Merge pull request #132 from Lukas0907/fixes

Fixes related to caching, LWN.net and various improvements
PyFeeds · Jul 23, 2018 · aa368e9 · aa368e9
2 parents 89bfd1f + b1662c7
commit aa368e9
Show file tree

Hide file tree

Showing 34 changed files with 376 additions and 296 deletions.
diff --git a/docs/generate_spider_documentation_template.py b/docs/generate_spider_documentation_template.py
@@ -36,5 +36,3 @@ def main(spider_name):
 
 if __name__ == "__main__":
     main()
-
-# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 smartindent autoindent
diff --git a/docs/spiders/falter.at.rst b/docs/spiders/falter.at.rst
@@ -2,7 +2,7 @@
 
 falter.at
 ---------
-Get newest articles and restaurant reviews from Falter_.
+Get newest articles and restaurant reviews ("wwei") from Falter_.
 
 Configuration
 ~~~~~~~~~~~~~
@@ -14,13 +14,19 @@ Add ``falter.at`` to the list of spiders:
    spiders =
      falter.at
 
-Falter_ has a paywall for certain articles. If you want to crawl paid articles,
-please provide ``abonr`` (subscription number) and ``password``.
+Falter_ has a paywall for certain articles. If you want to crawl paid articles, please
+provide ``abonr`` (subscription number) and ``password``.
+
+``pages`` accepts ``magazine`` for the Falter newspaper and ``wwei`` for the restaurant
+reviews. By default both are scraped.
 
 .. code-block:: ini
 
    [falter.at]
    abonr =
    password =
+   pages =
+       magazine
+       wwei
 
 .. _Falter: https://www.falter.at
diff --git a/docs/spiders/lwn.net.rst b/docs/spiders/lwn.net.rst
@@ -2,7 +2,11 @@
 
 lwn.net
 -------
-Newest articles from LWN_ with special treatment of LWN_ Weekly Editions.
+Newest articles from LWN_ with special treatment of LWN_ Weekly Editions. Please note
+that LWN_ requires the cache to be enabled to minimize useless requests. In case you
+provide username and password, the session (cookie) is also cached until the cache entry
+expires. The session cookie is valid for a month so to avoid disruptions, set the cache
+expiry time to less than that.
 
 Configuration
 ~~~~~~~~~~~~~
@@ -14,8 +18,8 @@ Add ``lwn.net`` to the list of spiders:
    spiders =
      lwn.net
 
-LWN_ has paywalled articles. If you want to crawl them, please provide
-``username`` and ``password``.
+LWN_ has paywalled articles. If you want to crawl them, please provide ``username`` and
+``password``.
 
 .. code-block:: ini
 

diff --git a/feeds.cfg.dist b/feeds.cfg.dist
@@ -17,6 +17,12 @@ useragent = feeds (+https://github.com/nblock/feeds)
 ## See also: https://validator.w3.org/feed/docs/warning/MissingSelf.html
 # output_url = https://example.com/feeds
 
+## Truncate content to 10 words instead of including the full text.
+## This can be useful if generated feeds should be made publicly available.
+# truncate_words = 10
+## Remove images from output.
+# remove_images = 1
+
 ## Enable caching of responses
 # cache_enabled = 0
 ## Path to the cache.

diff --git a/feeds/cache.py b/feeds/cache.py
@@ -7,7 +7,7 @@
 logger = logging.getLogger(__name__)
 
 
-IGNORE_HTTP_CODES = [404, 500, 502, 503, 504]
+IGNORE_HTTP_CODES = [403, 404, 500, 502, 503, 504]
 
 
 def read_meta(root):
@@ -23,22 +23,11 @@ def cleanup_cache(cache_dir, max_age):
     for cache_entry_path, _dirs, files in os.walk(cache_dir, topdown=False):
         if "pickled_meta" in files:
             meta = read_meta(cache_entry_path)
-
             timestamp = datetime.fromtimestamp(meta["timestamp"])
             if timestamp < max_age:
-                remove_cache_entry(cache_entry_path, meta["response_url"])
+                remove_cache_entry(cache_entry_path)
             elif meta["status"] in IGNORE_HTTP_CODES:
-                remove_cache_entry(cache_entry_path, meta["response_url"])
-                logger.debug(
-                    "Removing parent cache entries for URL {}".format(
-                        meta["response_url"]
-                    )
-                )
-                spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
-                # Remove parents as well.
-                for fingerprint in meta["parents"]:
-                    path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
-                    remove_cache_entry(path, read_meta(path)["response_url"])
+                remove_cache_entry(cache_entry_path, remove_parents=True)
         elif not os.path.samefile(cache_entry_path, cache_dir):
             # Try to delete parent directory of cache entries.
             try:
@@ -50,10 +39,18 @@ def cleanup_cache(cache_dir, max_age):
     logger.debug("Finished cleaning cache entries.")
 
 
-def remove_cache_entry(cache_entry_path, url):
+def remove_cache_entry(cache_entry_path, remove_parents=False):
     if os.path.exists(cache_entry_path):
-        logger.debug("Removing cache entry for URL {}".format(url))
+        meta = read_meta(cache_entry_path)
+        if remove_parents:
+            logger.debug(
+                "Removing parent cache entries for URL {}".format(meta["response_url"])
+            )
+            spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
+            for fingerprint in meta["parents"]:
+                path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
+                remove_cache_entry(path, read_meta(path)["response_url"])
+        logger.debug("Removing cache entry for URL {}".format(meta["response_url"]))
         shutil.rmtree(cache_entry_path)
     else:
-        logger.error("Cannot remove cache entry {} for URL {}".format(
-            cache_entry_path, url))
+        logger.debug("Cannot remove cache entry {}".format(cache_entry_path))
diff --git a/feeds/cli.py b/feeds/cli.py
@@ -1,4 +1,3 @@
-import configparser
 import logging
 import os
 from datetime import datetime, timedelta
@@ -10,49 +9,18 @@
 from twisted.python import failure
 
 from feeds.cache import cleanup_cache
+from feeds.settings import load_feeds_settings
 
 logger = logging.getLogger(__name__)
 
-FEEDS_CFGFILE_MAPPING = {
-    "USER_AGENT": "useragent",
-    "LOG_LEVEL": "loglevel",
-    "HTTPCACHE_ENABLED": "cache_enabled",
-    "HTTPCACHE_DIR": "cache_dir",
-}
-
 
 def run_cleanup_cache(settings):
-    days = int(
-        settings.get("FEEDS_CONFIG", {}).get("feeds", {}).get("cache_expires", 14)
-    )
+    days = settings.getint("FEEDS_CONFIG_CACHE_EXPIRES")
     cleanup_cache(
-        data_path(settings["HTTPCACHE_DIR"]), datetime.now() - timedelta(days=days)
+        data_path(settings.get("HTTPCACHE_DIR")), datetime.now() - timedelta(days=days)
     )
 
 
-def get_feeds_settings(file_=None):
-    if file_:
-        logger.debug("Parsing configuration file {} ...".format(file_.name))
-        # Parse configuration file and store result under FEEDS_CONFIG of
-        # scrapy's settings API.
-        parser = configparser.ConfigParser()
-        parser.read_file(file_)
-        config = {s: dict(parser.items(s)) for s in parser.sections()}
-    else:
-        config = {}
-
-    settings = get_project_settings()
-    settings.set("FEEDS_CONFIG", config)
-
-    # Mapping of feeds config section to setting names.
-    for settings_key, config_key in FEEDS_CFGFILE_MAPPING.items():
-        config_value = config.get("feeds", {}).get(config_key)
-        if config_value:
-            settings.set(settings_key, config_value)
-
-    return settings
-
-
 def spiders_to_crawl(process, argument_spiders):
     if argument_spiders:
         # Spider(s) given as command line argument(s).
@@ -61,7 +29,7 @@ def spiders_to_crawl(process, argument_spiders):
 
     try:
         # Spider(s) given in configuration file.
-        spiders = process.settings.get("FEEDS_CONFIG")["feeds"]["spiders"]
+        spiders = process.settings.get("FEEDS_CONFIG_SPIDERS")
         logger.debug("Using configuration file to decide what spiders to run.")
         return spiders.split()
     except KeyError:
@@ -90,7 +58,7 @@ def cli(ctx, loglevel, config, pdb):
         failure.startDebugMode()
     os.chdir(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 
-    settings = get_feeds_settings(config)
+    settings = load_feeds_settings(config)
     settings.set("LOG_LEVEL", loglevel.upper())
     ctx.obj["settings"] = settings
 

diff --git a/feeds/default_settings.py b/feeds/default_settings.py
@@ -0,0 +1,54 @@
+import logging
+
+# Feeds configuration populated by an optional feeds configuration file.
+FEEDS_CONFIG = {}
+
+# Low level settings intended for scrapy.
+# Please use feeds.cfg to configure feeds.
+
+BOT_NAME = "feeds"
+SPIDER_MODULES = ["feeds.spiders"]
+NEWSPIDER_MODULE = "feeds.spiders"
+
+# Don't overwhelm sites with requests.
+CONCURRENT_REQUESTS_PER_DOMAIN = 2
+DOWNLOAD_DELAY = 0.25
+
+# Disable telnet
+TELNETCONSOLE_ENABLED = False
+
+# Custom item pipeline
+ITEM_PIPELINES = {
+    "feeds.pipelines.AtomAutogenerateFieldsPipeline": 100,
+    "feeds.pipelines.AtomCheckRequiredFieldsPipeline": 110,
+    "feeds.pipelines.AtomExportPipeline": 400,
+}
+
+SPIDER_MIDDLEWARES = {
+    "feeds.spidermiddlewares.FeedsHttpErrorMiddleware": 51,
+    "feeds.spidermiddlewares.FeedsHttpCacheMiddleware": 1000,
+}
+
+DOWNLOADER_MIDDLEWARES = {
+    "feeds.downloadermiddlewares.FeedsHttpCacheMiddleware": 900,
+    "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": None,
+}
+
+HTTPCACHE_ENABLED = False
+HTTPCACHE_STORAGE = "feeds.extensions.FeedsCacheStorage"
+HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
+HTTPCACHE_DIR = "cache"
+# We cache everything and delete cache entries (and every parent request) during
+# cleanup.
+HTTPCACHE_IGNORE_HTTP_CODES = []
+
+# Default user agent. Can be overriden in feeds.cfg.
+USER_AGENT = "feeds (+https://github.com/nblock/feeds)"
+
+# Set default level to info.
+# Can be overriden with --loglevel parameter.
+LOG_LEVEL = logging.INFO
+
+# Stats collection is disabled by default.
+# Can be overriden with --stats parameter.
+STATS_CLASS = "scrapy.statscollectors.DummyStatsCollector"
diff --git a/feeds/downloadermiddlewares.py b/feeds/downloadermiddlewares.py
@@ -0,0 +1,13 @@
+from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
+from scrapy import signals
+
+
+class FeedsHttpCacheMiddleware(HttpCacheMiddleware):
+    @classmethod
+    def from_crawler(cls, crawler):
+        o = super().from_crawler(crawler)
+        crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
+        return o
+
+    def item_dropped(self, item, response, exception, spider):
+        self.storage.item_dropped(item, response, exception, spider)
diff --git a/feeds/exporters.py b/feeds/exporters.py
@@ -198,6 +198,3 @@ def export_item(self, item):
                 link_self = None
             self._feeds[path] = self.AtomFeed(exporter=self, link_self=link_self)
         self._feeds[path].add_item(item)
-
-
-# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 smartindent autoindent
diff --git a/feeds/extensions.py b/feeds/extensions.py
@@ -1,26 +1,11 @@
 import os
 
 import pickle
-from scrapy import signals
 from scrapy.extensions.httpcache import FilesystemCacheStorage
+from scrapy.utils.request import request_fingerprint
 from scrapy.utils.python import to_bytes
 
-from feeds.cache import IGNORE_HTTP_CODES
-
-
-class SpiderSettings:
-    @classmethod
-    def from_crawler(cls, crawler):
-        ext = cls()
-        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
-        return ext
-
-    def spider_opened(self, spider):
-        spider.spider_settings = self.spider_settings(spider)
-
-    @classmethod
-    def spider_settings(cls, spider):
-        return spider.settings.get("FEEDS_CONFIG").get(spider.name, {})
+from feeds.cache import IGNORE_HTTP_CODES, remove_cache_entry
 
 
 class FeedsCacheStorage(FilesystemCacheStorage):
@@ -47,9 +32,10 @@ def store_response(self, spider, request, response):
         # Read the new metadata.
         metadata = self._read_meta(spider, request)
         # Add the parents' fingerprints to the metadata and merge the parents from the
-        # old metadata.
+        # old metadata. The last fingerprint is not included since it's the fingerprint
+        # of this request.
         metadata["parents"] = list(
-            set(request.meta["fingerprints"]).union(
+            set(request.meta["fingerprints"][:-1]).union(
                 old_metadata["parents"] if old_metadata else []
             )
         )
@@ -59,3 +45,12 @@ def store_response(self, spider, request, response):
             f.write(to_bytes(repr(metadata)))
         with self._open(os.path.join(rpath, "pickled_meta"), "wb") as f:
             pickle.dump(metadata, f, protocol=2)
+
+    def _get_request_path(self, spider, request):
+        key = request_fingerprint(request, include_headers=["Cookie"])
+        return os.path.join(self.cachedir, spider.name, key[0:2], key)
+
+    def item_dropped(self, item, response, exception, spider):
+        remove_cache_entry(
+            self._get_request_path(spider, response.request), remove_parents=True
+        )
diff --git a/feeds/items.py b/feeds/items.py
@@ -59,6 +59,3 @@ class FeedEntryItem(BaseItem):
     enclosure_iri = scrapy.Field()
     # Optional
     enclosure_type = scrapy.Field()
-
-
-# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 smartindent autoindent