Merge pull request #190 from Lukas0907/next

Fix cache problems; spider enhancements
PyFeeds · Jan 30, 2019 · c9069ce · c9069ce
2 parents ec2f493 + 50823f4
commit c9069ce
Show file tree

Hide file tree

Showing 9 changed files with 20 additions and 10 deletions.
diff --git a/feeds/cache.py b/feeds/cache.py
@@ -183,16 +183,12 @@ def remove_cache_entry(self, cache_entry_path, remove_parents=False):
         if meta is None:
             return
 
-        if remove_parents:
-            logger.debug(
-                "Removing parent cache entries for URL {}".format(meta["response_url"])
-            )
+        if remove_parents and "parents" in meta:
             spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
             for fingerprint in meta["parents"]:
                 path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
                 self.remove_cache_entry(path, remove_parents=False)
 
-        logger.debug("Removing cache entry for URL {}".format(meta["response_url"]))
         shutil.rmtree(cache_entry_path, ignore_errors=True)
 
 

diff --git a/feeds/default_settings.py b/feeds/default_settings.py
@@ -46,6 +46,9 @@
 HTTPCACHE_EXPIRATION_SECS = FEEDS_CONFIG_CACHE_EXPIRES * 24 * 60 * 60
 HTTPCACHE_IGNORE_HTTP_CODES = list(range(400, 600))
 
+# Do not enable cookies by default to make better use of the cache.
+COOKIES_ENABLED = False
+
 RETRY_ENABLED = True
 # equals 5 requests in total
 RETRY_TIMES = 4

diff --git a/feeds/spiders/biblioweb_at.py b/feeds/spiders/biblioweb_at.py
@@ -6,7 +6,10 @@
 
 class BibliowebAtSpider(FeedsSpider):
     name = "biblioweb.at"
-    custom_settings = {"DUPEFILTER_CLASS": "scrapy.dupefilters.RFPDupeFilter"}
+    custom_settings = {
+        "DUPEFILTER_CLASS": "scrapy.dupefilters.RFPDupeFilter",
+        "COOKIES_ENABLED": True,
+    }
 
     _days = 60
 

diff --git a/feeds/spiders/falter_at.py b/feeds/spiders/falter_at.py
@@ -14,7 +14,7 @@
 class FalterAtSpider(FeedsSpider):
     name = "falter.at"
     # Don't overwhelm the poor Wordpress with too many requests at once.
-    custom_settings = {"DOWNLOAD_DELAY": 1.0}
+    custom_settings = {"DOWNLOAD_DELAY": 1.0, "COOKIES_ENABLED": True}
 
     def start_requests(self):
         pages = self.settings.get("FEEDS_SPIDER_FALTER_AT_PAGES")

diff --git a/feeds/spiders/konsument_at.py b/feeds/spiders/konsument_at.py
@@ -10,6 +10,7 @@
 class KonsumentAtSpider(FeedsSpider):
     name = "konsument.at"
     start_urls = ["https://www.konsument.at/page/das-aktuelle-heft"]
+    custom_settings = {"COOKIES_ENABLED": True}
 
     feed_title = "KONSUMENT.AT"
     feed_subtitle = "Objektiv, unbestechlich, keine Werbung"

diff --git a/feeds/spiders/lwn_net.py b/feeds/spiders/lwn_net.py
@@ -70,7 +70,7 @@ class LwnNetSpider(FeedsXMLFeedSpider):
     # introduced rss namespace prefix.
     iterator = "xml"
     # lwn.net doesn't like it (i.e. blocks us) if we impose too much load.
-    custom_settings = {"DOWNLOAD_DELAY": 1.0}
+    custom_settings = {"DOWNLOAD_DELAY": 1.0, "COOKIES_ENABLED": True}
 
     _subscribed = False
 

diff --git a/feeds/spiders/nachrichten_at.py b/feeds/spiders/nachrichten_at.py
@@ -11,6 +11,7 @@
 
 class NachrichtenAtSpider(FeedsXMLFeedSpider):
     name = "nachrichten.at"
+    custom_settings = {"COOKIES_ENABLED": True}
 
     def start_requests(self):
         self._ressorts = self.settings.get("FEEDS_SPIDER_NACHRICHTEN_AT_RESSORTS")

diff --git a/feeds/spiders/tuwien_ac_at.py b/feeds/spiders/tuwien_ac_at.py
@@ -31,9 +31,14 @@ def parse(self, response):
 
         response = yield scrapy.Request(link, method="HEAD")
         mb_url = response.url
-        mb_id = re.search(
+        match = re.search(
             r"https://tiss.tuwien.ac.at/mbl/blatt_struktur/anzeigen/(\d+)", mb_url
-        ).group(1)
+        )
+        if not match:
+            self.logger.error("No Mitteilungsblätter found!")
+            return
+        else:
+            mb_id = match.group(1)
 
         url = "https://tiss.{}/api/mbl/v22/id/{}".format(self.name, mb_id)
         response = yield scrapy.Request(url)

diff --git a/feeds/spiders/uebermedien_de.py b/feeds/spiders/uebermedien_de.py
@@ -15,6 +15,7 @@ class UebermedienDeSpider(FeedsXMLFeedSpider):
     name = "uebermedien.de"
     start_urls = ["https://uebermedien.de/feed/"]
     namespaces = [("dc", "http://purl.org/dc/elements/1.1/")]
+    custom_settings = {"COOKIES_ENABLED": True}
 
     feed_title = "uebermedien.de"
     feed_subtitle = "Medien besser kritisieren."