Merge pull request #171 from Lukas0907/fixes

Improve dietiwag, fix caching problems
PyFeeds · Sep 18, 2018 · 9144df0 · 9144df0
2 parents 5813d43 + 17ade0e
commit 9144df0
Show file tree

Hide file tree

Showing 9 changed files with 83 additions and 47 deletions.
diff --git a/feeds/cache.py b/feeds/cache.py
@@ -33,7 +33,6 @@ def cleanup_cache(cache_dir, expires):
     for cache_entry_path, _dirs, files in os.walk(cache_dir, topdown=False):
         if "pickled_meta" in files:
             meta = read_meta(cache_entry_path)
-            logger.debug("Checking cache entry for URL {}".format(meta["response_url"]))
             try:
                 entry_expires = timedelta(seconds=meta["cache_expires"])
             except KeyError:
@@ -43,9 +42,6 @@ def cleanup_cache(cache_dir, expires):
                 datetime.fromtimestamp(meta["timestamp"], tz=timezone.utc)
                 + entry_expires
             )
-            logger.debug(
-                "Entry expires after {} at {}".format(entry_expires, threshold)
-            )
             if now > threshold:
                 remove_cache_entry(cache_entry_path)
             elif meta["status"] in IGNORE_HTTP_CODES:
@@ -62,17 +58,19 @@ def cleanup_cache(cache_dir, expires):
 
 
 def remove_cache_entry(cache_entry_path, remove_parents=False):
-    if os.path.exists(cache_entry_path):
+    try:
         meta = read_meta(cache_entry_path)
-        if remove_parents:
-            logger.debug(
-                "Removing parent cache entries for URL {}".format(meta["response_url"])
-            )
-            spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
-            for fingerprint in meta["parents"]:
-                path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
-                remove_cache_entry(path, read_meta(path)["response_url"])
-        logger.debug("Removing cache entry for URL {}".format(meta["response_url"]))
-        shutil.rmtree(cache_entry_path)
-    else:
-        logger.debug("Cannot remove cache entry {}".format(cache_entry_path))
+    except FileNotFoundError:
+        return
+
+    if remove_parents:
+        logger.debug(
+            "Removing parent cache entries for URL {}".format(meta["response_url"])
+        )
+        spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
+        for fingerprint in meta["parents"]:
+            path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
+            remove_cache_entry(path, remove_parents=False)
+
+    logger.debug("Removing cache entry for URL {}".format(meta["response_url"]))
+    shutil.rmtree(cache_entry_path, ignore_errors=True)
diff --git a/feeds/extensions.py b/feeds/extensions.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import pickle
 
@@ -8,6 +9,9 @@
 from feeds.cache import IGNORE_HTTP_CODES, remove_cache_entry
 
 
+logger = logging.getLogger(__name__)
+
+
 class FeedsCacheStorage(FilesystemCacheStorage):
     def __init__(self, settings):
         super().__init__(settings)
@@ -19,9 +23,14 @@ def retrieve_response(self, spider, request):
         """Return response if present in cache, or None otherwise."""
         metadata = self._read_meta(spider, request)
         if metadata is not None and metadata["status"] in IGNORE_HTTP_CODES:
-            return  # ignore cache entry for error responses
+            # ignore cache entry for error responses
+            logger.debug("Response for {} not cached".format(request))
+            return
         # Retrieve response from cache.
-        return super().retrieve_response(spider, request)
+        try:
+            return super().retrieve_response(spider, request)
+        finally:
+            logger.debug("Retrieved response for {} from cache".format(request))
 
     def store_response(self, spider, request, response):
         """Store the given response in the cache."""

diff --git a/feeds/loaders.py b/feeds/loaders.py
@@ -134,7 +134,11 @@ def cleanup_html(tree, loader_context):
         for elem in selector(tree):
             for attrib in elem.attrib.keys():
                 if attrib in attribs:
-                    elem.attrib[attribs[attrib]] = elem.attrib.pop(attrib)
+                    old_attrib_value = elem.attrib.pop(attrib)
+                    if attribs[attrib] is not None:
+                        # If attribs[attrib] is None, attrib is removed instead of
+                        # renamed.
+                        elem.attrib[attribs[attrib]] = old_attrib_value
 
     # Change tag names.
     for elem_sel, elem_tag in loader_context.get("change_tags", {}).items():

diff --git a/feeds/spiders/dietiwag_org.py b/feeds/spiders/dietiwag_org.py
@@ -33,30 +33,42 @@ def _parse_article(self, response):
         remove_elems = [
             ".noprint",
             "form",
-            ".lineall > font[size='2'] > b:first-child",
-            "font[size='2'] > br:first-child",
-            "font[size='2'] > br:first-child",
-            "font[size='2'] > br:last-child",
-            "font[size='2'] > br:last-child",
-            "font[size='2'] > br:last-child",
+            "font[size='3'] > b",
+            "font[size='2'] > b:first-child",
+            'a[href="mailto:m.wilhelm@dietiwag.org"]',
+            "br:first-child",
+            "br:first-child",
+            "br:first-child",
+            "br:first-child",
+            "br:first-child",
+            "br:first-child",
+            "br:last-child",
+            "br:last-child",
+            "br:last-child",
+            "br:last-child",
+            "br:last-child",
+            "br:last-child",
         ]
         replace_regex = {
             r"\[\d{2}\.\d{2}\.\d{4}\]": "",
             # A0 is a non-breaking space in latin1.
             "\xA0": "",
             r"<br>\s*<br>\s*\d{1,2}\.\d{1,2}\.\d{4}\s*<br>": "",
         }
+        change_attribs = {"font": {"size": None, "face": None, "color": None}}
+        change_tags = {"font": "div", "center": "div"}
         il = FeedEntryItemLoader(
             response=response,
             base_url=response.url,
             remove_elems=remove_elems,
             replace_regex=replace_regex,
+            change_attribs=change_attribs,
+            change_tags=change_tags,
             parent=response.meta["il"],
         )
         il.add_css("author_name", ".sidebar .authors__name::text")
         if response.css(".printwidth2"):
-            il.add_css("content_html", ".printwidth2 > font[size='2']")
-            il.add_css("content_html", ".printwidth2 > font[size='3'] > font[size='2']")
+            il.add_css("content_html", ".printwidth2")
         else:
             # Tagebuch
             il.add_css("content_html", ".lineall")

diff --git a/feeds/spiders/falter_at.py b/feeds/spiders/falter_at.py
@@ -32,11 +32,13 @@ def start_requests(self):
             if abonr and password:
                 yield scrapy.FormRequest(
                     url="https://www.{}/falter/e-paper/login".format(self.name),
-                    formdata={
-                        "login[abonr]": abonr,
-                        "login[password]": password,
-                        "redirect_url": "/archiv/",
-                    },
+                    formdata=OrderedDict(
+                        [
+                            ("login[abonr]", abonr),
+                            ("login[password]", password),
+                            ("redirect_url", "/archiv/"),
+                        ]
+                    ),
                     meta={
                         "dont_redirect": True,
                         "cache_expires": timedelta(hours=3),

diff --git a/feeds/spiders/konsument_at.py b/feeds/spiders/konsument_at.py
@@ -1,3 +1,5 @@
+from collections import OrderedDict
+
 import scrapy
 
 from feeds.loaders import FeedEntryItemLoader
@@ -20,7 +22,7 @@ def parse(self, response):
             yield scrapy.FormRequest.from_response(
                 response,
                 formcss="#login form",
-                formdata={"user": user, "pwd": pwd},
+                formdata=OrderedDict([("user", user), ("pwd", pwd)]),
                 callback=self._after_login,
                 meta={"dont_cache": True},
             )

diff --git a/feeds/spiders/lwn_net.py b/feeds/spiders/lwn_net.py
@@ -1,4 +1,5 @@
 import re
+from collections import OrderedDict
 from datetime import timedelta
 
 import scrapy
@@ -84,12 +85,14 @@ def start_requests(self):
         if username and password:
             yield scrapy.FormRequest(
                 url="https://{}/login".format(self.name),
-                formdata={
-                    "Username": username,
-                    "Password": password,
-                    "target": "/MyAccount/",
-                    "submit": "Log+in",
-                },
+                formdata=OrderedDict(
+                    [
+                        ("Username", username),
+                        ("Password", password),
+                        ("target", "/MyAccount/"),
+                        ("submit", "Log+in"),
+                    ]
+                ),
                 callback=self._after_login,
                 # Session cookie is valid for a month. 14 days is a good compromise.
                 meta={"cache_expires": timedelta(days=14)},

diff --git a/feeds/spiders/nachrichten_at.py b/feeds/spiders/nachrichten_at.py
@@ -1,3 +1,5 @@
+from collections import OrderedDict
+
 import scrapy
 
 from feeds.loaders import FeedEntryItemLoader
@@ -28,12 +30,14 @@ def start_requests(self):
         if username and password:
             yield scrapy.FormRequest(
                 "https://www.{}/login/".format(self.name),
-                formdata={
-                    "user[control][login]": "true",
-                    "permanent": "checked",
-                    "username": username,
-                    "password": password,
-                },
+                formdata=OrderedDict(
+                    [
+                        ("user[control][login]", "true"),
+                        ("permanent", "checked"),
+                        ("username", username),
+                        ("password", password),
+                    ]
+                ),
                 callback=self._after_login,
             )
         else:

diff --git a/feeds/spiders/uebermedien_de.py b/feeds/spiders/uebermedien_de.py
@@ -40,7 +40,9 @@ def start_requests(self):
     def _steady_login(self, response):
         yield FormRequest.from_response(
             response,
-            formdata={"user[email]": self._username, "user[password]": self._password},
+            formdata=OrderedDict(
+                [("user[email]", self._username), ("user[password]", self._password)]
+            ),
             callback=self._request_steady_token,
             dont_filter=True,
             meta={"handle_httpstatus_list": [301], "cache_expires": timedelta(days=1)},