Skip to content

Commit

Permalink
Merge pull request #171 from Lukas0907/fixes
Browse files Browse the repository at this point in the history
Improve dietiwag, fix caching problems
  • Loading branch information
Lukas0907 committed Sep 18, 2018
2 parents 5813d43 + 17ade0e commit 9144df0
Show file tree
Hide file tree
Showing 9 changed files with 83 additions and 47 deletions.
32 changes: 15 additions & 17 deletions feeds/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def cleanup_cache(cache_dir, expires):
for cache_entry_path, _dirs, files in os.walk(cache_dir, topdown=False):
if "pickled_meta" in files:
meta = read_meta(cache_entry_path)
logger.debug("Checking cache entry for URL {}".format(meta["response_url"]))
try:
entry_expires = timedelta(seconds=meta["cache_expires"])
except KeyError:
Expand All @@ -43,9 +42,6 @@ def cleanup_cache(cache_dir, expires):
datetime.fromtimestamp(meta["timestamp"], tz=timezone.utc)
+ entry_expires
)
logger.debug(
"Entry expires after {} at {}".format(entry_expires, threshold)
)
if now > threshold:
remove_cache_entry(cache_entry_path)
elif meta["status"] in IGNORE_HTTP_CODES:
Expand All @@ -62,17 +58,19 @@ def cleanup_cache(cache_dir, expires):


def remove_cache_entry(cache_entry_path, remove_parents=False):
if os.path.exists(cache_entry_path):
try:
meta = read_meta(cache_entry_path)
if remove_parents:
logger.debug(
"Removing parent cache entries for URL {}".format(meta["response_url"])
)
spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
for fingerprint in meta["parents"]:
path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
remove_cache_entry(path, read_meta(path)["response_url"])
logger.debug("Removing cache entry for URL {}".format(meta["response_url"]))
shutil.rmtree(cache_entry_path)
else:
logger.debug("Cannot remove cache entry {}".format(cache_entry_path))
except FileNotFoundError:
return

if remove_parents:
logger.debug(
"Removing parent cache entries for URL {}".format(meta["response_url"])
)
spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
for fingerprint in meta["parents"]:
path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
remove_cache_entry(path, remove_parents=False)

logger.debug("Removing cache entry for URL {}".format(meta["response_url"]))
shutil.rmtree(cache_entry_path, ignore_errors=True)
13 changes: 11 additions & 2 deletions feeds/extensions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os
import pickle

Expand All @@ -8,6 +9,9 @@
from feeds.cache import IGNORE_HTTP_CODES, remove_cache_entry


logger = logging.getLogger(__name__)


class FeedsCacheStorage(FilesystemCacheStorage):
def __init__(self, settings):
super().__init__(settings)
Expand All @@ -19,9 +23,14 @@ def retrieve_response(self, spider, request):
"""Return response if present in cache, or None otherwise."""
metadata = self._read_meta(spider, request)
if metadata is not None and metadata["status"] in IGNORE_HTTP_CODES:
return # ignore cache entry for error responses
# ignore cache entry for error responses
logger.debug("Response for {} not cached".format(request))
return
# Retrieve response from cache.
return super().retrieve_response(spider, request)
try:
return super().retrieve_response(spider, request)
finally:
logger.debug("Retrieved response for {} from cache".format(request))

def store_response(self, spider, request, response):
"""Store the given response in the cache."""
Expand Down
6 changes: 5 additions & 1 deletion feeds/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,11 @@ def cleanup_html(tree, loader_context):
for elem in selector(tree):
for attrib in elem.attrib.keys():
if attrib in attribs:
elem.attrib[attribs[attrib]] = elem.attrib.pop(attrib)
old_attrib_value = elem.attrib.pop(attrib)
if attribs[attrib] is not None:
# If attribs[attrib] is None, attrib is removed instead of
# renamed.
elem.attrib[attribs[attrib]] = old_attrib_value

# Change tag names.
for elem_sel, elem_tag in loader_context.get("change_tags", {}).items():
Expand Down
28 changes: 20 additions & 8 deletions feeds/spiders/dietiwag_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,30 +33,42 @@ def _parse_article(self, response):
remove_elems = [
".noprint",
"form",
".lineall > font[size='2'] > b:first-child",
"font[size='2'] > br:first-child",
"font[size='2'] > br:first-child",
"font[size='2'] > br:last-child",
"font[size='2'] > br:last-child",
"font[size='2'] > br:last-child",
"font[size='3'] > b",
"font[size='2'] > b:first-child",
'a[href="mailto:m.wilhelm@dietiwag.org"]',
"br:first-child",
"br:first-child",
"br:first-child",
"br:first-child",
"br:first-child",
"br:first-child",
"br:last-child",
"br:last-child",
"br:last-child",
"br:last-child",
"br:last-child",
"br:last-child",
]
replace_regex = {
r"\[\d{2}\.\d{2}\.\d{4}\]": "",
# A0 is a non-breaking space in latin1.
"\xA0": "",
r"<br>\s*<br>\s*\d{1,2}\.\d{1,2}\.\d{4}\s*<br>": "",
}
change_attribs = {"font": {"size": None, "face": None, "color": None}}
change_tags = {"font": "div", "center": "div"}
il = FeedEntryItemLoader(
response=response,
base_url=response.url,
remove_elems=remove_elems,
replace_regex=replace_regex,
change_attribs=change_attribs,
change_tags=change_tags,
parent=response.meta["il"],
)
il.add_css("author_name", ".sidebar .authors__name::text")
if response.css(".printwidth2"):
il.add_css("content_html", ".printwidth2 > font[size='2']")
il.add_css("content_html", ".printwidth2 > font[size='3'] > font[size='2']")
il.add_css("content_html", ".printwidth2")
else:
# Tagebuch
il.add_css("content_html", ".lineall")
Expand Down
12 changes: 7 additions & 5 deletions feeds/spiders/falter_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@ def start_requests(self):
if abonr and password:
yield scrapy.FormRequest(
url="https://www.{}/falter/e-paper/login".format(self.name),
formdata={
"login[abonr]": abonr,
"login[password]": password,
"redirect_url": "/archiv/",
},
formdata=OrderedDict(
[
("login[abonr]", abonr),
("login[password]", password),
("redirect_url", "/archiv/"),
]
),
meta={
"dont_redirect": True,
"cache_expires": timedelta(hours=3),
Expand Down
4 changes: 3 additions & 1 deletion feeds/spiders/konsument_at.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from collections import OrderedDict

import scrapy

from feeds.loaders import FeedEntryItemLoader
Expand All @@ -20,7 +22,7 @@ def parse(self, response):
yield scrapy.FormRequest.from_response(
response,
formcss="#login form",
formdata={"user": user, "pwd": pwd},
formdata=OrderedDict([("user", user), ("pwd", pwd)]),
callback=self._after_login,
meta={"dont_cache": True},
)
Expand Down
15 changes: 9 additions & 6 deletions feeds/spiders/lwn_net.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from collections import OrderedDict
from datetime import timedelta

import scrapy
Expand Down Expand Up @@ -84,12 +85,14 @@ def start_requests(self):
if username and password:
yield scrapy.FormRequest(
url="https://{}/login".format(self.name),
formdata={
"Username": username,
"Password": password,
"target": "/MyAccount/",
"submit": "Log+in",
},
formdata=OrderedDict(
[
("Username", username),
("Password", password),
("target", "/MyAccount/"),
("submit", "Log+in"),
]
),
callback=self._after_login,
# Session cookie is valid for a month. 14 days is a good compromise.
meta={"cache_expires": timedelta(days=14)},
Expand Down
16 changes: 10 additions & 6 deletions feeds/spiders/nachrichten_at.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from collections import OrderedDict

import scrapy

from feeds.loaders import FeedEntryItemLoader
Expand Down Expand Up @@ -28,12 +30,14 @@ def start_requests(self):
if username and password:
yield scrapy.FormRequest(
"https://www.{}/login/".format(self.name),
formdata={
"user[control][login]": "true",
"permanent": "checked",
"username": username,
"password": password,
},
formdata=OrderedDict(
[
("user[control][login]", "true"),
("permanent", "checked"),
("username", username),
("password", password),
]
),
callback=self._after_login,
)
else:
Expand Down
4 changes: 3 additions & 1 deletion feeds/spiders/uebermedien_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ def start_requests(self):
def _steady_login(self, response):
yield FormRequest.from_response(
response,
formdata={"user[email]": self._username, "user[password]": self._password},
formdata=OrderedDict(
[("user[email]", self._username), ("user[password]", self._password)]
),
callback=self._request_steady_token,
dont_filter=True,
meta={"handle_httpstatus_list": [301], "cache_expires": timedelta(days=1)},
Expand Down

0 comments on commit 9144df0

Please sign in to comment.