Skip to content

Commit

Permalink
Merge pull request #190 from Lukas0907/next
Browse files Browse the repository at this point in the history
Fix cache problems; spider enhancements
  • Loading branch information
Lukas0907 committed Jan 30, 2019
2 parents ec2f493 + 50823f4 commit c9069ce
Show file tree
Hide file tree
Showing 9 changed files with 20 additions and 10 deletions.
6 changes: 1 addition & 5 deletions feeds/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,16 +183,12 @@ def remove_cache_entry(self, cache_entry_path, remove_parents=False):
if meta is None:
return

if remove_parents:
logger.debug(
"Removing parent cache entries for URL {}".format(meta["response_url"])
)
if remove_parents and "parents" in meta:
spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
for fingerprint in meta["parents"]:
path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
self.remove_cache_entry(path, remove_parents=False)

logger.debug("Removing cache entry for URL {}".format(meta["response_url"]))
shutil.rmtree(cache_entry_path, ignore_errors=True)


Expand Down
3 changes: 3 additions & 0 deletions feeds/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
HTTPCACHE_EXPIRATION_SECS = FEEDS_CONFIG_CACHE_EXPIRES * 24 * 60 * 60
HTTPCACHE_IGNORE_HTTP_CODES = list(range(400, 600))

# Do not enable cookies by default to make better use of the cache.
COOKIES_ENABLED = False

RETRY_ENABLED = True
# equals 5 requests in total
RETRY_TIMES = 4
Expand Down
5 changes: 4 additions & 1 deletion feeds/spiders/biblioweb_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@

class BibliowebAtSpider(FeedsSpider):
name = "biblioweb.at"
custom_settings = {"DUPEFILTER_CLASS": "scrapy.dupefilters.RFPDupeFilter"}
custom_settings = {
"DUPEFILTER_CLASS": "scrapy.dupefilters.RFPDupeFilter",
"COOKIES_ENABLED": True,
}

_days = 60

Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/falter_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
class FalterAtSpider(FeedsSpider):
name = "falter.at"
# Don't overwhelm the poor Wordpress with too many requests at once.
custom_settings = {"DOWNLOAD_DELAY": 1.0}
custom_settings = {"DOWNLOAD_DELAY": 1.0, "COOKIES_ENABLED": True}

def start_requests(self):
pages = self.settings.get("FEEDS_SPIDER_FALTER_AT_PAGES")
Expand Down
1 change: 1 addition & 0 deletions feeds/spiders/konsument_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
class KonsumentAtSpider(FeedsSpider):
name = "konsument.at"
start_urls = ["https://www.konsument.at/page/das-aktuelle-heft"]
custom_settings = {"COOKIES_ENABLED": True}

feed_title = "KONSUMENT.AT"
feed_subtitle = "Objektiv, unbestechlich, keine Werbung"
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/lwn_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class LwnNetSpider(FeedsXMLFeedSpider):
# introduced rss namespace prefix.
iterator = "xml"
# lwn.net doesn't like it (i.e. blocks us) if we impose too much load.
custom_settings = {"DOWNLOAD_DELAY": 1.0}
custom_settings = {"DOWNLOAD_DELAY": 1.0, "COOKIES_ENABLED": True}

_subscribed = False

Expand Down
1 change: 1 addition & 0 deletions feeds/spiders/nachrichten_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

class NachrichtenAtSpider(FeedsXMLFeedSpider):
name = "nachrichten.at"
custom_settings = {"COOKIES_ENABLED": True}

def start_requests(self):
self._ressorts = self.settings.get("FEEDS_SPIDER_NACHRICHTEN_AT_RESSORTS")
Expand Down
9 changes: 7 additions & 2 deletions feeds/spiders/tuwien_ac_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,14 @@ def parse(self, response):

response = yield scrapy.Request(link, method="HEAD")
mb_url = response.url
mb_id = re.search(
match = re.search(
r"https://tiss.tuwien.ac.at/mbl/blatt_struktur/anzeigen/(\d+)", mb_url
).group(1)
)
if not match:
self.logger.error("No Mitteilungsblätter found!")
return
else:
mb_id = match.group(1)

url = "https://tiss.{}/api/mbl/v22/id/{}".format(self.name, mb_id)
response = yield scrapy.Request(url)
Expand Down
1 change: 1 addition & 0 deletions feeds/spiders/uebermedien_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class UebermedienDeSpider(FeedsXMLFeedSpider):
name = "uebermedien.de"
start_urls = ["https://uebermedien.de/feed/"]
namespaces = [("dc", "http://purl.org/dc/elements/1.1/")]
custom_settings = {"COOKIES_ENABLED": True}

feed_title = "uebermedien.de"
feed_subtitle = "Medien besser kritisieren."
Expand Down

0 comments on commit c9069ce

Please sign in to comment.