Skip to content

Commit

Permalink
Merge pull request #132 from Lukas0907/fixes
Browse files Browse the repository at this point in the history
Fixes related to caching, LWN.net and various improvements
  • Loading branch information
Lukas0907 committed Jul 23, 2018
2 parents 89bfd1f + b1662c7 commit aa368e9
Show file tree
Hide file tree
Showing 34 changed files with 376 additions and 296 deletions.
2 changes: 0 additions & 2 deletions docs/generate_spider_documentation_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,3 @@ def main(spider_name):

if __name__ == "__main__":
main()

# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 smartindent autoindent
12 changes: 9 additions & 3 deletions docs/spiders/falter.at.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

falter.at
---------
Get newest articles and restaurant reviews from Falter_.
Get newest articles and restaurant reviews ("wwei") from Falter_.

Configuration
~~~~~~~~~~~~~
Expand All @@ -14,13 +14,19 @@ Add ``falter.at`` to the list of spiders:
spiders =
falter.at
Falter_ has a paywall for certain articles. If you want to crawl paid articles,
please provide ``abonr`` (subscription number) and ``password``.
Falter_ has a paywall for certain articles. If you want to crawl paid articles, please
provide ``abonr`` (subscription number) and ``password``.

``pages`` accepts ``magazine`` for the Falter newspaper and ``wwei`` for the restaurant
reviews. By default both are scraped.

.. code-block:: ini
[falter.at]
abonr =
password =
pages =
magazine
wwei
.. _Falter: https://www.falter.at
10 changes: 7 additions & 3 deletions docs/spiders/lwn.net.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@

lwn.net
-------
Newest articles from LWN_ with special treatment of LWN_ Weekly Editions.
Newest articles from LWN_ with special treatment of LWN_ Weekly Editions. Please note
that LWN_ requires the cache to be enabled to minimize useless requests. In case you
provide username and password, the session (cookie) is also cached until the cache entry
expires. The session cookie is valid for a month so to avoid disruptions, set the cache
expiry time to less than that.

Configuration
~~~~~~~~~~~~~
Expand All @@ -14,8 +18,8 @@ Add ``lwn.net`` to the list of spiders:
spiders =
lwn.net
LWN_ has paywalled articles. If you want to crawl them, please provide
``username`` and ``password``.
LWN_ has paywalled articles. If you want to crawl them, please provide ``username`` and
``password``.

.. code-block:: ini
Expand Down
6 changes: 6 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ useragent = feeds (+https://github.com/nblock/feeds)
## See also: https://validator.w3.org/feed/docs/warning/MissingSelf.html
# output_url = https://example.com/feeds

## Truncate content to 10 words instead of including the full text.
## This can be useful if generated feeds should be made publicly available.
# truncate_words = 10
## Remove images from output.
# remove_images = 1

## Enable caching of responses
# cache_enabled = 0
## Path to the cache.
Expand Down
33 changes: 15 additions & 18 deletions feeds/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
logger = logging.getLogger(__name__)


IGNORE_HTTP_CODES = [404, 500, 502, 503, 504]
IGNORE_HTTP_CODES = [403, 404, 500, 502, 503, 504]


def read_meta(root):
Expand All @@ -23,22 +23,11 @@ def cleanup_cache(cache_dir, max_age):
for cache_entry_path, _dirs, files in os.walk(cache_dir, topdown=False):
if "pickled_meta" in files:
meta = read_meta(cache_entry_path)

timestamp = datetime.fromtimestamp(meta["timestamp"])
if timestamp < max_age:
remove_cache_entry(cache_entry_path, meta["response_url"])
remove_cache_entry(cache_entry_path)
elif meta["status"] in IGNORE_HTTP_CODES:
remove_cache_entry(cache_entry_path, meta["response_url"])
logger.debug(
"Removing parent cache entries for URL {}".format(
meta["response_url"]
)
)
spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
# Remove parents as well.
for fingerprint in meta["parents"]:
path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
remove_cache_entry(path, read_meta(path)["response_url"])
remove_cache_entry(cache_entry_path, remove_parents=True)
elif not os.path.samefile(cache_entry_path, cache_dir):
# Try to delete parent directory of cache entries.
try:
Expand All @@ -50,10 +39,18 @@ def cleanup_cache(cache_dir, max_age):
logger.debug("Finished cleaning cache entries.")


def remove_cache_entry(cache_entry_path, url):
def remove_cache_entry(cache_entry_path, remove_parents=False):
if os.path.exists(cache_entry_path):
logger.debug("Removing cache entry for URL {}".format(url))
meta = read_meta(cache_entry_path)
if remove_parents:
logger.debug(
"Removing parent cache entries for URL {}".format(meta["response_url"])
)
spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
for fingerprint in meta["parents"]:
path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
remove_cache_entry(path, read_meta(path)["response_url"])
logger.debug("Removing cache entry for URL {}".format(meta["response_url"]))
shutil.rmtree(cache_entry_path)
else:
logger.error("Cannot remove cache entry {} for URL {}".format(
cache_entry_path, url))
logger.debug("Cannot remove cache entry {}".format(cache_entry_path))
42 changes: 5 additions & 37 deletions feeds/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import configparser
import logging
import os
from datetime import datetime, timedelta
Expand All @@ -10,49 +9,18 @@
from twisted.python import failure

from feeds.cache import cleanup_cache
from feeds.settings import load_feeds_settings

logger = logging.getLogger(__name__)

FEEDS_CFGFILE_MAPPING = {
"USER_AGENT": "useragent",
"LOG_LEVEL": "loglevel",
"HTTPCACHE_ENABLED": "cache_enabled",
"HTTPCACHE_DIR": "cache_dir",
}


def run_cleanup_cache(settings):
days = int(
settings.get("FEEDS_CONFIG", {}).get("feeds", {}).get("cache_expires", 14)
)
days = settings.getint("FEEDS_CONFIG_CACHE_EXPIRES")
cleanup_cache(
data_path(settings["HTTPCACHE_DIR"]), datetime.now() - timedelta(days=days)
data_path(settings.get("HTTPCACHE_DIR")), datetime.now() - timedelta(days=days)
)


def get_feeds_settings(file_=None):
if file_:
logger.debug("Parsing configuration file {} ...".format(file_.name))
# Parse configuration file and store result under FEEDS_CONFIG of
# scrapy's settings API.
parser = configparser.ConfigParser()
parser.read_file(file_)
config = {s: dict(parser.items(s)) for s in parser.sections()}
else:
config = {}

settings = get_project_settings()
settings.set("FEEDS_CONFIG", config)

# Mapping of feeds config section to setting names.
for settings_key, config_key in FEEDS_CFGFILE_MAPPING.items():
config_value = config.get("feeds", {}).get(config_key)
if config_value:
settings.set(settings_key, config_value)

return settings


def spiders_to_crawl(process, argument_spiders):
if argument_spiders:
# Spider(s) given as command line argument(s).
Expand All @@ -61,7 +29,7 @@ def spiders_to_crawl(process, argument_spiders):

try:
# Spider(s) given in configuration file.
spiders = process.settings.get("FEEDS_CONFIG")["feeds"]["spiders"]
spiders = process.settings.get("FEEDS_CONFIG_SPIDERS")
logger.debug("Using configuration file to decide what spiders to run.")
return spiders.split()
except KeyError:
Expand Down Expand Up @@ -90,7 +58,7 @@ def cli(ctx, loglevel, config, pdb):
failure.startDebugMode()
os.chdir(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))

settings = get_feeds_settings(config)
settings = load_feeds_settings(config)
settings.set("LOG_LEVEL", loglevel.upper())
ctx.obj["settings"] = settings

Expand Down
54 changes: 54 additions & 0 deletions feeds/default_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import logging

# Feeds configuration populated by an optional feeds configuration file.
FEEDS_CONFIG = {}

# Low level settings intended for scrapy.
# Please use feeds.cfg to configure feeds.

BOT_NAME = "feeds"
SPIDER_MODULES = ["feeds.spiders"]
NEWSPIDER_MODULE = "feeds.spiders"

# Don't overwhelm sites with requests.
CONCURRENT_REQUESTS_PER_DOMAIN = 2
DOWNLOAD_DELAY = 0.25

# Disable telnet
TELNETCONSOLE_ENABLED = False

# Custom item pipeline
ITEM_PIPELINES = {
"feeds.pipelines.AtomAutogenerateFieldsPipeline": 100,
"feeds.pipelines.AtomCheckRequiredFieldsPipeline": 110,
"feeds.pipelines.AtomExportPipeline": 400,
}

SPIDER_MIDDLEWARES = {
"feeds.spidermiddlewares.FeedsHttpErrorMiddleware": 51,
"feeds.spidermiddlewares.FeedsHttpCacheMiddleware": 1000,
}

DOWNLOADER_MIDDLEWARES = {
"feeds.downloadermiddlewares.FeedsHttpCacheMiddleware": 900,
"scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": None,
}

HTTPCACHE_ENABLED = False
HTTPCACHE_STORAGE = "feeds.extensions.FeedsCacheStorage"
HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
HTTPCACHE_DIR = "cache"
# We cache everything and delete cache entries (and every parent request) during
# cleanup.
HTTPCACHE_IGNORE_HTTP_CODES = []

# Default user agent. Can be overriden in feeds.cfg.
USER_AGENT = "feeds (+https://github.com/nblock/feeds)"

# Set default level to info.
# Can be overriden with --loglevel parameter.
LOG_LEVEL = logging.INFO

# Stats collection is disabled by default.
# Can be overriden with --stats parameter.
STATS_CLASS = "scrapy.statscollectors.DummyStatsCollector"
13 changes: 13 additions & 0 deletions feeds/downloadermiddlewares.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
from scrapy import signals


class FeedsHttpCacheMiddleware(HttpCacheMiddleware):
@classmethod
def from_crawler(cls, crawler):
o = super().from_crawler(crawler)
crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
return o

def item_dropped(self, item, response, exception, spider):
self.storage.item_dropped(item, response, exception, spider)
3 changes: 0 additions & 3 deletions feeds/exporters.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,3 @@ def export_item(self, item):
link_self = None
self._feeds[path] = self.AtomFeed(exporter=self, link_self=link_self)
self._feeds[path].add_item(item)


# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 smartindent autoindent
33 changes: 14 additions & 19 deletions feeds/extensions.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,11 @@
import os

import pickle
from scrapy import signals
from scrapy.extensions.httpcache import FilesystemCacheStorage
from scrapy.utils.request import request_fingerprint
from scrapy.utils.python import to_bytes

from feeds.cache import IGNORE_HTTP_CODES


class SpiderSettings:
@classmethod
def from_crawler(cls, crawler):
ext = cls()
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
return ext

def spider_opened(self, spider):
spider.spider_settings = self.spider_settings(spider)

@classmethod
def spider_settings(cls, spider):
return spider.settings.get("FEEDS_CONFIG").get(spider.name, {})
from feeds.cache import IGNORE_HTTP_CODES, remove_cache_entry


class FeedsCacheStorage(FilesystemCacheStorage):
Expand All @@ -47,9 +32,10 @@ def store_response(self, spider, request, response):
# Read the new metadata.
metadata = self._read_meta(spider, request)
# Add the parents' fingerprints to the metadata and merge the parents from the
# old metadata.
# old metadata. The last fingerprint is not included since it's the fingerprint
# of this request.
metadata["parents"] = list(
set(request.meta["fingerprints"]).union(
set(request.meta["fingerprints"][:-1]).union(
old_metadata["parents"] if old_metadata else []
)
)
Expand All @@ -59,3 +45,12 @@ def store_response(self, spider, request, response):
f.write(to_bytes(repr(metadata)))
with self._open(os.path.join(rpath, "pickled_meta"), "wb") as f:
pickle.dump(metadata, f, protocol=2)

def _get_request_path(self, spider, request):
key = request_fingerprint(request, include_headers=["Cookie"])
return os.path.join(self.cachedir, spider.name, key[0:2], key)

def item_dropped(self, item, response, exception, spider):
remove_cache_entry(
self._get_request_path(spider, response.request), remove_parents=True
)
3 changes: 0 additions & 3 deletions feeds/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,3 @@ class FeedEntryItem(BaseItem):
enclosure_iri = scrapy.Field()
# Optional
enclosure_type = scrapy.Field()


# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 smartindent autoindent

0 comments on commit aa368e9

Please sign in to comment.