Merge pull request #155 from Lukas0907/fixes

Fixes
PyFeeds · Aug 20, 2018 · e344239 · e344239
2 parents 4bee0ad + fb63c5d
commit e344239
Show file tree

Hide file tree

Showing 13 changed files with 45 additions and 46 deletions.
diff --git a/README.rst b/README.rst
@@ -29,10 +29,13 @@ Feeds comes with extensive documentation. It is available at
 Supported Websites
 ------------------
 
-Feeds is currently able to create Atom feeds for various sites. The complete
-list of `supported websites is available in the documentation
+Feeds is currently able to create full text Atom feeds for various sites. The
+complete list of `supported websites is available in the documentation
 <https://pyfeeds.readthedocs.io/en/latest/spiders.html>`_.
 
+Content behind paywalls
+```````````````````````
+
 Some sites (Falter_, Konsument_, LWN_, `Oberösterreichische Nachrichten`_,
 Übermedien_) offer articles only behind a paywall. If you have a paid
 subscription, you can configure your username and password in ``feeds.cfg`` and
@@ -47,15 +50,17 @@ Installation
 ------------
 
 Feeds is meant to be installed on your server and run periodically in a cron
-job.
+job or similar job scheduler.
 
 The easiest way to install Feeds is via ``pip`` in a virtual environment. Feeds
 does not provide any releases yet, so one might directly install the current
 master branch::
 
     $ git clone https://github.com/nblock/feeds.git
     $ cd feeds
-    $ pip install .
+    $ python3 -m venv venv
+    $ source bin/activate
+    $ pip install -e .
 
 After installation ``feeds`` is available in your virtual environment.
 
@@ -88,13 +93,10 @@ Quickstart
 Caching
 -------
 
-Feeds can be configured to use a cache for HTTP responses which is highly
-recommended to save bandwidth. It can be enabled in the config file. See
-feeds.cfg.dist for an example on how to do that.
-
-Entries are cached for 14 days by default (this can be overwritten in the
-config file). Entries are purged from cache automatically after a crawl. It's
-also possible to explicitly invalidate the cache::
+Feeds caches HTTP responses by default to save bandwidth. Entries are cached
+for 90 days by default (this can be overwritten in the config file). Outdated
+entries are purged from cache automatically after a crawl. It's also possible
+to explicitly purge the cache from outdated entries::
 
   $ feeds --config feeds.cfg cleanup
 
@@ -148,7 +150,7 @@ AGPL3, see `LICENSE`_ for details.
 .. _Konsument: https://pyfeeds.readthedocs.io/en/latest/spiders/konsument.at.html
 .. _LWN: https://pyfeeds.readthedocs.io/en/latest/spiders/lwn.net.html
 .. _Oberösterreichische Nachrichten: https://pyfeeds.readthedocs.io/en/latest/spiders/nachrichten.at.html
-.. _Übermedien: https://pyfeeds.readthedocs.io/en/latest/spiders/uebermedien.com.html
+.. _Übermedien: https://pyfeeds.readthedocs.io/en/latest/spiders/uebermedien.de.html
 
 .. |build-status| image:: https://travis-ci.org/nblock/feeds.svg?branch=master
     :alt: build status

diff --git a/docs/configure.rst b/docs/configure.rst
@@ -72,12 +72,12 @@ The path where cache data is stored.
 
 cache_expires
 ~~~~~~~~~~~~~
-Expire (remove) entries from cache after 14 days.
+Expire (remove) entries from cache after 90 days.
 
 .. code-block:: ini
 
    [feeds]
-   cache_expires = 14
+   cache_expires = 90
 
 Spider specific settings
 ------------------------

diff --git a/docs/get.rst b/docs/get.rst
@@ -9,9 +9,11 @@ master branch:
 
 .. code-block:: bash
 
-   $ git clone https://github.com/nblock/feeds.git
-   $ cd feeds
-   $ pip install .
+    $ git clone https://github.com/nblock/feeds.git
+    $ cd feeds
+    $ python3 -m venv venv
+    $ source bin/activate
+    $ pip install -e .
 
 After installation ``feeds`` is available in your virtual environment.
 

diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -12,7 +12,7 @@ Feeds has a few commands that are described on this page.
 
   .. code-block:: bash
 
-     $ feeds crawl tvthek.orf.at
+     $ feeds crawl indiehackers.com
 
 * A :ref:`configuration file <Configure Feeds>` is supported too. Simply copy
   the :ref:`example configuration` and adjust it. Enable the spiders you are

diff --git a/docs/spiders.rst b/docs/spiders.rst
@@ -2,7 +2,7 @@
 
 Supported Websites
 ==================
-Feeds is currently able to create Atom feeds for the following websites:
+Feeds is currently able to create full text Atom feeds for the following websites:
 
 .. toctree::
    :maxdepth: 1

diff --git a/docs/spiders/uebermedien.com.rst → docs/spiders/uebermedien.de.rst b/docs/spiders/uebermedien.com.rst → docs/spiders/uebermedien.de.rst
@@ -1,18 +1,18 @@
-.. _spider_uebermedien.com:
+.. _spider_uebermedien.de:
 
-uebermedien.com
----------------
+uebermedien.de
+--------------
 Newest articles from Übermedien_.
 
 Configuration
 ~~~~~~~~~~~~~
-Add ``uebermedien.com`` to the list of spiders:
+Add ``uebermedien.de`` to the list of spiders:
 
 .. code-block:: ini
 
    # List of spiders to run by default, one per line.
    spiders =
-     uebermedien.com
+     uebermedien.de
 
 
 Übermedien_ has a paywall for certain articles. If you want to crawl paid

diff --git a/feeds.cfg.dist b/feeds.cfg.dist
@@ -27,8 +27,8 @@ useragent = feeds (+https://github.com/nblock/feeds)
 # cache_enabled = 1
 ## Path to the cache.
 # cache_dir = ~/.cache/feeds
-## Expire (remove) entries from cache after 14 days
-# cache_expires = 14
+## Expire (remove) entries from cache after 90 days
+# cache_expires = 90
 
 #[generic]
 ## A list of URLs to RSS/Atom feeds.

diff --git a/feeds/cli.py b/feeds/cli.py
@@ -117,7 +117,7 @@ def cleanup(ctx):
     """
     Cleanup old cache entries.
 
-    By default, entries older than 14 days will be removed. This value can be
+    By default, entries older than 90 days will be removed. This value can be
     overriden in the config file.
     """
     settings = ctx.obj["settings"]

diff --git a/feeds/default_settings.py b/feeds/default_settings.py
@@ -6,7 +6,7 @@
 # Default settings for Feeds specific configurations.
 FEEDS_CONFIG_OUTPUT_PATH = "output"
 FEEDS_CONFIG_FILE = os.path.join(xdg_config_home, "feeds.cfg")
-FEEDS_CONFIG_CACHE_EXPIRES = 14
+FEEDS_CONFIG_CACHE_EXPIRES = 90
 
 # Low level settings intended for scrapy.
 # Please use feeds.cfg to configure feeds.

diff --git a/feeds/spiders/derstandard_at.py b/feeds/spiders/derstandard_at.py
@@ -84,8 +84,9 @@ def _parse_article(self, response):
         change_tags = {"#media-list li": "div", "#media-list": "div"}
         replace_regex = {
             # data-zoom-src is only valid if it starts with //images.derstandard.at.
-            r'<img[^>]+data-zoom-src="(//images.derstandard.at/[^"]+)"':
-            r'<img src="\1"'
+            r'<img[^>]+data-zoom-src="(//images.derstandard.at/[^"]+)"': (
+                r'<img src="\1"'
+            )
         }
         replace_elems = {
             ".embedded-posting": "<p><em>Hinweis: Das eingebettete Posting ist nur "
@@ -114,9 +115,8 @@ def _parse_article(self, response):
             ).format(self.name, blog_id)
             yield scrapy.Request(url, self._parse_blog_article, meta={"il": il})
         elif response.css("#feature-content"):
-            cover_photo = (
-                response.css("#feature-cover-photo::attr(style)").
-                re_first('\((.*)\)')
+            cover_photo = response.css("#feature-cover-photo::attr(style)").re_first(
+                "\((.*)\)"
             )
             il.add_value("content_html", '<img src="{}">'.format(cover_photo))
             il.add_css("content_html", "#feature-cover-title h2")

diff --git a/feeds/spiders/indiehackers_com.py b/feeds/spiders/indiehackers_com.py
@@ -16,12 +16,12 @@ def parse(self, response):
             ".interview__link::attr(href), .interview__date::text"
         ).extract()
         self._logo = response.urljoin(
-            response.css('link[rel="icon"][sizes="192x192"]::attr(href)').
-            extract_first()
+            response.css(
+                'link[rel="icon"][sizes="192x192"]::attr(href)'
+            ).extract_first()
         )
         self._icon = response.urljoin(
-            response.css('link[rel="icon"][sizes="16x16"]::attr(href)').
-            extract_first()
+            response.css('link[rel="icon"][sizes="16x16"]::attr(href)').extract_first()
         )
         for link, date in zip(interviews[::2], interviews[1::2]):
             yield scrapy.Request(

diff --git a/feeds/spiders/lwn_net.py b/feeds/spiders/lwn_net.py
@@ -164,9 +164,7 @@ def _parse_article(self, response):
             ".MakeALink",
             "br",
         ]
-        change_tags = {
-            "div.BigQuote": "blockquote",
-        }
+        change_tags = {"div.BigQuote": "blockquote"}
         il = FeedEntryItemLoader(
             response=response,
             parent=response.meta["il"],

diff --git a/feeds/spiders/profil_at.py b/feeds/spiders/profil_at.py
@@ -20,7 +20,6 @@ class ProfilAtSpider(FeedsXMLFeedSpider):
     _title = "PROFIL"
     _subtitle = "Österreichs unabhängiges Nachrichtenmagazin"
     _timezone = "Europe/Vienna"
-    _max_articles = 20
 
     def start_requests(self):
         # Scrape this and last month so that the feed is not empty on the first day of a
@@ -36,11 +35,9 @@ def start_requests(self):
             )
 
     def parse_node(self, response, node):
-        if self._max_articles > 0:
-            self._max_articles -= 1
-            url = node.xpath("rss:loc/text()").extract_first()
-            updated = node.xpath("rss:lastmod/text()").extract_first()
-            return scrapy.Request(url, self.parse_item, meta={"updated": updated})
+        url = node.xpath("rss:loc/text()").extract_first()
+        updated = node.xpath("rss:lastmod/text()").extract_first()
+        return scrapy.Request(url, self.parse_item, meta={"updated": updated})
 
     def parse_item(self, response):
         remove_elems = [