Merge pull request #135 from Lukas0907/generic-extraction

Add new generic spider.
PyFeeds · Jul 27, 2018 · 2b8a391 · 2b8a391
2 parents b7e639f + 4b4f72b
commit 2b8a391
Show file tree

Hide file tree

Showing 5 changed files with 116 additions and 3 deletions.
diff --git a/README.rst b/README.rst
@@ -14,6 +14,9 @@ of your favorite websites in your feed reader (e.g. `Tiny Tiny RSS
 <https://tt-rss.org>`_) even if this is not officially supported by the
 website.
 
+Furthermore it can also enhance existing feeds by inlining the actual content
+into the feed entry so it can be read without leaving the feed reader.
+
 Feeds is based on Scrapy_, a framework for extracting data from websites, and
 it's easy to add support for new websites. Just take a look at the existing
 spiders in ``feeds/spiders`` and feel free to open a pull request!

diff --git a/docs/spiders/generic.rst b/docs/spiders/generic.rst
@@ -0,0 +1,32 @@
+.. _spider_generic:
+
+Generic full-text extraction
+----------------------------
+The generic spider can transform already existing Atom or RSS feeds, which
+usually only contain a summary or a few lines of the content, into full
+content feeds. It is similar to `Full-Text RSS`_ but uses a port of an older
+version of Readability_ under the hood and currently doesn't support
+site_config files. It works best for blog articles.
+
+Configuration
+~~~~~~~~~~~~~
+Add ``generic`` to the list of spiders:
+
+.. code-block:: ini
+
+   # List of spiders to run by default, one per line.
+   spiders =
+     generic
+
+Add the feed URLs (Atom or XML) to the config file.
+
+.. code-block:: ini
+
+   # List of URLs to RSS/Atom feeds to crawl, one per line.
+   [generic]
+   urls =
+       https://www.example.com/feed.atom
+       https://www.example.org/feed.xml
+
+.. _Readability: https://github.com/mozilla/readability
+.. _`Full-Text RSS`: http://fivefilters.org/content-only/
diff --git a/feeds.cfg.dist b/feeds.cfg.dist
@@ -30,6 +30,10 @@ useragent = feeds (+https://github.com/nblock/feeds)
 ## Expire (remove) entries from cache after 14 days
 # cache_expires = 14
 
+#[generic]
+## A list of URLs to RSS/Atom feeds.
+# urls =
+
 #[falter.at]
 ## falter.at has a paywall for certain articles.
 ## If you want to crawl paid articles, please provide abonr (subscription

diff --git a/feeds/spiders/generic.py b/feeds/spiders/generic.py
@@ -0,0 +1,72 @@
+import io
+from urllib.parse import quote_plus as urlquote_plus, urlparse, urljoin
+
+import feedparser
+import scrapy
+from readability.readability import Document, Unparseable
+
+from feeds.loaders import FeedEntryItemLoader
+from feeds.spiders import FeedsSpider
+
+
+class GenericSpider(FeedsSpider):
+    name = "generic"
+
+    def start_requests(self):
+        self._sites = self.settings.get("FEEDS_SPIDER_GENERIC_URLS")
+        if not self._sites:
+            self.logger.error("Please specify url(s) in the config file!")
+            return
+
+        for url in self._sites.split():
+            yield scrapy.Request(url, meta={"dont_cache": True})
+
+    def feed_headers(self):
+        return []
+
+    def parse(self, response):
+        feed = feedparser.parse(io.BytesIO(response.body))
+        if "entries" not in feed or not feed["entries"]:
+            self.logger.error("Feed {} contains no entries!".format(response.url))
+            return
+        feed_entries = feed["entries"]
+        feed = feed["feed"]
+        path = urlquote_plus(response.url)
+        yield self.generate_feed_header(
+            title=feed.get("title"),
+            subtitle=feed.get("subtitle"),
+            link=feed["link"],
+            path=path,
+            author_name=feed.get("author_detail", {}).get("name"),
+            logo=feed.get("image", {}).get("href"),
+        )
+        base_url = "://".join(urlparse(response.url)[:2])
+        for entry in feed_entries:
+            yield scrapy.Request(
+                # Deals with protocol-relative URLs.
+                urljoin(base_url, entry["link"]),
+                self._parse_article,
+                meta={"path": path, "feed_entry": entry, "base_url": base_url},
+            )
+
+    def _parse_article(self, response):
+        doc = Document(response.text, url=response.url)
+        feed_entry = response.meta["feed_entry"]
+        il = FeedEntryItemLoader(base_url=response.meta["base_url"])
+        il.add_value("title", doc.short_title() or feed_entry.get("title"))
+        summary = feed_entry.get("summary")
+        try:
+            content = doc.summary(html_partial=True)
+            if summary and len(summary) > len(content):
+                # Something probably went wrong if the extracted content is shorter than
+                # the summary.
+                raise Unparseable
+        except Unparseable:
+            content = summary
+        il.add_value("content_html", content)
+        il.add_value("updated", feed_entry.get("updated", feed_entry.get("published")))
+        il.add_value("author_name", feed_entry.get("author_detail", {}).get("name"))
+        il.add_value("category", [t["term"] for t in feed_entry.get("tags", [])])
+        il.add_value("path", response.meta["path"])
+        il.add_value("link", response.url)
+        yield il.load_item()
diff --git a/setup.py b/setup.py
@@ -10,12 +10,14 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        "bleach>=1.4.3",
         "Click>=6.6",
-        "dateparser>=0.5.1",
-        "python-dateutil>=2.7.3",
         "Scrapy>=1.1",
+        "bleach>=1.4.3",
+        "dateparser>=0.5.1",
+        "feedparser",
         "lxml>=3.5.0",
+        "python-dateutil>=2.7.3",
+        "readability-lxml>=0.7",
     ],
     extras_require={
         "docs": ["doc8", "restructuredtext_lint", "sphinx", "sphinx_rtd_theme"],