Skip to content

Commit

Permalink
Merge pull request #135 from Lukas0907/generic-extraction
Browse files Browse the repository at this point in the history
Add new generic spider.
  • Loading branch information
Lukas0907 committed Jul 27, 2018
2 parents b7e639f + 4b4f72b commit 2b8a391
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 3 deletions.
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ of your favorite websites in your feed reader (e.g. `Tiny Tiny RSS
<https://tt-rss.org>`_) even if this is not officially supported by the
website.

Furthermore it can also enhance existing feeds by inlining the actual content
into the feed entry so it can be read without leaving the feed reader.

Feeds is based on Scrapy_, a framework for extracting data from websites, and
it's easy to add support for new websites. Just take a look at the existing
spiders in ``feeds/spiders`` and feel free to open a pull request!
Expand Down
32 changes: 32 additions & 0 deletions docs/spiders/generic.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
.. _spider_generic:

Generic full-text extraction
----------------------------
The generic spider can transform already existing Atom or RSS feeds, which
usually only contain a summary or a few lines of the content, into full
content feeds. It is similar to `Full-Text RSS`_ but uses a port of an older
version of Readability_ under the hood and currently doesn't support
site_config files. It works best for blog articles.

Configuration
~~~~~~~~~~~~~
Add ``generic`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
generic
Add the feed URLs (Atom or XML) to the config file.

.. code-block:: ini
# List of URLs to RSS/Atom feeds to crawl, one per line.
[generic]
urls =
https://www.example.com/feed.atom
https://www.example.org/feed.xml
.. _Readability: https://github.com/mozilla/readability
.. _`Full-Text RSS`: http://fivefilters.org/content-only/
4 changes: 4 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ useragent = feeds (+https://github.com/nblock/feeds)
## Expire (remove) entries from cache after 14 days
# cache_expires = 14

#[generic]
## A list of URLs to RSS/Atom feeds.
# urls =

#[falter.at]
## falter.at has a paywall for certain articles.
## If you want to crawl paid articles, please provide abonr (subscription
Expand Down
72 changes: 72 additions & 0 deletions feeds/spiders/generic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import io
from urllib.parse import quote_plus as urlquote_plus, urlparse, urljoin

import feedparser
import scrapy
from readability.readability import Document, Unparseable

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider


class GenericSpider(FeedsSpider):
name = "generic"

def start_requests(self):
self._sites = self.settings.get("FEEDS_SPIDER_GENERIC_URLS")
if not self._sites:
self.logger.error("Please specify url(s) in the config file!")
return

for url in self._sites.split():
yield scrapy.Request(url, meta={"dont_cache": True})

def feed_headers(self):
return []

def parse(self, response):
feed = feedparser.parse(io.BytesIO(response.body))
if "entries" not in feed or not feed["entries"]:
self.logger.error("Feed {} contains no entries!".format(response.url))
return
feed_entries = feed["entries"]
feed = feed["feed"]
path = urlquote_plus(response.url)
yield self.generate_feed_header(
title=feed.get("title"),
subtitle=feed.get("subtitle"),
link=feed["link"],
path=path,
author_name=feed.get("author_detail", {}).get("name"),
logo=feed.get("image", {}).get("href"),
)
base_url = "://".join(urlparse(response.url)[:2])
for entry in feed_entries:
yield scrapy.Request(
# Deals with protocol-relative URLs.
urljoin(base_url, entry["link"]),
self._parse_article,
meta={"path": path, "feed_entry": entry, "base_url": base_url},
)

def _parse_article(self, response):
doc = Document(response.text, url=response.url)
feed_entry = response.meta["feed_entry"]
il = FeedEntryItemLoader(base_url=response.meta["base_url"])
il.add_value("title", doc.short_title() or feed_entry.get("title"))
summary = feed_entry.get("summary")
try:
content = doc.summary(html_partial=True)
if summary and len(summary) > len(content):
# Something probably went wrong if the extracted content is shorter than
# the summary.
raise Unparseable
except Unparseable:
content = summary
il.add_value("content_html", content)
il.add_value("updated", feed_entry.get("updated", feed_entry.get("published")))
il.add_value("author_name", feed_entry.get("author_detail", {}).get("name"))
il.add_value("category", [t["term"] for t in feed_entry.get("tags", [])])
il.add_value("path", response.meta["path"])
il.add_value("link", response.url)
yield il.load_item()
8 changes: 5 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@
packages=find_packages(),
include_package_data=True,
install_requires=[
"bleach>=1.4.3",
"Click>=6.6",
"dateparser>=0.5.1",
"python-dateutil>=2.7.3",
"Scrapy>=1.1",
"bleach>=1.4.3",
"dateparser>=0.5.1",
"feedparser",
"lxml>=3.5.0",
"python-dateutil>=2.7.3",
"readability-lxml>=0.7",
],
extras_require={
"docs": ["doc8", "restructuredtext_lint", "sphinx", "sphinx_rtd_theme"],
Expand Down

0 comments on commit 2b8a391

Please sign in to comment.