Skip to content

Commit

Permalink
Merge branch 'generic'
Browse files Browse the repository at this point in the history
  • Loading branch information
Lukas0907 committed Aug 20, 2018
2 parents 1e7cd92 + a68d3c0 commit 4bee0ad
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 17 deletions.
8 changes: 8 additions & 0 deletions docs/spiders/generic.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ content feeds. It is similar to `Full-Text RSS`_ but uses a port of an older
version of Readability_ under the hood and currently doesn't support
site_config files. It works best for blog articles.

Some feeds already provide the full content but in a tag that is not used by
your feed reader. E.g. feeds created by Wordpress usually have the full
content in the "encoded" tag. In such cases it's best to add the URL to the
``fulltext_urls`` entry which extracts the content directly from the feed
without Readability_.

Configuration
~~~~~~~~~~~~~
Add ``generic`` to the list of spiders:
Expand All @@ -27,6 +33,8 @@ Add the feed URLs (Atom or XML) to the config file.
urls =
https://www.example.com/feed.atom
https://www.example.org/feed.xml
fulltext_urls =
https://myblog.example.com/feed/
.. _Readability: https://github.com/mozilla/readability
.. _`Full-Text RSS`: http://fivefilters.org/content-only/
3 changes: 3 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ useragent = feeds (+https://github.com/nblock/feeds)
#[generic]
## A list of URLs to RSS/Atom feeds.
# urls =
## A list of URLs to RSS/Atom feeds that provide the full content in the "encoded" or
## "content" tag.
# fulltext_urls =

#[falter.at]
## falter.at has a paywall for certain articles.
Expand Down
45 changes: 28 additions & 17 deletions feeds/spiders/generic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import io
import itertools
from urllib.parse import quote_plus as urlquote_plus, urlparse, urljoin

import feedparser
Expand All @@ -13,13 +14,17 @@ class GenericSpider(FeedsSpider):
name = "generic"

def start_requests(self):
self._sites = self.settings.get("FEEDS_SPIDER_GENERIC_URLS")
if not self._sites:
urls = self.settings.get("FEEDS_SPIDER_GENERIC_URLS") or ""
fulltext_urls = self.settings.get("FEEDS_SPIDER_GENERIC_FULLTEXT_URLS") or ""
if not urls and not fulltext_urls:
self.logger.error("Please specify url(s) in the config file!")
return

for url in self._sites.split():
yield scrapy.Request(url, meta={"dont_cache": True})
for url, fulltext in itertools.chain(
zip(urls.split(), itertools.repeat(False)),
zip(fulltext_urls.split(), itertools.repeat(True)),
):
yield scrapy.Request(url, meta={"dont_cache": True, "fulltext": fulltext})

def feed_headers(self):
return []
Expand All @@ -42,17 +47,28 @@ def parse(self, response):
)
base_url = "://".join(urlparse(response.url)[:2])
for entry in feed_entries:
yield scrapy.Request(
# Deals with protocol-relative URLs.
urljoin(base_url, entry["link"]),
self._parse_article,
meta={"path": path, "feed_entry": entry, "base_url": base_url},
)
# Deals with protocol-relative URLs.
link = urljoin(base_url, entry["link"])
il = FeedEntryItemLoader(base_url=base_url)
il.add_value("path", path)
il.add_value("updated", entry.get("updated") or entry.get("published"))
il.add_value("author_name", entry.get("author_detail", {}).get("name"))
il.add_value("link", link)
il.add_value("category", [t["term"] for t in entry.get("tags", [])])
if response.meta["fulltext"]:
il.add_value("title", entry["title"])
il.add_value("content_html", entry["content"][0]["value"])
yield il.load_item()
else:
# Content is not part of the feed, scrape it.
yield scrapy.Request(
link, self._parse_article, meta={"feed_entry": entry, "il": il}
)

def _parse_article(self, response):
doc = Document(response.text, url=response.url)
feed_entry = response.meta["feed_entry"]
il = FeedEntryItemLoader(base_url=response.meta["base_url"])
il = FeedEntryItemLoader(parent=response.meta["il"])
doc = Document(response.text, url=response.url)
il.add_value("title", doc.short_title() or feed_entry.get("title"))
summary = feed_entry.get("summary")
try:
Expand All @@ -64,9 +80,4 @@ def _parse_article(self, response):
except Unparseable:
content = summary
il.add_value("content_html", content)
il.add_value("updated", feed_entry.get("updated", feed_entry.get("published")))
il.add_value("author_name", feed_entry.get("author_detail", {}).get("name"))
il.add_value("category", [t["term"] for t in feed_entry.get("tags", [])])
il.add_value("path", response.meta["path"])
il.add_value("link", response.url)
yield il.load_item()

0 comments on commit 4bee0ad

Please sign in to comment.