Skip to content

Commit

Permalink
Merge pull request #170 from PyFeeds/check-for-fulltext-feed
Browse files Browse the repository at this point in the history
Add a script to check feed URLs for full-text content
  • Loading branch information
nblock committed Sep 14, 2018
2 parents ea4d396 + cac9d4f commit 5813d43
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 1 deletion.
4 changes: 3 additions & 1 deletion docs/spiders/generic.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ Some feeds already provide the full content but in a tag that is not used by
your feed reader. E.g. feeds created by Wordpress usually have the full
content in the "encoded" tag. In such cases it's best to add the URL to the
``fulltext_urls`` entry which extracts the content directly from the feed
without Readability_.
without Readability_. There is a little helper script in
``scripts/check-for-fulltext-content`` to detect if a feed contains full-text
content.

Configuration
~~~~~~~~~~~~~
Expand Down
63 changes: 63 additions & 0 deletions scripts/check-for-fulltext-content
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/python3
# Check feed URLs for full-text content

import argparse
import logging
import sys

import requests

# Wordpress
MATCHER_STRINGS = ["<content:encoded>"]


def has_full_text_content(url):
logging.debug(f"Checking URL: '{url}'")
response = requests.get(url)
if response.status_code != requests.codes.ok:
logging.debug(f"Ignoring non-ok status code: {response.status_code}")
return False

for string in MATCHER_STRINGS:
if string in response.text:
logging.debug(f"String '{string}' matches URL '{url}'")
return True

return False


def parse_args():
parser = argparse.ArgumentParser(
description="Check feed URLs for full-text content",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"-v",
"--verbosity",
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
default="WARNING",
help="the log verbosity level",
)
parser.add_argument(
"urls",
type=argparse.FileType("r"),
default=sys.stdin,
help="a file with URLs to check",
)

return parser.parse_args()


def main():
"""main"""
args = parse_args()
logging.basicConfig(level=args.verbosity)

urls = [url.strip() for url in args.urls if url.startswith("http")]
for url in urls:
if has_full_text_content(url):
print(url)


if __name__ == "__main__":
main()

0 comments on commit 5813d43

Please sign in to comment.