diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index 9b23c02..f7ba8b9 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -6,20 +6,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8] + python-version: [3.11] steps: # git checkout - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 # python setup - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} # python cache - - uses: actions/cache@v1 + - uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} diff --git a/.github/workflows/flake8.yml b/.github/workflows/flake8.yml index 97c7dfa..62645d0 100644 --- a/.github/workflows/flake8.yml +++ b/.github/workflows/flake8.yml @@ -6,20 +6,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8] + python-version: [3.11] steps: # git checkout - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 # python setup - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} # python cache - - uses: actions/cache@v1 + - uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} diff --git a/.github/workflows/pyroma.yml b/.github/workflows/pyroma.yml index bbad3c2..175b7d6 100644 --- a/.github/workflows/pyroma.yml +++ b/.github/workflows/pyroma.yml @@ -6,20 +6,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8] + python-version: [3.11] steps: # git checkout - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 # python setup - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} # python cache - - uses: actions/cache@v1 + - uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 13ca3e8..55d7c08 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,20 +13,20 @@ jobs: matrix: config: # [Python version, tox env] - - ["3.8", "py38-plone52"] - - ["3.8", "py38-plone60"] + # - ["3.8", "py38-plone52"] + # - ["3.8", "py38-plone60"] - ["3.9", "py39-plone60"] - ["3.10", "py310-plone60"] - ["3.11", "py311-plone60"] name: ${{ matrix.config[1] }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.config[0] }} - name: Pip cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | ~/.cache/pip diff --git a/.github/workflows/zpretty.yml b/.github/workflows/zpretty.yml index 180f0e9..49a04e3 100644 --- a/.github/workflows/zpretty.yml +++ b/.github/workflows/zpretty.yml @@ -6,20 +6,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8] + python-version: [3.11] steps: # git checkout - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 # python setup - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} # python cache - - uses: actions/cache@v1 + - uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} diff --git a/.gitignore b/.gitignore index b9fc443..8898a12 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ report.html .vscode/ .tox/ reports/ +pyvenv.cfg # excludes !.coveragerc !.editorconfig diff --git a/README.rst b/README.rst index 58f3663..ccea33a 100644 --- a/README.rst +++ b/README.rst @@ -115,6 +115,22 @@ Install redturtle.rssservice by adding it to your buildout:: and then running ``bin/buildout`` +Resilience +========== + +To make the product more resilient against external feed disruptions, it is possible to use an external proxy/cache service. The service is utilized in the following format: + +``` +http://proxyservice/http://feedurl +``` + +The product includes a built-in proxy/cache implementation that can be used. After installing the product, you can use it by running the following process: + +``` +bin/rssmixer-proxy --port 8000 --ttl 1200 +``` + +And eventually set the environment variable `RSSMIXER_PROXY` to `http://127.0.0.1:8000` according to the port used for the proxy. Contribute ========== diff --git a/base.cfg b/base.cfg index 1ebaedd..04e64e5 100644 --- a/base.cfg +++ b/base.cfg @@ -1,7 +1,7 @@ [buildout] show-picked-versions = true -extensions = - mr.developer +#extensions = +# mr.developer parts = instance @@ -14,7 +14,7 @@ parts = # releaser i18ndude omelette - robot +# robot plone-helper-scripts vscode @@ -65,6 +65,7 @@ eggs = coverage recipe = collective.recipe.template input = inline: #!/bin/bash + set -e export TZ=UTC ${buildout:directory}/bin/coverage run bin/test $* ${buildout:directory}/bin/coverage html diff --git a/buildout.cfg b/buildout.cfg index fe47885..049d99c 100644 --- a/buildout.cfg +++ b/buildout.cfg @@ -1,8 +1,6 @@ [buildout] - # use this extend one of the buildout configuration: extends = test_plone60.cfg [versions] -setuptools = diff --git a/requirements.txt b/requirements.txt index 06390bd..0ba7f38 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1 @@ --c constraints.txt -setuptools -zc.buildout +-r requirements_plone60.txt diff --git a/requirements_plone60.txt b/requirements_plone60.txt index 183df83..3ae6c36 100644 --- a/requirements_plone60.txt +++ b/requirements_plone60.txt @@ -1,13 +1 @@ -# Keep these the same as in base.cfg please. -pip==22.2.2 -setuptools==65.3.0 -zc.buildout>=3.0.0rc3 -wheel==0.37.1 - -# Windows specific down here (has to be installed here, fails in buildout) -# Dependency of zope.sendmail: -pywin32 ; platform_system == 'Windows' -# SSL Certs on Windows, because Python is missing them otherwise: -certifi ; platform_system == 'Windows' -# Dependency of collective.recipe.omelette: -ntfsutils ; platform_system == 'Windows' and python_version < '3.0' +-r https://dist.plone.org/release/6.0-latest/requirements.txt diff --git a/setup.cfg b/setup.cfg index 6032ea3..04150a0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,13 +6,7 @@ ignore = .gitattributes [isort] -# for details see -# http://docs.plone.org/develop/styleguide/python.html#grouping-and-sorting -force_alphabetical_sort = True -force_single_line = True -lines_after_imports = 2 -line_length = 200 -not_skip = __init__.py +profile=plone [flake8] exclude = bootstrap.py,docs,*.egg.,omelette @@ -21,4 +15,4 @@ max-line-length = 100000 extend-ignore = E203, C901, - C101 \ No newline at end of file + C101 diff --git a/setup.py b/setup.py index f551069..dd63769 100644 --- a/setup.py +++ b/setup.py @@ -76,6 +76,6 @@ [z3c.autoinclude.plugin] target = plone [console_scripts] - update_locale = redturtle.rssservice.locales.update:update_locale + rssmixer-proxy = redturtle.rssservice.proxycacheserver:main """, ) diff --git a/src/redturtle/rssservice/proxycacheserver/__init__.py b/src/redturtle/rssservice/proxycacheserver/__init__.py new file mode 100644 index 0000000..191b423 --- /dev/null +++ b/src/redturtle/rssservice/proxycacheserver/__init__.py @@ -0,0 +1 @@ +from .main import main # NOQA diff --git a/src/redturtle/rssservice/proxycacheserver/main.py b/src/redturtle/rssservice/proxycacheserver/main.py new file mode 100644 index 0000000..c439bdd --- /dev/null +++ b/src/redturtle/rssservice/proxycacheserver/main.py @@ -0,0 +1,279 @@ +""" +This code implements a caching proxy server that stores and serves web content. + +Key Components: +* Creates unique filenames for cached content using MD5 hashing +* Stores both the content and metadata (URL information) in separate files +* Background refresh content periodically + +The Proxy Server: + +* Listens for incoming requests (be awere to protect connection or leave the server listen only on localhost) +* Checks if requested content is in cache +* If found, serves from cache +* If not found, fetches it, saves it, then serves it + +Background Refresh: + +* Automatically updates cached content periodically +* Runs in separate threads to not block the main server +* Time between updates is configurable (TTL - Time To Live) + +Command Line Interface : Uses Click library to accept parameters like: + +Host address (default: 127.0.0.1) +Port number (default: 8080) +Cache directory location (default: ./var/cache) +TTL for cache refresh (default: 3600 seconds) + +Usage Example : + +``` +rssmixer-proxy --host 127.0.0.1 --port 8080 --cache-dir ./var/cache --ttl 3600 +``` + +XXX: this is not actually a real HTTP/HTTPS proxy because needs to act as man-in-the-middle + +Usage: + +``` +import requests + +RSSMIXER_PROXY = "http://127.0.0.1:8080" +url = "https://abcnews.go.com/abcnews/usheadlines" +res = requests.get(f{RSS_MIXER_PROXY}/{url}") +``` + +This is particularly useful for: + +* Reducing load on original servers +* Improving response times +* Working with content even when the original source is temporarily unavailable +* Saving bandwidth by not repeatedly downloading the same content +""" + +import click +import hashlib +import http.server +import json +import logging +import os +import re +import requests +import socketserver +import threading +import time + + +# this is not thtread-safe, but we don't care about it ! +LAST_ACCESS_TIMES = {} +MAX_TTL_IN_CACHE = 7 * 24 * 3600 # 1 week + +logger = logging.getLogger("rssmixer-proxy") +logger.setLevel(logging.INFO) +formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" +) +stream_handler = logging.StreamHandler() +stream_handler.setFormatter(formatter) +logger.addHandler(stream_handler) + + +# Function to calculate cache file path based on URL +def cache_path(url, cache_dir): + hash_url = hashlib.md5(url.encode("utf-8")).hexdigest() + return os.path.join(cache_dir, f"{hash_url}.json") + + +def load_json(cache_file): + try: + if os.path.exists(cache_file): + with open(cache_file, "r", encoding="utf-8") as f: + return json.load(f) + except Exception: + return {} + + +def fetch_and_cache(url, cache_dir, client_headers=None, timeout=(1, 10)): + cache_file = cache_path(url, cache_dir) + try: + # Send the request to the server + if client_headers is None: + data = load_json(cache_file) + headers = data.get("request_headers", {}) + else: + headers = client_headers + if "User-Agent" not in headers: + headers["User-Agent"] = "RSSMixerProxy/1.0" + if "Host" in headers: + del headers["Host"] + # Validate the URL + if not re.match(r"^https?:\/\/", url): + raise ValueError(f"Invalid URL path: {url}") + response = requests.get(url, headers=headers, timeout=timeout) + # Store the response in the cache + if response.status_code == 200: + cache_content = { + "url": url, + "request_headers": headers, + "response_headers": dict(response.headers), + "status_code": response.status_code, + "body": response.text, + } + # TODO: update file only if changed ? + with open(cache_file, "w", encoding="utf-8") as f: + json.dump(cache_content, f, indent=2) + logger.info("Cached %s: %s in %s", response.status_code, url, cache_dir) + else: + logger.error("Failed to fetch $s: %s", url, response.status_code) + cache_content = { + "url": url, + "request_headers": headers, + "response_headers": dict(response.headers), + "status_code": response.status_code, + "body": response.text, + } + if not os.path.exists(cache_file): + with open(cache_file, "w", encoding="utf-8") as f: + json.dump(cache_content, f, indent=2) + logger.info( + "Cached error %s: %s in %s", response.status_code, url, cache_dir + ) + except Exception as e: + logger.error("Error fetching %s: %s", url, e) + cache_content = { + "url": url, + "request_headers": headers, + "response_headers": {}, + "status_code": 500, + "body": str(e), + } + if not os.path.exists(cache_file): + with open(cache_file, "w", encoding="utf-8") as f: + json.dump(cache_content, f, indent=2) + logger.info("Cached error: %s in %s", url, cache_dir) + return cache_content + + +# Background thread to refresh cache +def refresh_cache(url, cache_dir, ttl): + logger.info(f"Refresh cache for {url} every {ttl} seconds") + while True: + time.sleep(ttl) + if url not in LAST_ACCESS_TIMES: + LAST_ACCESS_TIMES[url] = time.time() + else: + if LAST_ACCESS_TIMES[url] + MAX_TTL_IN_CACHE < time.time(): + cache_file = cache_path(url, cache_dir) + if os.path.exists(cache_file): + os.remove(cache_file) + logger.warning("Remove %s from cached files", url) + return + logger.info("Refresh cache for %s", url) + fetch_and_cache(url, cache_dir) + + +# Load URLs to cache from existing .url files +def load_urls_from_cache(cache_dir): + urls = [] + for file in os.listdir(cache_dir): + if file.endswith(".json"): + hash_file = os.path.join(cache_dir, file) + try: + # Extract original URL from the cached file + data = json.load(open(hash_file, "r", encoding="utf-8")) + url = data.get("url", "") + if url: + logger.info("Load: %s from cache %s", url, hash_file) + urls.append(url) + except Exception as e: + logger.info("Error reading cached file %s: %s", file, e) + return urls + + +# HTTP proxy handler +class CachingProxyHandler(http.server.BaseHTTPRequestHandler): + def __init__(self, *args, cache_dir=None, ttl=None, **kwargs): + self.cache_dir = cache_dir + self.ttl = ttl + super().__init__(*args, **kwargs) + + def do_GET(self): + + url = self.path.lstrip("/").replace("\n").replace("\r") + LAST_ACCESS_TIMES[url] = time.time() + cache_file = cache_path(url, self.cache_dir) + + # Check if the page is already cached + if os.path.exists(cache_file): + logger.info("Serving from cache: %s", url) + with open(cache_file, "r", encoding="utf-8") as f: + cache_content = json.load(f) + else: + logger.info("Fetching and caching: %s", url) + client_headers = dict(self.headers) + cache_content = fetch_and_cache(url, self.cache_dir, client_headers) + threading.Thread( + target=refresh_cache, args=(url, self.cache_dir, self.ttl), daemon=True + ).start() + + # Send response + self.send_response(cache_content["status_code"]) + for header, value in cache_content["response_headers"].items(): + if header.lower() in ("set-cookie", "content-length"): + continue + if header.lower() in ("content-type", "cache-control"): + self.send_header(header, value) + continue + # logger.info("skip header", header, value) + self.send_header("Content-Length", len(cache_content["body"].encode("utf-8"))) + self.end_headers() + self.wfile.write(cache_content["body"].encode("utf-8")) + + +# Start the server +def start_server(host, port, cache_dir, ttl): + def handler(*args, **kwargs): + return CachingProxyHandler(*args, cache_dir=cache_dir, ttl=ttl, **kwargs) + + socketserver.TCPServer.allow_reuse_address = True + with socketserver.TCPServer((host, port), handler) as httpd: + try: + logger.info("Serving on http://%s:%s", host, port) + httpd.serve_forever() + finally: + logger.info("Closing connection") + httpd.shutdown() + # con.shutdown(socket.SHUT_RDWR) + # httpd.close() + + +@click.command() +@click.option("--host", default="127.0.0.1", help="Ip address to run the server on.") +@click.option("--port", default=8080, help="Port to run the server on.") +@click.option( + "--cache-dir", default="./var/cache", help="Directory to store cached files." +) +@click.option("--ttl", default=3600, help="") +def main(host, port, cache_dir, ttl): + # Create cache directory if it doesn't exist + os.makedirs(cache_dir, exist_ok=True) + + # Load URLs from cache directory and start refresh threads + cached_urls = load_urls_from_cache(cache_dir) + try: + for url in cached_urls: + threading.Thread( + target=refresh_cache, args=(url, cache_dir, ttl), daemon=True + ).start() + + # Start the proxy server + start_server(host, port, cache_dir, ttl) + except KeyboardInterrupt: + logger.info("Server stopped.") + finally: + logger.info("Closing connection") + + +if __name__ == "__main__": + main() diff --git a/src/redturtle/rssservice/rss_mixer.py b/src/redturtle/rssservice/rss_mixer.py index b07febb..4d389e1 100644 --- a/src/redturtle/rssservice/rss_mixer.py +++ b/src/redturtle/rssservice/rss_mixer.py @@ -22,6 +22,7 @@ import logging import requests + logger = logging.getLogger(__name__) @@ -34,6 +35,7 @@ REQUESTS_TIMEOUT = int(environ.get("RSS_SERVICE_TIMEOUT", "5")) or 5 REQUESTS_USER_AGENT = environ.get("RSS_USER_AGENT") +RSSMIXER_HTTP_PROXY = environ.get("RSSMIXER_PROXY", "") class RSSMixerService(Service): @@ -232,6 +234,8 @@ def _getFeedFromUrl(self, url): if REQUESTS_USER_AGENT: headers["User-Agent"] = REQUESTS_USER_AGENT try: + if RSSMIXER_HTTP_PROXY: + url = f"{RSSMIXER_HTTP_PROXY}/{url}" response = requests.get( url, headers=headers, diff --git a/src/redturtle/rssservice/testing.py b/src/redturtle/rssservice/testing.py index 495bf82..f300aba 100644 --- a/src/redturtle/rssservice/testing.py +++ b/src/redturtle/rssservice/testing.py @@ -8,8 +8,8 @@ from plone.restapi.testing import PloneRestApiDXLayer from plone.testing import z2 -import redturtle.rssservice import plone.restapi +import redturtle.rssservice class RedTurtleRSSServiceLayer(PloneSandboxLayer): diff --git a/src/redturtle/rssservice/tests/test_rss_mixer.py b/src/redturtle/rssservice/tests/test_rss_mixer.py index a216e99..6c13609 100644 --- a/src/redturtle/rssservice/tests/test_rss_mixer.py +++ b/src/redturtle/rssservice/tests/test_rss_mixer.py @@ -6,15 +6,14 @@ from plone.app.testing import TEST_USER_ID from plone.restapi.testing import RelativeSession from redturtle.rssservice.rss_mixer import FEED_DATA -from redturtle.rssservice.testing import ( - REDTURTLE_RSSSERVICE_API_FUNCTIONAL_TESTING, -) +from redturtle.rssservice.testing import REDTURTLE_RSSSERVICE_API_FUNCTIONAL_TESTING from requests.exceptions import Timeout from transaction import commit from unittest import mock import unittest + EXAMPLE_FEED_FOO = """ diff --git a/test_plone60.cfg b/test_plone60.cfg index e8e7ab3..2ea07d6 100644 --- a/test_plone60.cfg +++ b/test_plone60.cfg @@ -8,14 +8,24 @@ extends = update-versions-file = test_plone60.cfg [versions] -createcoverage = 1.5 -watchdog = 2.1.6 +#createcoverage = 1.5 +#watchdog = 2.1.6 # Added by buildout at 2023-03-08 12:46:09.938653 -coverage = 7.1.0 -i18ndude = 5.5.0 -requests-mock = 1.10.0 +#coverage = 7.1.0 +#i18ndude = 5.5.0 +#requests-mock = 1.10.0 # Required by: # redturtle.rssservice==2.0.1.dev0 -z3c.jbot = 1.1.1 +#z3c.jbot = 1.1.1 + +# Added by buildout at 2025-01-18 18:05:02.912359 +coverage = 7.6.10 +createcoverage = 1.5 +i18ndude = 6.2.1 +requests-mock = 1.12.1 + +# Required by: +# redturtle.rssservice==2.2.2.dev0 +z3c.jbot = 2.0