From f66020be3881e95a3964eedfd5ac88764242d77e Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Thu, 16 May 2024 20:36:10 +0200 Subject: [PATCH 1/2] CPLHTTPFetch(): add a RETRY_CODES option (GDAL_HTTP_RETRY_CODES config option) Fixes #9941 ``` - .. config:: GDAL_HTTP_RETRY_CODES :choices: ALL or comma-separated list of codes :since: 3.10 Specify which HTTP error codes should trigger a retry attempt. Valid values are ``ALL`` or a comma-separated list of HTTP codes. By default, 429, 500, 502, 503 or 504 HTTP errors are considered, as well as other situations with a particular HTTP or Curl error message. ``` --- autotest/gcore/vsicurl.py | 91 ++++++++ doc/source/user/configoptions.rst | 15 +- doc/source/user/virtual_file_systems.rst | 1 + port/cpl_http.cpp | 269 +++++++++++++++-------- port/cpl_http.h | 3 +- port/cpl_vsil_curl.cpp | 46 ++-- port/cpl_vsil_curl_class.h | 1 + 7 files changed, 319 insertions(+), 107 deletions(-) diff --git a/autotest/gcore/vsicurl.py b/autotest/gcore/vsicurl.py index 1b425be6935d..00862d010a32 100755 --- a/autotest/gcore/vsicurl.py +++ b/autotest/gcore/vsicurl.py @@ -613,6 +613,97 @@ def test_vsicurl_test_retry(server): ############################################################################### +def test_vsicurl_retry_codes_ALL(server): + + gdal.VSICurlClearCache() + + handler = webserver.SequentialHandler() + handler.add("GET", "/test_retry/", 404) + handler.add("HEAD", "/test_retry/test.txt", 200, {"Content-Length": "3"}) + handler.add( + "GET", "/test_retry/test.txt", 400 + ) # non retriable by default, but here allowed because of retry_codes=ALL + handler.add("GET", "/test_retry/test.txt", 200, {}, "foo") + with webserver.install_http_handler(handler): + f = gdal.VSIFOpenL( + "/vsicurl?max_retry=1&retry_delay=0.01&retry_codes=ALL&url=http://localhost:%d/test_retry/test.txt" + % server.port, + "rb", + ) + assert f is not None + gdal.ErrorReset() + with gdal.quiet_errors(): + data = gdal.VSIFReadL(1, 3, f).decode("ascii") + error_msg = gdal.GetLastErrorMsg() + gdal.VSIFCloseL(f) + assert data == "foo" + assert "400" in error_msg + + +############################################################################### + + +def test_vsicurl_retry_codes_enumerated(server): + + gdal.VSICurlClearCache() + + handler = webserver.SequentialHandler() + handler.add("GET", "/test_retry/", 404) + handler.add("HEAD", "/test_retry/test.txt", 200, {"Content-Length": "3"}) + handler.add( + "GET", "/test_retry/test.txt", 400 + ) # non retriable by default, but here allowed because of retry_codes=ALL + handler.add("GET", "/test_retry/test.txt", 200, {}, "foo") + with webserver.install_http_handler(handler), gdal.config_option( + "GDAL_HTTP_RETRY_CODES", "400" + ): + f = gdal.VSIFOpenL( + "/vsicurl?max_retry=1&retry_delay=0.01&url=http://localhost:%d/test_retry/test.txt" + % server.port, + "rb", + ) + assert f is not None + gdal.ErrorReset() + with gdal.quiet_errors(): + data = gdal.VSIFReadL(1, 3, f).decode("ascii") + error_msg = gdal.GetLastErrorMsg() + gdal.VSIFCloseL(f) + assert data == "foo" + assert "400" in error_msg + + +############################################################################### + + +def test_vsicurl_retry_codes_no_match(server): + + gdal.VSICurlClearCache() + + handler = webserver.SequentialHandler() + handler.add("GET", "/test_retry/", 404) + handler.add("HEAD", "/test_retry/test.txt", 200, {"Content-Length": "3"}) + handler.add( + "GET", "/test_retry/test.txt", 400 + ) # non retriable by default, and not listed in GDAL_HTTP_RETRY_CODES + with webserver.install_http_handler(handler), gdal.config_option( + "GDAL_HTTP_RETRY_CODES", "409" + ): + f = gdal.VSIFOpenL( + "/vsicurl?max_retry=1&retry_delay=0.01&url=http://localhost:%d/test_retry/test.txt" + % server.port, + "rb", + ) + assert f is not None + gdal.ErrorReset() + with gdal.quiet_errors(): + data = gdal.VSIFReadL(1, 3, f).decode("ascii") + gdal.VSIFCloseL(f) + assert len(data) == 0 + + +############################################################################### + + def test_vsicurl_test_fallback_from_head_to_get(server): gdal.VSICurlClearCache() diff --git a/doc/source/user/configoptions.rst b/doc/source/user/configoptions.rst index 32e933fbb9c6..a0056df591ed 100644 --- a/doc/source/user/configoptions.rst +++ b/doc/source/user/configoptions.rst @@ -744,15 +744,28 @@ Networking options - .. config:: GDAL_HTTP_MAX_RETRY :since: 2.3 + :default: 0 - Set the number of HTTP attempts in case of HTTP errors 429, 502, 503, or 504. + Set the number of HTTP attempts, when a retry is allowed. + (cf :config:`GDAL_HTTP_RETRY_CODES` for conditions where a retry is attempted.) + The default value is 0, meaning no retry. - .. config:: GDAL_HTTP_RETRY_DELAY :choices: :since: 2.3 + :default: 30 Set the delay between HTTP attempts. +- .. config:: GDAL_HTTP_RETRY_CODES + :choices: ALL or comma-separated list of codes + :since: 3.10 + + Specify which HTTP error codes should trigger a retry attempt. + Valid values are ``ALL`` or a comma-separated list of HTTP codes. + By default, 429, 500, 502, 503 or 504 HTTP errors are considered, as + well as other situations with a particular HTTP or Curl error message. + - .. config:: GDAL_HTTP_TCP_KEEPALIVE :choices: YES, NO :default: NO diff --git a/doc/source/user/virtual_file_systems.rst b/doc/source/user/virtual_file_systems.rst index 9929057ea0d0..b4aea9243f10 100644 --- a/doc/source/user/virtual_file_systems.rst +++ b/doc/source/user/virtual_file_systems.rst @@ -379,6 +379,7 @@ Starting with GDAL 2.3, options can be passed in the filename with the following - use_head=yes/no: whether the HTTP HEAD request can be emitted. Default to YES. Setting this option overrides the behavior of the :config:`CPL_VSIL_CURL_USE_HEAD` configuration option. - max_retry=number: default to 0. Setting this option overrides the behavior of the :config:`GDAL_HTTP_MAX_RETRY` configuration option. - retry_delay=number_in_seconds: default to 30. Setting this option overrides the behavior of the :config:`GDAL_HTTP_RETRY_DELAY` configuration option. +- retry_codes=``ALL`` or comma-separated list of HTTP error codes. Setting this option overrides the behavior of the :config:`GDAL_HTTP_RETRY_CODES` configuration option. (GDAL >= 3.10) - list_dir=yes/no: whether an attempt to read the file list of the directory where the file is located should be done. Default to YES. - useragent=value: HTTP UserAgent header - referer=value: HTTP Referer header diff --git a/port/cpl_http.cpp b/port/cpl_http.cpp index 2d7f3bb10c7f..2a9e366cec69 100644 --- a/port/cpl_http.cpp +++ b/port/cpl_http.cpp @@ -529,6 +529,7 @@ constexpr TupleEnvVarOptionName asAssocEnvVarOptionName[] = { {"GDAL_HTTP_NETRC_FILE", "NETRC_FILE"}, {"GDAL_HTTP_MAX_RETRY", "MAX_RETRY"}, {"GDAL_HTTP_RETRY_DELAY", "RETRY_DELAY"}, + {"GDAL_HTTP_RETRY_CODES", "RETRY_CODES"}, {"GDAL_CURL_CA_BUNDLE", "CAINFO"}, {"CURL_CA_BUNDLE", "CAINFO"}, {"SSL_CERT_FILE", "CAINFO"}, @@ -576,18 +577,45 @@ char **CPLHTTPGetOptionsFromEnv(const char *pszFilename) /* CPLHTTPGetNewRetryDelay() */ /************************************************************************/ +/** Return the new retry delay. + * + * This takes into account the HTTP response code, the previous delay, the + * HTTP payload error message, the Curl error message and a potential list of + * retriable HTTP codes. + * + * @param response_code HTTP response code (e.g. 400) + * @param dfOldDelay Previous delay (nominally in second) + * @param pszErrBuf HTTP response body of the failed request (may be NULL) + * @param pszCurlError Curl error as returned by CURLOPT_ERRORBUFFER (may be NULL) + * @param pszRetriableCodes nullptr to limit to the default hard-coded scenarios, + * "ALL" to ask to retry for all non-200 codes, or a comma-separated list of + * HTTP codes (e.g. "400,500") that are accepted for retry. + * @return the new delay, or 0 if no retry should be attempted. + */ double CPLHTTPGetNewRetryDelay(int response_code, double dfOldDelay, - const char *pszErrBuf, const char *pszCurlError) + const char *pszErrBuf, const char *pszCurlError, + const char *pszRetriableCodes) { - if (response_code == 429 || response_code == 500 || - (response_code >= 502 && response_code <= 504) || - // S3 sends some client timeout errors as 400 Client Error - (response_code == 400 && pszErrBuf && - strstr(pszErrBuf, "RequestTimeout")) || - (pszCurlError && (strstr(pszCurlError, "Connection timed out") || - strstr(pszCurlError, "Operation timed out") || - strstr(pszCurlError, "Connection reset by peer") || - strstr(pszCurlError, "Connection was reset")))) + bool bRetry = false; + if (pszRetriableCodes && pszRetriableCodes[0]) + { + bRetry = EQUAL(pszRetriableCodes, "ALL") || + strstr(pszRetriableCodes, CPLSPrintf("%d", response_code)); + } + else if (response_code == 429 || response_code == 500 || + (response_code >= 502 && response_code <= 504) || + // S3 sends some client timeout errors as 400 Client Error + (response_code == 400 && pszErrBuf && + strstr(pszErrBuf, "RequestTimeout")) || + (pszCurlError && + (strstr(pszCurlError, "Connection timed out") || + strstr(pszCurlError, "Operation timed out") || + strstr(pszCurlError, "Connection reset by peer") || + strstr(pszCurlError, "Connection was reset")))) + { + bRetry = true; + } + if (bRetry) { // 'Operation timed out': seen during some long running operation 'hang' // no error but no response from server and we are in the cURL loop @@ -910,69 +938,133 @@ int CPLHTTPPopFetchCallback(void) /** * \brief Fetch a document from an url and return in a string. * - * @param pszURL valid URL recognized by underlying download library (libcurl) - * @param papszOptions option list as a NULL-terminated array of strings. May be - * NULL. The following options are handled :