diff --git a/cpp/src/arrow/io/caching.h b/cpp/src/arrow/io/caching.h index 99b4416505077..e2b911fafdbbc 100644 --- a/cpp/src/arrow/io/caching.h +++ b/cpp/src/arrow/io/caching.h @@ -61,9 +61,10 @@ struct ARROW_EXPORT CacheOptions { /// \brief Construct CacheOptions from network storage metrics (e.g. S3). /// /// \param[in] time_to_first_byte_millis Seek-time or Time-To-First-Byte (TTFB) in - /// milliseconds, also called call setup latency of a new S3 request. + /// milliseconds, also called call setup latency of a new read request. /// The value is a positive integer. - /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec. + /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec + /// (per connection). /// The value is a positive integer. /// \param[in] ideal_bandwidth_utilization_frac Transfer bandwidth utilization fraction /// (per connection) to maximize the net data load. diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 02a3db89a19c7..d93477915743c 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2050,6 +2050,36 @@ cdef class CacheOptions(_Weakrefable): except TypeError: return False + @staticmethod + def from_network_metrics(time_to_first_byte_millis, transfer_bandwidth_mib_per_sec, + ideal_bandwidth_utilization_frac, max_ideal_request_size_mib): + """ + Create suiteable CacheOptions based on provided network metrics. + + Typically this will be used with object storage solutions like Amazon S3, + Google Cloud Storage and Azure Blob Storage. + + Parameters + ---------- + time_to_first_byte_millis : int + Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call + setup latency of a new read request. The value is a positive integer. + transfer_bandwidth_mib_per_sec : int + Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive + integer. + ideal_bandwidth_utilization_frac : int + Transfer bandwidth utilization fraction (per connection) to maximize the net + data load. The value is a positive float less than 1. + max_ideal_request_size_mib : int + The maximum single data request size (in MiB) to maximize the net data load. + Returns + ------- + CacheOptions + """ + return CacheOptions.wrap(CCacheOptions.MakeFromNetworkMetrics( + time_to_first_byte_millis, transfer_bandwidth_mib_per_sec, + ideal_bandwidth_utilization_frac, max_ideal_request_size_mib)) + @staticmethod @binding(True) # Required for Cython < 3 def _reconstruct(kwargs): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 2baa2a11928ee..6475c293ae941 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1354,6 +1354,12 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: int64_t prefetch_limit c_bool Equals "operator==" (CCacheOptions other) + @staticmethod + CCacheOptions MakeFromNetworkMetrics(int64_t time_to_first_byte_millis, + int64_t transfer_bandwidth_mib_per_sec, + double ideal_bandwidth_utilization_frac, + int64_t max_ideal_request_size_mib) + @staticmethod CCacheOptions Defaults() diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 5646bf908dda2..24e93e323feb6 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -26,7 +26,6 @@ import threading import time from shutil import copytree - from urllib.parse import quote import numpy as np @@ -903,6 +902,10 @@ def test_cache_options(): opts4 = ds.CacheOptions(hole_size_limit=4096, range_size_limit=8192, lazy=True) opts5 = ds.CacheOptions(hole_size_limit=4096, range_size_limit=8192, lazy=True, prefetch_limit=5) + opts6 = ds.CacheOptions.from_network_metrics(time_to_first_byte_millis=100, + transfer_bandwidth_mib_per_sec=200, + ideal_bandwidth_utilization_frac=0.9, + max_ideal_request_size_mib=64) assert opts1.hole_size_limit == 8192 assert opts1.range_size_limit == 32 * 1024 * 1024 @@ -929,10 +932,14 @@ def test_cache_options(): assert opts5.lazy is True assert opts5.prefetch_limit == 5 + assert opts6.lazy is False + assert opts1 == opts1 assert opts1 != opts2 assert opts2 != opts3 assert opts3 != opts4 + assert opts4 != opts5 + assert opts6 != opts1 def test_cache_options_pickling(pickle_module):