Skip to content

Commit

Permalink
pybind MakeFromNetworkMetrics
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom-Newton committed Dec 11, 2023
1 parent 8b6d430 commit 1e77bac
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 3 deletions.
5 changes: 3 additions & 2 deletions cpp/src/arrow/io/caching.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,10 @@ struct ARROW_EXPORT CacheOptions {
/// \brief Construct CacheOptions from network storage metrics (e.g. S3).
///
/// \param[in] time_to_first_byte_millis Seek-time or Time-To-First-Byte (TTFB) in
/// milliseconds, also called call setup latency of a new S3 request.
/// milliseconds, also called call setup latency of a new read request.
/// The value is a positive integer.
/// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec.
/// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec
/// (per connection).
/// The value is a positive integer.
/// \param[in] ideal_bandwidth_utilization_frac Transfer bandwidth utilization fraction
/// (per connection) to maximize the net data load.
Expand Down
30 changes: 30 additions & 0 deletions python/pyarrow/_dataset.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2050,6 +2050,36 @@ cdef class CacheOptions(_Weakrefable):
except TypeError:
return False

@staticmethod
def from_network_metrics(time_to_first_byte_millis, transfer_bandwidth_mib_per_sec,
ideal_bandwidth_utilization_frac, max_ideal_request_size_mib):
"""
Create suiteable CacheOptions based on provided network metrics.
Typically this will be used with object storage solutions like Amazon S3,
Google Cloud Storage and Azure Blob Storage.
Parameters
----------
time_to_first_byte_millis : int
Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call
setup latency of a new read request. The value is a positive integer.
transfer_bandwidth_mib_per_sec : int
Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive
integer.
ideal_bandwidth_utilization_frac : int
Transfer bandwidth utilization fraction (per connection) to maximize the net
data load. The value is a positive float less than 1.
max_ideal_request_size_mib : int
The maximum single data request size (in MiB) to maximize the net data load.
Returns
-------
CacheOptions
"""
return CacheOptions.wrap(CCacheOptions.MakeFromNetworkMetrics(
time_to_first_byte_millis, transfer_bandwidth_mib_per_sec,
ideal_bandwidth_utilization_frac, max_ideal_request_size_mib))

@staticmethod
@binding(True) # Required for Cython < 3
def _reconstruct(kwargs):
Expand Down
6 changes: 6 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -1354,6 +1354,12 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil:
int64_t prefetch_limit
c_bool Equals "operator==" (CCacheOptions other)

@staticmethod
CCacheOptions MakeFromNetworkMetrics(int64_t time_to_first_byte_millis,
int64_t transfer_bandwidth_mib_per_sec,
double ideal_bandwidth_utilization_frac,
int64_t max_ideal_request_size_mib)

@staticmethod
CCacheOptions Defaults()

Expand Down
9 changes: 8 additions & 1 deletion python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import threading
import time
from shutil import copytree

from urllib.parse import quote

import numpy as np
Expand Down Expand Up @@ -903,6 +902,10 @@ def test_cache_options():
opts4 = ds.CacheOptions(hole_size_limit=4096, range_size_limit=8192, lazy=True)
opts5 = ds.CacheOptions(hole_size_limit=4096,
range_size_limit=8192, lazy=True, prefetch_limit=5)
opts6 = ds.CacheOptions.from_network_metrics(time_to_first_byte_millis=100,
transfer_bandwidth_mib_per_sec=200,
ideal_bandwidth_utilization_frac=0.9,
max_ideal_request_size_mib=64)

assert opts1.hole_size_limit == 8192
assert opts1.range_size_limit == 32 * 1024 * 1024
Expand All @@ -929,10 +932,14 @@ def test_cache_options():
assert opts5.lazy is True
assert opts5.prefetch_limit == 5

assert opts6.lazy is False

assert opts1 == opts1
assert opts1 != opts2
assert opts2 != opts3
assert opts3 != opts4
assert opts4 != opts5
assert opts6 != opts1


def test_cache_options_pickling(pickle_module):
Expand Down

0 comments on commit 1e77bac

Please sign in to comment.