Skip to content

Commit

Permalink
[Breaking] Change in behavior of /status by introduction of component…
Browse files Browse the repository at this point in the history
… based health

Introduce a component based health check system. Each type of component
should be able to opt into registering handlers implement some mechanical
checks of health. Health in this context is functionality expected to
work but runtime wise are semi no-op in terms of influencing the underlying
storage / rpc systems.

Opting in to the system requires for component to define a `HealthStatusIndicator`
and implement the `check_health()` function. Registration should automatically
be done by the existence of implementation and calling the `Store.register_health()`
function in component implementation. At the moment only `Store` based components
can register and a single health check is defined for `FilesystemStore`.

A global parameter `default_digest_size_health_check` has been introduced for
configuring the static seeded random bytes to fill data payload. The default
value is 1MB.

The `/status` endpoint has been updated to return the resulting string serialized
instances of `HealthStatus`, smart serialization of such objects is not implemented
at this time.
  • Loading branch information
adam-singer committed Feb 15, 2024
1 parent 745b0d6 commit 9caec5f
Show file tree
Hide file tree
Showing 30 changed files with 695 additions and 22 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions nativelink-config/src/cas_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,15 @@ pub struct GlobalConfig {
///
/// Default: ConfigDigestHashFunction::sha256
pub default_digest_hash_function: Option<ConfigDigestHashFunction>,

/// Default digest size to use for health check when running
/// diagnostics checks. Health checks are expected to use this
/// size for filling a buffer that is used for creation of
/// digest.
///
/// Default: 1024*1024 (1MiB)
#[serde(default, deserialize_with = "convert_numeric_with_shellexpand")]
pub default_digest_size_health_check: usize,
}

#[derive(Deserialize, Debug)]
Expand Down
2 changes: 2 additions & 0 deletions nativelink-service/tests/ac_server_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ async fn make_store_manager() -> Result<Arc<StoreManager>, Error> {
&nativelink_config::stores::StoreConfig::memory(nativelink_config::stores::MemoryStore::default()),
&store_manager,
Some(&mut <Registry>::default()),
None,
)
.await?,
);
Expand All @@ -64,6 +65,7 @@ async fn make_store_manager() -> Result<Arc<StoreManager>, Error> {
&nativelink_config::stores::StoreConfig::memory(nativelink_config::stores::MemoryStore::default()),
&store_manager,
Some(&mut <Registry>::default()),
None,
)
.await?,
);
Expand Down
1 change: 1 addition & 0 deletions nativelink-service/tests/bytestream_server_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ async fn make_store_manager() -> Result<Arc<StoreManager>, Error> {
&nativelink_config::stores::StoreConfig::memory(nativelink_config::stores::MemoryStore::default()),
&store_manager,
Some(&mut <Registry>::default()),
None,
)
.await?,
);
Expand Down
1 change: 1 addition & 0 deletions nativelink-service/tests/cas_server_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ async fn make_store_manager() -> Result<Arc<StoreManager>, Error> {
&nativelink_config::stores::StoreConfig::memory(nativelink_config::stores::MemoryStore::default()),
&store_manager,
Some(&mut <Registry>::default()),
None,
)
.await?,
);
Expand Down
1 change: 1 addition & 0 deletions nativelink-store/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ rust_library(
"@crates//:serde",
"@crates//:sha2",
"@crates//:shellexpand",
"@crates//:tempfile",
"@crates//:tokio",
"@crates//:tokio-stream",
"@crates//:tokio-util",
Expand Down
1 change: 1 addition & 0 deletions nativelink-store/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ rand = "0.8.5"
serde = "1.0.193"
sha2 = "0.10.8"
shellexpand = "3.1.0"
tempfile = "3.9.0"
tokio = { version = "1.35.1" }
tokio-stream = { version = "0.1.14", features = ["fs"] }
tokio-util = { version = "0.7.10" }
Expand Down
3 changes: 3 additions & 0 deletions nativelink-store/src/completeness_checking_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use nativelink_proto::build::bazel::remote::execution::v2::{
};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use parking_lot::Mutex;
use tokio::sync::Notify;
Expand Down Expand Up @@ -361,3 +362,5 @@ impl Store for CompletenessCheckingStore {
Box::new(self)
}
}

default_health_status_indicator!(CompletenessCheckingStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/compression_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use lz4_flex::block::{compress_into, decompress_into, get_maximum_output_size};
use nativelink_error::{error_if, make_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::{DigestInfo, JoinHandleDropGuard};
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::metrics_utils::Registry;
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -590,3 +591,5 @@ impl Store for CompressionStore {
self.inner_store.clone().register_metrics(inner_store_registry);
}
}

default_health_status_indicator!(CompressionStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/dedup_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use nativelink_error::{make_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf, StreamReader};
use nativelink_util::common::DigestInfo;
use nativelink_util::fastcdc::FastCDC;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use serde::{Deserialize, Serialize};
use tokio_util::codec::FramedRead;
Expand Down Expand Up @@ -349,3 +350,5 @@ impl Store for DedupStore {
Box::new(self)
}
}

default_health_status_indicator!(DedupStore);
31 changes: 19 additions & 12 deletions nativelink-store/src/default_store_factory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use futures::stream::FuturesOrdered;
use futures::{Future, TryStreamExt};
use nativelink_config::stores::StoreConfig;
use nativelink_error::Error;
use nativelink_util::health_utils::HealthRegistryBuilder;
use nativelink_util::metrics_utils::Registry;
use nativelink_util::store_trait::Store;

Expand All @@ -44,51 +45,52 @@ pub fn store_factory<'a>(
backend: &'a StoreConfig,
store_manager: &'a Arc<StoreManager>,
maybe_store_metrics: Option<&'a mut Registry>,
maybe_health_registry_builder: Option<&'a mut HealthRegistryBuilder>,
) -> Pin<FutureMaybeStore<'a>> {
Box::pin(async move {
let store: Arc<dyn Store> = match backend {
StoreConfig::memory(config) => Arc::new(MemoryStore::new(config)),
StoreConfig::experimental_s3_store(config) => Arc::new(S3Store::new(config).await?),
StoreConfig::verify(config) => Arc::new(VerifyStore::new(
config,
store_factory(&config.backend, store_manager, None).await?,
store_factory(&config.backend, store_manager, None, None).await?,
)),
StoreConfig::compression(config) => Arc::new(CompressionStore::new(
*config.clone(),
store_factory(&config.backend, store_manager, None).await?,
store_factory(&config.backend, store_manager, None, None).await?,
)?),
StoreConfig::dedup(config) => Arc::new(DedupStore::new(
config,
store_factory(&config.index_store, store_manager, None).await?,
store_factory(&config.content_store, store_manager, None).await?,
store_factory(&config.index_store, store_manager, None, None).await?,
store_factory(&config.content_store, store_manager, None, None).await?,
)),
StoreConfig::existence_cache(config) => Arc::new(ExistenceCacheStore::new(
config,
store_factory(&config.backend, store_manager, None).await?,
store_factory(&config.backend, store_manager, None, None).await?,
)),
StoreConfig::completeness_checking(config) => Arc::new(CompletenessCheckingStore::new(
store_factory(&config.backend, store_manager, None).await?,
store_factory(&config.cas_store, store_manager, None).await?,
store_factory(&config.backend, store_manager, None, None).await?,
store_factory(&config.cas_store, store_manager, None, None).await?,
)),
StoreConfig::fast_slow(config) => Arc::new(FastSlowStore::new(
config,
store_factory(&config.fast, store_manager, None).await?,
store_factory(&config.slow, store_manager, None).await?,
store_factory(&config.fast, store_manager, None, None).await?,
store_factory(&config.slow, store_manager, None, None).await?,
)),
StoreConfig::filesystem(config) => Arc::new(<FilesystemStore>::new(config).await?),
StoreConfig::ref_store(config) => Arc::new(RefStore::new(config, Arc::downgrade(store_manager))),
StoreConfig::size_partitioning(config) => Arc::new(SizePartitioningStore::new(
config,
store_factory(&config.lower_store, store_manager, None).await?,
store_factory(&config.upper_store, store_manager, None).await?,
store_factory(&config.lower_store, store_manager, None, None).await?,
store_factory(&config.upper_store, store_manager, None, None).await?,
)),
StoreConfig::grpc(config) => Arc::new(GrpcStore::new(config).await?),
StoreConfig::noop => Arc::new(NoopStore::new()),
StoreConfig::shard(config) => {
let stores = config
.stores
.iter()
.map(|store_config| store_factory(&store_config.store, store_manager, None))
.map(|store_config| store_factory(&store_config.store, store_manager, None, None))
.collect::<FuturesOrdered<_>>()
.try_collect::<Vec<_>>()
.await?;
Expand All @@ -98,6 +100,11 @@ pub fn store_factory<'a>(
if let Some(store_metrics) = maybe_store_metrics {
store.clone().register_metrics(store_metrics);
}

if let Some(health_registry_builder) = maybe_health_registry_builder {
store.clone().register_health(health_registry_builder);
}

Ok(store)
})
}
3 changes: 3 additions & 0 deletions nativelink-store/src/existence_cache_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use nativelink_error::{error_if, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::evicting_map::{EvictingMap, LenEntry};
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::metrics_utils::{CollectorState, MetricsComponent, Registry};
use nativelink_util::store_trait::{Store, UploadSizeInfo};

Expand Down Expand Up @@ -198,3 +199,5 @@ impl MetricsComponent for ExistenceCacheStore {
self.existence_cache.gather_metrics(c)
}
}

default_health_status_indicator!(ExistenceCacheStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/fast_slow_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use futures::{join, FutureExt};
use nativelink_error::{make_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::metrics_utils::Registry;
use nativelink_util::store_trait::{Store, UploadSizeInfo};

Expand Down Expand Up @@ -268,3 +269,5 @@ impl Store for FastSlowStore {
self.slow_store.clone().register_metrics(slow_store_registry);
}
}

default_health_status_indicator!(FastSlowStore);
17 changes: 17 additions & 0 deletions nativelink-store/src/filesystem_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::borrow::Cow;
use std::ffi::OsString;
use std::fmt::{Debug, Formatter};
use std::pin::Pin;
Expand All @@ -29,6 +30,7 @@ use nativelink_error::{make_err, make_input_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::{fs, DigestInfo};
use nativelink_util::evicting_map::{EvictingMap, LenEntry};
use nativelink_util::health_utils::{HealthRegistryBuilder, HealthStatus, HealthStatusIndicator};
use nativelink_util::metrics_utils::{Collector, CollectorState, MetricsComponent, Registry};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, SeekFrom};
Expand Down Expand Up @@ -750,6 +752,10 @@ impl<Fe: FileEntry> Store for FilesystemStore<Fe> {
fn register_metrics(self: Arc<Self>, registry: &mut Registry) {
registry.register_collector(Box::new(Collector::new(&self)));
}

fn register_health(self: Arc<Self>, registry: &mut HealthRegistryBuilder) {
registry.register_indicator(self);
}
}

impl<Fe: FileEntry> MetricsComponent for FilesystemStore<Fe> {
Expand Down Expand Up @@ -777,3 +783,14 @@ impl<Fe: FileEntry> MetricsComponent for FilesystemStore<Fe> {
c.publish("evicting_map", self.evicting_map.as_ref(), "");
}
}

#[async_trait]
impl<Fe: FileEntry> HealthStatusIndicator for FilesystemStore<Fe> {
fn get_name(&self) -> &'static str {
"FilesystemStore"
}

async fn check_health(&self, namespace: Cow<'static, str>) -> HealthStatus {
Store::check_health(Pin::new(self), namespace).await
}
}
5 changes: 4 additions & 1 deletion nativelink-store/src/grpc_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@ use nativelink_proto::google::bytestream::{
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::grpc_utils::ConnectionManager;
use nativelink_util::health_utils::HealthStatusIndicator;
use nativelink_util::resource_info::ResourceInfo;
use nativelink_util::retry::{Retrier, RetryResult};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use nativelink_util::tls_utils;
use nativelink_util::write_request_stream_wrapper::WriteRequestStreamWrapper;
use nativelink_util::{default_health_status_indicator, tls_utils};
use parking_lot::Mutex;
use prost::Message;
use rand::rngs::OsRng;
Expand Down Expand Up @@ -844,3 +845,5 @@ impl Store for GrpcStore {
Box::new(self)
}
}

default_health_status_indicator!(GrpcStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/memory_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use nativelink_error::{Code, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::evicting_map::{EvictingMap, LenEntry};
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::metrics_utils::{Collector, CollectorState, MetricsComponent, Registry};
use nativelink_util::store_trait::{Store, UploadSizeInfo};

Expand Down Expand Up @@ -170,3 +171,5 @@ impl MetricsComponent for MemoryStore {
c.publish("evicting_map", &self.evicting_map, "");
}
}

default_health_status_indicator!(MemoryStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/noop_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use async_trait::async_trait;
use nativelink_error::{make_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::store_trait::{Store, UploadSizeInfo};

#[derive(Default)]
Expand Down Expand Up @@ -71,3 +72,5 @@ impl Store for NoopStore {
Box::new(self)
}
}

default_health_status_indicator!(NoopStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/ref_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use async_trait::async_trait;
use nativelink_error::{make_err, make_input_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use tracing::error;

Expand Down Expand Up @@ -137,3 +138,5 @@ impl Store for RefStore {
Box::new(self)
}
}

default_health_status_indicator!(RefStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/s3_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ use hyper_rustls::{HttpsConnector, MaybeHttpsStream};
use nativelink_error::{error_if, make_err, make_input_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::retry::{Retrier, RetryResult};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use rand::rngs::OsRng;
Expand Down Expand Up @@ -534,3 +535,5 @@ impl Store for S3Store {
Box::new(self)
}
}

default_health_status_indicator!(S3Store);
3 changes: 3 additions & 0 deletions nativelink-store/src/shard_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use futures::stream::{FuturesUnordered, TryStreamExt};
use nativelink_error::{error_if, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::metrics_utils::Registry;
use nativelink_util::store_trait::{Store, UploadSizeInfo};

Expand Down Expand Up @@ -191,3 +192,5 @@ impl Store for ShardStore {
}
}
}

default_health_status_indicator!(ShardStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/size_partitioning_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use async_trait::async_trait;
use nativelink_error::{Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use tokio::join;

Expand Down Expand Up @@ -128,3 +129,5 @@ impl Store for SizePartitioningStore {
Box::new(self)
}
}

default_health_status_indicator!(SizePartitioningStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/verify_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use nativelink_error::{make_input_err, Error, ResultExt};
use nativelink_util::buf_channel::{make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc};
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::metrics_utils::{Collector, CollectorState, CounterWithTime, MetricsComponent, Registry};
use nativelink_util::store_trait::{Store, UploadSizeInfo};

Expand Down Expand Up @@ -199,3 +200,5 @@ impl MetricsComponent for VerifyStore {
);
}
}

default_health_status_indicator!(VerifyStore);

0 comments on commit 9caec5f

Please sign in to comment.