Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce component based health #636

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions nativelink-config/src/cas_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,15 @@ pub struct GlobalConfig {
///
/// Default: ConfigDigestHashFunction::sha256
pub default_digest_hash_function: Option<ConfigDigestHashFunction>,

/// Default digest size to use for health check when running
/// diagnostics checks. Health checks are expected to use this
/// size for filling a buffer that is used for creation of
/// digest.
///
/// Default: 1024*1024 (1MiB)
#[serde(default, deserialize_with = "convert_numeric_with_shellexpand")]
pub default_digest_size_health_check: usize,
}

#[derive(Deserialize, Debug)]
Expand Down
2 changes: 2 additions & 0 deletions nativelink-service/tests/ac_server_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ async fn make_store_manager() -> Result<Arc<StoreManager>, Error> {
&nativelink_config::stores::StoreConfig::memory(nativelink_config::stores::MemoryStore::default()),
&store_manager,
Some(&mut <Registry>::default()),
None,
)
.await?,
);
Expand All @@ -64,6 +65,7 @@ async fn make_store_manager() -> Result<Arc<StoreManager>, Error> {
&nativelink_config::stores::StoreConfig::memory(nativelink_config::stores::MemoryStore::default()),
&store_manager,
Some(&mut <Registry>::default()),
None,
)
.await?,
);
Expand Down
1 change: 1 addition & 0 deletions nativelink-service/tests/bytestream_server_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ async fn make_store_manager() -> Result<Arc<StoreManager>, Error> {
&nativelink_config::stores::StoreConfig::memory(nativelink_config::stores::MemoryStore::default()),
&store_manager,
Some(&mut <Registry>::default()),
None,
)
.await?,
);
Expand Down
1 change: 1 addition & 0 deletions nativelink-service/tests/cas_server_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ async fn make_store_manager() -> Result<Arc<StoreManager>, Error> {
&nativelink_config::stores::StoreConfig::memory(nativelink_config::stores::MemoryStore::default()),
&store_manager,
Some(&mut <Registry>::default()),
None,
)
.await?,
);
Expand Down
1 change: 1 addition & 0 deletions nativelink-store/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ rust_library(
"@crates//:serde",
"@crates//:sha2",
"@crates//:shellexpand",
"@crates//:tempfile",
"@crates//:tokio",
"@crates//:tokio-stream",
"@crates//:tokio-util",
Expand Down
1 change: 1 addition & 0 deletions nativelink-store/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ rand = "0.8.5"
serde = "1.0.193"
sha2 = "0.10.8"
shellexpand = "3.1.0"
tempfile = "3.9.0"
tokio = { version = "1.35.1" }
tokio-stream = { version = "0.1.14", features = ["fs"] }
tokio-util = { version = "0.7.10" }
Expand Down
3 changes: 3 additions & 0 deletions nativelink-store/src/completeness_checking_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use nativelink_proto::build::bazel::remote::execution::v2::{
};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use parking_lot::Mutex;
use tokio::sync::Notify;
Expand Down Expand Up @@ -361,3 +362,5 @@ impl Store for CompletenessCheckingStore {
Box::new(self)
}
}

default_health_status_indicator!(CompletenessCheckingStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/compression_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use lz4_flex::block::{compress_into, decompress_into, get_maximum_output_size};
use nativelink_error::{error_if, make_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::{DigestInfo, JoinHandleDropGuard};
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::metrics_utils::Registry;
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -590,3 +591,5 @@ impl Store for CompressionStore {
self.inner_store.clone().register_metrics(inner_store_registry);
}
}

default_health_status_indicator!(CompressionStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/dedup_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use nativelink_error::{make_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf, StreamReader};
use nativelink_util::common::DigestInfo;
use nativelink_util::fastcdc::FastCDC;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use serde::{Deserialize, Serialize};
use tokio_util::codec::FramedRead;
Expand Down Expand Up @@ -349,3 +350,5 @@ impl Store for DedupStore {
Box::new(self)
}
}

default_health_status_indicator!(DedupStore);
31 changes: 19 additions & 12 deletions nativelink-store/src/default_store_factory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use futures::stream::FuturesOrdered;
use futures::{Future, TryStreamExt};
use nativelink_config::stores::StoreConfig;
use nativelink_error::Error;
use nativelink_util::health_utils::HealthRegistryBuilder;
use nativelink_util::metrics_utils::Registry;
use nativelink_util::store_trait::Store;

Expand All @@ -44,51 +45,52 @@ pub fn store_factory<'a>(
backend: &'a StoreConfig,
store_manager: &'a Arc<StoreManager>,
maybe_store_metrics: Option<&'a mut Registry>,
maybe_health_registry_builder: Option<&'a mut HealthRegistryBuilder>,
) -> Pin<FutureMaybeStore<'a>> {
Box::pin(async move {
let store: Arc<dyn Store> = match backend {
StoreConfig::memory(config) => Arc::new(MemoryStore::new(config)),
StoreConfig::experimental_s3_store(config) => Arc::new(S3Store::new(config).await?),
StoreConfig::verify(config) => Arc::new(VerifyStore::new(
config,
store_factory(&config.backend, store_manager, None).await?,
store_factory(&config.backend, store_manager, None, None).await?,
)),
StoreConfig::compression(config) => Arc::new(CompressionStore::new(
*config.clone(),
store_factory(&config.backend, store_manager, None).await?,
store_factory(&config.backend, store_manager, None, None).await?,
)?),
StoreConfig::dedup(config) => Arc::new(DedupStore::new(
config,
store_factory(&config.index_store, store_manager, None).await?,
store_factory(&config.content_store, store_manager, None).await?,
store_factory(&config.index_store, store_manager, None, None).await?,
store_factory(&config.content_store, store_manager, None, None).await?,
)),
StoreConfig::existence_cache(config) => Arc::new(ExistenceCacheStore::new(
config,
store_factory(&config.backend, store_manager, None).await?,
store_factory(&config.backend, store_manager, None, None).await?,
)),
StoreConfig::completeness_checking(config) => Arc::new(CompletenessCheckingStore::new(
store_factory(&config.backend, store_manager, None).await?,
store_factory(&config.cas_store, store_manager, None).await?,
store_factory(&config.backend, store_manager, None, None).await?,
store_factory(&config.cas_store, store_manager, None, None).await?,
)),
StoreConfig::fast_slow(config) => Arc::new(FastSlowStore::new(
config,
store_factory(&config.fast, store_manager, None).await?,
store_factory(&config.slow, store_manager, None).await?,
store_factory(&config.fast, store_manager, None, None).await?,
store_factory(&config.slow, store_manager, None, None).await?,
)),
StoreConfig::filesystem(config) => Arc::new(<FilesystemStore>::new(config).await?),
StoreConfig::ref_store(config) => Arc::new(RefStore::new(config, Arc::downgrade(store_manager))),
StoreConfig::size_partitioning(config) => Arc::new(SizePartitioningStore::new(
config,
store_factory(&config.lower_store, store_manager, None).await?,
store_factory(&config.upper_store, store_manager, None).await?,
store_factory(&config.lower_store, store_manager, None, None).await?,
store_factory(&config.upper_store, store_manager, None, None).await?,
)),
StoreConfig::grpc(config) => Arc::new(GrpcStore::new(config).await?),
StoreConfig::noop => Arc::new(NoopStore::new()),
StoreConfig::shard(config) => {
let stores = config
.stores
.iter()
.map(|store_config| store_factory(&store_config.store, store_manager, None))
.map(|store_config| store_factory(&store_config.store, store_manager, None, None))
.collect::<FuturesOrdered<_>>()
.try_collect::<Vec<_>>()
.await?;
Expand All @@ -98,6 +100,11 @@ pub fn store_factory<'a>(
if let Some(store_metrics) = maybe_store_metrics {
store.clone().register_metrics(store_metrics);
}

if let Some(health_registry_builder) = maybe_health_registry_builder {
store.clone().register_health(health_registry_builder);
}

Ok(store)
})
}
3 changes: 3 additions & 0 deletions nativelink-store/src/existence_cache_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use nativelink_error::{error_if, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::evicting_map::{EvictingMap, LenEntry};
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::metrics_utils::{CollectorState, MetricsComponent, Registry};
use nativelink_util::store_trait::{Store, UploadSizeInfo};

Expand Down Expand Up @@ -198,3 +199,5 @@ impl MetricsComponent for ExistenceCacheStore {
self.existence_cache.gather_metrics(c)
}
}

default_health_status_indicator!(ExistenceCacheStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/fast_slow_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use futures::{join, FutureExt};
use nativelink_error::{make_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::metrics_utils::Registry;
use nativelink_util::store_trait::{Store, UploadSizeInfo};

Expand Down Expand Up @@ -268,3 +269,5 @@ impl Store for FastSlowStore {
self.slow_store.clone().register_metrics(slow_store_registry);
}
}

default_health_status_indicator!(FastSlowStore);
17 changes: 17 additions & 0 deletions nativelink-store/src/filesystem_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::borrow::Cow;
use std::ffi::OsString;
use std::fmt::{Debug, Formatter};
use std::pin::Pin;
Expand All @@ -29,6 +30,7 @@ use nativelink_error::{make_err, make_input_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::{fs, DigestInfo};
use nativelink_util::evicting_map::{EvictingMap, LenEntry};
use nativelink_util::health_utils::{HealthRegistryBuilder, HealthStatus, HealthStatusIndicator};
use nativelink_util::metrics_utils::{Collector, CollectorState, MetricsComponent, Registry};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, SeekFrom};
Expand Down Expand Up @@ -750,6 +752,10 @@ impl<Fe: FileEntry> Store for FilesystemStore<Fe> {
fn register_metrics(self: Arc<Self>, registry: &mut Registry) {
registry.register_collector(Box::new(Collector::new(&self)));
}

fn register_health(self: Arc<Self>, registry: &mut HealthRegistryBuilder) {
registry.register_indicator(self);
}
}

impl<Fe: FileEntry> MetricsComponent for FilesystemStore<Fe> {
Expand Down Expand Up @@ -777,3 +783,14 @@ impl<Fe: FileEntry> MetricsComponent for FilesystemStore<Fe> {
c.publish("evicting_map", self.evicting_map.as_ref(), "");
}
}

#[async_trait]
impl<Fe: FileEntry> HealthStatusIndicator for FilesystemStore<Fe> {
fn get_name(&self) -> &'static str {
"FilesystemStore"
}

async fn check_health(&self, namespace: Cow<'static, str>) -> HealthStatus {
Store::check_health(Pin::new(self), namespace).await
}
}
5 changes: 4 additions & 1 deletion nativelink-store/src/grpc_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@ use nativelink_proto::google::bytestream::{
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::grpc_utils::ConnectionManager;
use nativelink_util::health_utils::HealthStatusIndicator;
use nativelink_util::resource_info::ResourceInfo;
use nativelink_util::retry::{Retrier, RetryResult};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use nativelink_util::tls_utils;
use nativelink_util::write_request_stream_wrapper::WriteRequestStreamWrapper;
use nativelink_util::{default_health_status_indicator, tls_utils};
use parking_lot::Mutex;
use prost::Message;
use rand::rngs::OsRng;
Expand Down Expand Up @@ -844,3 +845,5 @@ impl Store for GrpcStore {
Box::new(self)
}
}

default_health_status_indicator!(GrpcStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/memory_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use nativelink_error::{Code, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::evicting_map::{EvictingMap, LenEntry};
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::metrics_utils::{Collector, CollectorState, MetricsComponent, Registry};
use nativelink_util::store_trait::{Store, UploadSizeInfo};

Expand Down Expand Up @@ -170,3 +171,5 @@ impl MetricsComponent for MemoryStore {
c.publish("evicting_map", &self.evicting_map, "");
}
}

default_health_status_indicator!(MemoryStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/noop_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use async_trait::async_trait;
use nativelink_error::{make_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::store_trait::{Store, UploadSizeInfo};

#[derive(Default)]
Expand Down Expand Up @@ -71,3 +72,5 @@ impl Store for NoopStore {
Box::new(self)
}
}

default_health_status_indicator!(NoopStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/ref_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use async_trait::async_trait;
use nativelink_error::{make_err, make_input_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use tracing::error;

Expand Down Expand Up @@ -137,3 +138,5 @@ impl Store for RefStore {
Box::new(self)
}
}

default_health_status_indicator!(RefStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/s3_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ use hyper_rustls::{HttpsConnector, MaybeHttpsStream};
use nativelink_error::{error_if, make_err, make_input_err, Code, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::retry::{Retrier, RetryResult};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use rand::rngs::OsRng;
Expand Down Expand Up @@ -534,3 +535,5 @@ impl Store for S3Store {
Box::new(self)
}
}

default_health_status_indicator!(S3Store);
3 changes: 3 additions & 0 deletions nativelink-store/src/shard_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use futures::stream::{FuturesUnordered, TryStreamExt};
use nativelink_error::{error_if, Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::metrics_utils::Registry;
use nativelink_util::store_trait::{Store, UploadSizeInfo};

Expand Down Expand Up @@ -191,3 +192,5 @@ impl Store for ShardStore {
}
}
}

default_health_status_indicator!(ShardStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/size_partitioning_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use async_trait::async_trait;
use nativelink_error::{Error, ResultExt};
use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::store_trait::{Store, UploadSizeInfo};
use tokio::join;

Expand Down Expand Up @@ -128,3 +129,5 @@ impl Store for SizePartitioningStore {
Box::new(self)
}
}

default_health_status_indicator!(SizePartitioningStore);
3 changes: 3 additions & 0 deletions nativelink-store/src/verify_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use nativelink_error::{make_input_err, Error, ResultExt};
use nativelink_util::buf_channel::{make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf};
use nativelink_util::common::DigestInfo;
use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc};
use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator};
use nativelink_util::metrics_utils::{Collector, CollectorState, CounterWithTime, MetricsComponent, Registry};
use nativelink_util::store_trait::{Store, UploadSizeInfo};

Expand Down Expand Up @@ -199,3 +200,5 @@ impl MetricsComponent for VerifyStore {
);
}
}

default_health_status_indicator!(VerifyStore);