From 997be53c7560bb0dca8fe2ab08831ec172ede7a6 Mon Sep 17 00:00:00 2001 From: allada Date: Tue, 9 Nov 2021 23:22:52 -0800 Subject: [PATCH] Dedup store will now bypass deduplication when size is small Also change the default values after some testing. --- cas/store/dedup_store.rs | 24 ++++++++++++++++++++---- cas/store/tests/dedup_store_test.rs | 14 ++++++++------ config/backends.rs | 12 +++++++++--- config/examples/basic_cas.json | 28 ++++++++++++++++++++++------ 4 files changed, 59 insertions(+), 19 deletions(-) diff --git a/cas/store/dedup_store.rs b/cas/store/dedup_store.rs index c4e32d4c7..84cc2b28e 100644 --- a/cas/store/dedup_store.rs +++ b/cas/store/dedup_store.rs @@ -28,9 +28,9 @@ use traits::{ResultFuture, StoreTrait, UploadSizeInfo}; // NOTE: If these change update the comments in `backends.rs` to reflect // the new defaults. -const DEFAULT_MIN_SIZE: usize = 4096; -const DEFAULT_NORM_SIZE: usize = 16384; -const DEFAULT_MAX_SIZE: usize = 65536; +const DEFAULT_MIN_SIZE: usize = 64 * 1024; +const DEFAULT_NORM_SIZE: usize = 256 * 1024; +const DEFAULT_MAX_SIZE: usize = 512 * 1024; const DEFAULT_MAX_CONCURRENT_FETCH_PER_GET: usize = 10; #[derive(Serialize, Deserialize, PartialEq, Debug, Default, Clone)] @@ -92,6 +92,10 @@ impl DedupStore { } } + fn is_small_object(&self, digest: &DigestInfo) -> bool { + return digest.size_bytes as usize <= self.upload_normal_size; + } + fn pin_index_store<'a>(&'a self) -> std::pin::Pin<&'a dyn StoreTrait> { Pin::new(self.index_store.as_ref()) } @@ -110,6 +114,12 @@ impl StoreTrait for DedupStore { size_info: UploadSizeInfo, ) -> ResultFuture<'a, ()> { Box::pin(async move { + if self.is_small_object(&digest) { + return Pin::new(self.content_store.as_ref()) + .update(digest, reader, size_info) + .await + .err_tip(|| "Failed to insert small object in dedup store"); + } let input_max_size = match size_info { UploadSizeInfo::ExactSize(sz) => sz, UploadSizeInfo::MaxSize(sz) => sz, @@ -186,6 +196,12 @@ impl StoreTrait for DedupStore { length: Option, ) -> ResultFuture<'a, ()> { Box::pin(async move { + if self.is_small_object(&digest) { + return Pin::new(self.content_store.as_ref()) + .get_part(digest, writer, offset, length) + .await + .err_tip(|| "Failed to get_part small object in dedup store"); + } // First we need to download the index that contains where the individual parts actually // can be fetched from. let index_entries = { @@ -222,7 +238,7 @@ impl StoreTrait for DedupStore { continue; } // Filter any items who's start byte is after the last requested byte. - if first_byte > offset + length.unwrap() { + if length.is_some() && first_byte > offset + length.unwrap() { continue; } entries.push(entry); diff --git a/cas/store/tests/dedup_store_test.rs b/cas/store/tests/dedup_store_test.rs index 58a2f02ae..05d273964 100644 --- a/cas/store/tests/dedup_store_test.rs +++ b/cas/store/tests/dedup_store_test.rs @@ -47,8 +47,8 @@ mod dedup_store_tests { ); let store = Pin::new(&store_owned); - let original_data = make_random_data(1 * MEGABYTE_SZ); - let digest = DigestInfo::try_new(&VALID_HASH1, 100).unwrap(); + let original_data = make_random_data(MEGABYTE_SZ); + let digest = DigestInfo::try_new(&VALID_HASH1, MEGABYTE_SZ).unwrap(); store .update( @@ -79,8 +79,8 @@ mod dedup_store_tests { ); let store = Pin::new(&store_owned); - let original_data = make_random_data(1 * MEGABYTE_SZ); - let digest = DigestInfo::try_new(&VALID_HASH1, 100).unwrap(); + let original_data = make_random_data(MEGABYTE_SZ); + let digest = DigestInfo::try_new(&VALID_HASH1, MEGABYTE_SZ).unwrap(); store .update( @@ -95,10 +95,12 @@ mod dedup_store_tests { const LAST_CHUNK_HASH: &str = "9220cc441e3860a0a8f5ed984d5b2da69c09ca800dcfd7a93c755acf8561e7a5"; const LAST_CHUNK_SIZE: usize = 25779; - content_store + let did_delete = content_store .remove_entry(&DigestInfo::try_new(LAST_CHUNK_HASH, LAST_CHUNK_SIZE).unwrap()) .await; + assert_eq!(did_delete, true, "Expected item to exist in store"); + let result = store.get_part(digest.clone(), &mut vec![], 0, None).await; assert!(result.is_err(), "Expected result to be an error"); assert_eq!( @@ -123,7 +125,7 @@ mod dedup_store_tests { const DATA_SIZE: usize = MEGABYTE_SZ / 4; let original_data = make_random_data(DATA_SIZE); - let digest = DigestInfo::try_new(&VALID_HASH1, 100).unwrap(); + let digest = DigestInfo::try_new(&VALID_HASH1, DATA_SIZE).unwrap(); store .update( diff --git a/config/backends.rs b/config/backends.rs index e997bdad4..a281b1859 100644 --- a/config/backends.rs +++ b/config/backends.rs @@ -84,7 +84,7 @@ pub struct DedupStore { /// because it will actually not check this number of bytes when /// deciding where to partition the data. /// - /// Default: 4096 (4k) + /// Default: 65536 (64k) #[serde(default)] pub min_size: u32, @@ -92,13 +92,19 @@ pub struct DedupStore { /// of the chunks to this number. It is not a guarantee, but a /// slight attempt will be made. /// - /// Default: 16384 (16k) + /// This value will also be about the threshold used to determine + /// if we should even attempt to dedup the entry or just forward + /// it directly to the content_store without an index. The actual + /// value will be about `normal_size * 1.3` due to implementation + /// details. + /// + /// Default: 262144 (256k) #[serde(default)] pub normal_size: u32, /// Maximum size a chunk is allowed to be. /// - /// Default: 65536 (64k) + /// Default: 524288 (512k) #[serde(default)] pub max_size: u32, diff --git a/config/examples/basic_cas.json b/config/examples/basic_cas.json index d270d1ea2..14a721cdd 100644 --- a/config/examples/basic_cas.json +++ b/config/examples/basic_cas.json @@ -3,21 +3,37 @@ "CAS_MAIN_STORE": { "verify": { "backend": { - "compression": { - "compression_algorithm": { - "LZ4": {} - }, - "backend": { + "dedup": { + "index_store": { "s3_store": { "region": "us-west-1", "bucket": "blaisebruer-cas-store", - "key_prefix": "test-prefix-cas/", + "key_prefix": "test-prefix-index/", "retry": { "max_retries": 6, "delay": 0.3, "jitter": 0.5, } } + }, + "content_store": { + "compression": { + "compression_algorithm": { + "LZ4": {} + }, + "backend": { + "s3_store": { + "region": "us-west-1", + "bucket": "blaisebruer-cas-store", + "key_prefix": "test-prefix-dedup-cas/", + "retry": { + "max_retries": 6, + "delay": 0.3, + "jitter": 0.5, + } + } + } + } } } },