Skip to content

Commit

Permalink
Dedup store will now bypass deduplication when size is small
Browse files Browse the repository at this point in the history
Also change the default values after some testing.
  • Loading branch information
allada committed Nov 10, 2021
1 parent 3063d2c commit 997be53
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 19 deletions.
24 changes: 20 additions & 4 deletions cas/store/dedup_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ use traits::{ResultFuture, StoreTrait, UploadSizeInfo};

// NOTE: If these change update the comments in `backends.rs` to reflect
// the new defaults.
const DEFAULT_MIN_SIZE: usize = 4096;
const DEFAULT_NORM_SIZE: usize = 16384;
const DEFAULT_MAX_SIZE: usize = 65536;
const DEFAULT_MIN_SIZE: usize = 64 * 1024;
const DEFAULT_NORM_SIZE: usize = 256 * 1024;
const DEFAULT_MAX_SIZE: usize = 512 * 1024;
const DEFAULT_MAX_CONCURRENT_FETCH_PER_GET: usize = 10;

#[derive(Serialize, Deserialize, PartialEq, Debug, Default, Clone)]
Expand Down Expand Up @@ -92,6 +92,10 @@ impl DedupStore {
}
}

fn is_small_object(&self, digest: &DigestInfo) -> bool {
return digest.size_bytes as usize <= self.upload_normal_size;
}

fn pin_index_store<'a>(&'a self) -> std::pin::Pin<&'a dyn StoreTrait> {
Pin::new(self.index_store.as_ref())
}
Expand All @@ -110,6 +114,12 @@ impl StoreTrait for DedupStore {
size_info: UploadSizeInfo,
) -> ResultFuture<'a, ()> {
Box::pin(async move {
if self.is_small_object(&digest) {
return Pin::new(self.content_store.as_ref())
.update(digest, reader, size_info)
.await
.err_tip(|| "Failed to insert small object in dedup store");
}
let input_max_size = match size_info {
UploadSizeInfo::ExactSize(sz) => sz,
UploadSizeInfo::MaxSize(sz) => sz,
Expand Down Expand Up @@ -186,6 +196,12 @@ impl StoreTrait for DedupStore {
length: Option<usize>,
) -> ResultFuture<'a, ()> {
Box::pin(async move {
if self.is_small_object(&digest) {
return Pin::new(self.content_store.as_ref())
.get_part(digest, writer, offset, length)
.await
.err_tip(|| "Failed to get_part small object in dedup store");
}
// First we need to download the index that contains where the individual parts actually
// can be fetched from.
let index_entries = {
Expand Down Expand Up @@ -222,7 +238,7 @@ impl StoreTrait for DedupStore {
continue;
}
// Filter any items who's start byte is after the last requested byte.
if first_byte > offset + length.unwrap() {
if length.is_some() && first_byte > offset + length.unwrap() {
continue;
}
entries.push(entry);
Expand Down
14 changes: 8 additions & 6 deletions cas/store/tests/dedup_store_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ mod dedup_store_tests {
);
let store = Pin::new(&store_owned);

let original_data = make_random_data(1 * MEGABYTE_SZ);
let digest = DigestInfo::try_new(&VALID_HASH1, 100).unwrap();
let original_data = make_random_data(MEGABYTE_SZ);
let digest = DigestInfo::try_new(&VALID_HASH1, MEGABYTE_SZ).unwrap();

store
.update(
Expand Down Expand Up @@ -79,8 +79,8 @@ mod dedup_store_tests {
);
let store = Pin::new(&store_owned);

let original_data = make_random_data(1 * MEGABYTE_SZ);
let digest = DigestInfo::try_new(&VALID_HASH1, 100).unwrap();
let original_data = make_random_data(MEGABYTE_SZ);
let digest = DigestInfo::try_new(&VALID_HASH1, MEGABYTE_SZ).unwrap();

store
.update(
Expand All @@ -95,10 +95,12 @@ mod dedup_store_tests {
const LAST_CHUNK_HASH: &str = "9220cc441e3860a0a8f5ed984d5b2da69c09ca800dcfd7a93c755acf8561e7a5";
const LAST_CHUNK_SIZE: usize = 25779;

content_store
let did_delete = content_store
.remove_entry(&DigestInfo::try_new(LAST_CHUNK_HASH, LAST_CHUNK_SIZE).unwrap())
.await;

assert_eq!(did_delete, true, "Expected item to exist in store");

let result = store.get_part(digest.clone(), &mut vec![], 0, None).await;
assert!(result.is_err(), "Expected result to be an error");
assert_eq!(
Expand All @@ -123,7 +125,7 @@ mod dedup_store_tests {

const DATA_SIZE: usize = MEGABYTE_SZ / 4;
let original_data = make_random_data(DATA_SIZE);
let digest = DigestInfo::try_new(&VALID_HASH1, 100).unwrap();
let digest = DigestInfo::try_new(&VALID_HASH1, DATA_SIZE).unwrap();

store
.update(
Expand Down
12 changes: 9 additions & 3 deletions config/backends.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,21 +84,27 @@ pub struct DedupStore {
/// because it will actually not check this number of bytes when
/// deciding where to partition the data.
///
/// Default: 4096 (4k)
/// Default: 65536 (64k)
#[serde(default)]
pub min_size: u32,

/// A best-effort attempt will be made to keep the average size
/// of the chunks to this number. It is not a guarantee, but a
/// slight attempt will be made.
///
/// Default: 16384 (16k)
/// This value will also be about the threshold used to determine
/// if we should even attempt to dedup the entry or just forward
/// it directly to the content_store without an index. The actual
/// value will be about `normal_size * 1.3` due to implementation
/// details.
///
/// Default: 262144 (256k)
#[serde(default)]
pub normal_size: u32,

/// Maximum size a chunk is allowed to be.
///
/// Default: 65536 (64k)
/// Default: 524288 (512k)
#[serde(default)]
pub max_size: u32,

Expand Down
28 changes: 22 additions & 6 deletions config/examples/basic_cas.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,37 @@
"CAS_MAIN_STORE": {
"verify": {
"backend": {
"compression": {
"compression_algorithm": {
"LZ4": {}
},
"backend": {
"dedup": {
"index_store": {
"s3_store": {
"region": "us-west-1",
"bucket": "blaisebruer-cas-store",
"key_prefix": "test-prefix-cas/",
"key_prefix": "test-prefix-index/",
"retry": {
"max_retries": 6,
"delay": 0.3,
"jitter": 0.5,
}
}
},
"content_store": {
"compression": {
"compression_algorithm": {
"LZ4": {}
},
"backend": {
"s3_store": {
"region": "us-west-1",
"bucket": "blaisebruer-cas-store",
"key_prefix": "test-prefix-dedup-cas/",
"retry": {
"max_retries": 6,
"delay": 0.3,
"jitter": 0.5,
}
}
}
}
}
}
},
Expand Down

0 comments on commit 997be53

Please sign in to comment.