Skip to content

Commit

Permalink
Single-process sub-millisecond temporal disambiguation of random labe…
Browse files Browse the repository at this point in the history
…ls. (#4800)

Add a submillisecond counter on `random_label` generation to
disambiguate fragments which are written within the same millisecond on
a single-process.

---
TYPE: BUG
DESC: Single-process sub-millisecond temporal disambiguation of random
labels.
  • Loading branch information
bekadavis9 committed Mar 20, 2024
1 parent f8bbb74 commit 8ea85dc
Show file tree
Hide file tree
Showing 5 changed files with 263 additions and 28 deletions.
3 changes: 2 additions & 1 deletion tiledb/common/random/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#
# The MIT License
#
# Copyright (c) 2023 TileDB, Inc.
# Copyright (c) 2023-2024 TileDB, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -37,6 +37,7 @@ gather_sources(${SOURCES})
commence(object_library seedable_global_PRNG)
this_target_sources(${SOURCES})
this_target_link_libraries(export)
this_target_object_libraries(baseline time)
conclude(object_library)

add_test_subdirectory()
74 changes: 53 additions & 21 deletions tiledb/common/random/random_label.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
*
* The MIT License
*
* @copyright Copyright (c) 2023 TileDB, Inc.
* @copyright Copyright (c) 2023-2024 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand All @@ -31,33 +31,65 @@
*/

#include "tiledb/common/random/random_label.h"
#include "tiledb/common/random/prng.h"

#include <iomanip>
#include <sstream>

namespace tiledb::common {

/**
* Legacy code provides randomness using UUIDs, which are always 128 bits,
* represented as a 32-digit hexadecimal value.
*
* To ensure backward compatibility, this function formats the PRNG-generated
* values to be precisely a 32-digit hexadecimal value. Each value is padded
* with 0s such that it makes up one 16-digit half of the full 32-digit number.
*/
std::string random_label() {
/* ********************************* */
/* CONSTRUCTORS & DESTRUCTORS */
/* ********************************* */
RandomLabelGenerator::RandomLabelGenerator()
: prev_time_(tiledb::sm::utils::time::timestamp_now_ms()) {
}

/* ********************************* */
/* API */
/* ********************************* */
std::string RandomLabelGenerator::generate() {
PRNG& prng = PRNG::get();
std::stringstream ss;
std::lock_guard<std::mutex> lock(mtx_);
auto now = tiledb::sm::utils::time::timestamp_now_ms();

// Generate and format a 128-bit, 32-digit hexadecimal random number
auto rand1 = prng();
ss << std::hex << std::setw(16) << std::setfill('0') << rand1;
auto rand2 = prng();
ss << std::hex << std::setw(16) << std::setfill('0') << rand2;
// If no label has been generated this millisecond, generate a new one.
if (now != prev_time_) {
prev_time_ = now;
counter_ = static_cast<uint32_t>(prng());
// Clear the top bit of the counter such that a full 2 billion values
// could be generated within a single millisecond.
counter_ &= 0x7FFFFFFF;
} else {
counter_ += 1;
if (counter_ == 0) {
throw RandomLabelException("Maximum generation frequency exceeded.");
}
}

// Return label string
// Generate and format a 128-bit, 32-digit hexadecimal random number
std::stringstream ss;
ss << std::hex << std::setw(8) << std::setfill('0') << counter_;
ss << std::hex << std::setw(8) << std::setfill('0')
<< static_cast<uint32_t>(prng());
ss << std::hex << std::setw(16) << std::setfill('0') << prng();
return ss.str();
}

std::string RandomLabelGenerator::generate_random_label() {
static RandomLabelGenerator generator;
return generator.generate();
}

/**
* Wrapper function for `generate_random_label`, which returns a PRNG-generated
* label as a 32-digit hexadecimal random number.
* (Ex. f258d22d4db9139204eef2b4b5d860cc).
*
* @pre If multiple labels are generated within the same millisecond, they will
* be sorted using a counter on the most significant 4 bytes.
* @note Labels may be 0-padded to ensure exactly a 128-bit, 32-digit length.
*
* @return A random label.
*/
std::string random_label() {
return RandomLabelGenerator::generate_random_label();
}

} // namespace tiledb::common
72 changes: 68 additions & 4 deletions tiledb/common/random/random_label.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
*
* The MIT License
*
* @copyright Copyright (c) 2023 TileDB, Inc.
* @copyright Copyright (c) 2023-2024 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -33,16 +33,80 @@
#ifndef TILEDB_HELPERS_H
#define TILEDB_HELPERS_H

#include <iomanip>
#include <mutex>
#include <sstream>
#include <string>

#include "tiledb/common/exception/exception.h"
#include "tiledb/common/random/prng.h"
#include "tiledb/sm/misc/tdb_time.h"

namespace tiledb::common {

class RandomLabelException : public StatusException {
public:
explicit RandomLabelException(const std::string& message)
: StatusException("RandomLabel", message) {
}
};

/**
* Generates a pseudeo-random label, formatted as a 32-digit hexadecimal number.
* (Ex. f258d22d4db9139204eef2b4b5d860cc).
*
* @pre If multiple labels are generated within the same millisecond, they will
* be sorted using a counter on the most significant 4 bytes.
* @note Use of wrapper `random_label()` is encouraged in production code.
*/
class RandomLabelGenerator {
public:
/* ********************************* */
/* CONSTRUCTORS & DESTRUCTORS */
/* ********************************* */
DISABLE_COPY_AND_COPY_ASSIGN(RandomLabelGenerator);
DISABLE_MOVE_AND_MOVE_ASSIGN(RandomLabelGenerator);

/** Default destructor. */
~RandomLabelGenerator() = default;

protected:
/** Protected constructor, abstracted by public-facing accessor. */
RandomLabelGenerator();

/* ********************************* */
/* API */
/* ********************************* */
/** Generate a random label. */
std::string generate();

public:
/** Generate a random label. */
static std::string generate_random_label();

private:
/* ********************************* */
/* PRIVATE ATTRIBUTES */
/* ********************************* */

/** Mutex which protects against simultaneous random label generation. */
std::mutex mtx_;

/** The time (in milliseconds) of the last label creation. */
uint64_t prev_time_;

/** The submillsecond counter portion of the random label. */
uint32_t counter_;
};

/**
* Returns a PRNG-generated label as a 32-digit hexadecimal random number.
* Wrapper function for `generate_random_label`, which returns a PRNG-generated
* label as a 32-digit hexadecimal random number.
* (Ex. f258d22d4db9139204eef2b4b5d860cc).
*
* Note: the random number is actually the combination of two 16-digit numbers.
* The values are 0-padded to ensure exactly a 128-bit, 32-digit length.
* @pre If multiple labels are generated within the same millisecond, they will
* be sorted using a counter on the most significant 4 bytes.
* @note Labels may be 0-padded to ensure exactly a 128-bit, 32-digit length.
*
* @return A random label.
*/
Expand Down
4 changes: 2 additions & 2 deletions tiledb/common/random/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#
# The MIT License
#
# Copyright (c) 2023 TileDB, Inc.
# Copyright (c) 2023-2024 TileDB, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand All @@ -27,6 +27,6 @@
include(unit_test)

commence(unit_test seedable_global_PRNG)
this_target_sources(unit_seedable_global_PRNG.cc)
this_target_sources(unit_random_label_generator.cc unit_seedable_global_PRNG.cc)
this_target_object_libraries(seedable_global_PRNG)
conclude(unit_test)
138 changes: 138 additions & 0 deletions tiledb/common/random/test/unit_random_label_generator.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/**
* @file tiledb/common/random/test/unit_random_label_generator.cc
*
* @section LICENSE
*
* The MIT License
*
* @copyright Copyright (c) 2024 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* Tests for the random label generator.
*/

#include <test/support/tdb_catch.h>
#include "../random_label.h"

using namespace tiledb::common;
using namespace tiledb::sm;

size_t generate_labels(std::vector<std::string>& labels) {
size_t labels_size = labels.size();
auto now = utils::time::timestamp_now_ms();
size_t idx = 0;
while ((utils::time::timestamp_now_ms()) < now + 100 && idx < labels_size) {
labels[idx++] = random_label();
}

return idx;
}

void validate_labels(std::vector<std::string>& labels, size_t num_labels) {
// Given the label randomness and the fact that we're racing the processor,
// the best we can do here (for now) is assert that there's 10 ordered groups.
// In this manner, groups are defined as sharing the first 4 bytes.
uint64_t num_groups = 0;
uint64_t this_group = 0;
for (size_t i = 1; i < num_labels; i++) {
bool match = true;
for (size_t j = 0; j < 4; j++) {
if (labels[i - 1][j] != labels[i][j]) {
match = false;
break;
}
}
if (!match) {
if (this_group > 10) {
num_groups += 1;
}
this_group = 0;
continue;
}

// We share a prefix so assert that they're ordered.
REQUIRE(labels[i] > labels[i - 1]);
this_group += 1;
}

REQUIRE(num_groups > 10);
}

TEST_CASE(
"RandomLabelGenerator: serial generation",
"[RandomLabelGenerator][serial]") {
// Generate a random label to validate initialization.
auto label = random_label();
REQUIRE(label.size() == 32);

// Test one million strings. Let's assume the buffer overflow check works.
std::vector<std::string> labels{1000000};
auto num_labels = generate_labels(labels);
validate_labels(labels, num_labels);
}

TEST_CASE(
"RandomLabelGenerator: parallel generation",
"[RandomLabelGenerator][parallel]") {
const unsigned nthreads = 20;
std::vector<std::thread> threads;
std::vector<std::vector<std::string>> labels{nthreads};
size_t num_labels[nthreads];

// Pre-allocate our buffers so we're getting as much contention as possible
for (size_t i = 0; i < nthreads; i++) {
labels[i].resize(1000000);
}

// Generate labels simultaneously in multiple threads.
for (size_t i = 0; i < nthreads; i++) {
auto num_ptr = &num_labels[i];
auto vec_ptr = &labels[i];
threads.emplace_back([num_ptr, vec_ptr]() {
auto num = generate_labels(*vec_ptr);
*num_ptr = num;
});
}

// Wait for all of our threads to finish.
for (auto& t : threads) {
t.join();
}

// Check that we've generated the correct number of random labels.
std::unordered_set<std::string> label_set;
size_t total_labels = 0;
for (size_t i = 0; i < nthreads; i++) {
total_labels += num_labels[i];
for (size_t j = 0; j < num_labels[i]; j++) {
label_set.insert(labels[i][j]);
}
}
REQUIRE(label_set.size() == total_labels);

// Sort and validate the parallel threads as if they were serially generated.
std::vector<std::string> all_labels{total_labels};
size_t idx = 0;
for (auto label : label_set) {
all_labels[idx++] = label;
}
std::sort(all_labels.begin(), all_labels.end());
validate_labels(all_labels, total_labels);
}

0 comments on commit 8ea85dc

Please sign in to comment.