Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change array schema format for schema evolution #2258

Merged
merged 30 commits into from Jul 5, 2021
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9aa9e85
rebase format change for schema evolution to latest dev
May 5, 2021
0e2931f
fix some unit test errors
bdeng-xt May 7, 2021
397efbb
fix some failed unit tests for fragments
bdeng-xt May 7, 2021
6367c67
fix unused parameter warnings for default constructors
bdeng-xt May 9, 2021
03c6dfc
change consolidation.step_size_ratio from 0.75 to 0.78 to pass the un…
bdeng-xt May 10, 2021
04da8d9
make format
bdeng-xt May 10, 2021
b6db8ae
rebase format change for schema evolution to latest dev
May 5, 2021
1822f26
fix some unit test errors
bdeng-xt May 7, 2021
204e09f
fix some failed unit tests for fragments
bdeng-xt May 7, 2021
1902ada
fix unused parameter warnings for default constructors
bdeng-xt May 9, 2021
2db7a72
change consolidation.step_size_ratio from 0.75 to 0.78 to pass the un…
bdeng-xt May 10, 2021
c85a36f
make format
bdeng-xt May 10, 2021
f4a15e8
fix heap memory api violations
bdeng-xt May 10, 2021
ea9941f
rebase to dev
bdeng-xt May 10, 2021
9400b7b
fix error for back compatibility when array schema is stored in a file
bdeng-xt May 11, 2021
4a40cb8
try to fix questions raised by reviewers
bdeng-xt May 12, 2021
fd487b7
make StorageManager::is_array less expensive
bdeng-xt May 17, 2021
f0ee7ef
make is_array more robust
bdeng-xt May 18, 2021
12baba5
merge dev and resolve conflicts
bdeng-xt May 18, 2021
132d69c
merge dev and resolve conflict
bdeng-xt May 18, 2021
c09720f
reuse ls result when openning an array
bdeng-xt May 20, 2021
94d139e
make format
bdeng-xt May 20, 2021
e242195
Merge branch 'dev' into bd/schema-evolution-format-change
bdeng-xt May 20, 2021
bbdcbcf
merge dev and resolve coflicts
bdeng-xt May 21, 2021
ca5cb29
fix some coding styles on reviews
bdeng-xt May 27, 2021
1eafd60
Merge branch 'dev' into bd/schema-evolution-format-change
bdeng-xt Jun 16, 2021
04cd7f9
merge dev and not do a listing on top level array directory
bdeng-xt Jun 16, 2021
436b992
check is_dir before listing a schema directory
bdeng-xt Jun 16, 2021
521a993
Merge branch 'dev' into bd/schema-evolution-format-change
Jul 3, 2021
200c74f
fix the bug for is_array in StorageManager
Jul 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions test/src/helpers.cc
Expand Up @@ -943,6 +943,7 @@ int32_t num_fragments(const std::string& array_name) {
for (const auto& uri : uris) {
auto name = tiledb::sm::URI(uri).remove_trailing_slash().last_path_part();
if (name != tiledb::sm::constants::array_metadata_folder_name &&
name != tiledb::sm::constants::array_schema_folder_name &&
name.find_first_of('.') == std::string::npos)
++ret;
}
Expand Down
29 changes: 28 additions & 1 deletion test/src/unit-capi-array_schema.cc
Expand Up @@ -101,6 +101,15 @@ struct ArraySchemaFx {
// Vector of supported filsystems
const std::vector<std::unique_ptr<SupportedFs>> fs_vec_;

// struct for information of another directory
struct schema_file_struct {
tiledb_ctx_t* ctx;
tiledb_vfs_t* vfs;
std::string path;
};

static int get_schema_file_struct(const char* path, void* data);

// Functions
ArraySchemaFx();
~ArraySchemaFx();
Expand Down Expand Up @@ -903,6 +912,18 @@ std::string ArraySchemaFx::random_name(const std::string& prefix) {
return ss.str();
}

int ArraySchemaFx::get_schema_file_struct(const char* path, void* data) {
auto data_struct = (ArraySchemaFx::schema_file_struct*)data;
auto ctx = data_struct->ctx;
auto vfs = data_struct->vfs;
int is_dir;
int rc = tiledb_vfs_is_dir(ctx, vfs, path, &is_dir);
CHECK(rc == TILEDB_OK);

data_struct->path = path;
return 1;
}

TEST_CASE_METHOD(
ArraySchemaFx,
"C API: Test array schema creation and retrieval",
Expand Down Expand Up @@ -1365,9 +1386,15 @@ TEST_CASE_METHOD(
tiledb_array_schema_free(&array_schema);

// Corrupt the array schema
std::string schema_path = array_name + "/__array_schema.tdb";
std::string schema_path =
array_name + "/" + tiledb::sm::constants::array_schema_folder_name;
std::string to_write = "garbage";
tiledb_vfs_fh_t* fh;
schema_file_struct data_struct = {ctx_, vfs_, ""};
rc = tiledb_vfs_ls(
ctx_, vfs_, schema_path.c_str(), &get_schema_file_struct, &data_struct);
schema_path = data_struct.path;

rc = tiledb_vfs_open(ctx_, vfs_, schema_path.c_str(), TILEDB_VFS_WRITE, &fh);
REQUIRE(rc == TILEDB_OK);
rc = tiledb_vfs_write(ctx_, fh, to_write.c_str(), to_write.size());
Expand Down
9 changes: 6 additions & 3 deletions test/src/unit-capi-consolidation.cc
Expand Up @@ -4492,8 +4492,11 @@ int ConsolidationFx::get_dir_num(const char* path, void* data) {
CHECK(rc == TILEDB_OK);
auto meta_dir =
std::string("/") + tiledb::sm::constants::array_metadata_folder_name;
if (!tiledb::sm::utils::parse::ends_with(path, meta_dir)) {
// Ignoring the meta folder
auto schema_dir =
std::string("/") + tiledb::sm::constants::array_schema_folder_name;
if (!tiledb::sm::utils::parse::ends_with(path, meta_dir) &&
!tiledb::sm::utils::parse::ends_with(path, schema_dir)) {
// Ignoring the meta folder and the schema folder
data_struct->num += is_dir;
}

Expand Down Expand Up @@ -5671,7 +5674,7 @@ TEST_CASE_METHOD(
REQUIRE(rc == TILEDB_OK);
REQUIRE(error == nullptr);
rc = tiledb_config_set(
config, "sm.consolidation.step_size_ratio", "0.75", &error);
config, "sm.consolidation.step_size_ratio", "0.78", &error);
REQUIRE(rc == TILEDB_OK);
REQUIRE(error == nullptr);

Expand Down
24 changes: 12 additions & 12 deletions test/src/unit-capi-fragment_info.cc
Expand Up @@ -236,7 +236,7 @@ TEST_CASE(
uint64_t size;
rc = tiledb_fragment_info_get_fragment_size(ctx, fragment_info, 1, &size);
CHECK(rc == TILEDB_OK);
CHECK(size == 1708);
CHECK(size == 1786);

// Get dense / sparse
int32_t dense;
Expand Down Expand Up @@ -451,7 +451,7 @@ TEST_CASE(
uint64_t size;
rc = tiledb_fragment_info_get_fragment_size(ctx, fragment_info, 1, &size);
CHECK(rc == TILEDB_OK);
CHECK(size == 3061);
CHECK(size == 3139);

// Get dense / sparse
int32_t dense;
Expand Down Expand Up @@ -1000,17 +1000,17 @@ TEST_CASE("C API: Test fragment info, dump", "[capi][fragment_info][dump]") {
"- Unconsolidated metadata num: 3\n" + "- To vacuum num: 0\n" +
"- Fragment #1:\n" + " > URI: " + written_frag_uri_1 + "\n" +
" > Type: dense\n" + " > Non-empty domain: [1, 6]\n" +
" > Size: 1584\n" + " > Cell num: 10\n" +
" > Timestamp range: [1, 1]\n" + " > Format version: 9\n" +
" > Size: 1662\n" + " > Cell num: 10\n" +
" > Timestamp range: [1, 1]\n" + " > Format version: 10\n" +
" > Has consolidated metadata: no\n" + "- Fragment #2:\n" +
" > URI: " + written_frag_uri_2 + "\n" + " > Type: sparse\n" +
" > Non-empty domain: [1, 7]\n" + " > Size: 1708\n" +
" > Non-empty domain: [1, 7]\n" + " > Size: 1786\n" +
" > Cell num: 4\n" + " > Timestamp range: [2, 2]\n" +
" > Format version: 9\n" + " > Has consolidated metadata: no\n" +
" > Format version: 10\n" + " > Has consolidated metadata: no\n" +
"- Fragment #3:\n" + " > URI: " + written_frag_uri_3 + "\n" +
" > Type: sparse\n" + " > Non-empty domain: [2, 9]\n" +
" > Size: 1696\n" + " > Cell num: 3\n" +
" > Timestamp range: [3, 3]\n" + " > Format version: 9\n" +
" > Size: 1774\n" + " > Cell num: 3\n" +
" > Timestamp range: [3, 3]\n" + " > Format version: 10\n" +
" > Has consolidated metadata: no\n";
FILE* gold_fout = fopen("gold_fout.txt", "w");
const char* dump = dump_str.c_str();
Expand Down Expand Up @@ -1128,9 +1128,9 @@ TEST_CASE(
"- To vacuum URIs:\n" + " > " + written_frag_uri_1 + "\n > " +
written_frag_uri_2 + "\n > " + written_frag_uri_3 + "\n" +
"- Fragment #1:\n" + " > URI: " + uri + "\n" + " > Type: dense\n" +
" > Non-empty domain: [1, 10]\n" + " > Size: 1584\n" +
" > Non-empty domain: [1, 10]\n" + " > Size: 1662\n" +
" > Cell num: 10\n" + " > Timestamp range: [1, 3]\n" +
" > Format version: 9\n" + " > Has consolidated metadata: no\n";
" > Format version: 10\n" + " > Has consolidated metadata: no\n";
FILE* gold_fout = fopen("gold_fout.txt", "w");
const char* dump = dump_str.c_str();
fwrite(dump, sizeof(char), strlen(dump), gold_fout);
Expand Down Expand Up @@ -1211,8 +1211,8 @@ TEST_CASE(
"- Unconsolidated metadata num: 1\n" + "- To vacuum num: 0\n" +
"- Fragment #1:\n" + " > URI: " + written_frag_uri + "\n" +
" > Type: sparse\n" + " > Non-empty domain: [a, ddd]\n" +
" > Size: 1833\n" + " > Cell num: 4\n" +
" > Timestamp range: [1, 1]\n" + " > Format version: 9\n" +
" > Size: 1903\n" + " > Cell num: 4\n" +
" > Timestamp range: [1, 1]\n" + " > Format version: 10\n" +
" > Has consolidated metadata: no\n";
FILE* gold_fout = fopen("gold_fout.txt", "w");
const char* dump = dump_str.c_str();
Expand Down
7 changes: 5 additions & 2 deletions test/src/unit-capi-string_dims.cc
Expand Up @@ -207,8 +207,11 @@ int StringDimsFx::get_dir_num(const char* path, void* data) {
CHECK(rc == TILEDB_OK);
auto meta_dir =
std::string("/") + tiledb::sm::constants::array_metadata_folder_name;
if (!tiledb::sm::utils::parse::ends_with(path, meta_dir)) {
// Ignoring the meta folder
auto schema_dir =
std::string("/") + tiledb::sm::constants::array_schema_folder_name;
if (!tiledb::sm::utils::parse::ends_with(path, meta_dir) &&
!tiledb::sm::utils::parse::ends_with(path, schema_dir)) {
// Ignoring the meta folder and the schema folder
data_struct->num += is_dir;
}

Expand Down
14 changes: 7 additions & 7 deletions test/src/unit-cppapi-fragment_info.cc
Expand Up @@ -199,7 +199,7 @@ TEST_CASE(

// Get fragment size
auto size = fragment_info.fragment_size(1);
CHECK(size == 1708);
CHECK(size == 1786);

// Get dense / sparse
auto dense = fragment_info.dense(0);
Expand Down Expand Up @@ -625,17 +625,17 @@ TEST_CASE(
"- Unconsolidated metadata num: 3\n" + "- To vacuum num: 0\n" +
"- Fragment #1:\n" + " > URI: " + written_frag_uri_1 + "\n" +
" > Type: dense\n" + " > Non-empty domain: [1, 6]\n" +
" > Size: 1584\n" + " > Cell num: 10\n" +
" > Timestamp range: [1, 1]\n" + " > Format version: 9\n" +
" > Size: 1662\n" + " > Cell num: 10\n" +
" > Timestamp range: [1, 1]\n" + " > Format version: 10\n" +
" > Has consolidated metadata: no\n" + "- Fragment #2:\n" +
" > URI: " + written_frag_uri_2 + "\n" + " > Type: sparse\n" +
" > Non-empty domain: [1, 7]\n" + " > Size: 1708\n" +
" > Non-empty domain: [1, 7]\n" + " > Size: 1786\n" +
" > Cell num: 4\n" + " > Timestamp range: [2, 2]\n" +
" > Format version: 9\n" + " > Has consolidated metadata: no\n" +
" > Format version: 10\n" + " > Has consolidated metadata: no\n" +
"- Fragment #3:\n" + " > URI: " + written_frag_uri_3 + "\n" +
" > Type: sparse\n" + " > Non-empty domain: [2, 9]\n" +
" > Size: 1696\n" + " > Cell num: 3\n" +
" > Timestamp range: [3, 3]\n" + " > Format version: 9\n" +
" > Size: 1774\n" + " > Cell num: 3\n" +
" > Timestamp range: [3, 3]\n" + " > Format version: 10\n" +
" > Has consolidated metadata: no\n";
FILE* gold_fout = fopen("gold_fout.txt", "w");
const char* dump = dump_str.c_str();
Expand Down
3 changes: 2 additions & 1 deletion tiledb/sm/array/array.cc
Expand Up @@ -140,8 +140,9 @@ Status Array::open_without_fragments(
RETURN_NOT_OK(
rest_client->get_array_schema_from_rest(array_uri_, &array_schema_));
} else {
std::vector<URI> ls_uris;
RETURN_NOT_OK(storage_manager_->array_open_for_reads_without_fragments(
array_uri_, *encryption_key_, &array_schema_));
array_uri_, *encryption_key_, &array_schema_, &ls_uris));
}

is_open_ = true;
Expand Down
75 changes: 75 additions & 0 deletions tiledb/sm/array_schema/array_schema.cc
Expand Up @@ -68,11 +68,15 @@ ArraySchema::ArraySchema(ArrayType array_type)
: array_type_(array_type) {
allows_dups_ = false;
array_uri_ = URI();
uri_ = URI();
name_ = "";
capacity_ = constants::capacity;
cell_order_ = Layout::ROW_MAJOR;
domain_ = nullptr;
tile_order_ = Layout::ROW_MAJOR;
version_ = constants::format_version;
auto timestamp = utils::time::timestamp_now_ms();
timestamp_range_ = std::make_pair(timestamp, timestamp);
Shelnutt2 marked this conversation as resolved.
Show resolved Hide resolved

// Set up default filter pipelines for coords, offsets, and validity values.
coords_filters_.add_filter(CompressionFilter(
Expand All @@ -88,8 +92,11 @@ ArraySchema::ArraySchema(ArrayType array_type)
ArraySchema::ArraySchema(const ArraySchema* array_schema) {
allows_dups_ = array_schema->allows_dups_;
array_uri_ = array_schema->array_uri_;
uri_ = array_schema->uri_;
name_ = array_schema->name_;
array_type_ = array_schema->array_type_;
domain_ = nullptr;
timestamp_range_ = array_schema->timestamp_range_;

capacity_ = array_schema->capacity_;
cell_order_ = array_schema->cell_order_;
Expand Down Expand Up @@ -689,6 +696,57 @@ uint32_t ArraySchema::version() const {
return version_;
}

Status ArraySchema::set_timestamp_range(
const std::pair<uint64_t, uint64_t>& timestamp_range) {
timestamp_range_ = timestamp_range;
return Status::Ok();
}

const std::pair<uint64_t, uint64_t>& ArraySchema::timestamp_range() const {
return timestamp_range_;
}

const URI& ArraySchema::uri() {
std::lock_guard<std::mutex> lock(mtx_);
if (uri_.is_invalid()) {
generate_uri();
}
return uri_;
}

void ArraySchema::set_uri(URI& uri) {
std::lock_guard<std::mutex> lock(mtx_);
uri_ = uri;
name_ = uri_.last_path_part();
utils::parse::get_timestamp_range(uri_, &timestamp_range_);
}

Status ArraySchema::get_uri(URI* uri) {
if (uri_.is_invalid()) {
return LOG_STATUS(
Status::ArraySchemaError("Error in ArraySchema; invalid URI"));
}
*uri = uri_;
return Status::Ok();
}

const std::string& ArraySchema::name() {
std::lock_guard<std::mutex> lock(mtx_);
if (name_.empty()) {
generate_uri();
}
return name_;
}

Status ArraySchema::get_name(std::string* name) const {
if (name_.empty()) {
return LOG_STATUS(
Status::ArraySchemaError("Error in ArraySchema; Empty name"));
}
*name = name_;
return Status::Ok();
}

/* ****************************** */
/* PRIVATE METHODS */
/* ****************************** */
Expand Down Expand Up @@ -736,6 +794,8 @@ Status ArraySchema::check_double_delta_compressor() const {

void ArraySchema::clear() {
array_uri_ = URI();
uri_ = URI();
name_.clear();
array_type_ = ArrayType::DENSE;
capacity_ = constants::capacity;
cell_order_ = Layout::ROW_MAJOR;
Expand All @@ -747,6 +807,21 @@ void ArraySchema::clear() {

tdb_delete(domain_);
domain_ = nullptr;
timestamp_range_ = std::make_pair(0, 0);
}

Status ArraySchema::generate_uri() {
std::string uuid;
RETURN_NOT_OK(uuid::generate_uuid(&uuid, false));

std::stringstream ss;
ss << "__" << timestamp_range_.first << "_" << timestamp_range_.second << "_"
<< uuid;
name_ = ss.str();
uri_ = array_uri_.join_path(constants::array_schema_folder_name)
.join_path(name_);

return Status::Ok();
}

} // namespace sm
Expand Down
44 changes: 44 additions & 0 deletions tiledb/sm/array_schema/array_schema.h
Expand Up @@ -39,7 +39,9 @@
#include "tiledb/common/status.h"
#include "tiledb/sm/filter/filter_pipeline.h"
#include "tiledb/sm/misc/constants.h"
#include "tiledb/sm/misc/hilbert.h"
#include "tiledb/sm/misc/uri.h"
#include "tiledb/sm/misc/uuid.h"

using namespace tiledb::common;

Expand Down Expand Up @@ -297,6 +299,28 @@ class ArraySchema {
/** Returns the array schema version. */
uint32_t version() const;

/** Set a timestamp range for the array schema */
Shelnutt2 marked this conversation as resolved.
Show resolved Hide resolved
Status set_timestamp_range(
const std::pair<uint64_t, uint64_t>& timestamp_range);

/** Returns the timestamp range */
const std::pair<uint64_t, uint64_t>& timestamp_range() const;
Shelnutt2 marked this conversation as resolved.
Show resolved Hide resolved

/** Returns the array schema uri */
const URI& uri();

/** Set schema URI, along with parsing out timestamp ranges and name */
void set_uri(URI& uri);

/** Get schema URI with return status */
Status get_uri(URI* uri);

/** Returns the schema name. If it is not set, will build it */
const std::string& name();

/** Returns the schema name. If it is not set, will returns error status */
Status get_name(std::string* name) const;

private:
/* ********************************* */
/* PRIVATE ATTRIBUTES */
Expand Down Expand Up @@ -356,6 +380,23 @@ class ArraySchema {
/** The format version of this array schema. */
uint32_t version_;

/** Mutex for thread-safety. */
mutable std::mutex mtx_;

/**
* The timestamp the array schema was written.
* This is used to determine the array schema file name.
* The two timestamps are identical.
* It is stored as a pair to keep the usage consistent with metadata
*/
std::pair<uint64_t, uint64_t> timestamp_range_;

/** The URI of the array schema file. */
URI uri_;

/** The file name of array schema in the format of timestamp_timestamp_uuid */
bdeng-xt marked this conversation as resolved.
Show resolved Hide resolved
std::string name_;

/* ********************************* */
/* PRIVATE METHODS */
/* ********************************* */
Expand All @@ -374,6 +415,9 @@ class ArraySchema {

/** Clears all members. Use with caution! */
void clear();

/** Generates a new array schema URI. */
Status generate_uri();
};

} // namespace sm
Expand Down