Skip to content

Commit

Permalink
Change array schema format for schema evolution (#2258)
Browse files Browse the repository at this point in the history
* rebase format change for schema evolution to latest dev

* fix some unit test errors

* fix some failed unit tests for fragments

* fix unused parameter warnings for default constructors

* change consolidation.step_size_ratio from 0.75 to 0.78 to pass the unit test with ratio 0.7523

* make format

* rebase format change for schema evolution to latest dev

* fix some unit test errors

* fix some failed unit tests for fragments

* fix unused parameter warnings for default constructors

* change consolidation.step_size_ratio from 0.75 to 0.78 to pass the unit test with ratio 0.7523

* make format

* fix heap memory api violations

* fix error for back compatibility when array schema is stored in a file

* try to fix questions raised by reviewers

* make StorageManager::is_array less expensive

* make is_array more robust

* reuse ls result when openning an array

* make format

* fix some coding styles on reviews

* merge dev and not do a listing on top level array directory

* check is_dir before listing a schema directory

* fix the bug for is_array in StorageManager

Co-authored-by: bdeng.xt@gmail.com <bdeng@DESKTOP-NT5F11P.localdomain>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-21-150.us-east-2.compute.internal>
  • Loading branch information
3 people committed Jul 5, 2021
1 parent a26e9f6 commit 41e5e8f
Show file tree
Hide file tree
Showing 17 changed files with 356 additions and 43 deletions.
1 change: 1 addition & 0 deletions test/src/helpers.cc
Expand Up @@ -968,6 +968,7 @@ int32_t num_fragments(const std::string& array_name) {
for (const auto& uri : uris) {
auto name = tiledb::sm::URI(uri).remove_trailing_slash().last_path_part();
if (name != tiledb::sm::constants::array_metadata_folder_name &&
name != tiledb::sm::constants::array_schema_folder_name &&
name.find_first_of('.') == std::string::npos)
++ret;
}
Expand Down
2 changes: 2 additions & 0 deletions test/src/unit-capi-array.cc
Expand Up @@ -514,6 +514,8 @@ TEST_CASE_METHOD(
REQUIRE(rc == TILEDB_ERR);
tiledb_ctx_free(&ctx_invalid_key_len_2);
tiledb_vfs_free(&vfs_invalid_key_len_2);
// remove the empty array directory
remove_temp_dir(array_name);

// Create array with proper key
rc = tiledb_config_set(cfg, "sm.encryption_type", "AES_256_GCM", &err);
Expand Down
29 changes: 28 additions & 1 deletion test/src/unit-capi-array_schema.cc
Expand Up @@ -101,6 +101,15 @@ struct ArraySchemaFx {
// Vector of supported filsystems
const std::vector<std::unique_ptr<SupportedFs>> fs_vec_;

// struct for information of another directory
struct schema_file_struct {
tiledb_ctx_t* ctx;
tiledb_vfs_t* vfs;
std::string path;
};

static int get_schema_file_struct(const char* path, void* data);

// Functions
ArraySchemaFx();
~ArraySchemaFx();
Expand Down Expand Up @@ -903,6 +912,18 @@ std::string ArraySchemaFx::random_name(const std::string& prefix) {
return ss.str();
}

int ArraySchemaFx::get_schema_file_struct(const char* path, void* data) {
auto data_struct = (ArraySchemaFx::schema_file_struct*)data;
auto ctx = data_struct->ctx;
auto vfs = data_struct->vfs;
int is_dir;
int rc = tiledb_vfs_is_dir(ctx, vfs, path, &is_dir);
CHECK(rc == TILEDB_OK);

data_struct->path = path;
return 1;
}

TEST_CASE_METHOD(
ArraySchemaFx,
"C API: Test array schema creation and retrieval",
Expand Down Expand Up @@ -1365,9 +1386,15 @@ TEST_CASE_METHOD(
tiledb_array_schema_free(&array_schema);

// Corrupt the array schema
std::string schema_path = array_name + "/__array_schema.tdb";
std::string schema_path =
array_name + "/" + tiledb::sm::constants::array_schema_folder_name;
std::string to_write = "garbage";
tiledb_vfs_fh_t* fh;
schema_file_struct data_struct = {ctx_, vfs_, ""};
rc = tiledb_vfs_ls(
ctx_, vfs_, schema_path.c_str(), &get_schema_file_struct, &data_struct);
schema_path = data_struct.path;

rc = tiledb_vfs_open(ctx_, vfs_, schema_path.c_str(), TILEDB_VFS_WRITE, &fh);
REQUIRE(rc == TILEDB_OK);
rc = tiledb_vfs_write(ctx_, fh, to_write.c_str(), to_write.size());
Expand Down
9 changes: 6 additions & 3 deletions test/src/unit-capi-consolidation.cc
Expand Up @@ -4490,8 +4490,11 @@ int ConsolidationFx::get_dir_num(const char* path, void* data) {
CHECK(rc == TILEDB_OK);
auto meta_dir =
std::string("/") + tiledb::sm::constants::array_metadata_folder_name;
if (!tiledb::sm::utils::parse::ends_with(path, meta_dir)) {
// Ignoring the meta folder
auto schema_dir =
std::string("/") + tiledb::sm::constants::array_schema_folder_name;
if (!tiledb::sm::utils::parse::ends_with(path, meta_dir) &&
!tiledb::sm::utils::parse::ends_with(path, schema_dir)) {
// Ignoring the meta folder and the schema folder
data_struct->num += is_dir;
}

Expand Down Expand Up @@ -5668,7 +5671,7 @@ TEST_CASE_METHOD(
REQUIRE(rc == TILEDB_OK);
REQUIRE(error == nullptr);
rc = tiledb_config_set(
config, "sm.consolidation.step_size_ratio", "0.75", &error);
config, "sm.consolidation.step_size_ratio", "0.78", &error);
REQUIRE(rc == TILEDB_OK);
REQUIRE(error == nullptr);

Expand Down
24 changes: 12 additions & 12 deletions test/src/unit-capi-fragment_info.cc
Expand Up @@ -250,7 +250,7 @@ TEST_CASE(
uint64_t size;
rc = tiledb_fragment_info_get_fragment_size(ctx, fragment_info, 1, &size);
CHECK(rc == TILEDB_OK);
CHECK(size == 1708);
CHECK(size == 1786);

// Get dense / sparse
int32_t dense;
Expand Down Expand Up @@ -481,7 +481,7 @@ TEST_CASE(
uint64_t size;
rc = tiledb_fragment_info_get_fragment_size(ctx, fragment_info, 1, &size);
CHECK(rc == TILEDB_OK);
CHECK(size == 3061);
CHECK(size == 3139);

// Get dense / sparse
int32_t dense;
Expand Down Expand Up @@ -1030,17 +1030,17 @@ TEST_CASE("C API: Test fragment info, dump", "[capi][fragment_info][dump]") {
"- Unconsolidated metadata num: 3\n" + "- To vacuum num: 0\n" +
"- Fragment #1:\n" + " > URI: " + written_frag_uri_1 + "\n" +
" > Type: dense\n" + " > Non-empty domain: [1, 6]\n" +
" > Size: 1584\n" + " > Cell num: 10\n" +
" > Timestamp range: [1, 1]\n" + " > Format version: 9\n" +
" > Size: 1662\n" + " > Cell num: 10\n" +
" > Timestamp range: [1, 1]\n" + " > Format version: 10\n" +
" > Has consolidated metadata: no\n" + "- Fragment #2:\n" +
" > URI: " + written_frag_uri_2 + "\n" + " > Type: sparse\n" +
" > Non-empty domain: [1, 7]\n" + " > Size: 1708\n" +
" > Non-empty domain: [1, 7]\n" + " > Size: 1786\n" +
" > Cell num: 4\n" + " > Timestamp range: [2, 2]\n" +
" > Format version: 9\n" + " > Has consolidated metadata: no\n" +
" > Format version: 10\n" + " > Has consolidated metadata: no\n" +
"- Fragment #3:\n" + " > URI: " + written_frag_uri_3 + "\n" +
" > Type: sparse\n" + " > Non-empty domain: [2, 9]\n" +
" > Size: 1696\n" + " > Cell num: 3\n" +
" > Timestamp range: [3, 3]\n" + " > Format version: 9\n" +
" > Size: 1774\n" + " > Cell num: 3\n" +
" > Timestamp range: [3, 3]\n" + " > Format version: 10\n" +
" > Has consolidated metadata: no\n";
FILE* gold_fout = fopen("gold_fout.txt", "w");
const char* dump = dump_str.c_str();
Expand Down Expand Up @@ -1158,9 +1158,9 @@ TEST_CASE(
"- To vacuum URIs:\n" + " > " + written_frag_uri_1 + "\n > " +
written_frag_uri_2 + "\n > " + written_frag_uri_3 + "\n" +
"- Fragment #1:\n" + " > URI: " + uri + "\n" + " > Type: dense\n" +
" > Non-empty domain: [1, 10]\n" + " > Size: 1584\n" +
" > Non-empty domain: [1, 10]\n" + " > Size: 1662\n" +
" > Cell num: 10\n" + " > Timestamp range: [1, 3]\n" +
" > Format version: 9\n" + " > Has consolidated metadata: no\n";
" > Format version: 10\n" + " > Has consolidated metadata: no\n";
FILE* gold_fout = fopen("gold_fout.txt", "w");
const char* dump = dump_str.c_str();
fwrite(dump, sizeof(char), strlen(dump), gold_fout);
Expand Down Expand Up @@ -1241,8 +1241,8 @@ TEST_CASE(
"- Unconsolidated metadata num: 1\n" + "- To vacuum num: 0\n" +
"- Fragment #1:\n" + " > URI: " + written_frag_uri + "\n" +
" > Type: sparse\n" + " > Non-empty domain: [a, ddd]\n" +
" > Size: 1833\n" + " > Cell num: 4\n" +
" > Timestamp range: [1, 1]\n" + " > Format version: 9\n" +
" > Size: 1903\n" + " > Cell num: 4\n" +
" > Timestamp range: [1, 1]\n" + " > Format version: 10\n" +
" > Has consolidated metadata: no\n";
FILE* gold_fout = fopen("gold_fout.txt", "w");
const char* dump = dump_str.c_str();
Expand Down
7 changes: 5 additions & 2 deletions test/src/unit-capi-string_dims.cc
Expand Up @@ -207,8 +207,11 @@ int StringDimsFx::get_dir_num(const char* path, void* data) {
CHECK(rc == TILEDB_OK);
auto meta_dir =
std::string("/") + tiledb::sm::constants::array_metadata_folder_name;
if (!tiledb::sm::utils::parse::ends_with(path, meta_dir)) {
// Ignoring the meta folder
auto schema_dir =
std::string("/") + tiledb::sm::constants::array_schema_folder_name;
if (!tiledb::sm::utils::parse::ends_with(path, meta_dir) &&
!tiledb::sm::utils::parse::ends_with(path, schema_dir)) {
// Ignoring the meta folder and the schema folder
data_struct->num += is_dir;
}

Expand Down
14 changes: 7 additions & 7 deletions test/src/unit-cppapi-fragment_info.cc
Expand Up @@ -199,7 +199,7 @@ TEST_CASE(

// Get fragment size
auto size = fragment_info.fragment_size(1);
CHECK(size == 1708);
CHECK(size == 1786);

// Get dense / sparse
auto dense = fragment_info.dense(0);
Expand Down Expand Up @@ -625,17 +625,17 @@ TEST_CASE(
"- Unconsolidated metadata num: 3\n" + "- To vacuum num: 0\n" +
"- Fragment #1:\n" + " > URI: " + written_frag_uri_1 + "\n" +
" > Type: dense\n" + " > Non-empty domain: [1, 6]\n" +
" > Size: 1584\n" + " > Cell num: 10\n" +
" > Timestamp range: [1, 1]\n" + " > Format version: 9\n" +
" > Size: 1662\n" + " > Cell num: 10\n" +
" > Timestamp range: [1, 1]\n" + " > Format version: 10\n" +
" > Has consolidated metadata: no\n" + "- Fragment #2:\n" +
" > URI: " + written_frag_uri_2 + "\n" + " > Type: sparse\n" +
" > Non-empty domain: [1, 7]\n" + " > Size: 1708\n" +
" > Non-empty domain: [1, 7]\n" + " > Size: 1786\n" +
" > Cell num: 4\n" + " > Timestamp range: [2, 2]\n" +
" > Format version: 9\n" + " > Has consolidated metadata: no\n" +
" > Format version: 10\n" + " > Has consolidated metadata: no\n" +
"- Fragment #3:\n" + " > URI: " + written_frag_uri_3 + "\n" +
" > Type: sparse\n" + " > Non-empty domain: [2, 9]\n" +
" > Size: 1696\n" + " > Cell num: 3\n" +
" > Timestamp range: [3, 3]\n" + " > Format version: 9\n" +
" > Size: 1774\n" + " > Cell num: 3\n" +
" > Timestamp range: [3, 3]\n" + " > Format version: 10\n" +
" > Has consolidated metadata: no\n";
FILE* gold_fout = fopen("gold_fout.txt", "w");
const char* dump = dump_str.c_str();
Expand Down
77 changes: 77 additions & 0 deletions tiledb/sm/array_schema/array_schema.cc
Expand Up @@ -68,11 +68,15 @@ ArraySchema::ArraySchema(ArrayType array_type)
: array_type_(array_type) {
allows_dups_ = false;
array_uri_ = URI();
uri_ = URI();
name_ = "";
capacity_ = constants::capacity;
cell_order_ = Layout::ROW_MAJOR;
domain_ = nullptr;
tile_order_ = Layout::ROW_MAJOR;
version_ = constants::format_version;
auto timestamp = utils::time::timestamp_now_ms();
timestamp_range_ = std::make_pair(timestamp, timestamp);

// Set up default filter pipelines for coords, offsets, and validity values.
coords_filters_.add_filter(CompressionFilter(
Expand All @@ -88,8 +92,11 @@ ArraySchema::ArraySchema(ArrayType array_type)
ArraySchema::ArraySchema(const ArraySchema* array_schema) {
allows_dups_ = array_schema->allows_dups_;
array_uri_ = array_schema->array_uri_;
uri_ = array_schema->uri_;
name_ = array_schema->name_;
array_type_ = array_schema->array_type_;
domain_ = nullptr;
timestamp_range_ = array_schema->timestamp_range_;

capacity_ = array_schema->capacity_;
cell_order_ = array_schema->cell_order_;
Expand Down Expand Up @@ -689,6 +696,59 @@ uint32_t ArraySchema::version() const {
return version_;
}

Status ArraySchema::set_timestamp_range(
const std::pair<uint64_t, uint64_t>& timestamp_range) {
timestamp_range_ = timestamp_range;
return Status::Ok();
}

std::pair<uint64_t, uint64_t> ArraySchema::timestamp_range() const {
return std::pair<uint64_t, uint64_t>(
timestamp_range_.first, timestamp_range_.second);
}

URI ArraySchema::uri() {
std::lock_guard<std::mutex> lock(mtx_);
if (uri_.is_invalid()) {
generate_uri();
}
URI result = uri_;
return result;
}

void ArraySchema::set_uri(URI& uri) {
std::lock_guard<std::mutex> lock(mtx_);
uri_ = uri;
name_ = uri_.last_path_part();
utils::parse::get_timestamp_range(uri_, &timestamp_range_);
}

Status ArraySchema::get_uri(URI* uri) {
if (uri_.is_invalid()) {
return LOG_STATUS(
Status::ArraySchemaError("Error in ArraySchema; invalid URI"));
}
*uri = uri_;
return Status::Ok();
}

std::string ArraySchema::name() {
std::lock_guard<std::mutex> lock(mtx_);
if (name_.empty()) {
generate_uri();
}
return name_;
}

Status ArraySchema::get_name(std::string* name) const {
if (name_.empty()) {
return LOG_STATUS(
Status::ArraySchemaError("Error in ArraySchema; Empty name"));
}
*name = name_;
return Status::Ok();
}

/* ****************************** */
/* PRIVATE METHODS */
/* ****************************** */
Expand Down Expand Up @@ -736,6 +796,8 @@ Status ArraySchema::check_double_delta_compressor() const {

void ArraySchema::clear() {
array_uri_ = URI();
uri_ = URI();
name_.clear();
array_type_ = ArrayType::DENSE;
capacity_ = constants::capacity;
cell_order_ = Layout::ROW_MAJOR;
Expand All @@ -747,6 +809,21 @@ void ArraySchema::clear() {

tdb_delete(domain_);
domain_ = nullptr;
timestamp_range_ = std::make_pair(0, 0);
}

Status ArraySchema::generate_uri() {
std::string uuid;
RETURN_NOT_OK(uuid::generate_uuid(&uuid, false));

std::stringstream ss;
ss << "__" << timestamp_range_.first << "_" << timestamp_range_.second << "_"
<< uuid;
name_ = ss.str();
uri_ = array_uri_.join_path(constants::array_schema_folder_name)
.join_path(name_);

return Status::Ok();
}

} // namespace sm
Expand Down
44 changes: 44 additions & 0 deletions tiledb/sm/array_schema/array_schema.h
Expand Up @@ -39,7 +39,9 @@
#include "tiledb/common/status.h"
#include "tiledb/sm/filter/filter_pipeline.h"
#include "tiledb/sm/misc/constants.h"
#include "tiledb/sm/misc/hilbert.h"
#include "tiledb/sm/misc/uri.h"
#include "tiledb/sm/misc/uuid.h"

using namespace tiledb::common;

Expand Down Expand Up @@ -297,6 +299,28 @@ class ArraySchema {
/** Returns the array schema version. */
uint32_t version() const;

/** Set a timestamp range for the array schema */
Status set_timestamp_range(
const std::pair<uint64_t, uint64_t>& timestamp_range);

/** Returns the timestamp range. */
std::pair<uint64_t, uint64_t> timestamp_range() const;

/** Returns the array schema uri. */
URI uri();

/** Set schema URI, along with parsing out timestamp ranges and name. */
void set_uri(URI& uri);

/** Get schema URI with return status. */
Status get_uri(URI* uri);

/** Returns the schema name. If it is not set, will build it. */
std::string name();

/** Returns the schema name. If it is not set, will returns error status. */
Status get_name(std::string* name) const;

private:
/* ********************************* */
/* PRIVATE ATTRIBUTES */
Expand Down Expand Up @@ -356,6 +380,23 @@ class ArraySchema {
/** The format version of this array schema. */
uint32_t version_;

/** Mutex for thread-safety. */
mutable std::mutex mtx_;

/**
* The timestamp the array schema was written.
* This is used to determine the array schema file name.
* The two timestamps are identical.
* It is stored as a pair to keep the usage consistent with metadata
*/
std::pair<uint64_t, uint64_t> timestamp_range_;

/** The URI of the array schema file. */
URI uri_;

/** The file name of array schema in the format of timestamp_timestamp_uuid */
std::string name_;

/* ********************************* */
/* PRIVATE METHODS */
/* ********************************* */
Expand All @@ -374,6 +415,9 @@ class ArraySchema {

/** Clears all members. Use with caution! */
void clear();

/** Generates a new array schema URI. */
Status generate_uri();
};

} // namespace sm
Expand Down

0 comments on commit 41e5e8f

Please sign in to comment.