Skip to content

Commit

Permalink
Reduce the number of requests in dir_size (#3382) (#3405)
Browse files Browse the repository at this point in the history
* Extend ls_with_sizes to retrieve is_directory metadata

Optimize no of requests for dir_size

* address comment from @-KiterLuc, fix hdfs failing build

Co-authored-by: Robert Bindar <robertbindar@gmail.com>
  • Loading branch information
github-actions[bot] and robertbindar committed Aug 1, 2022
1 parent a517d83 commit 41dd22e
Show file tree
Hide file tree
Showing 9 changed files with 42 additions and 26 deletions.
16 changes: 14 additions & 2 deletions tiledb/common/filesystem/directory_entry.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,12 @@ class directory_entry {
*
* @param p The path of the entry
* @param size The size of the filesystem entry
* @param is_directory Value indicating if the entry is a dir or not
*/
directory_entry(const std::string& p, uintmax_t size)
directory_entry(const std::string& p, uintmax_t size, bool is_directory)
: path_(p)
, size_(size) {
, size_(size)
, is_directory_(is_directory) {
}

/** Destructor. */
Expand Down Expand Up @@ -115,6 +117,13 @@ class directory_entry {
return size_;
}

/**
* @return Checks whether the directory entry points to a directory
*/
bool is_directory() const {
return is_directory_;
}

private:
/* ********************************* */
/* PRIVATE ATTRIBUTES */
Expand All @@ -125,6 +134,9 @@ class directory_entry {

/** The size of a filesystem entry */
uintmax_t size_;

/** Stores whether the filesystem entry points to a directory */
bool is_directory_;
};

} // namespace tiledb::common::filesystem
Expand Down
6 changes: 4 additions & 2 deletions tiledb/sm/filesystem/azure.cc
Original file line number Diff line number Diff line change
Expand Up @@ -630,12 +630,14 @@ tuple<Status, optional<std::vector<directory_entry>>> Azure::ls_with_sizes(
entries.emplace_back(
"azure://" + container_name + "/" +
remove_front_slash(remove_trailing_slash(blob.name)),
0);
0,
blob.is_directory);
} else {
entries.emplace_back(
"azure://" + container_name + "/" +
remove_front_slash(remove_trailing_slash(blob.name)),
blob.content_length);
blob.content_length,
blob.is_directory);
}
}

Expand Down
6 changes: 4 additions & 2 deletions tiledb/sm/filesystem/gcs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -448,15 +448,17 @@ tuple<Status, optional<std::vector<directory_entry>>> GCS::ls_with_sizes(
entries.emplace_back(
gcs_prefix + bucket_name + "/" +
remove_front_slash(remove_trailing_slash(obj.name())),
obj.size());
obj.size(),
false);
} else if (absl::holds_alternative<std::string>(results)) {
// "Directories" are returned as strings here so we can't return
// any metadata for them.
entries.emplace_back(
gcs_prefix + bucket_name + "/" +
remove_front_slash(
remove_trailing_slash(absl::get<std::string>(results))),
0);
0,
true);
}
}

Expand Down
4 changes: 2 additions & 2 deletions tiledb/sm/filesystem/hdfs_filesystem.cc
Original file line number Diff line number Diff line change
Expand Up @@ -557,9 +557,9 @@ tuple<Status, optional<std::vector<directory_entry>>> HDFS::ls_with_sizes(
path = std::string("hdfs://") + path;
}
if (fileList[i].mKind == kObjectKindDirectory) {
entries.emplace_back(path, 0);
entries.emplace_back(path, 0, true);
} else {
entries.emplace_back(path, fileList[i].mSize);
entries.emplace_back(path, fileList[i].mSize, false);
}
}
libhdfs_->hdfsFreeFileInfo(fileList, numEntries);
Expand Down
4 changes: 2 additions & 2 deletions tiledb/sm/filesystem/mem_filesystem.cc
Original file line number Diff line number Diff line change
Expand Up @@ -291,11 +291,11 @@ class MemFilesystem::Directory : public MemFilesystem::FSNode {
for (const auto& child : children_) {
std::unique_lock<std::mutex> lock(child.second->mutex_);
if (child.second->is_dir()) {
names.emplace_back("mem://" + full_path + child.first, 0);
names.emplace_back("mem://" + full_path + child.first, 0, true);
} else {
uint64_t size;
RETURN_NOT_OK_TUPLE(child.second->get_size(&size), nullopt);
names.emplace_back("mem://" + full_path + child.first, size);
names.emplace_back("mem://" + full_path + child.first, size, false);
}
}

Expand Down
4 changes: 2 additions & 2 deletions tiledb/sm/filesystem/posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -313,11 +313,11 @@ tuple<Status, optional<std::vector<directory_entry>>> Posix::ls_with_sizes(
// If this penalty becomes noticeable, we should just duplicate
// this implementation in ls() and don't get the size
if (next_path->d_type == DT_DIR) {
entries.emplace_back(abspath, 0);
entries.emplace_back(abspath, 0, true);
} else {
uint64_t size;
RETURN_NOT_OK_TUPLE(file_size(abspath, &size), nullopt);
entries.emplace_back(abspath, size);
entries.emplace_back(abspath, size, false);
}
}
// close parent directory
Expand Down
7 changes: 5 additions & 2 deletions tiledb/sm/filesystem/s3.cc
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,8 @@ tuple<Status, optional<std::vector<directory_entry>>> S3::ls_with_sizes(
for (const auto& object : list_objects_outcome.GetResult().GetContents()) {
std::string file(object.GetKey().c_str());
uint64_t size = object.GetSize();
entries.emplace_back("s3://" + aws_auth + add_front_slash(file), size);
entries.emplace_back(
"s3://" + aws_auth + add_front_slash(file), size, false);
}

for (const auto& object :
Expand All @@ -742,7 +743,9 @@ tuple<Status, optional<std::vector<directory_entry>>> S3::ls_with_sizes(
// For "directories" it doesn't seem possible to get a shallow size in
// S3, so the size of such an entry will be 0 in S3.
entries.emplace_back(
"s3://" + aws_auth + add_front_slash(remove_trailing_slash(file)), 0);
"s3://" + aws_auth + add_front_slash(remove_trailing_slash(file)),
0,
true);
}

is_done =
Expand Down
17 changes: 7 additions & 10 deletions tiledb/sm/filesystem/vfs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -190,22 +190,19 @@ Status VFS::dir_size(const URI& dir_name, uint64_t* dir_size) const {

// Get all files in the tree rooted at `dir_name` and add their sizes
*dir_size = 0;
uint64_t size;
std::list<URI> to_ls;
bool is_file;
to_ls.push_front(dir_name);
do {
auto uri = to_ls.front();
to_ls.pop_front();
std::vector<URI> children;
RETURN_NOT_OK(ls(uri, &children));
for (const auto& child : children) {
RETURN_NOT_OK(this->is_file(child, &is_file));
if (is_file) {
RETURN_NOT_OK(file_size(child, &size));
*dir_size += size;
auto&& [st, children] = ls_with_sizes(uri);
RETURN_NOT_OK(st);

for (const auto& child : *children) {
if (!child.is_directory()) {
*dir_size += child.file_size();
} else {
to_ls.push_back(child);
to_ls.push_back(URI(child.path().native()));
}
}
} while (!to_ls.empty());
Expand Down
4 changes: 2 additions & 2 deletions tiledb/sm/filesystem/win.cc
Original file line number Diff line number Diff line change
Expand Up @@ -359,12 +359,12 @@ tuple<Status, optional<std::vector<directory_entry>>> Win::ls_with_sizes(
std::string file_path =
path + (ends_with_slash ? "" : "\\") + find_data.cFileName;
if (is_dir(file_path)) {
entries.emplace_back(file_path, 0);
entries.emplace_back(file_path, 0, true);
} else {
ULARGE_INTEGER size;
size.LowPart = find_data.nFileSizeLow;
size.HighPart = find_data.nFileSizeHigh;
entries.emplace_back(file_path, size.QuadPart);
entries.emplace_back(file_path, size.QuadPart, false);
}
}

Expand Down

0 comments on commit 41dd22e

Please sign in to comment.