Skip to content

Commit

Permalink
apacheGH-35903: [C++] Skeleton for Azure Blob Storage filesystem impl…
Browse files Browse the repository at this point in the history
…ementation (apache#35701)

### What changes are included in this PR?
This PR splits out the overall skeleton of apache#12914 in order to make merging of the overall Azure Filesystem easier to do.

### Are these changes tested?
Yes.

### Are there any user-facing changes?
Yes.

* Closes: apache#35903

Authored-by: Srinivas Lade <srinulade1@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
srilman authored and R-JunmingChen committed Aug 20, 2023
1 parent 3d5b8c4 commit 6f48dd0
Show file tree
Hide file tree
Showing 11 changed files with 382 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ jobs:
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 75
env:
ARROW_AZURE: ON
ARROW_BUILD_TESTS: ON
ARROW_DATASET: ON
ARROW_FLIGHT: ON
Expand Down
1 change: 1 addition & 0 deletions ci/docker/ubuntu-20.04-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin
# static Arrow to run Flight/Flight SQL tests
ENV absl_SOURCE=BUNDLED \
ARROW_ACERO=ON \
ARROW_AZURE=ON \
ARROW_BUILD_STATIC=ON \
ARROW_BUILD_TESTS=ON \
ARROW_DEPENDENCY_SOURCE=SYSTEM \
Expand Down
1 change: 1 addition & 0 deletions ci/docker/ubuntu-22.04-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin
# - libgtest-dev only provide sources
ENV absl_SOURCE=BUNDLED \
ARROW_ACERO=ON \
ARROW_AZURE=ON \
ARROW_BUILD_STATIC=ON \
ARROW_BUILD_TESTS=ON \
ARROW_DEPENDENCY_SOURCE=SYSTEM \
Expand Down
1 change: 1 addition & 0 deletions ci/scripts/cpp_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ pushd ${build_dir}
cmake \
-Dabsl_SOURCE=${absl_SOURCE:-} \
-DARROW_ACERO=${ARROW_ACERO:-ON} \
-DARROW_AZURE=${ARROW_AZURE:-OFF} \
-DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \
-DARROW_BUILD_BENCHMARKS_REFERENCE=${ARROW_BUILD_BENCHMARKS:-OFF} \
-DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
"inherits": "features-basic",
"hidden": true,
"cacheVariables": {
"ARROW_AZURE": "ON",
"ARROW_GCS": "ON",
"ARROW_HDFS": "ON",
"ARROW_S3": "ON"
Expand Down
3 changes: 3 additions & 0 deletions cpp/cmake_modules/DefineOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,9 @@ takes precedence over ccache if a storage backend is configured" ON)
ARROW_COMPUTE
ARROW_IPC)

define_option(ARROW_AZURE
"Build Arrow with Azure support (requires the Azure SDK for C++)" OFF)

define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" OFF)

define_option(ARROW_COMPUTE "Build all Arrow Compute kernels" OFF)
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,12 @@ if(ARROW_FILESYSTEM)
filesystem/path_util.cc
filesystem/util_internal.cc)

if(ARROW_AZURE)
list(APPEND ARROW_SRCS filesystem/azurefs.cc)
set_source_files_properties(filesystem/azurefs.cc
PROPERTIES SKIP_PRECOMPILE_HEADERS ON
SKIP_UNITY_BUILD_INCLUSION ON)
endif()
if(ARROW_GCS)
list(APPEND ARROW_SRCS filesystem/gcsfs.cc filesystem/gcsfs_internal.cc)
set_source_files_properties(filesystem/gcsfs.cc filesystem/gcsfs_internal.cc
Expand Down
9 changes: 9 additions & 0 deletions cpp/src/arrow/filesystem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ if(ARROW_GCS)
Boost::system)
endif()

if(ARROW_AZURE)
add_arrow_test(azurefs_test
EXTRA_LABELS
filesystem
EXTRA_LINK_LIBS
Boost::filesystem
Boost::system)
endif()

if(ARROW_S3)
add_arrow_test(s3fs_test
SOURCES
Expand Down
154 changes: 154 additions & 0 deletions cpp/src/arrow/filesystem/azurefs.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/filesystem/azurefs.h"

#include "arrow/result.h"
#include "arrow/util/checked_cast.h"

namespace arrow {
namespace fs {

// -----------------------------------------------------------------------
// AzureOptions Implementation

AzureOptions::AzureOptions() {}

bool AzureOptions::Equals(const AzureOptions& other) const {
return (account_dfs_url == other.account_dfs_url &&
account_blob_url == other.account_blob_url &&
credentials_kind == other.credentials_kind);
}

// -----------------------------------------------------------------------
// AzureFilesystem Implementation

class AzureFileSystem::Impl {
public:
io::IOContext io_context_;
bool is_hierarchical_namespace_enabled_;
AzureOptions options_;

explicit Impl(AzureOptions options, io::IOContext io_context)
: io_context_(io_context), options_(std::move(options)) {}

Status Init() {
if (options_.backend == AzureBackend::Azurite) {
// gen1Client_->GetAccountInfo().Value.IsHierarchicalNamespaceEnabled
// throws error in azurite
is_hierarchical_namespace_enabled_ = false;
}
return Status::OK();
}

const AzureOptions& options() const { return options_; }
};

const AzureOptions& AzureFileSystem::options() const { return impl_->options(); }

bool AzureFileSystem::Equals(const FileSystem& other) const {
if (this == &other) {
return true;
}
if (other.type_name() != type_name()) {
return false;
}
const auto& azure_fs = ::arrow::internal::checked_cast<const AzureFileSystem&>(other);
return options().Equals(azure_fs.options());
}

Result<FileInfo> AzureFileSystem::GetFileInfo(const std::string& path) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Result<FileInfoVector> AzureFileSystem::GetFileInfo(const FileSelector& select) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Status AzureFileSystem::DeleteDir(const std::string& path) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Status AzureFileSystem::DeleteRootDirContents() {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Status AzureFileSystem::DeleteFile(const std::string& path) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Status AzureFileSystem::Move(const std::string& src, const std::string& dest) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Status AzureFileSystem::CopyFile(const std::string& src, const std::string& dest) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Result<std::shared_ptr<io::InputStream>> AzureFileSystem::OpenInputStream(
const std::string& path) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Result<std::shared_ptr<io::InputStream>> AzureFileSystem::OpenInputStream(
const FileInfo& info) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Result<std::shared_ptr<io::RandomAccessFile>> AzureFileSystem::OpenInputFile(
const std::string& path) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Result<std::shared_ptr<io::RandomAccessFile>> AzureFileSystem::OpenInputFile(
const FileInfo& info) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Result<std::shared_ptr<io::OutputStream>> AzureFileSystem::OpenOutputStream(
const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Result<std::shared_ptr<io::OutputStream>> AzureFileSystem::OpenAppendStream(
const std::string&, const std::shared_ptr<const KeyValueMetadata>&) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
}

Result<std::shared_ptr<AzureFileSystem>> AzureFileSystem::Make(
const AzureOptions& options, const io::IOContext& io_context) {
std::shared_ptr<AzureFileSystem> ptr(new AzureFileSystem(options, io_context));
RETURN_NOT_OK(ptr->impl_->Init());
return ptr;
}

AzureFileSystem::AzureFileSystem(const AzureOptions& options,
const io::IOContext& io_context)
: FileSystem(io_context), impl_(std::make_unique<Impl>(options, io_context)) {
default_async_is_sync_ = false;
}

} // namespace fs
} // namespace arrow
159 changes: 159 additions & 0 deletions cpp/src/arrow/filesystem/azurefs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <memory>
#include <string>
#include <vector>

#include "arrow/filesystem/filesystem.h"
#include "arrow/util/macros.h"
#include "arrow/util/uri.h"

namespace Azure {
namespace Core {
namespace Credentials {

class TokenCredential;

} // namespace Credentials
} // namespace Core
namespace Storage {

class StorageSharedKeyCredential;

} // namespace Storage
} // namespace Azure

namespace arrow {
namespace fs {

enum class AzureCredentialsKind : int8_t {
/// Anonymous access (no credentials used), public
Anonymous,
/// Use explicitly-provided access key pair
StorageCredentials,
/// Use ServicePrincipleCredentials
ServicePrincipleCredentials,
/// Use Sas Token to authenticate
Sas,
/// Use Connection String
ConnectionString
};

enum class AzureBackend : bool {
/// Official Azure Remote Backend
Azure,
/// Local Simulated Storage
Azurite
};

/// Options for the AzureFileSystem implementation.
struct ARROW_EXPORT AzureOptions {
std::string account_dfs_url;
std::string account_blob_url;
AzureBackend backend = AzureBackend::Azure;
AzureCredentialsKind credentials_kind = AzureCredentialsKind::Anonymous;

std::string sas_token;
std::string connection_string;
std::shared_ptr<Azure::Storage::StorageSharedKeyCredential>
storage_credentials_provider;
std::shared_ptr<Azure::Core::Credentials::TokenCredential>
service_principle_credentials_provider;

AzureOptions();

bool Equals(const AzureOptions& other) const;
};

/// \brief Azure-backed FileSystem implementation for ABFS and ADLS.
///
/// ABFS (Azure Blob Storage - https://azure.microsoft.com/en-us/products/storage/blobs/)
/// object-based cloud storage system.
///
/// ADLS (Azure Data Lake Storage -
/// https://azure.microsoft.com/en-us/products/storage/data-lake-storage/)
/// is a scalable data storage system designed for big-data applications.
/// ADLS provides filesystem semantics, file-level security, and Hadoop
/// compatibility. Gen1 exists as a separate object that will retired
/// on Feb 29, 2024. New ADLS accounts will use Gen2 instead, which is
/// implemented on top of ABFS.
///
/// TODO: GH-18014 Complete the internal implementation
/// and review the documentation
class ARROW_EXPORT AzureFileSystem : public FileSystem {
public:
~AzureFileSystem() override = default;

std::string type_name() const override { return "abfs"; }

/// Return the original Azure options when constructing the filesystem
const AzureOptions& options() const;

bool Equals(const FileSystem& other) const override;

Result<FileInfo> GetFileInfo(const std::string& path) override;

Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;

Status CreateDir(const std::string& path, bool recursive = true) override;

Status DeleteDir(const std::string& path) override;

Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;

Status DeleteRootDirContents() override;

Status DeleteFile(const std::string& path) override;

Status Move(const std::string& src, const std::string& dest) override;

Status CopyFile(const std::string& src, const std::string& dest) override;

Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;

Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;

Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;

Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info) override;

Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;

Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;

static Result<std::shared_ptr<AzureFileSystem>> Make(
const AzureOptions& options, const io::IOContext& = io::default_io_context());

private:
explicit AzureFileSystem(const AzureOptions& options, const io::IOContext& io_context);

class Impl;
std::unique_ptr<Impl> impl_;
};

} // namespace fs
} // namespace arrow
Loading

0 comments on commit 6f48dd0

Please sign in to comment.