Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(dataset): external storage backend #3323

Merged
merged 7 commits into from Mar 20, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/test_deploy.yml
Expand Up @@ -710,7 +710,8 @@ jobs:
- name: Install system packages
run: |
sudo apt-get update -y
sudo apt-get install -y libyaml-0-2 libyaml-dev rclone fuse
sudo apt-get install -y libyaml-0-2 libyaml-dev unzip fuse
sudo -v ; curl https://rclone.org/install.sh | sudo bash
- uses: actions/cache@master
id: dependency-cache
with:
Expand Down Expand Up @@ -807,7 +808,8 @@ jobs:
- name: Install system packages
run: |
sudo apt-get update -y
sudo apt-get install -y libyaml-0-2 libyaml-dev rclone fuse
sudo apt-get install -y libyaml-0-2 libyaml-dev unzip fuse
sudo -v ; curl https://rclone.org/install.sh | sudo bash
- uses: actions/cache@master
id: dependency-cache
with:
Expand Down
5 changes: 2 additions & 3 deletions renku/__init__.py
@@ -1,6 +1,5 @@
#
# Copyright 2017-2023- Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
9 changes: 4 additions & 5 deletions renku/command/checks/__init__.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -20,10 +19,10 @@
from .datasets import (
check_dataset_files_outside_datadir,
check_dataset_old_metadata_location,
check_external_files,
check_invalid_datasets_derivation,
check_missing_files,
)
from .external import check_missing_external_files
from .githooks import check_git_hooks_installed
from .migration import check_migration
from .project import check_project_id_group
Expand All @@ -43,7 +42,7 @@
"check_lfs_info",
"check_migrated_activity_ids",
"check_migration",
"check_missing_external_files",
"check_external_files",
"check_missing_files",
"check_project_id_group",
"check_project_structure",
Expand Down
5 changes: 2 additions & 3 deletions renku/command/checks/activities.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
57 changes: 53 additions & 4 deletions renku/command/checks/datasets.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -70,6 +69,9 @@ def check_missing_files(dataset_gateway: IDatasetGateway, **_):
missing = defaultdict(list)

for dataset in dataset_gateway.get_all_active_datasets():
# NOTE: Datasets with storage backend don't have local copies of files
if dataset.storage:
continue
for file_ in dataset.files:
path = project_context.path / file_.entity.path
file_exists = path.exists() or (file_.is_external and os.path.lexists(path))
Expand Down Expand Up @@ -163,7 +165,7 @@ def check_dataset_files_outside_datadir(fix, dataset_gateway: IDatasetGateway, *
detected_files = []

for file in dataset.files:
if file.is_external:
if file.is_external or file.linked:
continue
try:
get_safe_relative_path(project_context.path / file.entity.path, project_context.path / data_dir)
Expand Down Expand Up @@ -194,3 +196,50 @@ def check_dataset_files_outside_datadir(fix, dataset_gateway: IDatasetGateway, *
return False, problems

return True, None


@inject.autoparams("dataset_gateway")
def check_external_files(fix, dataset_gateway: IDatasetGateway, **_):
"""Find external files.

Args:
fix: Whether to fix found issues.
dataset_gateway(IDatasetGateway): The injected dataset gateway.
_: keyword arguments.

Returns:
Tuple of whether no external files are found and string of found problems.
"""
from renku.core.dataset.dataset import file_unlink

external_files = []
datasets = defaultdict(list)

for dataset in dataset_gateway.get_all_active_datasets():
for file in dataset.files:
if file.is_external:
external_files.append(file.entity.path)
datasets[dataset.name].append(file)

if not external_files:
return True, None

external_files_str = "\n\t".join(sorted(external_files))

if not fix:
problems = (
f"\n{WARNING}: External files are deprecated in favor of an external dataset backend.\n"
"Use 'renku dataset rm' or rerun 'renku doctor' with '--fix' flag to remove them:\n\t"
f"{external_files_str}\n"
)
return False, problems

communication.info(
"The following external files were deleted from the project. You need to add them later manually using a "
f"dataset with an external storage backend:\n\t{external_files_str}"
)

for name, files in datasets.items():
file_unlink(name=name, yes=True, dataset_files=files)

return True, None
53 changes: 0 additions & 53 deletions renku/command/checks/external.py

This file was deleted.

5 changes: 2 additions & 3 deletions renku/command/checks/githooks.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/checks/migration.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/checks/project.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/checks/storage.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/checks/validate_shacl.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/checks/workflow.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/__init__.py
@@ -1,6 +1,5 @@
#
# Copyright 2017-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/command.py
@@ -1,6 +1,5 @@
#
# Copyright 2018-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/communication.py
@@ -1,6 +1,5 @@
#
# Copyright 2018-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/database.py
@@ -1,6 +1,5 @@
#
# Copyright 2018-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/lock.py
@@ -1,6 +1,5 @@
#
# Copyright 2018-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/migration.py
@@ -1,6 +1,5 @@
#
# Copyright 2018-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/repo.py
@@ -1,6 +1,5 @@
#
# Copyright 2018-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
24 changes: 12 additions & 12 deletions renku/command/dataset.py
Expand Up @@ -26,12 +26,12 @@
import_dataset,
list_dataset_files,
list_datasets,
mount_external_storage,
pull_external_data,
mount_cloud_storage,
pull_cloud_storage,
remove_dataset,
search_datasets,
show_dataset,
unmount_external_storage,
unmount_cloud_storage,
update_datasets,
)
from renku.core.dataset.dataset_add import add_to_dataset
Expand Down Expand Up @@ -127,18 +127,18 @@ def list_tags_command():
return Command().command(list_dataset_tags).with_database().require_migration()


def pull_external_data_command():
"""Command for pulling/copying data from an external storage."""
command = Command().command(pull_external_data).lock_dataset().with_database(write=True)
def pull_cloud_storage_command():
"""Command for pulling/copying data from a cloud storage."""
command = Command().command(pull_cloud_storage).lock_dataset().with_database(write=True)
return command.require_migration().with_commit(commit_only=DATASET_METADATA_PATHS + [CONFIG_LOCAL_PATH])


def mount_external_storage_command(unmount: bool):
"""Command for mounting an external storage."""
command = unmount_external_storage if unmount else mount_external_storage
def mount_cloud_storage_command(unmount: bool):
"""Command for mounting a cloud storage."""
command = unmount_cloud_storage if unmount else mount_cloud_storage
return Command().command(command).lock_dataset().with_database(write=False).require_migration()


def unmount_external_storage_command():
"""Command for unmounting an external storage."""
return Command().command(unmount_external_storage).lock_dataset().with_database(write=False).require_migration()
def unmount_cloud_storage_command():
"""Command for unmounting a cloud storage."""
return Command().command(unmount_cloud_storage).lock_dataset().with_database(write=False).require_migration()
5 changes: 2 additions & 3 deletions renku/command/format/__init__.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/format/activity.py
@@ -1,6 +1,5 @@
#
# Copyright 2021 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down