Skip to content

Commit

Permalink
feat(dataset): external storage backend (#3323)
Browse files Browse the repository at this point in the history
  • Loading branch information
m-alisafaee committed Mar 20, 2023
1 parent c8148d8 commit 2a461d4
Show file tree
Hide file tree
Showing 106 changed files with 1,295 additions and 1,149 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/test_deploy.yml
Expand Up @@ -710,7 +710,8 @@ jobs:
- name: Install system packages
run: |
sudo apt-get update -y
sudo apt-get install -y libyaml-0-2 libyaml-dev rclone fuse
sudo apt-get install -y libyaml-0-2 libyaml-dev unzip fuse
sudo -v ; curl https://rclone.org/install.sh | sudo bash
- uses: actions/cache@master
id: dependency-cache
with:
Expand Down Expand Up @@ -807,7 +808,8 @@ jobs:
- name: Install system packages
run: |
sudo apt-get update -y
sudo apt-get install -y libyaml-0-2 libyaml-dev rclone fuse
sudo apt-get install -y libyaml-0-2 libyaml-dev unzip fuse
sudo -v ; curl https://rclone.org/install.sh | sudo bash
- uses: actions/cache@master
id: dependency-cache
with:
Expand Down
5 changes: 2 additions & 3 deletions renku/__init__.py
@@ -1,6 +1,5 @@
#
# Copyright 2017-2023- Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
9 changes: 4 additions & 5 deletions renku/command/checks/__init__.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -20,10 +19,10 @@
from .datasets import (
check_dataset_files_outside_datadir,
check_dataset_old_metadata_location,
check_external_files,
check_invalid_datasets_derivation,
check_missing_files,
)
from .external import check_missing_external_files
from .githooks import check_git_hooks_installed
from .migration import check_migration
from .project import check_project_id_group
Expand All @@ -43,7 +42,7 @@
"check_lfs_info",
"check_migrated_activity_ids",
"check_migration",
"check_missing_external_files",
"check_external_files",
"check_missing_files",
"check_project_id_group",
"check_project_structure",
Expand Down
5 changes: 2 additions & 3 deletions renku/command/checks/activities.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
57 changes: 53 additions & 4 deletions renku/command/checks/datasets.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -70,6 +69,9 @@ def check_missing_files(dataset_gateway: IDatasetGateway, **_):
missing = defaultdict(list)

for dataset in dataset_gateway.get_all_active_datasets():
# NOTE: Datasets with storage backend don't have local copies of files
if dataset.storage:
continue
for file_ in dataset.files:
path = project_context.path / file_.entity.path
file_exists = path.exists() or (file_.is_external and os.path.lexists(path))
Expand Down Expand Up @@ -163,7 +165,7 @@ def check_dataset_files_outside_datadir(fix, dataset_gateway: IDatasetGateway, *
detected_files = []

for file in dataset.files:
if file.is_external:
if file.is_external or file.linked:
continue
try:
get_safe_relative_path(project_context.path / file.entity.path, project_context.path / data_dir)
Expand Down Expand Up @@ -194,3 +196,50 @@ def check_dataset_files_outside_datadir(fix, dataset_gateway: IDatasetGateway, *
return False, problems

return True, None


@inject.autoparams("dataset_gateway")
def check_external_files(fix, dataset_gateway: IDatasetGateway, **_):
"""Find external files.
Args:
fix: Whether to fix found issues.
dataset_gateway(IDatasetGateway): The injected dataset gateway.
_: keyword arguments.
Returns:
Tuple of whether no external files are found and string of found problems.
"""
from renku.core.dataset.dataset import file_unlink

external_files = []
datasets = defaultdict(list)

for dataset in dataset_gateway.get_all_active_datasets():
for file in dataset.files:
if file.is_external:
external_files.append(file.entity.path)
datasets[dataset.name].append(file)

if not external_files:
return True, None

external_files_str = "\n\t".join(sorted(external_files))

if not fix:
problems = (
f"\n{WARNING}: External files are deprecated in favor of an external dataset backend.\n"
"Use 'renku dataset rm' or rerun 'renku doctor' with '--fix' flag to remove them:\n\t"
f"{external_files_str}\n"
)
return False, problems

communication.info(
"The following external files were deleted from the project. You need to add them later manually using a "
f"dataset with an external storage backend:\n\t{external_files_str}"
)

for name, files in datasets.items():
file_unlink(name=name, yes=True, dataset_files=files)

return True, None
53 changes: 0 additions & 53 deletions renku/command/checks/external.py

This file was deleted.

5 changes: 2 additions & 3 deletions renku/command/checks/githooks.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/checks/migration.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/checks/project.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/checks/storage.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/checks/validate_shacl.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/checks/workflow.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/__init__.py
@@ -1,6 +1,5 @@
#
# Copyright 2017-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/command.py
@@ -1,6 +1,5 @@
#
# Copyright 2018-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/communication.py
@@ -1,6 +1,5 @@
#
# Copyright 2018-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/database.py
@@ -1,6 +1,5 @@
#
# Copyright 2018-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/lock.py
@@ -1,6 +1,5 @@
#
# Copyright 2018-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/migration.py
@@ -1,6 +1,5 @@
#
# Copyright 2018-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/command_builder/repo.py
@@ -1,6 +1,5 @@
#
# Copyright 2018-2023 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
24 changes: 12 additions & 12 deletions renku/command/dataset.py
Expand Up @@ -26,12 +26,12 @@
import_dataset,
list_dataset_files,
list_datasets,
mount_external_storage,
pull_external_data,
mount_cloud_storage,
pull_cloud_storage,
remove_dataset,
search_datasets,
show_dataset,
unmount_external_storage,
unmount_cloud_storage,
update_datasets,
)
from renku.core.dataset.dataset_add import add_to_dataset
Expand Down Expand Up @@ -127,18 +127,18 @@ def list_tags_command():
return Command().command(list_dataset_tags).with_database().require_migration()


def pull_external_data_command():
"""Command for pulling/copying data from an external storage."""
command = Command().command(pull_external_data).lock_dataset().with_database(write=True)
def pull_cloud_storage_command():
"""Command for pulling/copying data from a cloud storage."""
command = Command().command(pull_cloud_storage).lock_dataset().with_database(write=True)
return command.require_migration().with_commit(commit_only=DATASET_METADATA_PATHS + [CONFIG_LOCAL_PATH])


def mount_external_storage_command(unmount: bool):
"""Command for mounting an external storage."""
command = unmount_external_storage if unmount else mount_external_storage
def mount_cloud_storage_command(unmount: bool):
"""Command for mounting a cloud storage."""
command = unmount_cloud_storage if unmount else mount_cloud_storage
return Command().command(command).lock_dataset().with_database(write=False).require_migration()


def unmount_external_storage_command():
"""Command for unmounting an external storage."""
return Command().command(unmount_external_storage).lock_dataset().with_database(write=False).require_migration()
def unmount_cloud_storage_command():
"""Command for unmounting a cloud storage."""
return Command().command(unmount_cloud_storage).lock_dataset().with_database(write=False).require_migration()
5 changes: 2 additions & 3 deletions renku/command/format/__init__.py
@@ -1,6 +1,5 @@
#
# Copyright 2020 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
5 changes: 2 additions & 3 deletions renku/command/format/activity.py
@@ -1,6 +1,5 @@
#
# Copyright 2021 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down

0 comments on commit 2a461d4

Please sign in to comment.