From cf32efbc47e6ca3710c43f9b53ed7311e7b1af43 Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Thu, 17 Oct 2019 15:55:19 +0200 Subject: [PATCH 1/6] feat: import data from other renku projects --- renku/core/commands/dataset.py | 14 +++- renku/core/errors.py | 10 ++- renku/core/management/datasets.py | 117 +++++++++++++++++++--------- tests/cli/test_datasets.py | 4 +- tests/core/commands/test_dataset.py | 3 +- 5 files changed, 107 insertions(+), 41 deletions(-) diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index a7bf6709f2..aaf5ea1112 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -39,7 +39,7 @@ from renku.core.commands.providers import ProviderFactory from renku.core.compat import contextlib from renku.core.errors import DatasetNotFound, InvalidAccessToken, \ - MigrationRequired, ParameterError + MigrationRequired, ParameterError, UsageError from renku.core.management.datasets import DATASET_METADATA_PATHS from renku.core.management.git import COMMIT_DIFF_STRATEGY from renku.core.models.creators import Creator @@ -166,6 +166,14 @@ def add_to_dataset( urlscontext=contextlib.nullcontext ): """Add data to a dataset.""" + if sources or destination: + if len(urls) == 0: + raise UsageError('No URL is specified') + elif len(urls) > 1: + raise UsageError( + 'Cannot add multiple URLs whit --source or --destination' + ) + # check for identifier before creating the dataset identifier = extract_doi( with_metadata.identifier @@ -208,7 +216,9 @@ def add_to_dataset( 'automatic dataset creation.'.format(name) ) except (FileNotFoundError, git.exc.NoSuchPathError): - raise ParameterError('Could not process \n{0}'.format('\n'.join(urls))) + raise ParameterError( + 'Could not find paths/URLs: \n{0}'.format('\n'.join(urls)) + ) @pass_local_client(clean=False, commit=False) diff --git a/renku/core/errors.py b/renku/core/errors.py index a4ef655ce9..c9912fdd7f 100644 --- a/renku/core/errors.py +++ b/renku/core/errors.py @@ -84,9 +84,11 @@ def __init__(self, message, param_hint=None): if param_hint: if isinstance(param_hint, (tuple, list)): param_hint = ' / '.join('"{}"'.format(x) for x in param_hint) - message = 'Invalid value for {}: {}'.format(param_hint, message) + message = 'Invalid parameter value for {}: {}'.format( + param_hint, message + ) else: - message = 'Invalid value: {}'.format(message) + message = 'Invalid parameter value: {}'.format(message) super().__init__(message) @@ -367,3 +369,7 @@ class GitError(RenkuException): class UrlSchemaNotSupported(RenkuException): """Raised when adding data from unsupported URL schemas.""" + + +class OperationError(RenkuException): + """Raised when an operation at runtime raises an error.""" diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index 610d766f3a..ce137c7b89 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -81,7 +81,8 @@ def datasets_from_commit(self, commit=None): def datasets(self): """Return mapping from path to dataset.""" result = {} - for path in self.renku_datasets_path.rglob(self.METADATA): + paths = (self.path / self.renku_datasets_path).rglob(self.METADATA) + for path in paths: result[path] = self.get_dataset(path) return result @@ -240,8 +241,8 @@ def _add_from_url(self, dataset, dataset_path, url, link, destination): u = parse.urlparse(url) if u.scheme not in Dataset.SUPPORTED_SCHEMES: - raise NotImplementedError( - '{} URLs are not supported'.format(u.scheme) + raise errors.UrlSchemaNotSupported( + 'Schema {} not supported'.format(u.scheme) ) if destination: @@ -284,7 +285,7 @@ def _add_from_url(self, dataset, dataset_path, url, link, destination): try: os.link(str(src), str(dst)) except Exception as e: - raise Exception( + raise errors.OperationError( 'Could not create hard link ' '- retry without --link.' ) from e @@ -298,7 +299,9 @@ def _add_from_url(self, dataset, dataset_path, url, link, destination): response = requests.get(url) dst.write_bytes(response.content) except error.HTTPError as e: # pragma nocover - raise e + raise errors.OperationError( + 'Cannot download from {}'.format(url) + ) from e # make the added file read-only mode = dst.stat().st_mode & 0o777 @@ -339,8 +342,6 @@ def _add_from_git( 'parent': self }] - warnings.warn('Importing local git repository, use HTTPS') - # determine where is the base repo path repo = Repo(u.path, search_parent_directories=True) repo_path = Path(repo.git_dir).parent.resolve() @@ -357,7 +358,7 @@ def _add_from_git( elif u.scheme not in {'http', 'https', 'git+https', 'git+ssh'} and \ not url.startswith('git@'): raise errors.UrlSchemaNotSupported( - 'Scheme {} not supported'.format(u.scheme) + 'Schema {} not supported'.format(u.scheme) ) repo, repo_path = self._prepare_git_repo(url, ref) @@ -391,8 +392,9 @@ def _add_from_git( # Create metadata and move files to dataset results = [] - client = LocalClient(repo_path) + remote_client = LocalClient(repo_path) + # Pull files from LFS paths = set() for path, src, _, __ in files: if src.is_dir(): @@ -402,15 +404,12 @@ def _add_from_git( paths.add(path) self._fetch_lfs_files(repo_path, paths) + # Fetch metadata from Renku if any + paths = {f[0] for f in files} + metadata = self._fetch_files_metadata(remote_client, paths) + for path, src, dst, _ in files: if not src.is_dir(): - creators = [] - # grab all the creators from the commit history - for commit in repo.iter_commits(paths=path): - creator = Creator.from_commit(commit) - if creator not in creators: - creators.append(creator) - if is_local_repo: dst_url = None elif path: @@ -418,9 +417,23 @@ def _add_from_git( else: dst_url = url - based_on = DatasetFile.from_revision( - client, path=path, url=url - ) + # Use original metadata if it exists + based_on = metadata.get(path) + if based_on: + based_on.url = url + based_on.based_on = None + creators = based_on.creator + else: + creators = [] + # grab all the creators from the commit history + for commit in repo.iter_commits(paths=path): + creator = Creator.from_commit(commit) + if creator not in creators: + creators.append(creator) + + based_on = DatasetFile.from_revision( + remote_client, path=path, url=url + ) path_in_dst_repo = dst.relative_to(self.path) @@ -521,6 +534,17 @@ def _fetch_lfs_files(self, repo_path, paths): except SubprocessError: pass + @staticmethod + def _fetch_files_metadata(client, paths): + """Return metadata for files from paths.""" + files = {} + for _, dataset in client.datasets.items(): + for file_ in dataset.files: + path = file_.path + if path in paths: + files[path] = file_ + return files + def get_relative_url(self, url): """Determine if the repo url should be relative.""" # Check if the default remote of the branch we are on is on @@ -627,7 +651,8 @@ def update_dataset_files(self, files, ref=None): deleted_files = [] for file_ in files: - based_on = DatasetFile.from_jsonld(file_.based_on) + file_.based_on = DatasetFile.from_jsonld(file_.based_on) + based_on = file_.based_on url = based_on.url if url in visited_repos: repo, repo_path, remote_client = visited_repos[url] @@ -636,22 +661,28 @@ def update_dataset_files(self, files, ref=None): remote_client = LocalClient(repo_path) visited_repos[url] = repo, repo_path, remote_client - try: - remote_file = DatasetFile.from_revision( - remote_client, - path=based_on.path, - url=url, - added=based_on.added - ) - except KeyError: - raise errors.InvalidFileOperation( - 'Cannot find file {} in the repo {}'.format( - based_on.url, url + remote_file = self._fetch_file_metadata( + remote_client, based_on.path + ) + + if not remote_file: + try: + remote_file = DatasetFile.from_revision( + remote_client, + path=based_on.path, + url=url, + added=based_on.added + ) + except KeyError: + raise errors.InvalidFileOperation( + 'Cannot find file {} in the repo {}'.format( + based_on.url, url + ) ) - ) - commit_sha = based_on._id.split('/')[1] - if commit_sha != remote_file.commit.hexsha: + commit_sha = self._get_commit_sha_from_label(based_on) + remote_commit_sha = self._get_commit_sha_from_label(remote_file) + if commit_sha != remote_commit_sha: src = Path(repo.working_dir) / based_on.path dst = self.renku_path.parent / file_.path @@ -659,7 +690,8 @@ def update_dataset_files(self, files, ref=None): # Fetch file is it is tracked by Git LFS self._fetch_lfs_files(repo_path, {based_on.path}) shutil.copy(str(src), str(dst)) - file_.based_on = remote_file + file_.based_on.commit = remote_file.commit + file_.based_on._label = remote_file._label updated_files.append(file_) else: # File was removed or renamed @@ -779,6 +811,21 @@ def checkout(repo, ref): else: return repo, repo_path + @staticmethod + def _fetch_file_metadata(client, path): + """Return metadata for a single file.""" + for _, dataset in client.datasets.items(): + for file_ in dataset.files: + if file_.path == path: + return file_ + + @staticmethod + def _get_commit_sha_from_label(dataset_file): + label = dataset_file._label + if '@' in label: + return label.split('@')[1] + return label + def check_for_git_repo(url): """Check if a url points to a git repository.""" diff --git a/tests/cli/test_datasets.py b/tests/cli/test_datasets.py index 0037ff7935..e74324d99c 100644 --- a/tests/cli/test_datasets.py +++ b/tests/cli/test_datasets.py @@ -1295,8 +1295,10 @@ def test_dataset_update( assert after['_label'] != before['_label'] assert after['added'] == before['added'] assert after['url'] == before['url'] - assert after['based_on']['_id'] != before['based_on']['_id'] + assert after['based_on']['_id'] == before['based_on']['_id'] + assert after['based_on']['_label'] != before['based_on']['_label'] assert after['based_on']['path'] == before['based_on']['path'] + assert after['based_on']['based_on'] is None def test_dataset_update_remove_file( diff --git a/tests/core/commands/test_dataset.py b/tests/core/commands/test_dataset.py index 6095af6db9..e4cdd4a354 100644 --- a/tests/core/commands/test_dataset.py +++ b/tests/core/commands/test_dataset.py @@ -25,6 +25,7 @@ import git import pytest +from renku.core import errors from renku.core.models.creators import Creator from renku.core.models.datasets import Dataset, DatasetFile @@ -59,7 +60,7 @@ def not_raises(): ('', 'tempp', git.NoSuchPathError), ('http://', 'example.com/file', None), ('https://', 'example.com/file', None), - ('bla://', 'file', NotImplementedError)] + ('bla://', 'file', errors.UrlSchemaNotSupported)] ) def test_data_add( scheme, path, error, client, data_file, directory_tree, dataset_responses From 61672aec5cdc264029447785961a6c5847da6a26 Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Tue, 22 Oct 2019 09:58:56 +0200 Subject: [PATCH 2/6] test: add more tests --- conftest.py | 27 +++++++ renku/core/commands/dataset.py | 2 +- tests/cli/test_datasets.py | 134 +++++++++++++++++++++++++++++++-- 3 files changed, 154 insertions(+), 9 deletions(-) diff --git a/conftest.py b/conftest.py index 6689e2dee8..9cc6eb630f 100644 --- a/conftest.py +++ b/conftest.py @@ -474,3 +474,30 @@ def sleep_after(): import time yield time.sleep(0.5) + + +@pytest.fixture +def remote_project(data_repository, directory_tree): + """A second Renku project with a dataset.""" + from renku.cli import cli + + runner = CliRunner() + + with runner.isolated_filesystem() as project_path: + runner.invoke(cli, ['-S', 'init']) + result = runner.invoke( + cli, ['-S', 'dataset', 'create', 'remote-dataset'] + ) + assert 0 == result.exit_code + + result = runner.invoke( + cli, + [ + '-S', 'dataset', 'add', '-s', 'file', '-s', 'dir2', + 'remote-dataset', directory_tree.strpath + ], + catch_exceptions=False, + ) + assert 0 == result.exit_code + + yield runner, project_path diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index aaf5ea1112..1b93c942c5 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -171,7 +171,7 @@ def add_to_dataset( raise UsageError('No URL is specified') elif len(urls) > 1: raise UsageError( - 'Cannot add multiple URLs whit --source or --destination' + 'Cannot add multiple URLs with --source or --destination' ) # check for identifier before creating the dataset diff --git a/tests/cli/test_datasets.py b/tests/cli/test_datasets.py index e74324d99c..4aaeaf0df4 100644 --- a/tests/cli/test_datasets.py +++ b/tests/cli/test_datasets.py @@ -431,16 +431,34 @@ def test_relative_git_import_to_dataset(tmpdir, runner, project, client): ) assert 1 == result.exit_code - # copy a non-existing source + +@pytest.mark.parametrize( + 'add_options,n_urls,message', [ + (['-s', 'file', '-d', 'new-file'], 0, 'No URL is specified'), + (['-s', 'file'], 2, 'Cannot add multiple URLs'), + (['-d', 'file'], 2, 'Cannot add multiple URLs'), + (['-s', 'non-existing'], 1, 'No such file or directory'), + ] +) +def test_usage_error_in_import_from_git_repo( + data_repository, directory_tree, runner, project, client, add_options, + n_urls, message +): + """Test user's errors when adding to a dataset from a git repository.""" + # create a dataset + result = runner.invoke(cli, ['dataset', 'create', 'dataset']) + assert 0 == result.exit_code + + urls = n_urls * [directory_tree.strpath] + + # add data in subdirectory result = runner.invoke( cli, - [ - 'dataset', 'add', 'relative', '--source', 'non-existing', - str(tmpdir) - ], - catch_exceptions=True, + ['dataset', 'add', 'dataset'] + add_options + urls, + catch_exceptions=False, ) - assert 2 == result.exit_code + assert result.exit_code == 2 + assert message in result.output def test_dataset_add_with_link(tmpdir, runner, project, client): @@ -1246,7 +1264,7 @@ def test_dataset_clean_up_when_add_fails(runner, client): def read_dataset_file_metadata(client, dataset_name, filename): - + """Return metadata from dataset's YAML file.""" path = client.dataset_path(dataset_name) assert path.exists() @@ -1422,3 +1440,103 @@ def test_dataset_update_multiple_datasets( assert data == 'new content' data = open(str(client.path / 'data' / 'dataset-2' / 'file')).read() assert data == 'new content' + + +def test_empty_update(client, runner, data_repository, directory_tree): + """Test update when nothing changed.""" + # Add dataset to project + result = runner.invoke( + cli, ['dataset', 'add', 'dataset', directory_tree.strpath], + catch_exceptions=False + ) + assert 0 == result.exit_code + + commit_sha_before = client.repo.head.object.hexsha + result = runner.invoke(cli, ['dataset', 'update'], catch_exceptions=False) + assert 0 == result.exit_code + commit_sha_after = client.repo.head.object.hexsha + assert commit_sha_after == commit_sha_before + + +def test_import_from_renku_project(remote_project, client, runner): + """Test an imported dataset from other renku repos will have metadata.""" + from renku.core.management import LocalClient + + _, remote_project_path = remote_project + + remote_client = LocalClient(remote_project_path) + remote = read_dataset_file_metadata( + remote_client, 'remote-dataset', 'file' + ) + + runner.invoke(cli, ['-S', 'dataset', 'create', 'remote-dataset']) + result = runner.invoke( + cli, + [ + '-S', 'dataset', 'add', '--src', 'data', 'remote-dataset', + str(remote_project_path) + ], + catch_exceptions=False, + ) + assert 0 == result.exit_code + + metadata = read_dataset_file_metadata(client, 'remote-dataset', 'file') + assert metadata['creator'] == remote['creator'] + assert metadata['based_on']['_id'] == remote['_id'] + assert metadata['based_on']['_label'] == remote['_label'] + assert metadata['based_on']['path'] == remote['path'] + assert metadata['based_on']['based_on'] is None + assert metadata['based_on']['url'] == remote_project_path + + +def test_update_renku_project_dataset( + remote_project, client, runner, directory_tree, data_repository +): + """Test an imported dataset from other renku repos will be updated.""" + remote_runner, remote_project_path = remote_project + + runner.invoke(cli, ['-S', 'dataset', 'create', 'remote-dataset']) + result = runner.invoke( + cli, + [ + '-S', 'dataset', 'add', '--src', 'data/remote-dataset/file', + 'remote-dataset', + str(remote_project_path) + ], + catch_exceptions=False, + ) + assert 0 == result.exit_code + + before = read_dataset_file_metadata(client, 'remote-dataset', 'file') + + # Update and commit original file + file_ = directory_tree.join('file') + file_.write('new content') + data_repository.index.add([file_.strpath]) + data_repository.index.commit('updated') + + assert (Path(file_.strpath)).read_text() == 'new content' + + # Update remote project + os.chdir(remote_project_path) + result = remote_runner.invoke( + cli, ['dataset', 'update'], catch_exceptions=False + ) + assert 0 == result.exit_code + + # Update project + os.chdir(client.path) + result = runner.invoke(cli, ['dataset', 'update'], catch_exceptions=False) + assert 0 == result.exit_code + content = (client.path / 'data' / 'remote-dataset' / 'file').read_text() + assert content == 'new content' + + after = read_dataset_file_metadata(client, 'remote-dataset', 'file') + assert after['_id'] == before['_id'] + assert after['_label'] != before['_label'] + assert after['added'] == before['added'] + assert after['url'] == before['url'] + assert after['based_on']['_id'] == before['based_on']['_id'] + assert after['based_on']['_label'] != before['based_on']['_label'] + assert after['based_on']['path'] == before['based_on']['path'] + assert after['based_on']['based_on'] is None From 23adec1e1bdbf360f0fdb0f076d32668744f22c7 Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Fri, 25 Oct 2019 10:05:16 +0200 Subject: [PATCH 3/6] doc: update command --- renku/cli/dataset.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/renku/cli/dataset.py b/renku/cli/dataset.py index bb8030383d..2a525c2485 100644 --- a/renku/cli/dataset.py +++ b/renku/cli/dataset.py @@ -115,6 +115,26 @@ branch, commit, or tag. The value passed to this option must be a valid reference in the remote Git repository. +Updating a dataset: + +After adding files from a remote Git repository, you can check for updates in +those files by using ``renku dataset update`` command. This command checks all +remote files and copies over new content if there are any. It does not delete +files from the local dataset if they are deleted from the remote Git +repository; to force the delete use ``--delete`` argument. + +You can limit the scope of updated files by specifying dataset names, using +``--include`` and ``--exclude`` to filter based on file names, or using +``--creators`` to filter based on creators. For example, the following command +updates only CSV files from ``my-dataset``: + +.. code-block:: console + + $ renku dataset update -I '*.csv' my-dataset + +Note that putting glob patterns in quotation is needed to tell Unix shell not +to expand them. + Tagging a dataset: A dataset can be tagged with an arbitrary tag to refer to the dataset at that From c0f6dff2cb82b7e17c2509f99b6efd1a0a1b17d4 Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Fri, 25 Oct 2019 10:01:01 +0200 Subject: [PATCH 4/6] review: apply comments --- renku/cli/dataset.py | 21 ++++++++++++---- renku/core/commands/dataset.py | 11 ++++++++- renku/core/errors.py | 4 ++-- renku/core/management/datasets.py | 40 ++++++++++++++++++------------- tests/cli/test_datasets.py | 6 +++++ 5 files changed, 59 insertions(+), 23 deletions(-) diff --git a/renku/cli/dataset.py b/renku/cli/dataset.py index 2a525c2485..173bb45c0e 100644 --- a/renku/cli/dataset.py +++ b/renku/cli/dataset.py @@ -119,7 +119,7 @@ After adding files from a remote Git repository, you can check for updates in those files by using ``renku dataset update`` command. This command checks all -remote files and copies over new content if there are any. It does not delete +remote files and copies over new content if there is any. It does not delete files from the local dataset if they are deleted from the remote Git repository; to force the delete use ``--delete`` argument. @@ -132,7 +132,7 @@ $ renku dataset update -I '*.csv' my-dataset -Note that putting glob patterns in quotation is needed to tell Unix shell not +Note that putting glob patterns in quotes is needed to tell Unix shell not to expand them. Tagging a dataset: @@ -669,8 +669,21 @@ def _init(lock, id_queue): @click.option( '--ref', default=None, help='Update to a specific commit/tag/branch.' ) -def update(names, creators, include, exclude, ref): +@click.option( + '--delete', + is_flag=True, + help='Delete local files that are deleted from remote.' +) +def update(names, creators, include, exclude, ref, delete): """Updates files in dataset from a remote Git repo.""" progress_context = partial(progressbar, label='Updating files') - update_datasets(names, creators, include, exclude, ref, progress_context) + update_datasets( + names=names, + creators=creators, + include=include, + exclude=exclude, + ref=ref, + delete=delete, + progress_context=progress_context + ) click.secho('OK', fg='green') diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index 1b93c942c5..a645a7a55b 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -525,6 +525,7 @@ def update_datasets( include, exclude, ref, + delete, progress_context=contextlib.nullcontext ): """Update files from a remote Git repo.""" @@ -566,7 +567,15 @@ def update_datasets( with progress_context( possible_updates, item_show_func=lambda x: x.path if x else None ) as progressbar: - client.update_dataset_files(progressbar, ref) + deleted_files = client.update_dataset_files( + files=progressbar, ref=ref, delete=delete + ) + + if deleted_files and not delete: + click.echo( + 'Some files are deleted from remote. To also delete them locally ' + 'run update command with `--delete` flag.' + ) def _include_exclude(file_path, include=None, exclude=None): diff --git a/renku/core/errors.py b/renku/core/errors.py index c9912fdd7f..473ab9701c 100644 --- a/renku/core/errors.py +++ b/renku/core/errors.py @@ -367,8 +367,8 @@ class GitError(RenkuException): """Raised when a remote Git repo cannot be accessed.""" -class UrlSchemaNotSupported(RenkuException): - """Raised when adding data from unsupported URL schemas.""" +class UrlSchemeNotSupported(RenkuException): + """Raised when adding data from unsupported URL schemes.""" class OperationError(RenkuException): diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index ce137c7b89..3db79aa192 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -241,8 +241,8 @@ def _add_from_url(self, dataset, dataset_path, url, link, destination): u = parse.urlparse(url) if u.scheme not in Dataset.SUPPORTED_SCHEMES: - raise errors.UrlSchemaNotSupported( - 'Schema {} not supported'.format(u.scheme) + raise errors.UrlSchemeNotSupported( + 'Scheme {} not supported'.format(u.scheme) ) if destination: @@ -357,8 +357,8 @@ def _add_from_git( url = repo_path.as_posix() elif u.scheme not in {'http', 'https', 'git+https', 'git+ssh'} and \ not url.startswith('git@'): - raise errors.UrlSchemaNotSupported( - 'Schema {} not supported'.format(u.scheme) + raise errors.UrlSchemeNotSupported( + 'Scheme {} not supported'.format(u.scheme) ) repo, repo_path = self._prepare_git_repo(url, ref) @@ -412,8 +412,6 @@ def _add_from_git( if not src.is_dir(): if is_local_repo: dst_url = None - elif path: - dst_url = '{}/{}'.format(url, path) else: dst_url = url @@ -642,8 +640,14 @@ def remove_dataset_tags(self, dataset, tags): return dataset - def update_dataset_files(self, files, ref=None): - """Update files and dataset metadata according to their remotes.""" + def update_dataset_files(self, files, ref, delete=False): + """Update files and dataset metadata according to their remotes. + + :param files: List of files to be updated + :param delete: Indicates whether to delete files or not + + :return: List of files that should be deleted + """ from renku import LocalClient visited_repos = {} @@ -695,12 +699,13 @@ def update_dataset_files(self, files, ref=None): updated_files.append(file_) else: # File was removed or renamed - os.remove(str(dst)) + if delete: + os.remove(str(dst)) deleted_files.append(file_) - if not updated_files and not deleted_files: + if not updated_files and (not delete or not deleted_files): # Nothing to commit or update - return + return deleted_files # Commit changes in files @@ -735,13 +740,16 @@ def update_dataset_files(self, files, ref=None): file_.dataset.update_files([new_file]) modified_datasets[file_.dataset.name] = file_.dataset - for file_ in deleted_files: - file_.dataset.unlink_file(file_.path) - modified_datasets[file_.dataset.name] = file_.dataset + if delete: + for file_ in deleted_files: + file_.dataset.unlink_file(file_.path) + modified_datasets[file_.dataset.name] = file_.dataset for dataset in modified_datasets.values(): dataset.to_yaml() + return deleted_files + def _prepare_git_repo(self, url, ref): def checkout(repo, ref): try: @@ -804,9 +812,9 @@ def checkout(repo, ref): ], with_exceptions=False ) - except GitCommandError: + except GitCommandError as e: raise errors.GitError( - 'Cannot access remote Git repo: {}'.format(url) + 'Cannot clone remote Git repo: {}\n\n{}'.format(url, e) ) else: return repo, repo_path diff --git a/tests/cli/test_datasets.py b/tests/cli/test_datasets.py index 4aaeaf0df4..73b9c6a21b 100644 --- a/tests/cli/test_datasets.py +++ b/tests/cli/test_datasets.py @@ -1341,6 +1341,12 @@ def test_dataset_update_remove_file( result = runner.invoke(cli, ['dataset', 'update'], catch_exceptions=False) assert 0 == result.exit_code + assert file_path.exists() + + result = runner.invoke( + cli, ['dataset', 'update', '--delete'], catch_exceptions=False + ) + assert 0 == result.exit_code assert not file_path.exists() From b4f349f12e24c6b1e7c902423e01240685e7f6ed Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Mon, 11 Nov 2019 13:38:20 +0100 Subject: [PATCH 5/6] rebase on master --- renku/core/commands/dataset.py | 7 ++++++- renku/core/management/datasets.py | 2 +- tests/cli/test_datasets.py | 4 ++-- tests/core/commands/test_cli.py | 2 +- tests/core/commands/test_dataset.py | 2 +- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index a645a7a55b..d9adf860c6 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -517,7 +517,12 @@ def import_dataset( ) -@pass_local_client(clean=True, commit=True, commit_only=DATASET_METADATA_PATHS) +@pass_local_client( + clean=True, + commit=True, + commit_only=DATASET_METADATA_PATHS, + commit_empty=False +) def update_datasets( client, names, diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index 3db79aa192..b7c5701c1f 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -347,7 +347,7 @@ def _add_from_git( repo_path = Path(repo.git_dir).parent.resolve() # if repo path is a parent of the url, treat the url as a source - if repo_path != Path(u.path): + if repo_path != Path(u.path).resolve(): if sources: raise errors.UsageError( 'Cannot use --source within local repo subdirectories' diff --git a/tests/cli/test_datasets.py b/tests/cli/test_datasets.py index 73b9c6a21b..c83cc22faa 100644 --- a/tests/cli/test_datasets.py +++ b/tests/cli/test_datasets.py @@ -1452,7 +1452,7 @@ def test_empty_update(client, runner, data_repository, directory_tree): """Test update when nothing changed.""" # Add dataset to project result = runner.invoke( - cli, ['dataset', 'add', 'dataset', directory_tree.strpath], + cli, ['dataset', 'add', '--create', 'dataset', directory_tree.strpath], catch_exceptions=False ) assert 0 == result.exit_code @@ -1531,7 +1531,7 @@ def test_update_renku_project_dataset( assert 0 == result.exit_code # Update project - os.chdir(client.path) + os.chdir(str(client.path)) result = runner.invoke(cli, ['dataset', 'update'], catch_exceptions=False) assert 0 == result.exit_code content = (client.path / 'data' / 'remote-dataset' / 'file').read_text() diff --git a/tests/core/commands/test_cli.py b/tests/core/commands/test_cli.py index bffc9325ca..da94af86b3 100644 --- a/tests/core/commands/test_cli.py +++ b/tests/core/commands/test_cli.py @@ -787,7 +787,7 @@ def test_config_manager_set_value(client, global_config_dir, global_only): assert 'zenodo' not in config.sections() -def test_config_get_value(client): +def test_config_get_value(client, global_config_dir): """Check reading from configuration.""" # Value set locally is not visible globally client.set_value('local', 'key', 'local-value') diff --git a/tests/core/commands/test_dataset.py b/tests/core/commands/test_dataset.py index e4cdd4a354..1b29726bcf 100644 --- a/tests/core/commands/test_dataset.py +++ b/tests/core/commands/test_dataset.py @@ -60,7 +60,7 @@ def not_raises(): ('', 'tempp', git.NoSuchPathError), ('http://', 'example.com/file', None), ('https://', 'example.com/file', None), - ('bla://', 'file', errors.UrlSchemaNotSupported)] + ('bla://', 'file', errors.UrlSchemeNotSupported)] ) def test_data_add( scheme, path, error, client, data_file, directory_tree, dataset_responses From 867ac1ae089d8cc0c09687b0788e20aa3511227e Mon Sep 17 00:00:00 2001 From: Mohammad Alisafaee Date: Tue, 12 Nov 2019 10:18:25 +0100 Subject: [PATCH 6/6] feat: print full traceback when re-raising --- renku/cli/dataset.py | 3 ++- renku/cli/exception_handler.py | 2 ++ renku/core/commands/dataset.py | 4 ++-- renku/core/management/datasets.py | 4 ++-- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/renku/cli/dataset.py b/renku/cli/dataset.py index 173bb45c0e..ab33eb5634 100644 --- a/renku/cli/dataset.py +++ b/renku/cli/dataset.py @@ -121,7 +121,8 @@ those files by using ``renku dataset update`` command. This command checks all remote files and copies over new content if there is any. It does not delete files from the local dataset if they are deleted from the remote Git -repository; to force the delete use ``--delete`` argument. +repository; to force the delete use ``--delete`` argument. You can update to a +specific branch, commit, or tag by passing ``--ref`` option. You can limit the scope of updated files by specifying dataset names, using ``--include`` and ``--exclude`` to filter based on file names, or using diff --git a/renku/cli/exception_handler.py b/renku/cli/exception_handler.py index 8cc2794e3f..0dcb53787b 100644 --- a/renku/cli/exception_handler.py +++ b/renku/cli/exception_handler.py @@ -90,6 +90,8 @@ def main(self, *args, **kwargs): return super().main(*args, **kwargs) except RenkuException as e: click.echo('Error: {}'.format(e)) + if e.__cause__ is not None: + click.echo('\n{}'.format(traceback.format_exc())) exit_code = 1 if isinstance(e, (ParameterError, UsageError)): exit_code = 2 diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index d9adf860c6..9fa2fb35ab 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -215,10 +215,10 @@ def add_to_dataset( '"renku dataset add {0}" command with "--create" option for ' 'automatic dataset creation.'.format(name) ) - except (FileNotFoundError, git.exc.NoSuchPathError): + except (FileNotFoundError, git.exc.NoSuchPathError) as e: raise ParameterError( 'Could not find paths/URLs: \n{0}'.format('\n'.join(urls)) - ) + ) from e @pass_local_client(clean=False, commit=False) diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index b7c5701c1f..c8db4a51f6 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -814,8 +814,8 @@ def checkout(repo, ref): ) except GitCommandError as e: raise errors.GitError( - 'Cannot clone remote Git repo: {}\n\n{}'.format(url, e) - ) + 'Cannot clone remote Git repo: {}'.format(url) + ) from e else: return repo, repo_path