diff --git a/conftest.py b/conftest.py index 6689e2dee8..9cc6eb630f 100644 --- a/conftest.py +++ b/conftest.py @@ -474,3 +474,30 @@ def sleep_after(): import time yield time.sleep(0.5) + + +@pytest.fixture +def remote_project(data_repository, directory_tree): + """A second Renku project with a dataset.""" + from renku.cli import cli + + runner = CliRunner() + + with runner.isolated_filesystem() as project_path: + runner.invoke(cli, ['-S', 'init']) + result = runner.invoke( + cli, ['-S', 'dataset', 'create', 'remote-dataset'] + ) + assert 0 == result.exit_code + + result = runner.invoke( + cli, + [ + '-S', 'dataset', 'add', '-s', 'file', '-s', 'dir2', + 'remote-dataset', directory_tree.strpath + ], + catch_exceptions=False, + ) + assert 0 == result.exit_code + + yield runner, project_path diff --git a/renku/cli/dataset.py b/renku/cli/dataset.py index bb8030383d..ab33eb5634 100644 --- a/renku/cli/dataset.py +++ b/renku/cli/dataset.py @@ -115,6 +115,27 @@ branch, commit, or tag. The value passed to this option must be a valid reference in the remote Git repository. +Updating a dataset: + +After adding files from a remote Git repository, you can check for updates in +those files by using ``renku dataset update`` command. This command checks all +remote files and copies over new content if there is any. It does not delete +files from the local dataset if they are deleted from the remote Git +repository; to force the delete use ``--delete`` argument. You can update to a +specific branch, commit, or tag by passing ``--ref`` option. + +You can limit the scope of updated files by specifying dataset names, using +``--include`` and ``--exclude`` to filter based on file names, or using +``--creators`` to filter based on creators. For example, the following command +updates only CSV files from ``my-dataset``: + +.. code-block:: console + + $ renku dataset update -I '*.csv' my-dataset + +Note that putting glob patterns in quotes is needed to tell Unix shell not +to expand them. + Tagging a dataset: A dataset can be tagged with an arbitrary tag to refer to the dataset at that @@ -649,8 +670,21 @@ def _init(lock, id_queue): @click.option( '--ref', default=None, help='Update to a specific commit/tag/branch.' ) -def update(names, creators, include, exclude, ref): +@click.option( + '--delete', + is_flag=True, + help='Delete local files that are deleted from remote.' +) +def update(names, creators, include, exclude, ref, delete): """Updates files in dataset from a remote Git repo.""" progress_context = partial(progressbar, label='Updating files') - update_datasets(names, creators, include, exclude, ref, progress_context) + update_datasets( + names=names, + creators=creators, + include=include, + exclude=exclude, + ref=ref, + delete=delete, + progress_context=progress_context + ) click.secho('OK', fg='green') diff --git a/renku/cli/exception_handler.py b/renku/cli/exception_handler.py index 8cc2794e3f..0dcb53787b 100644 --- a/renku/cli/exception_handler.py +++ b/renku/cli/exception_handler.py @@ -90,6 +90,8 @@ def main(self, *args, **kwargs): return super().main(*args, **kwargs) except RenkuException as e: click.echo('Error: {}'.format(e)) + if e.__cause__ is not None: + click.echo('\n{}'.format(traceback.format_exc())) exit_code = 1 if isinstance(e, (ParameterError, UsageError)): exit_code = 2 diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index a7bf6709f2..9fa2fb35ab 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -39,7 +39,7 @@ from renku.core.commands.providers import ProviderFactory from renku.core.compat import contextlib from renku.core.errors import DatasetNotFound, InvalidAccessToken, \ - MigrationRequired, ParameterError + MigrationRequired, ParameterError, UsageError from renku.core.management.datasets import DATASET_METADATA_PATHS from renku.core.management.git import COMMIT_DIFF_STRATEGY from renku.core.models.creators import Creator @@ -166,6 +166,14 @@ def add_to_dataset( urlscontext=contextlib.nullcontext ): """Add data to a dataset.""" + if sources or destination: + if len(urls) == 0: + raise UsageError('No URL is specified') + elif len(urls) > 1: + raise UsageError( + 'Cannot add multiple URLs with --source or --destination' + ) + # check for identifier before creating the dataset identifier = extract_doi( with_metadata.identifier @@ -207,8 +215,10 @@ def add_to_dataset( '"renku dataset add {0}" command with "--create" option for ' 'automatic dataset creation.'.format(name) ) - except (FileNotFoundError, git.exc.NoSuchPathError): - raise ParameterError('Could not process \n{0}'.format('\n'.join(urls))) + except (FileNotFoundError, git.exc.NoSuchPathError) as e: + raise ParameterError( + 'Could not find paths/URLs: \n{0}'.format('\n'.join(urls)) + ) from e @pass_local_client(clean=False, commit=False) @@ -507,7 +517,12 @@ def import_dataset( ) -@pass_local_client(clean=True, commit=True, commit_only=DATASET_METADATA_PATHS) +@pass_local_client( + clean=True, + commit=True, + commit_only=DATASET_METADATA_PATHS, + commit_empty=False +) def update_datasets( client, names, @@ -515,6 +530,7 @@ def update_datasets( include, exclude, ref, + delete, progress_context=contextlib.nullcontext ): """Update files from a remote Git repo.""" @@ -556,7 +572,15 @@ def update_datasets( with progress_context( possible_updates, item_show_func=lambda x: x.path if x else None ) as progressbar: - client.update_dataset_files(progressbar, ref) + deleted_files = client.update_dataset_files( + files=progressbar, ref=ref, delete=delete + ) + + if deleted_files and not delete: + click.echo( + 'Some files are deleted from remote. To also delete them locally ' + 'run update command with `--delete` flag.' + ) def _include_exclude(file_path, include=None, exclude=None): diff --git a/renku/core/errors.py b/renku/core/errors.py index a4ef655ce9..473ab9701c 100644 --- a/renku/core/errors.py +++ b/renku/core/errors.py @@ -84,9 +84,11 @@ def __init__(self, message, param_hint=None): if param_hint: if isinstance(param_hint, (tuple, list)): param_hint = ' / '.join('"{}"'.format(x) for x in param_hint) - message = 'Invalid value for {}: {}'.format(param_hint, message) + message = 'Invalid parameter value for {}: {}'.format( + param_hint, message + ) else: - message = 'Invalid value: {}'.format(message) + message = 'Invalid parameter value: {}'.format(message) super().__init__(message) @@ -365,5 +367,9 @@ class GitError(RenkuException): """Raised when a remote Git repo cannot be accessed.""" -class UrlSchemaNotSupported(RenkuException): - """Raised when adding data from unsupported URL schemas.""" +class UrlSchemeNotSupported(RenkuException): + """Raised when adding data from unsupported URL schemes.""" + + +class OperationError(RenkuException): + """Raised when an operation at runtime raises an error.""" diff --git a/renku/core/management/datasets.py b/renku/core/management/datasets.py index 610d766f3a..c8db4a51f6 100644 --- a/renku/core/management/datasets.py +++ b/renku/core/management/datasets.py @@ -81,7 +81,8 @@ def datasets_from_commit(self, commit=None): def datasets(self): """Return mapping from path to dataset.""" result = {} - for path in self.renku_datasets_path.rglob(self.METADATA): + paths = (self.path / self.renku_datasets_path).rglob(self.METADATA) + for path in paths: result[path] = self.get_dataset(path) return result @@ -240,8 +241,8 @@ def _add_from_url(self, dataset, dataset_path, url, link, destination): u = parse.urlparse(url) if u.scheme not in Dataset.SUPPORTED_SCHEMES: - raise NotImplementedError( - '{} URLs are not supported'.format(u.scheme) + raise errors.UrlSchemeNotSupported( + 'Scheme {} not supported'.format(u.scheme) ) if destination: @@ -284,7 +285,7 @@ def _add_from_url(self, dataset, dataset_path, url, link, destination): try: os.link(str(src), str(dst)) except Exception as e: - raise Exception( + raise errors.OperationError( 'Could not create hard link ' '- retry without --link.' ) from e @@ -298,7 +299,9 @@ def _add_from_url(self, dataset, dataset_path, url, link, destination): response = requests.get(url) dst.write_bytes(response.content) except error.HTTPError as e: # pragma nocover - raise e + raise errors.OperationError( + 'Cannot download from {}'.format(url) + ) from e # make the added file read-only mode = dst.stat().st_mode & 0o777 @@ -339,14 +342,12 @@ def _add_from_git( 'parent': self }] - warnings.warn('Importing local git repository, use HTTPS') - # determine where is the base repo path repo = Repo(u.path, search_parent_directories=True) repo_path = Path(repo.git_dir).parent.resolve() # if repo path is a parent of the url, treat the url as a source - if repo_path != Path(u.path): + if repo_path != Path(u.path).resolve(): if sources: raise errors.UsageError( 'Cannot use --source within local repo subdirectories' @@ -356,7 +357,7 @@ def _add_from_git( url = repo_path.as_posix() elif u.scheme not in {'http', 'https', 'git+https', 'git+ssh'} and \ not url.startswith('git@'): - raise errors.UrlSchemaNotSupported( + raise errors.UrlSchemeNotSupported( 'Scheme {} not supported'.format(u.scheme) ) @@ -391,8 +392,9 @@ def _add_from_git( # Create metadata and move files to dataset results = [] - client = LocalClient(repo_path) + remote_client = LocalClient(repo_path) + # Pull files from LFS paths = set() for path, src, _, __ in files: if src.is_dir(): @@ -402,25 +404,34 @@ def _add_from_git( paths.add(path) self._fetch_lfs_files(repo_path, paths) + # Fetch metadata from Renku if any + paths = {f[0] for f in files} + metadata = self._fetch_files_metadata(remote_client, paths) + for path, src, dst, _ in files: if not src.is_dir(): - creators = [] - # grab all the creators from the commit history - for commit in repo.iter_commits(paths=path): - creator = Creator.from_commit(commit) - if creator not in creators: - creators.append(creator) - if is_local_repo: dst_url = None - elif path: - dst_url = '{}/{}'.format(url, path) else: dst_url = url - based_on = DatasetFile.from_revision( - client, path=path, url=url - ) + # Use original metadata if it exists + based_on = metadata.get(path) + if based_on: + based_on.url = url + based_on.based_on = None + creators = based_on.creator + else: + creators = [] + # grab all the creators from the commit history + for commit in repo.iter_commits(paths=path): + creator = Creator.from_commit(commit) + if creator not in creators: + creators.append(creator) + + based_on = DatasetFile.from_revision( + remote_client, path=path, url=url + ) path_in_dst_repo = dst.relative_to(self.path) @@ -521,6 +532,17 @@ def _fetch_lfs_files(self, repo_path, paths): except SubprocessError: pass + @staticmethod + def _fetch_files_metadata(client, paths): + """Return metadata for files from paths.""" + files = {} + for _, dataset in client.datasets.items(): + for file_ in dataset.files: + path = file_.path + if path in paths: + files[path] = file_ + return files + def get_relative_url(self, url): """Determine if the repo url should be relative.""" # Check if the default remote of the branch we are on is on @@ -618,8 +640,14 @@ def remove_dataset_tags(self, dataset, tags): return dataset - def update_dataset_files(self, files, ref=None): - """Update files and dataset metadata according to their remotes.""" + def update_dataset_files(self, files, ref, delete=False): + """Update files and dataset metadata according to their remotes. + + :param files: List of files to be updated + :param delete: Indicates whether to delete files or not + + :return: List of files that should be deleted + """ from renku import LocalClient visited_repos = {} @@ -627,7 +655,8 @@ def update_dataset_files(self, files, ref=None): deleted_files = [] for file_ in files: - based_on = DatasetFile.from_jsonld(file_.based_on) + file_.based_on = DatasetFile.from_jsonld(file_.based_on) + based_on = file_.based_on url = based_on.url if url in visited_repos: repo, repo_path, remote_client = visited_repos[url] @@ -636,22 +665,28 @@ def update_dataset_files(self, files, ref=None): remote_client = LocalClient(repo_path) visited_repos[url] = repo, repo_path, remote_client - try: - remote_file = DatasetFile.from_revision( - remote_client, - path=based_on.path, - url=url, - added=based_on.added - ) - except KeyError: - raise errors.InvalidFileOperation( - 'Cannot find file {} in the repo {}'.format( - based_on.url, url + remote_file = self._fetch_file_metadata( + remote_client, based_on.path + ) + + if not remote_file: + try: + remote_file = DatasetFile.from_revision( + remote_client, + path=based_on.path, + url=url, + added=based_on.added + ) + except KeyError: + raise errors.InvalidFileOperation( + 'Cannot find file {} in the repo {}'.format( + based_on.url, url + ) ) - ) - commit_sha = based_on._id.split('/')[1] - if commit_sha != remote_file.commit.hexsha: + commit_sha = self._get_commit_sha_from_label(based_on) + remote_commit_sha = self._get_commit_sha_from_label(remote_file) + if commit_sha != remote_commit_sha: src = Path(repo.working_dir) / based_on.path dst = self.renku_path.parent / file_.path @@ -659,16 +694,18 @@ def update_dataset_files(self, files, ref=None): # Fetch file is it is tracked by Git LFS self._fetch_lfs_files(repo_path, {based_on.path}) shutil.copy(str(src), str(dst)) - file_.based_on = remote_file + file_.based_on.commit = remote_file.commit + file_.based_on._label = remote_file._label updated_files.append(file_) else: # File was removed or renamed - os.remove(str(dst)) + if delete: + os.remove(str(dst)) deleted_files.append(file_) - if not updated_files and not deleted_files: + if not updated_files and (not delete or not deleted_files): # Nothing to commit or update - return + return deleted_files # Commit changes in files @@ -703,13 +740,16 @@ def update_dataset_files(self, files, ref=None): file_.dataset.update_files([new_file]) modified_datasets[file_.dataset.name] = file_.dataset - for file_ in deleted_files: - file_.dataset.unlink_file(file_.path) - modified_datasets[file_.dataset.name] = file_.dataset + if delete: + for file_ in deleted_files: + file_.dataset.unlink_file(file_.path) + modified_datasets[file_.dataset.name] = file_.dataset for dataset in modified_datasets.values(): dataset.to_yaml() + return deleted_files + def _prepare_git_repo(self, url, ref): def checkout(repo, ref): try: @@ -772,13 +812,28 @@ def checkout(repo, ref): ], with_exceptions=False ) - except GitCommandError: + except GitCommandError as e: raise errors.GitError( - 'Cannot access remote Git repo: {}'.format(url) - ) + 'Cannot clone remote Git repo: {}'.format(url) + ) from e else: return repo, repo_path + @staticmethod + def _fetch_file_metadata(client, path): + """Return metadata for a single file.""" + for _, dataset in client.datasets.items(): + for file_ in dataset.files: + if file_.path == path: + return file_ + + @staticmethod + def _get_commit_sha_from_label(dataset_file): + label = dataset_file._label + if '@' in label: + return label.split('@')[1] + return label + def check_for_git_repo(url): """Check if a url points to a git repository.""" diff --git a/tests/cli/test_datasets.py b/tests/cli/test_datasets.py index 0037ff7935..c83cc22faa 100644 --- a/tests/cli/test_datasets.py +++ b/tests/cli/test_datasets.py @@ -431,16 +431,34 @@ def test_relative_git_import_to_dataset(tmpdir, runner, project, client): ) assert 1 == result.exit_code - # copy a non-existing source + +@pytest.mark.parametrize( + 'add_options,n_urls,message', [ + (['-s', 'file', '-d', 'new-file'], 0, 'No URL is specified'), + (['-s', 'file'], 2, 'Cannot add multiple URLs'), + (['-d', 'file'], 2, 'Cannot add multiple URLs'), + (['-s', 'non-existing'], 1, 'No such file or directory'), + ] +) +def test_usage_error_in_import_from_git_repo( + data_repository, directory_tree, runner, project, client, add_options, + n_urls, message +): + """Test user's errors when adding to a dataset from a git repository.""" + # create a dataset + result = runner.invoke(cli, ['dataset', 'create', 'dataset']) + assert 0 == result.exit_code + + urls = n_urls * [directory_tree.strpath] + + # add data in subdirectory result = runner.invoke( cli, - [ - 'dataset', 'add', 'relative', '--source', 'non-existing', - str(tmpdir) - ], - catch_exceptions=True, + ['dataset', 'add', 'dataset'] + add_options + urls, + catch_exceptions=False, ) - assert 2 == result.exit_code + assert result.exit_code == 2 + assert message in result.output def test_dataset_add_with_link(tmpdir, runner, project, client): @@ -1246,7 +1264,7 @@ def test_dataset_clean_up_when_add_fails(runner, client): def read_dataset_file_metadata(client, dataset_name, filename): - + """Return metadata from dataset's YAML file.""" path = client.dataset_path(dataset_name) assert path.exists() @@ -1295,8 +1313,10 @@ def test_dataset_update( assert after['_label'] != before['_label'] assert after['added'] == before['added'] assert after['url'] == before['url'] - assert after['based_on']['_id'] != before['based_on']['_id'] + assert after['based_on']['_id'] == before['based_on']['_id'] + assert after['based_on']['_label'] != before['based_on']['_label'] assert after['based_on']['path'] == before['based_on']['path'] + assert after['based_on']['based_on'] is None def test_dataset_update_remove_file( @@ -1321,6 +1341,12 @@ def test_dataset_update_remove_file( result = runner.invoke(cli, ['dataset', 'update'], catch_exceptions=False) assert 0 == result.exit_code + assert file_path.exists() + + result = runner.invoke( + cli, ['dataset', 'update', '--delete'], catch_exceptions=False + ) + assert 0 == result.exit_code assert not file_path.exists() @@ -1420,3 +1446,103 @@ def test_dataset_update_multiple_datasets( assert data == 'new content' data = open(str(client.path / 'data' / 'dataset-2' / 'file')).read() assert data == 'new content' + + +def test_empty_update(client, runner, data_repository, directory_tree): + """Test update when nothing changed.""" + # Add dataset to project + result = runner.invoke( + cli, ['dataset', 'add', '--create', 'dataset', directory_tree.strpath], + catch_exceptions=False + ) + assert 0 == result.exit_code + + commit_sha_before = client.repo.head.object.hexsha + result = runner.invoke(cli, ['dataset', 'update'], catch_exceptions=False) + assert 0 == result.exit_code + commit_sha_after = client.repo.head.object.hexsha + assert commit_sha_after == commit_sha_before + + +def test_import_from_renku_project(remote_project, client, runner): + """Test an imported dataset from other renku repos will have metadata.""" + from renku.core.management import LocalClient + + _, remote_project_path = remote_project + + remote_client = LocalClient(remote_project_path) + remote = read_dataset_file_metadata( + remote_client, 'remote-dataset', 'file' + ) + + runner.invoke(cli, ['-S', 'dataset', 'create', 'remote-dataset']) + result = runner.invoke( + cli, + [ + '-S', 'dataset', 'add', '--src', 'data', 'remote-dataset', + str(remote_project_path) + ], + catch_exceptions=False, + ) + assert 0 == result.exit_code + + metadata = read_dataset_file_metadata(client, 'remote-dataset', 'file') + assert metadata['creator'] == remote['creator'] + assert metadata['based_on']['_id'] == remote['_id'] + assert metadata['based_on']['_label'] == remote['_label'] + assert metadata['based_on']['path'] == remote['path'] + assert metadata['based_on']['based_on'] is None + assert metadata['based_on']['url'] == remote_project_path + + +def test_update_renku_project_dataset( + remote_project, client, runner, directory_tree, data_repository +): + """Test an imported dataset from other renku repos will be updated.""" + remote_runner, remote_project_path = remote_project + + runner.invoke(cli, ['-S', 'dataset', 'create', 'remote-dataset']) + result = runner.invoke( + cli, + [ + '-S', 'dataset', 'add', '--src', 'data/remote-dataset/file', + 'remote-dataset', + str(remote_project_path) + ], + catch_exceptions=False, + ) + assert 0 == result.exit_code + + before = read_dataset_file_metadata(client, 'remote-dataset', 'file') + + # Update and commit original file + file_ = directory_tree.join('file') + file_.write('new content') + data_repository.index.add([file_.strpath]) + data_repository.index.commit('updated') + + assert (Path(file_.strpath)).read_text() == 'new content' + + # Update remote project + os.chdir(remote_project_path) + result = remote_runner.invoke( + cli, ['dataset', 'update'], catch_exceptions=False + ) + assert 0 == result.exit_code + + # Update project + os.chdir(str(client.path)) + result = runner.invoke(cli, ['dataset', 'update'], catch_exceptions=False) + assert 0 == result.exit_code + content = (client.path / 'data' / 'remote-dataset' / 'file').read_text() + assert content == 'new content' + + after = read_dataset_file_metadata(client, 'remote-dataset', 'file') + assert after['_id'] == before['_id'] + assert after['_label'] != before['_label'] + assert after['added'] == before['added'] + assert after['url'] == before['url'] + assert after['based_on']['_id'] == before['based_on']['_id'] + assert after['based_on']['_label'] != before['based_on']['_label'] + assert after['based_on']['path'] == before['based_on']['path'] + assert after['based_on']['based_on'] is None diff --git a/tests/core/commands/test_cli.py b/tests/core/commands/test_cli.py index bffc9325ca..da94af86b3 100644 --- a/tests/core/commands/test_cli.py +++ b/tests/core/commands/test_cli.py @@ -787,7 +787,7 @@ def test_config_manager_set_value(client, global_config_dir, global_only): assert 'zenodo' not in config.sections() -def test_config_get_value(client): +def test_config_get_value(client, global_config_dir): """Check reading from configuration.""" # Value set locally is not visible globally client.set_value('local', 'key', 'local-value') diff --git a/tests/core/commands/test_dataset.py b/tests/core/commands/test_dataset.py index 6095af6db9..1b29726bcf 100644 --- a/tests/core/commands/test_dataset.py +++ b/tests/core/commands/test_dataset.py @@ -25,6 +25,7 @@ import git import pytest +from renku.core import errors from renku.core.models.creators import Creator from renku.core.models.datasets import Dataset, DatasetFile @@ -59,7 +60,7 @@ def not_raises(): ('', 'tempp', git.NoSuchPathError), ('http://', 'example.com/file', None), ('https://', 'example.com/file', None), - ('bla://', 'file', NotImplementedError)] + ('bla://', 'file', errors.UrlSchemeNotSupported)] ) def test_data_add( scheme, path, error, client, data_file, directory_tree, dataset_responses