Skip to content

Commit 84e59d0

Browse files
feat: do not create dataset implicitly (#779)
* feat: do not create dataset implicitly * refactor: remove dataset overwrite prompt
1 parent b8b1ae2 commit 84e59d0

File tree

8 files changed

+112
-91
lines changed

8 files changed

+112
-91
lines changed

conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def client(project):
176176
@pytest.fixture
177177
def dataset(client):
178178
"""Create a dataset."""
179-
with client.with_dataset(name='dataset') as dataset:
179+
with client.with_dataset(name='dataset', create=True) as dataset:
180180
dataset.creator = [{
181181
'affiliation': 'xxx',
182182
'email': 'me@example.com',

renku/cli/dataset.py

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@
5858
This will copy the contents of ``data-url`` to the dataset and add it
5959
to the dataset metadata.
6060
61+
You can create a dataset when you add data to it for the first time by passing
62+
``--create`` flag to add command:
63+
64+
.. code-block:: console
65+
66+
$ renku dataset add --create new-dataset http://data-url
67+
6168
To add data from a git repository, you can specify it via https or git+ssh
6269
URL schemes. For example,
6370
@@ -231,16 +238,6 @@
231238
from renku.core.errors import DatasetNotFound, InvalidAccessToken
232239

233240

234-
def prompt_duplicate_dataset(existing_dataset):
235-
"""Check if existing dataset should be overwritten.
236-
237-
:return: True if user confirmed overwriting.
238-
"""
239-
warn_ = WARNING + 'This dataset already exists.'
240-
click.echo(warn_)
241-
return click.confirm('Do you wish to overwrite it?', abort=True)
242-
243-
244241
def prompt_access_token(exporter):
245242
"""Prompt user for an access token for a provider.
246243
@@ -365,7 +362,7 @@ def dataset(ctx, revision, datadir, format):
365362
@click.argument('name')
366363
def create(name):
367364
"""Create an empty dataset in the current repo."""
368-
create_dataset(name, handle_duplicate_fn=prompt_duplicate_dataset)
365+
create_dataset(name)
369366
click.secho('OK', fg='green')
370367

371368

@@ -387,6 +384,9 @@ def edit(dataset_id):
387384
@click.option(
388385
'--force', is_flag=True, help='Allow adding otherwise ignored files.'
389386
)
387+
@click.option(
388+
'--create', is_flag=True, help='Create dataset if it does not exist.'
389+
)
390390
@click.option(
391391
'-s',
392392
'--src',
@@ -404,14 +404,15 @@ def edit(dataset_id):
404404
default='',
405405
help='Destination file or directory within the dataset path'
406406
)
407-
def add(name, urls, link, force, sources, destination):
407+
def add(name, urls, link, force, create, sources, destination):
408408
"""Add data to a dataset."""
409409
progress = partial(progressbar, label='Adding data to dataset')
410410
add_file(
411411
urls=urls,
412412
name=name,
413413
link=link,
414414
force=force,
415+
create=create,
415416
sources=sources,
416417
destination=destination,
417418
urlscontext=progress
@@ -580,10 +581,7 @@ def export_(id, provider, publish, tag):
580581
is_flag=True,
581582
help='Extract files before importing to dataset.'
582583
)
583-
@click.option(
584-
'-y', '--yes', is_flag=True, help='Confirm unlinking of all files.'
585-
)
586-
def import_(uri, name, extract, yes):
584+
def import_(uri, name, extract):
587585
"""Import data from a 3rd party provider.
588586
589587
Supported providers: [Zenodo, Dataverse]
@@ -612,11 +610,9 @@ def _init(lock, id_queue):
612610
name,
613611
extract,
614612
with_prompt=True,
615-
handle_duplicate_fn=prompt_duplicate_dataset,
616613
pool_init_fn=_init,
617614
pool_init_args=(mp.RLock(), id_queue),
618-
download_file_fn=download_file_with_progress,
619-
force=yes
615+
download_file_fn=download_file_with_progress
620616
)
621617
click.secho('OK', fg='green')
622618

renku/core/commands/dataset.py

Lines changed: 33 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -100,17 +100,15 @@ def dataset_parent(client, revision, datadir, format, ctx=None):
100100
@pass_local_client(
101101
clean=False, commit=True, commit_only=DATASET_METADATA_PATHS
102102
)
103-
def create_dataset(client, name, handle_duplicate_fn=None):
103+
def create_dataset(client, name):
104104
"""Create an empty dataset in the current repo.
105105
106106
:raises: ``renku.core.errors.ParameterError``
107107
"""
108-
existing = client.load_dataset(name=name)
109-
if (not existing or handle_duplicate_fn and handle_duplicate_fn(existing)):
110-
with client.with_dataset(name=name) as dataset:
111-
creator = Creator.from_git(client.repo)
112-
if creator not in dataset.creator:
113-
dataset.creator.append(creator)
108+
with client.with_dataset(name=name, create=True) as dataset:
109+
creator = Creator.from_git(client.repo)
110+
if creator not in dataset.creator:
111+
dataset.creator.append(creator)
114112

115113

116114
@pass_local_client(
@@ -140,15 +138,16 @@ def add_file(
140138
name,
141139
link=False,
142140
force=False,
141+
create=False,
143142
sources=(),
144143
destination='',
145144
with_metadata=None,
146145
urlscontext=contextlib.nullcontext
147146
):
148147
"""Add data file to a dataset."""
149148
add_to_dataset(
150-
client, urls, name, link, force, sources, destination, with_metadata,
151-
urlscontext
149+
client, urls, name, link, force, create, sources, destination,
150+
with_metadata, urlscontext
152151
)
153152

154153

@@ -158,6 +157,7 @@ def add_to_dataset(
158157
name,
159158
link=False,
160159
force=False,
160+
create=False,
161161
sources=(),
162162
destination='',
163163
with_metadata=None,
@@ -169,7 +169,9 @@ def add_to_dataset(
169169
with_metadata.identifier
170170
) if with_metadata else None
171171
try:
172-
with client.with_dataset(name=name, identifier=identifier) as dataset:
172+
with client.with_dataset(
173+
name=name, identifier=identifier, create=create
174+
) as dataset:
173175
with urlscontext(urls) as bar:
174176
client.add_data_to_dataset(
175177
dataset,
@@ -195,6 +197,13 @@ def add_to_dataset(
195197

196198
dataset.update_metadata(with_metadata)
197199

200+
except DatasetNotFound:
201+
raise DatasetNotFound(
202+
'Dataset "{0}" does not exist.\n'
203+
'Use "renku dataset create {0}" to create the dataset or retry '
204+
'"renku dataset add {0}" command with "--create" option for '
205+
'automatic dataset creation.'.format(name)
206+
)
198207
except (FileNotFoundError, git.exc.NoSuchPathError):
199208
raise ParameterError('Could not process \n{0}'.format('\n'.join(urls)))
200209

@@ -393,8 +402,6 @@ def import_dataset(
393402
name,
394403
extract,
395404
with_prompt=False,
396-
force=False,
397-
handle_duplicate_fn=None,
398405
pool_init_fn=None,
399406
pool_init_args=None,
400407
download_file_fn=default_download_file
@@ -428,8 +435,7 @@ def import_dataset(
428435
record.links.get('latest_html')
429436
) + text_prompt
430437

431-
if not force:
432-
click.confirm(text_prompt, abort=True)
438+
click.confirm(text_prompt, abort=True)
433439

434440
except KeyError as e:
435441
raise ParameterError((
@@ -482,27 +488,20 @@ def import_dataset(
482488
pool.close()
483489

484490
dataset_name = name or dataset.display_name
485-
existing = client.load_dataset(name=dataset_name)
486-
if (
487-
not existing or force or
488-
(handle_duplicate_fn and handle_duplicate_fn(dataset_name))
489-
):
490-
add_to_dataset(
491-
client,
492-
urls=[str(p) for p in Path(data_folder).glob('*')],
493-
name=dataset_name,
494-
with_metadata=dataset
495-
)
491+
add_to_dataset(
492+
client,
493+
urls=[str(p) for p in Path(data_folder).glob('*')],
494+
name=dataset_name,
495+
with_metadata=dataset,
496+
create=True
497+
)
496498

497-
if dataset.version:
498-
tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version)
499-
tag_dataset(
500-
client,
501-
dataset_name,
502-
tag_name,
503-
'Tag {} created by renku import'.format(dataset.version),
504-
force=True
505-
)
499+
if dataset.version:
500+
tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version)
501+
tag_dataset(
502+
client, dataset_name, tag_name,
503+
'Tag {} created by renku import'.format(dataset.version)
504+
)
506505

507506

508507
@pass_local_client(clean=True, commit=True, commit_only=DATASET_METADATA_PATHS)

renku/core/errors.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -293,10 +293,8 @@ def __init__(self, returncode, success_codes=None):
293293
class DatasetNotFound(RenkuException):
294294
"""Raise when dataset is not found."""
295295

296-
def __init__(self):
296+
def __init__(self, msg='Dataset is not found.'):
297297
"""Build a custom message."""
298-
msg = 'Dataset is not found.'
299-
300298
super(DatasetNotFound, self).__init__(msg)
301299

302300

renku/core/management/datasets.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def load_dataset(self, name=None):
109109
return self.get_dataset(path)
110110

111111
@contextmanager
112-
def with_dataset(self, name=None, identifier=None):
112+
def with_dataset(self, name=None, identifier=None, create=False):
113113
"""Yield an editable metadata object for a dataset."""
114114
dataset = self.load_dataset(name=name)
115115
clean_up_required = False
@@ -121,6 +121,8 @@ def with_dataset(self, name=None, identifier=None):
121121
'Dataset name {} is not valid.'.format(name)
122122
)
123123

124+
if not create:
125+
raise errors.DatasetNotFound
124126
clean_up_required = True
125127
dataset_ref = None
126128
identifier = str(uuid.uuid4())
@@ -143,6 +145,11 @@ def with_dataset(self, name=None, identifier=None):
143145
)
144146
dataset_ref.set_reference(path)
145147

148+
elif create:
149+
raise errors.DatasetExistsError(
150+
'Dataset exists: "{}".'.format(name)
151+
)
152+
146153
dataset_path = self.path / self.datadir / dataset.name
147154
dataset_path.mkdir(parents=True, exist_ok=True)
148155

0 commit comments

Comments
 (0)