Skip to content

Commit 75ce369

Browse files
Panaetiusjsam
andcommitted
fix: adds _label and commit data to imported dataset files, single commit for imports (#651)
* fix: adds _label and commit data to imported dataset files; dataset files are imported in a single commit * Changes bad URL reporting style Co-Authored-By: Sam <contact@justsam.io>
1 parent 95ed468 commit 75ce369

File tree

4 files changed

+48
-35
lines changed

4 files changed

+48
-35
lines changed

renku/api/datasets.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -145,29 +145,36 @@ def with_dataset(self, name=None):
145145
dataset.to_yaml()
146146

147147
def add_data_to_dataset(
148-
self, dataset, url, git=False, force=False, **kwargs
148+
self, dataset, urls, git=False, force=False, **kwargs
149149
):
150150
"""Import the data into the data directory."""
151151
dataset_path = self.path / self.datadir / dataset.name
152-
git = git or check_for_git_repo(url)
153152

154-
target = kwargs.pop('target', None)
153+
files = []
155154

156-
if git:
157-
if isinstance(target, (str, NoneType)):
158-
files = self._add_from_git(
159-
dataset, dataset_path, url, target, **kwargs
160-
)
161-
else:
162-
files = []
163-
for t in target:
155+
for url in urls:
156+
git = git or check_for_git_repo(url)
157+
158+
target = kwargs.pop('target', None)
159+
160+
if git:
161+
if isinstance(target, (str, NoneType)):
164162
files.extend(
165163
self._add_from_git(
166-
dataset, dataset_path, url, t, **kwargs
164+
dataset, dataset_path, url, target, **kwargs
167165
)
168166
)
169-
else:
170-
files = self._add_from_url(dataset, dataset_path, url, **kwargs)
167+
else:
168+
for t in target:
169+
files.extend(
170+
self._add_from_git(
171+
dataset, dataset_path, url, t, **kwargs
172+
)
173+
)
174+
else:
175+
files.extend(
176+
self._add_from_url(dataset, dataset_path, url, **kwargs)
177+
)
171178

172179
ignored = self.find_ignored_paths(*(data['path']
173180
for data in files)) or []

renku/cli/dataset.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -289,15 +289,14 @@ def add_to_dataset(
289289
with client.with_dataset(name=name) as dataset:
290290
target = target if target else None
291291
with progressbar(urls, label='Adding data to dataset') as bar:
292-
for url in bar:
293-
client.add_data_to_dataset(
294-
dataset,
295-
url,
296-
link=link,
297-
target=target,
298-
relative_to=relative_to,
299-
force=force,
300-
)
292+
client.add_data_to_dataset(
293+
dataset,
294+
bar,
295+
link=link,
296+
target=target,
297+
relative_to=relative_to,
298+
force=force,
299+
)
301300

302301
if with_metadata:
303302

@@ -310,11 +309,13 @@ def add_to_dataset(
310309

311310
file_.path = added_.path
312311
file_.creator = with_metadata.creator
312+
file_._label = added_._label
313+
file_.commit = added_.commit
313314

314315
dataset.update_metadata(with_metadata)
315316

316317
except FileNotFoundError:
317-
raise BadParameter('Could not process {0}'.format(url))
318+
raise BadParameter('Could not process \n{0}'.format('\n'.join(urls)))
318319

319320

320321
@dataset.command('ls-files')

renku/models/datasets.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -432,10 +432,16 @@ def update_metadata(self, other_dataset):
432432

433433
def update_files(self, files):
434434
"""Update files with collection of DatasetFile objects."""
435-
to_insert = [
436-
new_file
437-
for new_file in files if not self.find_file(new_file.path)
438-
]
435+
to_insert = []
436+
437+
for new_file in files:
438+
existing_file = self.find_file(new_file.path)
439+
if existing_file is None:
440+
to_insert.append(new_file)
441+
else:
442+
existing_file.commit = new_file.commit
443+
existing_file._label = new_file._label
444+
439445
self.files += to_insert
440446

441447
def rename_files(self, rename):

tests/test_dataset.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def test_data_add(
7777
'email': 'me@example.com',
7878
'identifier': 'me_id'
7979
}]
80-
client.add_data_to_dataset(d, '{}{}'.format(scheme, path))
80+
client.add_data_to_dataset(d, ['{}{}'.format(scheme, path)])
8181

8282
with open('data/dataset/file') as f:
8383
assert f.read() == '1234'
@@ -99,7 +99,7 @@ def test_data_add(
9999
'identifier': 'me_id'
100100
}]
101101
client.add_data_to_dataset(
102-
d, '{}{}'.format(scheme, path), nocopy=True
102+
d, ['{}{}'.format(scheme, path)], nocopy=True
103103
)
104104
assert os.path.exists('data/dataset/file')
105105

@@ -113,8 +113,7 @@ def test_data_add_recursive(directory_tree, client):
113113
'identifier': 'me_id'
114114
}]
115115
client.add_data_to_dataset(
116-
dataset,
117-
directory_tree.join('dir2').strpath
116+
dataset, [directory_tree.join('dir2').strpath]
118117
)
119118

120119
assert os.path.basename(
@@ -134,7 +133,7 @@ def dataset_serialization(client, dataset, data_file):
134133

135134
assert all([key in d_dict for key in ('name', 'identifier', 'files')])
136135
assert not len(d_dict['files'].values())
137-
client.add_data_to_dataset(dataset, str(data_file))
136+
client.add_data_to_dataset(dataset, [str(data_file)])
138137
d_dict = dataset.to_dict()
139138
assert len(d_dict['files'].values())
140139

@@ -144,15 +143,15 @@ def test_git_repo_import(client, dataset, tmpdir, data_repository):
144143
# add data from local repo
145144
client.add_data_to_dataset(
146145
dataset,
147-
os.path.join(os.path.dirname(data_repository.git_dir), 'dir2')
146+
[os.path.join(os.path.dirname(data_repository.git_dir), 'dir2')]
148147
)
149148
assert os.stat('data/dataset/dir2/file2')
150149
assert dataset.files[0].path.endswith('dir2/file2')
151150
assert os.stat('.renku/vendors/local')
152151

153152
# check that the creators are properly parsed from commits
154153
client.add_data_to_dataset(
155-
dataset, os.path.dirname(data_repository.git_dir), target='file'
154+
dataset, [os.path.dirname(data_repository.git_dir)], target='file'
156155
)
157156

158157
assert len(dataset.files[1].creator) == 2

0 commit comments

Comments
 (0)