28
28
29
29
import attr
30
30
import requests
31
- import yaml
32
31
33
32
from renku import errors
34
33
from renku ._compat import Path
@@ -65,20 +64,26 @@ def datasets_from_commit(self, commit=None):
65
64
blob = tree / self .METADATA
66
65
except KeyError :
67
66
continue
68
-
69
- yield Dataset .from_jsonld (
70
- yaml .safe_load (blob .data_stream .read ()),
71
- __reference__ = Path (blob .path ),
67
+ dataset = Dataset .from_yaml (
68
+ self .path / Path (blob .path ), client = self
72
69
)
70
+ dataset .commit = commit
71
+ yield dataset
73
72
74
73
@property
75
74
def datasets (self ):
76
75
"""Return mapping from path to dataset."""
77
76
result = {}
78
77
for path in self .renku_datasets_path .rglob (self .METADATA ):
79
- result [path ] = Dataset . from_yaml (path )
78
+ result [path ] = self . get_dataset (path )
80
79
return result
81
80
81
+ def get_dataset (self , path ):
82
+ """Return a dataset from a given path."""
83
+ if not path .is_absolute ():
84
+ path = self .path / path
85
+ return Dataset .from_yaml (path , client = self )
86
+
82
87
def dataset_path (self , name ):
83
88
"""Get dataset path from name."""
84
89
from renku .models .refs import LinkReference
@@ -98,7 +103,7 @@ def load_dataset(self, name=None):
98
103
if name :
99
104
path = self .dataset_path (name )
100
105
if path .exists ():
101
- dataset = Dataset . from_yaml (path )
106
+ dataset = self . get_dataset (path )
102
107
103
108
return dataset
104
109
@@ -116,7 +121,9 @@ def with_dataset(self, name=None):
116
121
path .parent .mkdir (parents = True , exist_ok = True )
117
122
118
123
with with_reference (path ):
119
- dataset = Dataset (identifier = identifier , name = name )
124
+ dataset = Dataset (
125
+ identifier = identifier , name = name , client = self
126
+ )
120
127
121
128
if name :
122
129
LinkReference .create (client = self , name = 'datasets/' +
@@ -150,32 +157,38 @@ def add_data_to_dataset(
150
157
dataset , dataset_path , url , target , ** kwargs
151
158
)
152
159
else :
153
- files = {}
160
+ files = []
154
161
for t in target :
155
- files .update (
162
+ files .extend (
156
163
self ._add_from_git (
157
164
dataset , dataset_path , url , t , ** kwargs
158
165
)
159
166
)
160
167
else :
161
168
files = self ._add_from_url (dataset , dataset_path , url , ** kwargs )
162
169
163
- ignored = self .find_ignored_paths (
164
- * [
165
- os .path .relpath (
166
- str (self .renku_datasets_path / dataset .uid / key ),
167
- start = str (self .path ),
168
- ) for key in files .keys ()
169
- ]
170
- )
170
+ ignored = self .find_ignored_paths (* (data ['path' ]
171
+ for data in files )) or []
171
172
172
173
if ignored :
173
174
if force :
174
175
self .repo .git .add (* ignored , force = True )
175
176
else :
176
177
raise errors .IgnoredFiles (ignored )
177
178
178
- dataset .update_files (files .values ())
179
+ # commit all new data
180
+ file_paths = {str (data ['path' ]) for data in files if str (data ['path' ])}
181
+ self .repo .git .add (* (file_paths - set (ignored )))
182
+ self .repo .index .commit (
183
+ 'renku dataset: commiting {} newly added files' .
184
+ format (len (file_paths ) + len (ignored ))
185
+ )
186
+
187
+ # Generate the DatasetFiles
188
+ dataset_files = []
189
+ for data in files :
190
+ dataset_files .append (DatasetFile .from_revision (self , ** data ))
191
+ dataset .update_files (dataset_files )
179
192
180
193
def _add_from_url (self , dataset , path , url , link = False , ** kwargs ):
181
194
"""Process an add from url and return the location on disk."""
@@ -202,15 +215,16 @@ def _add_from_url(self, dataset, path, url, link=False, **kwargs):
202
215
203
216
# if we have a directory, recurse
204
217
if src .is_dir ():
205
- files = {}
218
+ files = []
206
219
dst .mkdir (parents = True , exist_ok = True )
207
220
for f in src .iterdir ():
208
- files .update (
221
+ files .extend (
209
222
self ._add_from_url (
210
223
dataset ,
211
224
dst ,
212
225
f .absolute ().as_posix (),
213
226
link = link ,
227
+ ** kwargs
214
228
)
215
229
)
216
230
return files
@@ -243,17 +257,14 @@ def _add_from_url(self, dataset, path, url, link=False, **kwargs):
243
257
dst .chmod (mode & ~ (stat .S_IXUSR | stat .S_IXGRP | stat .S_IXOTH ))
244
258
245
259
self .track_paths_in_storage (str (dst .relative_to (self .path )))
246
- dataset_path = self .renku_datasets_path / dataset .name
247
- result = os .path .relpath (str (dst ), start = str (dataset_path ))
248
- return {
249
- result :
250
- DatasetFile (
251
- path = result ,
252
- url = url ,
253
- creator = dataset .creator ,
254
- dataset = dataset .name ,
255
- )
256
- }
260
+
261
+ return [{
262
+ 'path' : dst .relative_to (self .path ),
263
+ 'url' : url ,
264
+ 'creator' : dataset .creator ,
265
+ 'dataset' : dataset .name ,
266
+ 'parent' : self
267
+ }]
257
268
258
269
def _add_from_git (self , dataset , path , url , target , ** kwargs ):
259
270
"""Process adding resources from another git repository.
@@ -280,21 +291,13 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
280
291
relative_url = None
281
292
282
293
if relative_url :
283
- result = str (
284
- os .path .relpath (
285
- str (relative_url ),
286
- start = str (self .renku_datasets_path / dataset .uid ),
287
- )
288
- )
289
- return {
290
- result :
291
- DatasetFile (
292
- path = result ,
293
- url = url ,
294
- creator = dataset .creator ,
295
- dataset = dataset .name ,
296
- )
297
- }
294
+ return [{
295
+ 'path' : url ,
296
+ 'url' : url ,
297
+ 'creator' : dataset .creator ,
298
+ 'dataset' : dataset .name ,
299
+ 'parent' : self
300
+ }]
298
301
299
302
warnings .warn ('Importing local git repository, use HTTPS' )
300
303
# determine where is the base repo path
@@ -355,12 +358,12 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
355
358
356
359
# if we have a directory, recurse
357
360
if src .is_dir ():
358
- files = {}
361
+ files = []
359
362
dst .mkdir (parents = True , exist_ok = True )
360
363
# FIXME get all files from submodule index
361
364
for f in src .iterdir ():
362
365
try :
363
- files .update (
366
+ files .extend (
364
367
self ._add_from_git (
365
368
dataset ,
366
369
path ,
@@ -386,23 +389,18 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
386
389
if creator not in creators :
387
390
creators .append (creator )
388
391
389
- dataset_path = self .renku_datasets_path / dataset .name
390
- result = os .path .relpath (str (dst ), start = str (dataset_path ))
391
-
392
392
if u .scheme in ('' , 'file' ):
393
393
url = None
394
394
else :
395
395
url = '{}/{}' .format (url , target )
396
396
397
- return {
398
- result :
399
- DatasetFile (
400
- path = result ,
401
- url = url ,
402
- creator = creators ,
403
- dataset = dataset .name , # TODO detect original dataset
404
- )
405
- }
397
+ return [{
398
+ 'path' : dst .relative_to (self .path ),
399
+ 'url' : url ,
400
+ 'creator' : creators ,
401
+ 'dataset' : dataset .name ,
402
+ 'parent' : self
403
+ }]
406
404
407
405
def get_relative_url (self , url ):
408
406
"""Determine if the repo url should be relative."""
0 commit comments