Skip to content

Commit fb443d7

Browse files
rokroskarjsam
authored andcommitted
feat: add dataset metadata to the KG (#558)
* feat: export Datasets to graph * chore: update restricted dot graph * fix: update dataset metadata * feat: adding DatasetFile from git * allow for creation of an Entity object without a commit in the repo * enable DatasetFile.from_revision * preserve source path in Entity if it is added from a Submodule
1 parent 7938ac4 commit fb443d7

20 files changed

+329
-181
lines changed

renku/api/datasets.py

Lines changed: 59 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828

2929
import attr
3030
import requests
31-
import yaml
3231

3332
from renku import errors
3433
from renku._compat import Path
@@ -65,20 +64,26 @@ def datasets_from_commit(self, commit=None):
6564
blob = tree / self.METADATA
6665
except KeyError:
6766
continue
68-
69-
yield Dataset.from_jsonld(
70-
yaml.safe_load(blob.data_stream.read()),
71-
__reference__=Path(blob.path),
67+
dataset = Dataset.from_yaml(
68+
self.path / Path(blob.path), client=self
7269
)
70+
dataset.commit = commit
71+
yield dataset
7372

7473
@property
7574
def datasets(self):
7675
"""Return mapping from path to dataset."""
7776
result = {}
7877
for path in self.renku_datasets_path.rglob(self.METADATA):
79-
result[path] = Dataset.from_yaml(path)
78+
result[path] = self.get_dataset(path)
8079
return result
8180

81+
def get_dataset(self, path):
82+
"""Return a dataset from a given path."""
83+
if not path.is_absolute():
84+
path = self.path / path
85+
return Dataset.from_yaml(path, client=self)
86+
8287
def dataset_path(self, name):
8388
"""Get dataset path from name."""
8489
from renku.models.refs import LinkReference
@@ -98,7 +103,7 @@ def load_dataset(self, name=None):
98103
if name:
99104
path = self.dataset_path(name)
100105
if path.exists():
101-
dataset = Dataset.from_yaml(path)
106+
dataset = self.get_dataset(path)
102107

103108
return dataset
104109

@@ -116,7 +121,9 @@ def with_dataset(self, name=None):
116121
path.parent.mkdir(parents=True, exist_ok=True)
117122

118123
with with_reference(path):
119-
dataset = Dataset(identifier=identifier, name=name)
124+
dataset = Dataset(
125+
identifier=identifier, name=name, client=self
126+
)
120127

121128
if name:
122129
LinkReference.create(client=self, name='datasets/' +
@@ -150,32 +157,38 @@ def add_data_to_dataset(
150157
dataset, dataset_path, url, target, **kwargs
151158
)
152159
else:
153-
files = {}
160+
files = []
154161
for t in target:
155-
files.update(
162+
files.extend(
156163
self._add_from_git(
157164
dataset, dataset_path, url, t, **kwargs
158165
)
159166
)
160167
else:
161168
files = self._add_from_url(dataset, dataset_path, url, **kwargs)
162169

163-
ignored = self.find_ignored_paths(
164-
*[
165-
os.path.relpath(
166-
str(self.renku_datasets_path / dataset.uid / key),
167-
start=str(self.path),
168-
) for key in files.keys()
169-
]
170-
)
170+
ignored = self.find_ignored_paths(*(data['path']
171+
for data in files)) or []
171172

172173
if ignored:
173174
if force:
174175
self.repo.git.add(*ignored, force=True)
175176
else:
176177
raise errors.IgnoredFiles(ignored)
177178

178-
dataset.update_files(files.values())
179+
# commit all new data
180+
file_paths = {str(data['path']) for data in files if str(data['path'])}
181+
self.repo.git.add(*(file_paths - set(ignored)))
182+
self.repo.index.commit(
183+
'renku dataset: commiting {} newly added files'.
184+
format(len(file_paths) + len(ignored))
185+
)
186+
187+
# Generate the DatasetFiles
188+
dataset_files = []
189+
for data in files:
190+
dataset_files.append(DatasetFile.from_revision(self, **data))
191+
dataset.update_files(dataset_files)
179192

180193
def _add_from_url(self, dataset, path, url, link=False, **kwargs):
181194
"""Process an add from url and return the location on disk."""
@@ -202,15 +215,16 @@ def _add_from_url(self, dataset, path, url, link=False, **kwargs):
202215

203216
# if we have a directory, recurse
204217
if src.is_dir():
205-
files = {}
218+
files = []
206219
dst.mkdir(parents=True, exist_ok=True)
207220
for f in src.iterdir():
208-
files.update(
221+
files.extend(
209222
self._add_from_url(
210223
dataset,
211224
dst,
212225
f.absolute().as_posix(),
213226
link=link,
227+
**kwargs
214228
)
215229
)
216230
return files
@@ -243,17 +257,14 @@ def _add_from_url(self, dataset, path, url, link=False, **kwargs):
243257
dst.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH))
244258

245259
self.track_paths_in_storage(str(dst.relative_to(self.path)))
246-
dataset_path = self.renku_datasets_path / dataset.name
247-
result = os.path.relpath(str(dst), start=str(dataset_path))
248-
return {
249-
result:
250-
DatasetFile(
251-
path=result,
252-
url=url,
253-
creator=dataset.creator,
254-
dataset=dataset.name,
255-
)
256-
}
260+
261+
return [{
262+
'path': dst.relative_to(self.path),
263+
'url': url,
264+
'creator': dataset.creator,
265+
'dataset': dataset.name,
266+
'parent': self
267+
}]
257268

258269
def _add_from_git(self, dataset, path, url, target, **kwargs):
259270
"""Process adding resources from another git repository.
@@ -280,21 +291,13 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
280291
relative_url = None
281292

282293
if relative_url:
283-
result = str(
284-
os.path.relpath(
285-
str(relative_url),
286-
start=str(self.renku_datasets_path / dataset.uid),
287-
)
288-
)
289-
return {
290-
result:
291-
DatasetFile(
292-
path=result,
293-
url=url,
294-
creator=dataset.creator,
295-
dataset=dataset.name,
296-
)
297-
}
294+
return [{
295+
'path': url,
296+
'url': url,
297+
'creator': dataset.creator,
298+
'dataset': dataset.name,
299+
'parent': self
300+
}]
298301

299302
warnings.warn('Importing local git repository, use HTTPS')
300303
# determine where is the base repo path
@@ -355,12 +358,12 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
355358

356359
# if we have a directory, recurse
357360
if src.is_dir():
358-
files = {}
361+
files = []
359362
dst.mkdir(parents=True, exist_ok=True)
360363
# FIXME get all files from submodule index
361364
for f in src.iterdir():
362365
try:
363-
files.update(
366+
files.extend(
364367
self._add_from_git(
365368
dataset,
366369
path,
@@ -386,23 +389,18 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
386389
if creator not in creators:
387390
creators.append(creator)
388391

389-
dataset_path = self.renku_datasets_path / dataset.name
390-
result = os.path.relpath(str(dst), start=str(dataset_path))
391-
392392
if u.scheme in ('', 'file'):
393393
url = None
394394
else:
395395
url = '{}/{}'.format(url, target)
396396

397-
return {
398-
result:
399-
DatasetFile(
400-
path=result,
401-
url=url,
402-
creator=creators,
403-
dataset=dataset.name, # TODO detect original dataset
404-
)
405-
}
397+
return [{
398+
'path': dst.relative_to(self.path),
399+
'url': url,
400+
'creator': creators,
401+
'dataset': dataset.name,
402+
'parent': self
403+
}]
406404

407405
def get_relative_url(self, url):
408406
"""Determine if the repo url should be relative."""

renku/cli/_checks/files_in_datasets.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
# limitations under the License.
1818
"""Check location of files in datasets."""
1919

20-
import os
2120
from collections import defaultdict
21+
from pathlib import Path
2222

2323
import click
2424

@@ -31,13 +31,11 @@ def check_missing_files(client):
3131

3232
for path, dataset in client.datasets.items():
3333
for file_ in dataset.files:
34-
filepath = (path.parent / file_.path)
34+
filepath = Path(file_.path)
3535
if not filepath.exists():
3636
missing[str(
3737
path.parent.relative_to(client.renku_datasets_path)
38-
)].append(
39-
os.path.normpath(str(filepath.relative_to(client.path)))
40-
)
38+
)].append(str(filepath))
4139

4240
if not missing:
4341
return True

renku/cli/_format/graph.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,11 @@ def dot(graph, simple=True, debug=False, landscape=False):
5959
)
6060

6161
g.bind('prov', 'http://www.w3.org/ns/prov#')
62+
g.bind('foaf', 'http://xmlns.com/foaf/0.1/')
6263
g.bind('wfdesc', 'http://purl.org/wf4ever/wfdesc#')
6364
g.bind('wf', 'http://www.w3.org/2005/01/wf/flow#')
6465
g.bind('wfprov', 'http://purl.org/wf4ever/wfprov#')
66+
g.bind('schema', 'http://schema.org/')
6567

6668
if debug:
6769
rdf2dot(g, sys.stdout)
@@ -230,20 +232,33 @@ def color(p):
230232
"""Choose node color."""
231233
return 'BLACK'
232234

233-
for s, p, o in g:
235+
# filter out nodes and edges created for directories
236+
sparql = """
237+
SELECT ?s ?p ?o
238+
WHERE {
239+
?s ?p ?o
240+
MINUS {
241+
?s rdf:type prov:Collection.
242+
}
243+
MINUS {
244+
VALUES ?exclude { prov:wasInformedBy prov:influenced rdf:label }
245+
?s ?exclude ?o.
246+
}
247+
}
248+
"""
249+
250+
for s, p, o in g.query(sparql):
234251
sn = node(s)
235252
if p == rdflib.RDFS.label:
236253
continue
237-
238254
# inject the type predicate into the node itself
239255
if p == rdflib.RDF.type:
240256
types[sn].add((qname(p, g), cgi.escape(o)))
241257
continue
242-
if p == rdflib.term.URIRef('http://purl.org/dc/terms/isPartOf'):
258+
# add the project membership to the node
259+
if p == rdflib.term.URIRef('schema:isPartOf'):
243260
fields[sn].add((qname(p, g), cgi.escape(o)))
244261
continue
245-
if p == rdflib.term.URIRef('http://www.w3.org/ns/prov#wasInformedBy'):
246-
continue
247262

248263
if isinstance(o, (rdflib.URIRef, rdflib.BNode)):
249264
on = node(o)

renku/cli/_providers/zenodo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,11 +228,11 @@ def get_files(self):
228228

229229
return [ZenodoFileSerializer(**file_) for file_ in self.files]
230230

231-
def as_dataset(self):
231+
def as_dataset(self, client):
232232
"""Deserialize `ZenodoRecordSerializer` to `Dataset`."""
233233
files = self.get_files()
234234
metadata = self.get_jsonld()
235-
dataset = Dataset.from_jsonld(metadata)
235+
dataset = Dataset.from_jsonld(metadata, client=client)
236236

237237
serialized_files = []
238238
for file_ in files:

renku/cli/dataset.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def edit(client, id):
246246
)
247247

248248
edited = yaml.safe_load(metadata_edited)
249-
updated_ = Dataset(**edited)
249+
updated_ = Dataset(client=client, **edited)
250250

251251
dataset_.update_metadata(updated_)
252252
dataset_.to_yaml()
@@ -303,7 +303,7 @@ def add_to_dataset(
303303
for file_ in with_metadata.files:
304304
for added_ in dataset.files:
305305

306-
if file_.filename.endswith(added_.path.name):
306+
if added_.path.endswith(file_.filename):
307307
if isinstance(file_.url, ParseResult):
308308
file_.url = file_.url.geturl()
309309

@@ -526,7 +526,7 @@ def import_(ctx, client, uri, name, extract):
526526
try:
527527

528528
record = provider.find_record(uri)
529-
dataset_ = record.as_dataset()
529+
dataset_ = record.as_dataset(client)
530530
files_ = dataset_.files
531531

532532
click.echo(

renku/cli/doctor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ def doctor(ctx, client):
4040
from . import _checks
4141

4242
is_ok = True
43-
for attr in _checks.__all__:
44-
is_ok &= getattr(_checks, attr)(client)
43+
for check in _checks.__all__:
44+
is_ok &= getattr(_checks, check)(client)
4545

4646
if is_ok:
4747
click.secho('Everything seems to be ok.', fg='green')

renku/cli/migrate.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,7 @@ def datasets(ctx, client):
5151
from ._checks.location_datasets import _dataset_metadata_pre_0_3_4
5252

5353
for old_path in _dataset_metadata_pre_0_3_4(client):
54-
with old_path.open('r') as fp:
55-
dataset = Dataset.from_jsonld(yaml.safe_load(fp))
54+
dataset = Dataset.from_yaml(old_path, client=client)
5655

5756
name = str(old_path.parent.relative_to(client.path / 'data'))
5857
new_path = (client.renku_datasets_path / dataset.uid / client.METADATA)

renku/cli/move.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,10 @@ def fmt_dst(path):
8181
renames = {}
8282

8383
for file_ in dataset.files:
84-
filepath = fmt_path(
85-
os.path.normpath(str(path.parent / file_.path))
86-
)
84+
filepath = fmt_path(file_.path)
8785

8886
if filepath in files:
89-
renames[file_.path] = os.path.relpath(
90-
destinations[filepath], start=str(path.parent)
91-
)
87+
renames[file_.path] = destinations[filepath]
9288

9389
if renames:
9490
dataset = dataset.rename_files(

renku/cli/remove.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def fmt_path(path):
6161
remove = []
6262
for file_ in dataset.files:
6363
key = file_.path
64-
filepath = fmt_path(file_.full_path)
64+
filepath = fmt_path(file_.path)
6565
if filepath in files:
6666
remove.append(key)
6767

0 commit comments

Comments
 (0)