Skip to content

Commit ccc4f59

Browse files
committed
feat(datasets): added ls-files command
1 parent 5d1e8e7 commit ccc4f59

File tree

5 files changed

+391
-15
lines changed

5 files changed

+391
-15
lines changed

renku/cli/_format/dataset_files.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright 2019 - Swiss Data Science Center (SDSC)
4+
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
5+
# Eidgenössische Technische Hochschule Zürich (ETHZ).
6+
#
7+
# Licensed under the Apache License, Version 2.0 (the "License");
8+
# you may not use this file except in compliance with the License.
9+
# You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
"""Serializers for dataset list files."""
19+
20+
from collections import OrderedDict
21+
22+
from renku.cli._echo import echo_via_pager
23+
24+
25+
def tabular(client, records):
26+
"""Format dataset files with a tabular output.
27+
28+
:param client: LocalClient instance.
29+
:param records: Filtered collection.
30+
"""
31+
from renku.models._tabulate import tabulate
32+
33+
echo_via_pager(
34+
tabulate(
35+
records,
36+
headers=OrderedDict((
37+
('added', None),
38+
('authors_csv', 'authors'),
39+
('dataset', None),
40+
('full_path', 'path'),
41+
)),
42+
)
43+
)
44+
45+
46+
def jsonld(client, records):
47+
"""Format dataset files as JSON-LD.
48+
49+
:param client: LocalClient instance.
50+
:param records: Filtered collection.
51+
"""
52+
from renku.models._json import dumps
53+
from renku.models._jsonld import asjsonld
54+
55+
data = [asjsonld(record) for record in records]
56+
echo_via_pager(dumps(data, indent=2))
57+
58+
59+
FORMATS = {
60+
'tabular': tabular,
61+
'json-ld': jsonld,
62+
}
63+
"""Valid formatting options."""

renku/cli/dataset.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383

8484
from ._client import pass_local_client
8585
from ._echo import progressbar
86+
from ._format.dataset_files import FORMATS as DATASET_FILES_FORMATS
8687
from ._format.datasets import FORMATS as DATASETS_FORMATS
8788

8889

@@ -157,6 +158,107 @@ def add(client, name, urls, link, relative_to, target, force):
157158
raise BadParameter('Could not process {0}'.format(url))
158159

159160

161+
@dataset.command('ls-files')
162+
@click.option(
163+
'--dataset', multiple=True, help='Filter files in specific dataset.'
164+
)
165+
@click.option(
166+
'--authors',
167+
help='Filter files which where authored by specific authors. '
168+
'Multiple authors are specified by comma.'
169+
)
170+
@click.option(
171+
'-I',
172+
'--include',
173+
default=None,
174+
multiple=True,
175+
help='Include files matching given pattern.'
176+
)
177+
@click.option(
178+
'-X',
179+
'--exclude',
180+
default=None,
181+
multiple=True,
182+
help='Exclude files matching given pattern.'
183+
)
184+
@click.option(
185+
'--format',
186+
type=click.Choice(DATASET_FILES_FORMATS),
187+
default='tabular',
188+
help='Choose an output format.'
189+
)
190+
@pass_local_client(clean=False, commit=False)
191+
def ls_files(client, format, exclude, include, authors, dataset):
192+
"""List files in dataset."""
193+
records = _filter(
194+
client,
195+
dataset_names=dataset,
196+
authors=authors,
197+
include=include,
198+
exclude=exclude
199+
)
200+
201+
DATASET_FILES_FORMATS[format](client, records)
202+
203+
204+
def _include_exclude(file_path, include=None, exclude=None):
205+
"""Check if file matches one of include filters and not in exclude filter.
206+
207+
:param file_path: Path to the file.
208+
:param include: Tuple containing patterns to which include from result.
209+
:param exclude: Tuple containing patterns to which exclude from result.
210+
"""
211+
if exclude is not None and exclude:
212+
for pattern in exclude:
213+
if file_path.match(pattern):
214+
return False
215+
216+
if include is not None and include:
217+
for pattern in include:
218+
if file_path.match(pattern):
219+
return True
220+
return False
221+
222+
return True
223+
224+
225+
def _filter(
226+
client, dataset_names=None, authors=None, include=None, exclude=None
227+
):
228+
"""Filter dataset files by specified filters.
229+
230+
:param dataset_names: Filter by specified dataset names.
231+
:param authors: Filter by authors.
232+
:param include: Include files matching file pattern.
233+
:param exclude: Exclude files matching file pattern.
234+
"""
235+
if isinstance(authors, str):
236+
authors = set(authors.split(','))
237+
238+
if isinstance(authors, list) or isinstance(authors, tuple):
239+
authors = set(authors)
240+
241+
records = []
242+
for path_, dataset in client.datasets.items():
243+
if dataset.name in dataset_names or not dataset_names:
244+
for file_ in dataset.files.values():
245+
file_.dataset = dataset.name
246+
247+
path_ = file_.full_path.relative_to(client.path)
248+
match = _include_exclude(path_, include, exclude)
249+
250+
if authors:
251+
match = match and authors.issubset({
252+
author.name
253+
for author in file_.authors
254+
})
255+
256+
if match:
257+
records.append(file_)
258+
259+
return sorted(records, key=lambda file_: file_.added)
260+
261+
160262
def get_datadir():
161263
"""Fetch the current data directory."""
162264
ctx = click.get_current_context()

renku/models/datasets.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import configparser
2121
import datetime
22+
import os
2223
import re
2324
import uuid
2425
from functools import partial
@@ -100,27 +101,45 @@ def from_commit(cls, commit):
100101
)
101102

102103

104+
@attr.s
105+
class AuthorsMixin:
106+
"""Mixin for handling authors container."""
107+
108+
authors = jsonld.container.list(Author, kw_only=True)
109+
110+
@property
111+
def authors_csv(self):
112+
"""Comma-separated list of authors associated with dataset."""
113+
return ",".join(author.name for author in self.authors)
114+
115+
103116
@jsonld.s(
104117
type='http://schema.org/DigitalDocument',
105118
slots=True,
106119
)
107-
class DatasetFile(object):
120+
class DatasetFile(AuthorsMixin):
108121
"""Represent a file in a dataset."""
109122

110-
path = _path_attr()
123+
path = _path_attr(kw_only=True)
111124
url = jsonld.ib(
112-
default=None,
113-
context='http://schema.org/url',
125+
default=None, context='http://schema.org/url', kw_only=True
114126
)
115-
authors = jsonld.container.list(Author)
116-
dataset = attr.ib(default=None)
117-
added = jsonld.ib(context='http://schema.org/dateCreated', )
127+
authors = jsonld.container.list(Author, kw_only=True)
128+
dataset = attr.ib(default=None, kw_only=True)
129+
added = jsonld.ib(context='http://schema.org/dateCreated', kw_only=True)
118130

119131
@added.default
120132
def _now(self):
121133
"""Define default value for datetime fields."""
122134
return datetime.datetime.utcnow()
123135

136+
@property
137+
def full_path(self):
138+
"""Return full path in the current reference frame."""
139+
return Path(
140+
os.path.realpath(str(self.__reference__.parent / self.path))
141+
)
142+
124143

125144
def _parse_date(value):
126145
"""Convert date to datetime."""
@@ -148,7 +167,7 @@ def _convert_dataset_files(value):
148167
'scoro': 'http://purl.org/spar/scoro/',
149168
},
150169
)
151-
class Dataset(object):
170+
class Dataset(AuthorsMixin):
152171
"""Repesent a dataset."""
153172

154173
SUPPORTED_SCHEMES = ('', 'file', 'http', 'https', 'git+https', 'git+ssh')

0 commit comments

Comments
 (0)