diff --git a/renga/cli/_graph.py b/renga/cli/_graph.py index 128bc339c0..2a1c5a3874 100644 --- a/renga/cli/_graph.py +++ b/renga/cli/_graph.py @@ -25,6 +25,7 @@ from renga._compat import Path from renga.models.cwl.command_line_tool import CommandLineTool +from renga.models.cwl.workflow import Workflow @attr.s @@ -72,7 +73,8 @@ def find_latest(self, start, path): def iter_file_inputs(self, tool, basedir): """Yield path of tool file inputs.""" if tool.stdin: - raise NotImplemented(tool.stdin) + if tool.stdin[0] != '$': # pragma: no cover + raise NotImplemented(tool.stdin) for input_ in tool.inputs: if input_.type == 'File' and input_.default: yield os.path.relpath( @@ -105,9 +107,85 @@ def add_file(self, path, revision='HEAD'): file_key = self.add_node(commit, path) tool_key = self.add_tool(commit, cwl) #: Edge from a tool to the output. - self.G.add_edge(tool_key, file_key) + tool = self.G.nodes[tool_key]['tool'] + output_id = tool.get_output_id(path) + self.G.add_edge(tool_key, file_key, id=output_id) return file_key if file_commits: #: Does not have a parent CWL. return self.add_node(file_commits[0], path) + + @property + def _output_keys(self): + """Return a list of the output keys.""" + return [n for n, d in self.G.out_degree() if d == 0] + + def _source_name(self, key): + """Find source name for a node.""" + if self.G.in_degree(key) == 0: + return None + + assert self.G.in_degree(key) == 1 + + tool_key, attr = list(self.G.pred[key].items())[0] + step = self.G.nodes[tool_key]['step']['id'] + return '{0}/{1}'.format(step, attr['id']) + + @property + def _tool_nodes(self): + """Yield topologically sorted tools.""" + for key in nx.topological_sort(self.G): + node = self.G.nodes[key] + tool = node.get('tool') + if tool is not None: + yield key, node + + def ascwl(self): + """Serialize graph to CWL workflow.""" + workflow = Workflow() + + input_index = 1 + + for tool_index, (key, node) in enumerate(self._tool_nodes, 1): + _, path = key + tool = node['tool'] + step_id = 'step_{0}'.format(tool_index) + node['step'] = {'id': step_id} + + ins = { + edge_id: self._source_name(target_id) + for target_id, _, edge_id in self.G.in_edges(key, data='id') + } + outs = [ + edge_id for _, _, edge_id in self.G.out_edges(key, data='id') + ] + + for input_ in tool.inputs: + input_mapping = ins.get(input_.id) + if input_mapping is None: + input_id = 'input_{0}'.format(input_index) + workflow.inputs.append({ + 'id': input_id, + 'type': input_.type, + # 'default': input_.default, + }) + input_index += 1 + ins[input_.id] = input_id + + workflow.add_step( + run=Path(path), + id=step_id, + in_=ins, + out=outs, + ) + + for index, key in enumerate(self._output_keys): + output_id = 'output_{0}'.format(index) + workflow.outputs.append({ + 'id': output_id, + 'type': 'File', + 'outputSource': self._source_name(key), + }) + + return workflow diff --git a/renga/cli/run.py b/renga/cli/run.py index 70a370809b..65c811ab9c 100644 --- a/renga/cli/run.py +++ b/renga/cli/run.py @@ -18,6 +18,7 @@ """Track provenance of data created by executing programs.""" import os +import sys from subprocess import call import click @@ -39,11 +40,20 @@ def run(repo, no_output, command_line): """Tracking work on a specific problem.""" candidates = [x[0] for x in repo.git.index.entries] + \ repo.git.untracked_files + mapped_std = _mapped_std_streams(candidates) factory = CommandLineToolFactory( command_line=command_line, - **_mapped_std_streams(candidates)) + **mapped_std) with repo.with_workflow_storage() as wf: with factory.watch(repo.git, no_output=no_output) as tool: - call(factory.command_line, cwd=os.getcwd()) + call( + factory.command_line, + cwd=os.getcwd(), + **{key: getattr(sys, key) for key in mapped_std.keys()}, + ) + + sys.stdout.flush() + sys.stderr.flush() + wf.add_step(run=tool) diff --git a/renga/cli/workflow.py b/renga/cli/workflow.py new file mode 100644 index 0000000000..304aeb5118 --- /dev/null +++ b/renga/cli/workflow.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2018 - Swiss Data Science Center (SDSC) +# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and +# Eidgenössische Technische Hochschule Zürich (ETHZ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Workflow operations.""" + +import os + +import click +import yaml + +from renga.models.cwl._ascwl import ascwl + +from ._graph import Graph +from ._repo import pass_repo + + +@click.group() +def workflow(): + """Workflow operations.""" + + +@workflow.command() +@click.option('--revision', default='HEAD') +@click.argument('path', type=click.Path(exists=True, dir_okay=False), nargs=-1) +@pass_repo +def create(repo, revision, path): + """Create a workflow description for a file.""" + graph = Graph(repo) + for p in path: + graph.add_file(p, revision=revision) + + click.echo( + yaml.dump(ascwl( + graph.ascwl(), + filter=lambda _, x: x is not None, + # basedir=repo.workflow_path, + basedir='.', + ), default_flow_style=False)) diff --git a/renga/models/cwl/_ascwl.py b/renga/models/cwl/_ascwl.py index dd05a87dd3..e6e47ce85e 100644 --- a/renga/models/cwl/_ascwl.py +++ b/renga/models/cwl/_ascwl.py @@ -78,17 +78,18 @@ def convert_value(v): return v for a in attrs: + a_name = a.name.rstrip('_') v = getattr(inst, a.name) if filter is not None and not filter(a, v): continue if recurse is True: if has(v.__class__): - rv[a.name] = ascwl(v, recurse=True, filter=filter, + rv[a_name] = ascwl(v, recurse=True, filter=filter, dict_factory=dict_factory, basedir=basedir) elif isinstance(v, (tuple, list, set)): cf = v.__class__ if retain_collection_types is True else list - rv[a.name] = cf([ + rv[a_name] = cf([ ascwl(i, recurse=True, filter=filter, dict_factory=dict_factory, basedir=basedir) if has(i.__class__) else i @@ -99,23 +100,23 @@ def convert_value(v): k = a.metadata['jsonldPredicate'].get('mapSubject') if k: vv = dict_factory() - for i in rv[a.name]: + for i in rv[a_name]: kk = i.pop(k) vv[kk] = i - rv[a.name] = vv + rv[a_name] = vv elif isinstance(v, dict): df = dict_factory - rv[a.name] = df(( + rv[a_name] = df(( ascwl(kk, dict_factory=df, basedir=basedir) if has(kk.__class__) else kk, ascwl(vv, dict_factory=df, basedir=basedir) if has(vv.__class__) else vv) for kk, vv in iteritems(v)) else: - rv[a.name] = convert_value(v) + rv[a_name] = convert_value(v) else: - rv[a.name] = convert_value(v) + rv[a_name] = convert_value(v) if isinstance(inst, CWLClass): rv['class'] = inst.__class__.__name__ diff --git a/renga/models/cwl/command_line_tool.py b/renga/models/cwl/command_line_tool.py index 29df38d477..7629e31f6c 100644 --- a/renga/models/cwl/command_line_tool.py +++ b/renga/models/cwl/command_line_tool.py @@ -17,6 +17,7 @@ # limitations under the License. """Represent a ``CommandLineTool`` from the Common Workflow Language.""" +import fnmatch import re import shlex from contextlib import contextmanager @@ -48,8 +49,7 @@ class CommandLineTool(Process, CWLClass): cmd, (list, tuple)) else shlex.split(cmd), ) # list(string, Expression, CommandLineBinding) - stdin = attr.ib(default=None, converter=attr.converters.optional(Path)) - # null, str, Expression + stdin = attr.ib(default=None) stdout = attr.ib(default=None) stderr = attr.ib(default=None) @@ -60,6 +60,24 @@ class CommandLineTool(Process, CWLClass): temporaryFailCodes = attr.ib(default=attr.Factory(list)) # list(int) permanentFailCodes = attr.ib(default=attr.Factory(list)) # list(int) + def get_output_id(self, path): + """Return an id of the matching path from default values.""" + for output in self.outputs: + if output.type in {'stdout', 'stderr'}: + stream = getattr(self, output.type) + if stream == path: + return output.id + elif output.type == 'File': + glob = output.outputBinding.glob + # TODO better support for Expression + if glob.startswith('$(inputs.'): + input_id = glob[len('$(inputs.'):-1] + for input_ in self.inputs: + if input_.id == input_id and input_.default == path: + return output.id + elif fnmatch.fnmatch(path, glob): + return output.id + @attr.s class CommandLineToolFactory(object): @@ -94,6 +112,13 @@ def __attrs_post_init__(self): self.inputs = [] self.outputs = [] + if self.stdin: + input_ = next(self.guess_inputs(self.stdin)) + assert input_.type == 'File' + input_.id = 'input_stdin' + self.inputs.append(input_) + self.stdin = '$(inputs.{0}.path)'.format(input_.id) + for stream_name in ('stdout', 'stderr'): stream = getattr(self, stream_name) if stream and self.file_candidate(stream): @@ -151,7 +176,7 @@ def watch(self, git=None, no_output=False): 'Output file was not created or changed.' ) - if not tool.outputs: + if not outputs: raise RuntimeError('No output was detected') tool.inputs = list(inputs.values()) @@ -184,6 +209,10 @@ def split_command_and_args(self): cmd = [self.command_line[0]] args = list(self.command_line[1:]) + if len(args) < 2: + # only guess subcommand for more arguments + return cmd, args + while args and re.match(self._RE_SUBCOMMAND, args[0]) \ and not self.file_candidate(args[0]): cmd.append(args.pop(0)) @@ -210,7 +239,7 @@ def guess_type(self, value): # TODO suggest that the file should be imported to the repo pass - if ',' in value: + if len(value) > 1 and ',' in value: return value.split(','), 'string[]', ',' return value, 'string', None diff --git a/renga/models/cwl/workflow.py b/renga/models/cwl/workflow.py index 8996dd0951..72ab2e8921 100644 --- a/renga/models/cwl/workflow.py +++ b/renga/models/cwl/workflow.py @@ -17,21 +17,30 @@ # limitations under the License. """Represent workflows from the Common Workflow Language.""" +import uuid + import attr +from ._ascwl import CWLClass, mapped +from .process import Process + @attr.s class WorkflowStep(object): """Define an executable element of a workflow.""" run = attr.ib() # string, Process + id = attr.ib(default=attr.Factory(uuid.uuid4)) + + in_ = attr.ib(default=None) + out = attr.ib(default=None) @attr.s -class Workflow(object): +class Workflow(Process, CWLClass): """Define a workflow representation.""" - steps = attr.ib(default=attr.Factory(list)) + steps = mapped(WorkflowStep) def add_step(self, **kwargs): """Add a workflow step.""" diff --git a/setup.py b/setup.py index 4d3621d852..61385968d0 100644 --- a/setup.py +++ b/setup.py @@ -107,6 +107,7 @@ 'log=renga.cli.log:log', 'run=renga.cli.run:run', 'workon=renga.cli.workon:workon', + 'workflow=renga.cli.workflow:workflow', ], }, extras_require=extras_require, diff --git a/tests/conftest.py b/tests/conftest.py index 8d7cdf9d20..68945458d6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -105,21 +105,6 @@ def request_callback(request): yield rsps -@pytest.fixture() -def runner(base_runner): - """Return runner with a new project.""" - from renga import cli - - runner = base_runner - - os.mkdir('test-project') - os.chdir('test-project') - - result = runner.invoke(cli.cli, ['init']) - assert result.exit_code == 0 - yield base_runner - - @pytest.fixture() def graph_mutation_responses(auth_responses, graph_mutation_client): """Monkeypatch requests to immitate the KnowledgeGraph.""" @@ -652,31 +637,37 @@ def add_client(doctest_namespace, renga_client, storage_responses, @pytest.fixture() -def test_file(tmpdir): +def data_file(tmpdir): """Create a sample data file.""" - p = tmpdir.mkdir('data').join('test_file') + p = tmpdir.mkdir('data').join('file') p.write('1234') return p @pytest.fixture() -def test_project(base_runner): +def project(base_runner): """Create a test project.""" from renga import cli - os.makedirs('test-project/data') - os.chdir('test-project') + with base_runner.isolated_filesystem() as project_path: + os.makedirs('data') + result = base_runner.invoke(cli.cli, ['init', '.']) + yield project_path + - result = base_runner.invoke(cli.cli, ['init', '.']) +@pytest.fixture() +def runner(base_runner, project): + """Return runner with a new project.""" + yield base_runner @pytest.fixture() -def test_dataset(test_project): +def dataset(project): """Create a dataset.""" from renga.models import dataset return dataset.Dataset.create( 'dataset', - datadir='./data', + datadir='data', authors={'name': 'me', 'email': 'me@example.com'}) @@ -691,20 +682,20 @@ def request_callback(request): rsps.add_callback( responses.GET, - 'http://example.com/test_file', + 'http://example.com/file', callback=request_callback) rsps.add_callback( responses.GET, - 'https://example.com/test_file', + 'https://example.com/file', callback=request_callback) yield rsps @pytest.fixture() -def test_dir(tmpdir): +def directory_tree(tmpdir): """Create a test directory tree.""" # initialize - p = tmpdir.mkdir('test_dir') + p = tmpdir.mkdir('directory_tree') p.join('file').write('1234') p.join('dir2').mkdir() p.join('dir2/file2').write('5678') @@ -712,23 +703,23 @@ def test_dir(tmpdir): @pytest.fixture() -def test_repo(test_dir): +def data_repository(directory_tree): """Create a test repo.""" from git import Repo, Actor # initialize - repo = Repo.init(test_dir.strpath) + repo = Repo.init(directory_tree.strpath) # add a file - repo.index.add([test_dir.join('file').strpath]) + repo.index.add([directory_tree.join('file').strpath]) repo.index.commit('test commit', author=Actor('me', 'me@example.com')) # commit changes to the same file with a different user - test_dir.join('file').write('5678') - repo.index.add([test_dir.join('file').strpath]) + directory_tree.join('file').write('5678') + repo.index.add([directory_tree.join('file').strpath]) repo.index.commit('test commit', author=Actor('me2', 'me2@example.com')) # commit a second file - repo.index.add([test_dir.join('dir2/file2').strpath]) + repo.index.add([directory_tree.join('dir2/file2').strpath]) repo.index.commit('test commit', author=Actor('me', 'me@example.com')) # return the repo diff --git a/tests/test_cli.py b/tests/test_cli.py index 3aaef34417..bf00362dd7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2017 - Swiss Data Science Center (SDSC) +# Copyright 2017, 2018 - Swiss Data Science Center (SDSC) # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and # Eidgenössische Technische Hochschule Zürich (ETHZ). # @@ -19,8 +19,11 @@ from __future__ import absolute_import, print_function +import contextlib import os +import sys +import git import pytest import responses @@ -100,10 +103,59 @@ def test_run_simple(runner): assert result.exit_code == 0 -def test_datasets(base_runner, test_file, test_project, test_repo): - """Test importing data into a dataset.""" - runner = base_runner +def test_workflow(runner): + """Test workflow command.""" + result = runner.invoke(cli.cli, ['run', 'touch', 'data.csv']) + assert result.exit_code == 0 + + with open('counted.txt', 'w') as stdout: + with contextlib.redirect_stdout(stdout): + try: + cli.cli.main( + args=('run', 'wc', 'data.csv'), + prog_name=runner.get_default_prog_name(cli.cli), + ) + except SystemExit as e: + assert e.code in {None, 0} + + result = runner.invoke(cli.cli, ['workflow', 'create', 'counted.txt']) + assert result.exit_code == 0 + + +def test_streams(runner, capsys): + """Test redirection of std streams.""" + repo = git.Repo('.') + + with open('source.txt', 'w') as source: + source.write('first,second,third') + repo.git.add('--all') + repo.index.commit('Added source.txt') + + with capsys.disabled(): + with open('source.txt', 'rb') as stdin: + with open('result.txt', 'wb') as stdout: + try: + old_stdin, old_stdout = sys.stdin, sys.stdout + sys.stdin, sys.stdout = stdin, stdout + try: + cli.cli.main( + args=('run', 'cut', '-d,', '-f', '2', '-s'), + ) + except SystemExit as e: + assert e.code in {None, 0} + finally: + sys.stdin, sys.stdout = old_stdin, old_stdout + + with open('result.txt', 'r') as f: + assert f.read().strip() == 'second' + + result = runner.invoke(cli.cli, ['workflow', 'create', 'result.txt']) + assert result.exit_code == 0 + + +def test_datasets(data_file, data_repository, runner): + """Test importing data into a dataset.""" # create a dataset result = runner.invoke(cli.cli, ['datasets', 'create', 'dataset']) assert result.exit_code == 0 @@ -112,8 +164,10 @@ def test_datasets(base_runner, test_file, test_project, test_repo): # add data result = runner.invoke(cli.cli, ['datasets', 'add', 'dataset', - str(test_file)]) - assert os.stat('data/dataset/test_file') + str(data_file)]) + assert os.stat(os.path.join( + 'data', 'dataset', os.path.basename(data_file) + )) # add data from a git repo via http result = runner.invoke(cli.cli, [ @@ -126,4 +180,4 @@ def test_datasets(base_runner, test_file, test_project, test_repo): # add data from local git repo result = runner.invoke(cli.cli, [ 'datasets', 'add', 'dataset', '-t', 'file', '-t', 'file2', - os.path.dirname(test_repo.git_dir)]) + os.path.dirname(data_repository.git_dir)]) diff --git a/tests/test_cwl.py b/tests/test_cwl.py index d6ce9d640e..3b24213ee4 100644 --- a/tests/test_cwl.py +++ b/tests/test_cwl.py @@ -76,6 +76,14 @@ def test_base_command_detection(instance_path): assert tool.inputs[0].inputBinding.separate is True +def test_short_base_command_detection(): + """Test base command detection without arguments.""" + tool = CommandLineToolFactory(('echo', 'A')).generate_tool() + assert tool.cwlVersion == 'v1.0' + assert tool.__class__.__name__ == 'CommandLineTool' + assert tool.inputs[0].default == 'A' + + def test_04_output(instance_path): """Test describtion of outputs from a command.""" hello = Path(instance_path) / 'hello.tar' diff --git a/tests/test_dataset.py b/tests/test_dataset.py index f8ea719593..d7306ee6a2 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -46,7 +46,7 @@ def not_raises(): return not_raises() -def test_dataset_creation(test_project): +def dataset_creation(project): """Test dataset directory tree creation.""" # creating a dataset without an author fails with pytest.raises(RuntimeError): @@ -67,31 +67,31 @@ def test_dataset_creation(test_project): @pytest.mark.parametrize('scheme, path, error', [('', 'temp', None), ('file://', 'temp', None), ('', 'tempp', git.NoSuchPathError), - ('http://', 'example.com/test_file', - None), ('https://', 'example.com/test_file', + ('http://', 'example.com/file', + None), ('https://', 'example.com/file', None), ('bla://', 'file', NotImplementedError)]) -def test_data_add(scheme, path, error, test_project, test_file, test_dir, +def test_data_add(scheme, path, error, project, data_file, directory_tree, dataset_responses): """Test data import.""" with raises(error): if path == 'temp': - path = str(test_file) + path = str(data_file) elif path == 'tempdir': - path = str(test_dir) + path = str(directory_tree) d = dataset.Dataset.create( 'dataset', datadir='./data', authors={'name': 'me', 'email': 'me@example.com'}) d.add_data('{}{}'.format(scheme, path)) - with open('data/dataset/test_file') as f: + with open('data/dataset/file') as f: assert f.read() == '1234' - assert d.files.get('test_file') + assert d.files.get('file') # check that the imported file is read-only - assert not os.access('data/dataset/test_file', + assert not os.access('data/dataset/file', stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) assert os.stat('data/dataset/metadata.json') @@ -104,68 +104,68 @@ def test_data_add(scheme, path, error, test_project, test_file, test_dir, authors={'name': 'me', 'email': 'me@example.com'}) d.add_data('{}{}'.format(scheme, path), nocopy=True) - assert os.path.exists('data/dataset/test_file') + assert os.path.exists('data/dataset/file') -def test_data_add_recursive(test_dir, test_project): +def test_data_add_recursive(directory_tree, project): """Test recursive data imports.""" d = dataset.Dataset.create( 'dataset', authors={'name': 'me', 'email': 'me@example.com'}) - d.add_data(test_dir.join('dir2').strpath) + d.add_data(directory_tree.join('dir2').strpath) assert 'dir2/file2' in d.files -def test_dataset_serialization(test_dataset, test_file): +def dataset_serialization(dataset, data_file): """Test deserializing a dataset object.""" # deserialize from json on disk - d = dataset.Dataset.from_json(test_dataset.path.joinpath('metadata.json')) - assert d.path == test_dataset.path + d = dataset.Dataset.from_json(dataset.path.joinpath('metadata.json')) + assert d.path == dataset.path d_dict = d.to_dict() assert all([key in d_dict for key in ('name', 'identifier', 'files')]) assert not len(d_dict['files'].values()) - d.add_data(str(test_file)) + d.add_data(str(data_file)) d_dict = d.to_dict() assert len(d_dict['files'].values()) -def test_repo_commit(test_dataset, test_file): +def data_repository_commit(dataset, data_file): """Test that files get commited to the git repository properly.""" from git import Repo r = Repo('.') - test_dataset.repo = r - test_dataset.add_data(str(test_file)) - test_dataset.write_metadata() - test_dataset.commit_to_repo() + dataset.repo = r + dataset.add_data(str(data_file)) + dataset.write_metadata() + dataset.commit_to_repo() assert all([ f not in r.untracked_files - for f in ['data/dataset/metadata.json', 'data/dataset/test_file'] + for f in ['data/dataset/metadata.json', 'data/dataset/file'] ]) -def test_git_repo_import(test_dataset, tmpdir, test_repo): +def test_git_repo_import(dataset, tmpdir, data_repository): """Test an import from a git repository.""" from git import Repo r = Repo('.') - test_dataset.repo = r + dataset.repo = r # add data from local repo - test_dataset.add_data( - os.path.join(os.path.dirname(test_repo.git_dir), 'dir2')) - assert os.stat('data/dataset/test_dir/dir2/file2') - assert 'test_dir/dir2/file2' in test_dataset.files + dataset.add_data( + os.path.join(os.path.dirname(data_repository.git_dir), 'dir2')) + assert os.stat('data/dataset/directory_tree/dir2/file2') + assert 'directory_tree/dir2/file2' in dataset.files assert os.stat('.renga/vendors/local') # check that the authors are properly parsed from commits - test_dataset.add_data(os.path.dirname(test_repo.git_dir), target='file') - assert len(test_dataset.files['test_dir/file'].authors) == 2 + dataset.add_data(os.path.dirname(data_repository.git_dir), target='file') + assert len(dataset.files['directory_tree/file'].authors) == 2 assert all( x.name in ('me', 'me2') - for x in test_dataset.files['test_dir/file'].authors) + for x in dataset.files['directory_tree/file'].authors) @pytest.mark.parametrize('authors', [ @@ -176,10 +176,10 @@ def test_git_repo_import(test_dataset, tmpdir, test_repo): 'email': 'me@example.com' } ]) -def test_author_parse(authors, test_file): +def test_author_parse(authors, data_file): """Test that different options for specifying authors work.""" f = dataset.DatasetFile( - 'test_file', origin=str(test_file), authors=authors) + 'file', origin=str(data_file), authors=authors) assert dataset.Author(name='me', email='me@example.com') in f.authors # email check @@ -189,4 +189,4 @@ def test_author_parse(authors, test_file): # authors must be a set or list of dicts or Author with pytest.raises(ValueError): f = dataset.DatasetFile( - 'test_file', origin=str(test_file), authors=['name']) + 'file', origin=str(data_file), authors=['name'])