feat: explicit input output specification (#598)

mohammad-alisafaee · jsam · commit ce8ba6737808 · 2019-08-23T08:58:20.000+02:00
* feat: can specify inputs and outputs explicityly

* build: refactor and add tests

* feat: allow working with dirty repo

* review: apply review comments

* refactor: minor refactoring

* undo allow working with dirty repo

* feat: stage all explicit inputs in tool's output directory

* feat: stage ALL file and directory inputs in the tools output directory.

* chore: fix bad formatting

* review: apply review comments
diff --git a/conftest.py b/conftest.py
@@ -297,3 +297,30 @@ def zenodo_sandbox(client):
         'zenodo', 'access_token',
         'HPwXfABPZ7JNiwXMrktL7pevuuo9jt4gsUCkh3Gs2apg65ixa3JPyFukdGup'
     )
+
+
+@pytest.fixture
+def cli(client, run):
+    """Return a callable Renku CLI.
+
+    It returns the exit code and content of the resulting CWL tool.
+    """
+    import yaml
+    from renku.models.cwl import CWLClass
+
+    def renku_cli(*args):
+        before_cwl_files = set(client.workflow_path.glob('*.cwl'))
+        exit_code = run(args)
+        after_cwl_files = set(client.workflow_path.glob('*.cwl'))
+        new_files = after_cwl_files - before_cwl_files
+        assert len(new_files) <= 1
+        if new_files:
+            cwl_filepath = new_files.pop()
+            with cwl_filepath.open('r') as f:
+                content = CWLClass.from_cwl(yaml.safe_load(f))
+        else:
+            content = None
+
+        return exit_code, content
+
+    return renku_cli
diff --git a/renku/cli/run.py b/renku/cli/run.py
@@ -49,6 +49,16 @@
   as an **output**;
 * a path is not passed as an argument to ``renku run``.
 
+.. topic:: Specifying auxiliary inputs (``--input``)
+
+   You can specify extra inputs to your program explicitly by using the
+   ``--input`` option. This is useful for specifying hidden dependencies
+   that don't appear on the command line. These input file must exist before
+   execution of ``renku run`` command. This option is not a replacement for
+   the arguments that are passed on the command line. Files or directories
+   specified with this option will not be passed as input arguments to the
+   script.
+
 Detecting output paths
 ~~~~~~~~~~~~~~~~~~~~~~
 
@@ -84,6 +94,13 @@
    You can specify the ``--no-output`` option to force tracking of such
    an execution.
 
+.. topic:: Specifying outputs explicitly (``--output``)
+
+   You can specify expected outputs of your program explicitly by using the
+   ``--output`` option. These output must exist after the execution of the
+   ``renku run`` command. However, they do not need to be modified by
+   the command.
+
 .. cli-run-std
 
 Detecting standard streams
@@ -134,6 +151,12 @@
 
 
 @click.command(context_settings=dict(ignore_unknown_options=True, ))
+@click.option(
+    'inputs',
+    '--input',
+    multiple=True,
+    help='Force a path to be considered as an input.',
+)
 @click.option(
     'outputs',
     '--output',
@@ -162,12 +185,16 @@
     commit=True,
     ignore_std_streams=True,
 )
-def run(client, outputs, no_output, success_codes, isolation, command_line):
+def run(
+    client, inputs, outputs, no_output, success_codes, isolation, command_line
+):
     """Tracking work on a specific problem."""
     working_dir = client.repo.working_dir
     mapped_std = _mapped_std_streams(client.candidate_paths)
     factory = CommandLineToolFactory(
         command_line=command_line,
+        explicit_inputs=inputs,
+        explicit_outputs=outputs,
         directory=os.getcwd(),
         working_dir=working_dir,
         successCodes=success_codes,
@@ -177,9 +204,7 @@ def run(client, outputs, no_output, success_codes, isolation, command_line):
         }
     )
     with client.with_workflow_storage() as wf:
-        with factory.watch(
-            client, no_output=no_output, outputs=outputs
-        ) as tool:
+        with factory.watch(client, no_output=no_output) as tool:
             # Don't compute paths if storage is disabled.
             if client.has_external_storage:
                 # Make sure all inputs are pulled from a storage.
diff --git a/renku/errors.py b/renku/errors.py
@@ -185,6 +185,7 @@ def __init__(self, repo, unmodified):
                  for path in unmodified) + '\n'
             '\nOnce you have removed the files that should be used as outputs,'
             '\nyou can safely rerun the previous command.'
+            '\nYou can use --output flag to specify outputs explicitly.'
         )
 
 
@@ -214,6 +215,7 @@ def __init__(self, repo, inputs):
                 ) + '\n\n'
                 'Once you have removed files that should be used as outputs,\n'
                 'you can safely rerun the previous command.'
+                '\nYou can use --output flag to specify outputs explicitly.'
             )
         else:
             msg += (
@@ -224,6 +226,10 @@ def __init__(self, repo, inputs):
         super(OutputsNotFound, self).__init__(msg)
 
 
+class InvalidInputPath(RenkuException, click.ClickException):
+    """Raise when input path does not exist or is not in the repository."""
+
+
 class InvalidSuccessCode(RenkuException, click.ClickException):
     """Raise when the exit-code is not 0 or redefined."""
 
diff --git a/renku/models/cwl/command_line_tool.py b/renku/models/cwl/command_line_tool.py
@@ -162,6 +162,15 @@ class CommandLineToolFactory(object):
         if isinstance(cmd, (list, tuple)) else shlex.split(cmd),
     )
 
+    explicit_inputs = attr.ib(
+        default=[],
+        converter=lambda paths: [Path(path).resolve() for path in paths]
+    )
+    explicit_outputs = attr.ib(
+        default=[],
+        converter=lambda paths: [Path(path).resolve() for path in paths]
+    )
+
     directory = attr.ib(
         default='.',
         converter=lambda path: Path(path).resolve(),
@@ -183,7 +192,7 @@ class CommandLineToolFactory(object):
     successCodes = attr.ib(default=attr.Factory(list))  # list(int)
 
     def __attrs_post_init__(self):
-        """Derive basic informations."""
+        """Derive basic information."""
         self.baseCommand, detect = self.split_command_and_args()
         self.arguments = []
         self.inputs = []
@@ -218,6 +227,10 @@ def __attrs_post_init__(self):
             else:
                 self.inputs.append(input_)
 
+        if self.explicit_inputs:
+            for input in self.find_explicit_inputs():
+                self.inputs.append(input)
+
     def generate_tool(self):
         """Return an instance of command line tool."""
         return CommandLineTool(
@@ -232,24 +245,11 @@ def generate_tool(self):
         )
 
     @contextmanager
-    def watch(self, client, no_output=False, outputs=None):
+    def watch(self, client, no_output=False):
         """Watch a Renku repository for changes to detect outputs."""
         tool = self.generate_tool()
         repo = client.repo
 
-        if outputs:
-            directories = [
-                output for output in outputs if Path(output).is_dir()
-            ]
-
-            client.repo.git.rm(
-                *outputs, r=True, force=True, ignore_unmatch=True
-            )
-            client.repo.index.commit('renku: automatic removal of outputs')
-
-            for directory in directories:
-                Path(directory).mkdir(parents=True, exist_ok=True)
-
         # NOTE consider to use git index instead
         existing_directories = {
             str(p.relative_to(client.path))
@@ -261,6 +261,10 @@ def watch(self, client, no_output=False, outputs=None):
         if repo:
             # List of all output paths.
             paths = []
+
+            inputs = {input.id: input for input in self.inputs}
+            outputs = list(tool.outputs)
+
             # Keep track of unmodified output files.
             unmodified = set()
 
@@ -277,9 +281,6 @@ def watch(self, client, no_output=False, outputs=None):
             from renku.cli._graph import _safe_path
             candidates = {path for path in candidates if _safe_path(path)}
 
-            inputs = {input.id: input for input in self.inputs}
-            outputs = list(tool.outputs)
-
             for output, input, path in self.guess_outputs(candidates):
                 outputs.append(output)
                 paths.append(path)
@@ -292,27 +293,48 @@ def watch(self, client, no_output=False, outputs=None):
 
             for stream_name in ('stdout', 'stderr'):
                 stream = getattr(self, stream_name)
-                if stream and stream not in candidates:
+                if (
+                    stream and stream not in candidates and
+                    Path(stream).resolve() not in self.explicit_outputs
+                ):
                     unmodified.add(stream)
                 elif stream:
                     paths.append(stream)
 
+            if self.explicit_outputs:
+                last_output_id = len(outputs)
+
+                for output, input, path in self.find_explicit_outputs(
+                    last_output_id
+                ):
+                    outputs.append(output)
+                    paths.append(path)
+
+                    if input is not None:
+                        if input.id not in inputs:  # pragma: no cover
+                            raise RuntimeError('Inconsistent input name.')
+
+                        inputs[input.id] = input
+
             if unmodified:
                 raise errors.UnmodifiedOutputs(repo, unmodified)
 
             if not no_output and not paths:
                 raise errors.OutputsNotFound(repo, inputs.values())
 
+            if client.has_external_storage:
+                client.track_paths_in_storage(*paths)
+
             tool.inputs = list(inputs.values())
             tool.outputs = outputs
 
-            client.track_paths_in_storage(*paths)
-
         # Requirement detection can be done anytime.
         from .process_requirements import InitialWorkDirRequirement, \
             InlineJavascriptRequirement
         initial_work_dir_requirement = InitialWorkDirRequirement.from_tool(
-            tool, existing_directories=existing_directories
+            tool,
+            existing_directories=existing_directories,
+            working_dir=self.working_dir
         )
         if initial_work_dir_requirement:
             tool.requirements.extend([
@@ -521,25 +543,28 @@ def guess_outputs(self, paths):
                     str(input_path / path)
                     for path in tree.get(input_path, default=[])
                 }
-                content = {
-                    str(path)
-                    for path in input_path.rglob('*')
-                    if not path.is_dir() and path.name != '.gitkeep'
-                }
-                extra_paths = content - subpaths
-                if extra_paths:
-                    raise errors.InvalidOutputPath(
-                        'The output directory "{0}" is not empty. \n\n'
-                        'Delete existing files before running the command:'
-                        '\n  (use "git rm <file>..." to remove them first)'
-                        '\n\n'.format(input_path) + '\n'.join(
-                            '\t' + click.style(path, fg='yellow')
-                            for path in extra_paths
-                        ) + '\n\n'
-                        'Once you have removed files that should be used '
-                        'as outputs,\n'
-                        'you can safely rerun the previous command.'
-                    )
+                if input_path.resolve() not in self.explicit_outputs:
+                    content = {
+                        str(path)
+                        for path in input_path.rglob('*')
+                        if not path.is_dir() and path.name != '.gitkeep'
+                    }
+                    extra_paths = content - subpaths
+                    if extra_paths:
+                        raise errors.InvalidOutputPath(
+                            'The output directory "{0}" is not empty. \n\n'
+                            'Delete existing files before running the '
+                            'command:'
+                            '\n  (use "git rm <file>..." to remove them '
+                            'first)'
+                            '\n\n'.format(input_path) + '\n'.join(
+                                '\t' + click.style(path, fg='yellow')
+                                for path in extra_paths
+                            ) + '\n\n'
+                            'Once you have removed files that should be used '
+                            'as outputs,\n'
+                            'you can safely rerun the previous command.'
+                        )
 
                 # Remove files from the input directory
                 paths = [path for path in paths if path not in subpaths]
@@ -611,3 +636,83 @@ def guess_outputs(self, paths):
                         outputBinding=dict(glob=glob, ),
                     ), None, glob
                 )
+
+    def find_explicit_inputs(self):
+        """Yield explicit inputs and command line input bindings if any."""
+        input_paths = [
+            input.default.path
+            for input in self.inputs if input.type in PATH_OBJECTS
+        ]
+        input_id = len(self.inputs) + len(self.arguments)
+
+        for explicit_input in self.explicit_inputs:
+            if explicit_input in input_paths:
+                continue
+
+            try:
+                explicit_input.relative_to(self.working_dir)
+            except ValueError:
+                raise errors.InvalidInputPath(
+                    'The input file or directory is not in the repository.'
+                    '\n\n\t' + click.style(str(explicit_input), fg='yellow') +
+                    '\n\n'
+                )
+            if self.file_candidate(explicit_input) is None:
+                raise errors.InvalidInputPath(
+                    'The input file or directory does not exist.'
+                    '\n\n\t' + click.style(str(explicit_input), fg='yellow') +
+                    '\n\n'
+                )
+            input_id += 1
+            default, type, _ = self.guess_type(explicit_input)
+            # Explicit inputs are either File or Directory
+            assert type in PATH_OBJECTS
+            # The inputBinging is None because these inputs won't
+            # appear on command-line
+            yield CommandInputParameter(
+                id='input_{0}'.format(input_id),
+                type=type,
+                default=default,
+                inputBinding=None
+            )
+
+    def find_explicit_outputs(self, starting_output_id):
+        """Yield explicit output and changed command input parameter."""
+        inputs = {
+            str(i.default.path.relative_to(self.working_dir)): i
+            for i in self.inputs if i.type in PATH_OBJECTS
+        }
+        output_id = starting_output_id
+
+        for path in self.explicit_outputs:
+            if self.file_candidate(path) is None:
+                raise errors.InvalidOutputPath(
+                    'The output file or directory does not exist.'
+                    '\n\n\t' + click.style(str(path), fg='yellow') + '\n\n'
+                )
+
+            output_path = str(path.relative_to(self.working_dir))
+            type = 'Directory' if path.is_dir() else 'File'
+            if output_path in inputs:
+                # change input type to note that it is also an output
+                input = inputs[output_path]
+                input = attr.evolve(input, type='string', default=output_path)
+                yield (
+                    CommandOutputParameter(
+                        id='output_{0}'.format(output_id),
+                        type=type,
+                        outputBinding=dict(
+                            glob='$(inputs.{0})'.format(input.id)
+                        )
+                    ), input, output_path
+                )
+            else:
+                yield (
+                    CommandOutputParameter(
+                        id='output_{0}'.format(output_id),
+                        type=type,
+                        outputBinding=dict(glob=str(output_path))
+                    ), None, output_path
+                )
+
+            output_id += 1
diff --git a/renku/models/cwl/process_requirements.py b/renku/models/cwl/process_requirements.py
diff --git a/tests/cli/test_output_option.py b/tests/cli/test_output_option.py