Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use the same logic for handling --mets/--mets-basename/--working-dir in processors and "ocrd workspace" CLI #696

Merged
merged 9 commits into from Feb 1, 2022
66 changes: 24 additions & 42 deletions ocrd/ocrd/cli/workspace.py
Expand Up @@ -7,7 +7,7 @@
"""
import os
from os import getcwd
from os.path import relpath, exists, join, isabs, dirname, basename, abspath
from os.path import relpath, exists, join, isabs
from pathlib import Path
from json import loads
import sys
Expand All @@ -27,28 +27,10 @@ class WorkspaceCtx():

def __init__(self, directory, mets_url, mets_basename, automatic_backup):
self.log = getLogger('ocrd.cli.workspace')
if mets_basename and mets_url:
raise ValueError("Use either --mets or --mets-basename, not both")
if mets_basename and not mets_url:
self.log.warning(DeprecationWarning("--mets-basename is deprecated. Use --mets/--directory instead"))
mets_basename = mets_basename if mets_basename else 'mets.xml'
if directory and mets_url:
directory = abspath(directory)
if not abspath(mets_url).startswith(directory):
raise ValueError("--mets has a directory part inconsistent with --directory")
elif not directory and mets_url:
if mets_url.startswith('http') or mets_url.startswith('https:'):
raise ValueError("--mets is an http(s) URL but no --directory was given")
directory = dirname(abspath(mets_url)) or getcwd()
elif directory and not mets_url:
directory = abspath(directory)
mets_url = join(directory, mets_basename)
else:
directory = getcwd()
mets_url = join(directory, mets_basename)
self.directory = directory
self.resolver = Resolver()
self.mets_url = mets_url
if mets_basename:
self.log.warning(DeprecationWarning('--mets-basename is deprecated. Use --mets/--directory instead.'))
self.directory, self.mets_url, self.mets_basename = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename)
self.automatic_backup = automatic_backup

pass_workspace = click.make_pass_decorator(WorkspaceCtx)
Expand Down Expand Up @@ -137,8 +119,8 @@ def workspace_clone(ctx, clobber_mets, download, mets_url, workspace_dir):

workspace = ctx.resolver.workspace_from_url(
mets_url,
dst_dir=os.path.abspath(ctx.directory),
mets_basename=basename(ctx.mets_url),
dst_dir=ctx.directory,
mets_basename=ctx.mets_basename,
clobber_mets=clobber_mets,
download=download,
)
Expand All @@ -165,8 +147,8 @@ def workspace_init(ctx, clobber_mets, directory):
LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
ctx.directory = directory
workspace = ctx.resolver.workspace_from_nothing(
directory=os.path.abspath(ctx.directory),
mets_basename=basename(ctx.mets_url),
directory=ctx.directory,
mets_basename=ctx.mets_basename,
clobber_mets=clobber_mets
)
workspace.save_mets()
Expand All @@ -191,7 +173,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_
Add a file or http(s) URL FNAME to METS in a workspace.
If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)

log = getLogger('ocrd.cli.workspace.add')
if not mimetype:
Expand Down Expand Up @@ -268,7 +250,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp

"""
log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)

try:
pat = re.compile(regex)
Expand Down Expand Up @@ -359,7 +341,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down
"""
modified_mets = False
ret = list()
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
for f in workspace.mets.find_files(
ID=file_id,
fileGrp=file_grp,
Expand Down Expand Up @@ -400,7 +382,7 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin
(If any ``ID`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
for i in id:
workspace.remove_file(i, force=force, keep_file=keep_file)
workspace.save_mets()
Expand All @@ -418,7 +400,7 @@ def rename_group(ctx, old, new):
"""
Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
workspace.rename_file_group(old, new)
workspace.save_mets()

Expand All @@ -439,7 +421,7 @@ def remove_group(ctx, group, recursive, force, keep_files):
(If any ``GROUP`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
for g in group:
workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
workspace.save_mets()
Expand All @@ -461,7 +443,7 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id):
(If any ``FILTER`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
with pushd_popd(workspace.directory):
for f in workspace.mets.find_files(
ID=file_id,
Expand All @@ -487,7 +469,7 @@ def list_groups(ctx):
"""
List fileGrp USE attributes
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
print("\n".join(workspace.mets.file_groups))

# ----------------------------------------------------------------------
Expand All @@ -500,7 +482,7 @@ def list_pages(ctx):
"""
List physical page IDs
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
print("\n".join(workspace.mets.physical_pages))

# ----------------------------------------------------------------------
Expand All @@ -513,7 +495,7 @@ def get_id(ctx):
"""
Get METS id if any
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
ID = workspace.mets.unique_identifier
if ID:
print(ID)
Expand All @@ -533,7 +515,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin

Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
workspace.mets.unique_identifier = id
workspace.save_mets()

Expand All @@ -558,7 +540,7 @@ def merge(ctx, copy_files, filegrp_mapping, file_grp, file_id, page_id, mimetype
mets_path = Path(mets_path)
if filegrp_mapping:
filegrp_mapping = loads(filegrp_mapping)
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
workspace.merge(
other_workspace,
Expand Down Expand Up @@ -588,7 +570,7 @@ def workspace_backup_add(ctx):
"""
Create a new backup
"""
backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup))
backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
backup_manager.add()

@workspace_backup_cli.command('list')
Expand All @@ -597,7 +579,7 @@ def workspace_backup_list(ctx):
"""
List backups
"""
backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup))
backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
for b in backup_manager.list():
print(b)

Expand All @@ -609,7 +591,7 @@ def workspace_backup_restore(ctx, choose_first, bak):
"""
Restore backup BAK
"""
backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup))
backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
backup_manager.restore(bak, choose_first)

@workspace_backup_cli.command('undo')
Expand All @@ -618,5 +600,5 @@ def workspace_backup_undo(ctx):
"""
Restore the last backup
"""
backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup))
backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
backup_manager.undo()
5 changes: 1 addition & 4 deletions ocrd/ocrd/decorators/__init__.py
Expand Up @@ -59,11 +59,8 @@ def ocrd_cli_wrap_processor(
# raise ValueError('-I/--input-file-grp is required')
# if not kwargs['output_file_grp']:
# raise ValueError('-O/--output-file-grp is required')
if is_local_filename(mets) and not isfile(get_local_filename(mets)):
msg = "File does not exist: %s" % mets
LOG.error(msg)
raise Exception(msg)
resolver = Resolver()
working_dir, mets, _ = resolver.resolve_mets_arguments(working_dir, mets, None)
workspace = resolver.workspace_from_url(mets, working_dir)
page_id = kwargs.get('page_id')
# XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505
Expand Down
53 changes: 53 additions & 0 deletions ocrd/ocrd/resolver.py
@@ -1,5 +1,6 @@
from tempfile import mkdtemp
from pathlib import Path
from warnings import warn

import requests

Expand All @@ -9,6 +10,7 @@
is_local_filename,
get_local_filename,
remove_non_path_from_url,
is_file_in_directory,
nth_url_segment
)
from ocrd.workspace import Workspace
Expand Down Expand Up @@ -199,3 +201,54 @@ def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_me

return Workspace(self, directory, mets, mets_basename=mets_basename)

def resolve_mets_arguments(self, directory, mets_url, mets_basename):
"""
Resolve the ``--mets``, ``--mets-basename`` and `--directory`` argument
into a coherent set of arguments according to https://github.com/OCR-D/core/issues/517
"""
log = getLogger('ocrd.resolver.resolve_mets_arguments')
mets_is_remote = mets_url and (mets_url.startswith('http://') or mets_url.startswith('https://'))

# XXX we might want to be more strict like this but it might break # legacy code
# Allow --mets and --directory together iff --mets is a remote URL
# if directory and mets_url and not mets_is_remote:
# raise ValueError("Use either --mets or --directory, not both")

# If --mets is a URL, a directory must be explicitly provided (not strictly necessary, but retained for legacy behavior)
if not directory and mets_is_remote:
raise ValueError("--mets is an http(s) URL but no --directory was given")

# Determine --mets-basename
if not mets_basename and mets_url:
mets_basename = Path(mets_url).name
elif not mets_basename and not mets_url:
mets_basename = 'mets.xml'
elif mets_basename and mets_url:
raise ValueError("Use either --mets or --mets-basename, not both")
else:
warn("--mets-basename is deprecated. Use --mets/--directory instead", DeprecationWarning)

# Determine --directory and --mets-url
if not directory and not mets_url:
directory = Path.cwd()
mets_url = Path(directory, mets_basename)
elif directory and not mets_url:
directory = Path(directory).resolve()
mets_url = directory / mets_basename
elif not directory and mets_url:
mets_url = Path(mets_url).resolve()
directory = mets_url.parent
else: # == directory and mets_url:
directory = Path(directory).resolve()
if not mets_is_remote:
# --mets is just a basename and --directory is set, so treat --mets as --mets-basename
if Path(mets_url).parent == Path('.'):
mets_url = directory / mets_url
else:
mets_url = Path(mets_url).resolve()
if not is_file_in_directory(directory, mets_url):
raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory))

return str(Path(directory).resolve()), str(mets_url), str(mets_basename)


7 changes: 5 additions & 2 deletions ocrd_utils/ocrd_utils/__init__.py
Expand Up @@ -61,7 +61,8 @@
(produced by `tesserocr`)
* `y0x0y1x1` is the same as `x0y0x1y1` with positions of `x` and `y` in the list swapped

* :py:func:`is_local_filename`,
* :py:func:`is_file_in_directory`
:py:func:`is_local_filename`,
:py:func:`safe_filename`,
:py:func:`abspath`,
:py:func:`get_local_filename`
Expand Down Expand Up @@ -166,10 +167,12 @@
from .os import (
abspath,
list_all_resources,
is_file_in_directory,
list_resource_candidates,
atomic_write,
pushd_popd,
unzip_file_to_dir)
unzip_file_to_dir,
)

from .str import (
assert_file_grp_cardinality,
Expand Down
11 changes: 10 additions & 1 deletion ocrd_utils/ocrd_utils/os.py
Expand Up @@ -3,9 +3,9 @@
"""
__all__ = [
'abspath',
'is_file_in_directory',
'pushd_popd',
'unzip_file_to_dir',
'list_resource_candidates',
'atomic_write',
]

Expand Down Expand Up @@ -120,3 +120,12 @@ def get_fileobject(self, **kwargs):
def atomic_write(fpath):
with atomic_write_(fpath, writer_cls=AtomicWriterPerms, overwrite=True) as f:
yield f


def is_file_in_directory(directory, file):
"""
Return True if ``file`` is in ``directory`` (by checking that all components of ``directory`` are in ``file.parts``)
"""
directory = Path(directory)
file = Path(file)
return list(file.parts)[:len(directory.parts)] == list(directory.parts)
6 changes: 3 additions & 3 deletions tests/cli/test_workspace.py
Expand Up @@ -382,7 +382,7 @@ def test_mets_basename_and_not_mets(self):
with pushd_popd(tempdir=True) as tempdir:
_, out, err = self.invoke_cli(workspace_cli, ['-d', 'foo', '-M', 'not-foo.xml', 'init'])
self.assertEqual(out, join(tempdir, 'foo') + '\n')
self.assertIn('--mets-basename is deprecated. Use --mets/--directory instead', err)
self.assertIn('--mets-basename is deprecated', err)

def test_mets_get_id_set_id(self):
with pushd_popd(tempdir=True):
Expand All @@ -396,10 +396,10 @@ def test_mets_get_id_set_id(self):

def test_mets_directory_incompatible(self):
with pushd_popd(tempdir=True) as tempdir:
with self.assertRaisesRegex(ValueError, "--mets has a directory part inconsistent with --directory"):
with self.assertRaisesRegex(ValueError, "inconsistent with --directory"):
self.invoke_cli(workspace_cli, ['-d', 'foo', '-m', '/somewhere/else', 'init'])

def test_mets_directory_html(self):
def test_mets_directory_http(self):
with pushd_popd(tempdir=True) as tempdir:
with self.assertRaisesRegex(ValueError, r"--mets is an http\(s\) URL but no --directory was given"):
self.invoke_cli(workspace_cli, ['-m', 'https://foo.bar/bla', 'init'])
Expand Down
3 changes: 2 additions & 1 deletion tests/test_decorators.py
Expand Up @@ -212,5 +212,6 @@ def test_parameter_override_wo_param(self):
print(out)
self.assertEqual(out, '{"baz": "two"}\n')


if __name__ == '__main__':
main(__file__)
main()
21 changes: 21 additions & 0 deletions tests/test_resolver.py
Expand Up @@ -277,5 +277,26 @@ def test_download_to_directory_subdir(fixture_copy_kant):
assert fn == 'baz/mets.xml'


def test_resolve_mets_arguments():
"""
https://github.com/OCR-D/core/issues/693
https://github.com/OCR-D/core/issues/517
"""
resolver = Resolver()
assert resolver.resolve_mets_arguments(None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml')
assert resolver.resolve_mets_arguments('/', None, 'mets.xml') == ('/', '/mets.xml', 'mets.xml')
assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None) == ('/foo', '/foo/foo.xml', 'foo.xml')
assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None) == ('/foo', '/foo/foo.xml', 'foo.xml')
assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None) == ('/foo', '/foo/foo.xml', 'foo.xml')
assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None) == ('/foo', 'http://bar/foo.xml', 'foo.xml')
with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"):
resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml')
with pytest.raises(ValueError, match="inconsistent with --directory"):
resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None)
with pytest.warns(DeprecationWarning):
resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml')
with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"):
resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None)

if __name__ == '__main__':
main(__file__)