Allow title, subject, author, and keywords to be unset with an empty …

…string argument (#1117) Co-authored-by: Frederick D. Hansen <frederick.hansen@gmail.com>
ocrmypdf · Jun 20, 2023 · 050dd1f · 050dd1f
1 parent e44a57a
commit 050dd1f
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 9 deletions.
diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py
@@ -852,24 +852,43 @@ def report_on_metadata(missing):
 
     with pikepdf.open(context.origin) as original, pikepdf.open(working_file) as pdf:
         docinfo = get_docinfo(original, context)
-        with pdf.open_metadata() as meta:
-            meta.load_from_docinfo(docinfo, delete_missing=False, raise_failure=False)
+        with pdf.open_metadata() as meta_pdf:
+            meta_pdf.load_from_docinfo(docinfo, delete_missing=False, raise_failure=False)
             # If xmp:CreateDate is missing, set it to the modify date to
-            # match Ghostscript, for consistency
-            if 'xmp:CreateDate' not in meta:
-                meta['xmp:CreateDate'] = meta.get('xmp:ModifyDate', '')
+            # ensure consistency with Ghostscript.
+            if 'xmp:CreateDate' not in meta_pdf:
+                meta_pdf['xmp:CreateDate'] = meta_pdf.get('xmp:ModifyDate', '')
 
             with original.open_metadata(
                 set_pikepdf_as_editor=False, update_docinfo=False, strict=False
             ) as meta_original:
-                if meta.get('dc:title') == 'Untitled':
+                if meta_pdf.get('dc:title') == 'Untitled':
                     # Ghostscript likes to set title to Untitled if omitted from input.
                     # Reverse this, because PDF/A TechNote 0003:Metadata in PDF/A-1
                     # and the XMP Spec do not make this recommendation.
                     if 'dc:title' not in meta_original:
-                        del meta['dc:title']
-                missing = set(meta_original.keys()) - set(meta.keys())
-                report_on_metadata(missing)
+                        del meta_pdf['dc:title']
+                # If the user explicitly specified an empty string for any of the
+                # following, they should be unset and not reported as missing in
+                # the output pdf. Note that some metadata fields use differing names
+                # between PDF-A and PDF.
+                for meta in [meta_pdf, meta_original]:
+                    if options.title == '' and 'dc:title' in meta:
+                        del meta['dc:title']  # PDF-A and PDF
+                    if options.author == '':
+                        if 'dc:creator' in meta:
+                            del meta['dc:creator']  # PDF-A (Not xmp:CreatorTool)
+                        if 'pdf:Author' in meta:
+                            del meta['pdf:Author']  # PDF
+                    if options.subject == '':
+                        if 'dc:description' in meta:
+                            del meta['dc:description']  # PDF-A
+                        if 'dc:subject' in meta:
+                            del meta['dc:subject']  # PDF
+                    if options.keywords == '' and 'pdf:Keywords' in meta:
+                        del meta['pdf:Keywords']  # PDF-A and PDF
+                meta_missing = set(meta_original.keys()) - set(meta_pdf.keys())
+                report_on_metadata(meta_missing)
 
         optimizing = context.plugin_manager.hook.is_optimization_enabled(
             context=context

diff --git a/tests/resources/meta.pdf b/tests/resources/meta.pdf
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -76,6 +76,50 @@ def test_override_metadata(output_type, resources, outpdf):
         assert pdfa_info['output'] == output_type
 
 
+@pytest.mark.parametrize('output_type', ['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'])
+@pytest.mark.parametrize('field', ['title', 'author', 'subject', 'keywords'])
+def test_unset_metadata(output_type, field, resources, outpdf):
+    input_file = resources / 'meta.pdf'
+
+    # magic strings contained in the input pdf metadata
+    meta = {
+        'title': b'NFY5f7Ft2DWMkxLhXwxvFf7eWR2KeK3vEDcd',
+        'author': b'yXaryipxyRk9dVjWjSSaVaNCKeLRgEVzPRMp',
+        'subject': b't49vimctvnuH7ZeAjAkv52ACvWFjcnm5MPJr',
+        'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh'}
+
+    p = run_ocrmypdf(
+        input_file,
+        outpdf,
+        f'--{field}',
+        '',
+        '--output-type',
+        output_type,
+        '--plugin',
+        'tests/plugins/tesseract_noop.py',
+    )
+
+    assert p.returncode == ExitCode.ok, p.stderr
+
+    # We mainly want to ensure that when '' is passed, the corresponding
+    # metadata is unset in the output pdf. Since metedata is not compressed,
+    # the best way to gaurentee the metadata of interest didn't carry
+    # forward is to just check to ensure the corresponding magic string
+    # isn't contained anywhere in the output pdf. We'll also check to ensure
+    # it's in the input pdf and that any values not unset are still in the
+    # output pdf.
+    with open(input_file, 'rb') as before, open(outpdf, 'rb') as after:
+        before_data = before.read()
+        after_data = after.read()
+
+    for k, v in meta.items():
+        assert v in before_data
+        if k == field:
+            assert v not in after_data
+        else:
+            assert v in after_data
+
+
 def test_high_unicode(resources, no_outpdf):
     # Ghostscript doesn't support high Unicode, so neither do we, to be
     # safe