Skip to content

Commit

Permalink
Merge branches 'feature/better-imageops' and 'feature/cleanup-fitz'
Browse files Browse the repository at this point in the history
  • Loading branch information
jbarlow83 committed Apr 16, 2023
2 parents 3731fdf + b2e6a64 commit cb251a8
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 67 deletions.
94 changes: 42 additions & 52 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,9 @@

from .conftest import check_ocrmypdf, run_ocrmypdf

try:
import fitz
except ImportError:
fitz = None


@pytest.mark.parametrize("output_type", ['pdfa', 'pdf'])
def test_preserve_docinfo(output_type, resources, outpdf):
pdf_before = pikepdf.open(resources / 'graph.pdf')

output = check_ocrmypdf(
resources / 'graph.pdf',
outpdf,
Expand All @@ -40,14 +33,13 @@ def test_preserve_docinfo(output_type, resources, outpdf):
'--plugin',
'tests/plugins/tesseract_noop.py',
)

pdf_after = pikepdf.open(output)

for key in ('/Title', '/Author'):
assert pdf_before.docinfo[key] == pdf_after.docinfo[key]

pdfa_info = file_claims_pdfa(str(output))
assert pdfa_info['output'] == output_type
with pikepdf.open(resources / 'graph.pdf') as pdf_before, pikepdf.open(
output
) as pdf_after:
for key in ('/Title', '/Author'):
assert pdf_before.docinfo[key] == pdf_after.docinfo[key]
pdfa_info = file_claims_pdfa(str(output))
assert pdfa_info['output'] == output_type


@pytest.mark.parametrize("output_type", ['pdfa', 'pdf'])
Expand All @@ -71,19 +63,17 @@ def test_override_metadata(output_type, resources, outpdf):

assert p.returncode == ExitCode.ok, p.stderr

before = pikepdf.open(input_file)
after = pikepdf.open(outpdf)

assert after.docinfo.Title == german, after.docinfo
assert after.docinfo.Author == chinese, after.docinfo
assert after.docinfo.get('/Keywords', '') == ''
with pikepdf.open(input_file) as before, pikepdf.open(outpdf) as after:
assert after.docinfo.Title == german, after.docinfo
assert after.docinfo.Author == chinese, after.docinfo
assert after.docinfo.get('/Keywords', '') == ''

before_date = decode_pdf_date(str(before.docinfo.CreationDate))
after_date = decode_pdf_date(str(after.docinfo.CreationDate))
assert before_date == after_date
before_date = decode_pdf_date(str(before.docinfo.CreationDate))
after_date = decode_pdf_date(str(after.docinfo.CreationDate))
assert before_date == after_date

pdfa_info = file_claims_pdfa(outpdf)
assert pdfa_info['output'] == output_type
pdfa_info = file_claims_pdfa(outpdf)
assert pdfa_info['output'] == output_type


def test_high_unicode(resources, no_outpdf):
Expand All @@ -106,10 +96,10 @@ def test_high_unicode(resources, no_outpdf):
assert p.returncode == ExitCode.bad_args, p.stderr


@pytest.mark.skipif(not fitz, reason="test uses fitz")
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_bookmarks_preserved(output_type, ocr_option, resources, outpdf):
fitz = pytest.importorskip('fitz')
input_file = resources / 'toc.pdf'
before_toc = fitz.Document(str(input_file)).get_toc()

Expand Down Expand Up @@ -147,23 +137,24 @@ def test_creation_date_preserved(output_type, resources, infile, outpdf):
'tests/plugins/tesseract_noop.py',
)

pdf_before = pikepdf.open(input_file)
pdf_after = pikepdf.open(outpdf)

before = pdf_before.trailer.get('/Info', {})
after = pdf_after.trailer.get('/Info', {})

if not before:
assert after.get('/CreationDate', '') != ''
else:
# We expect that the creation date stayed the same
date_before = decode_pdf_date(str(before['/CreationDate']))
date_after = decode_pdf_date(str(after['/CreationDate']))
assert seconds_between_dates(date_before, date_after) < 1000

# We expect that the modified date is quite recent
date_after = decode_pdf_date(str(after['/ModDate']))
assert seconds_between_dates(date_after, datetime.datetime.now(timezone.utc)) < 1000
with pikepdf.open(input_file) as pdf_before, pikepdf.open(outpdf) as pdf_after:
before = pdf_before.trailer.get('/Info', {})
after = pdf_after.trailer.get('/Info', {})

if not before:
assert after.get('/CreationDate', '') != ''
else:
# We expect that the creation date stayed the same
date_before = decode_pdf_date(str(before['/CreationDate']))
date_after = decode_pdf_date(str(after['/CreationDate']))
assert seconds_between_dates(date_before, date_after) < 1000

# We expect that the modified date is quite recent
date_after = decode_pdf_date(str(after['/ModDate']))
assert (
seconds_between_dates(date_after, datetime.datetime.now(timezone.utc))
< 1000
)


@pytest.fixture
Expand Down Expand Up @@ -280,10 +271,9 @@ def test_kodak_toc(resources, outpdf):
'tests/plugins/tesseract_noop.py',
)

p = pikepdf.open(outpdf)

if pikepdf.Name.First in p.Root.Outlines:
assert isinstance(p.Root.Outlines.First, pikepdf.Dictionary)
with pikepdf.open(outpdf) as p:
if pikepdf.Name.First in p.Root.Outlines:
assert isinstance(p.Root.Outlines.First, pikepdf.Dictionary)


def test_metadata_fixup_warning(resources, outdir, caplog):
Expand All @@ -301,10 +291,10 @@ def test_metadata_fixup_warning(resources, outdir, caplog):
assert record.levelname != 'WARNING', "Unexpected warning"

# Now add some metadata that will not be copyable
graph = pikepdf.open(outdir / 'graph.pdf')
with graph.open_metadata() as meta:
meta['prism2:publicationName'] = 'OCRmyPDF Test'
graph.save(outdir / 'graph_mod.pdf')
with pikepdf.open(outdir / 'graph.pdf') as graph:
with graph.open_metadata() as meta:
meta['prism2:publicationName'] = 'OCRmyPDF Test'
graph.save(outdir / 'graph_mod.pdf')

context = PdfContext(
options, outdir, outdir / 'graph_mod.pdf', None, get_plugin_manager([])
Expand Down
20 changes: 10 additions & 10 deletions tests/test_optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,14 @@ def test_jbig2_lossy(lossy, resources, outpdf):

check_ocrmypdf(*args)

pdf = pikepdf.open(outpdf)
pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
assert pim.filters[0] == '/JBIG2Decode'
with pikepdf.open(outpdf) as pdf:
pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
assert pim.filters[0] == '/JBIG2Decode'

if lossy:
assert '/JBIG2Globals' in pim.decode_parms[0]
else:
assert len(pim.decode_parms) == 0
if lossy:
assert '/JBIG2Globals' in pim.decode_parms[0]
else:
assert len(pim.decode_parms) == 0


@needs_pngquant
Expand All @@ -134,9 +134,9 @@ def test_flate_to_jbig2(resources, outdir):
'tests/plugins/tesseract_noop.py',
)

pdf = pikepdf.open(outdir / 'out.pdf')
pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
assert pim.filters[0] == '/JBIG2Decode'
with pikepdf.open(outdir / 'out.pdf') as pdf:
pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
assert pim.filters[0] == '/JBIG2Decode'


@needs_pngquant
Expand Down
10 changes: 5 additions & 5 deletions tests/test_rotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,11 +237,11 @@ def make_rotate_test(prefix, image_angle, page_angle):
**IMG2PDF_KWARGS,
)
mempdf.seek(0)
pike = pikepdf.open(mempdf)
pike.pages[0].Rotate = page_angle
target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'
pike.save(target)
return target
with pikepdf.open(mempdf) as pdf:
pdf.pages[0].Rotate = page_angle
target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'
pdf.save(target)
return target

reference = make_rotate_test('ref', 0, 0)
test = make_rotate_test('test', image_angle, page_angle)
Expand Down

0 comments on commit cb251a8

Please sign in to comment.