Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Verify derivs #3064

Merged
merged 4 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 117 additions & 2 deletions app/services/oregon_digital/verify_derivatives_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,123 @@

module OregonDigital
##
# A service to verify that derivatives for the content exist for the migrated asset
# A service to verify that derivatives for the content exist for the migrated work
class VerifyDerivativesService < VerifyService
def verify; end
attr_reader :verification_errors

# Given derivatives info from the original profile, verify that the derivatives
# were successfully created after migrating the new work
def verify
@verification_errors = { derivatives: [] }
return { derivatives: ['no file set'] } if no_fileset_warning

file_sets.each do |file_set|
verify_file_set(file_set)
end
@verification_errors
rescue StandardError => e
@verification_errors[:derivatives] << e.message
@verification_errors
end

def file_sets
@file_sets ||= Hyrax.custom_queries.find_child_file_sets(resource: @work)
end

def mime_type(file_set)
file_metadata(file_set).mime_type
end

def file_metadata(file_set)
Hyrax.custom_queries.find_files(file_set: file_set).first
end

def no_fileset_warning
file_sets.empty? && @work.class.to_s != 'Generic'
end

# rubocop:disable Metrics/AbcSize
def verify_file_set(file_set)
case mime_type(file_set)
when FileSet.pdf_mime_types then check_pdf_derivatives(object)
when FileSet.office_document_mime_types then check_office_document_derivatives(object)
when FileSet.audio_mime_types then check_audio_derivatives(object)
when FileSet.video_mime_types then check_video_derivatives(object)
when FileSet.image_mime_types then check_image_derivatives(object)
end
@verification_errors[:derivatives] << 'Index problem (fileset)' unless index_info[:file_set]
end
# rubocop:enable Metrics/AbcSize

def check_pdf_derivatives(file_set)
check_thumbnail(file_set)
check_page_count(file_set)
end

def check_office_document_derivatives(file_set)
check_thumbnail(file_set)
# TODO: restore this check when we know more about extracted_text
# check_extracted_content(file_set)
end

def check_audio_derivatives(file_set)
check_file_type(file_set, 'mp3')
end

def check_video_derivatives(file_set)
check_thumbnail(file_set)
check_file_type(file_set, 'mp4')
end

def check_image_derivatives(file_set)
check_thumbnail(file_set)
check_file_type(file_set, 'jp2')
end

def all_derivatives(file_set)
Hyrax::DerivativePath.derivatives_for_reference(file_set).map { |f| File.basename(f) }
end

def check_thumbnail(file_set)
has_thumbnail = work_info(file_set)[:has_thumbnail]
if has_thumbnail
@verification_errors[:derivatives] << 'Index problem (thumbnail)' unless index_info[:thumbnail]
else
@verification_errors[:derivatives] << 'Missing thumbnail' unless has_thumbnail == true
end
end

def check_page_count(file_set)
@verification_errors[:derivatives] << 'Missing pages' unless work_info(file_set)[:page_count].positive?
end

def check_extracted_content(file_set)
@verification_errors[:derivatives] << 'Missing extracted text' unless work_info(file_set)[:has_extracted_text]
end

def check_file_type(file_set, extension)
@verification_errors[:derivatives] << "Missing #{extension} derivative" unless derivatives_for_reference(file_set, extension).present?
end

def derivatives_for_reference(file_set, extension)
all_derivatives(file_set).select { |b| File.extname(b) == ".#{extension}" }
end

def index_info
@index_info ||=
{
thumbnail: !@solr_doc['thumbnail_path_ss'].blank?,
file_set: !@solr_doc['file_set_ids_ssim'].blank?
}
end

def work_info(file_set)
@work_info ||=
{
has_thumbnail: all_derivatives(file_set).select { |b| b.match 'thumbnail' }.present?,
# has_extracted_text: file_set.extracted_text.present?,
page_count: derivatives_for_reference(file_set, 'jp2').count
}
end
end
end
2 changes: 1 addition & 1 deletion config/environments/development.rb
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
config.batch_size = ENV.fetch('BULKRAX_BATCH_SIZE', 100).to_i

config.verify_services = [

OregonDigital::VerifyDerivativesService
]

end
4 changes: 3 additions & 1 deletion config/environments/production.rb
Original file line number Diff line number Diff line change
Expand Up @@ -123,5 +123,7 @@

config.large_export_size = ENV.fetch('BULKRAX_LARGE_EXPORT', 5000).to_i
config.batch_size = ENV.fetch('BULKRAX_BATCH_SIZE', 100).to_i

config.verify_services = [
OregonDigital::VerifyDerivativesService
]
end
237 changes: 237 additions & 0 deletions spec/services/oregon_digital/verify_derivatives_service_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
# frozen_string_literal: true

RSpec.describe OregonDigital::VerifyDerivativesService do
let(:work) { double }
let(:service) { described_class.new({ work: work, solr_doc: solr_doc }) }
let(:solr_doc) do
{
'id' => pid,
'thumbnail_path_ss' => 'path',
'file_set_ids_ssim' => ["f0#{pid}"]
}
end
let(:pid) { 'df70jh899' }
let(:file_set) { instance_double('Hyrax::FileSet') }
let(:file_metadata) { instance_double('Hyrax::FileMetadata') }
let(:content_path) { 'spec/fixtures/test.jpg' }

before do
allow(work).to receive(:class).and_return(Image)
allow(Hyrax.custom_queries).to receive(:find_files).and_return([file_metadata])
allow(file_set).to receive(:internal_resource)
allow(file_metadata).to receive(:mime_type).and_return('image/jpg')
allow(Hyrax::DerivativePath).to receive(:derivatives_for_reference).and_return(all_derivative_paths)
allow(Hyrax.custom_queries).to receive(:find_child_file_sets).and_return([file_set])
end

# rubocop:disable RSpec/NestedGroups
describe 'verify' do
let(:all_derivative_paths) { [] }

context 'when there is no file set' do
before do
allow(service).to receive(:file_sets).and_return([])
end

context 'when the asset is an Image' do
it 'returns a warning' do
expect(service.verify).to eq({ derivatives: ['no file set'] })
end
end

context 'when the asset is a Generic' do
before do
allow(work).to receive(:class).and_return(Generic)
end

it 'returns no errors' do
expect(service.verify).to eq({ derivatives: [] })
end
end
end
# rubocop:enable RSpec/NestedGroups

context 'when the file set was not indexed' do
let(:all_derivative_paths) { ['/data/tmp/shared/derivatives/c2/47/ds/08/x-jp2-0000.jp2', '/data/tmp/shared/derivatives/c2/47/ds/08/x-thumbnail.jpeg'] }
let(:solr_doc) do
{
'id' => pid,
'thumbnail_path_ss' => 'path',
'file_set_ids_ssim' => []
}
end

it 'returns the error' do
expect(service.verify).to eq({ derivatives: ['Index problem (fileset)'] })
end
end

context 'when derivatives check fails due to error' do
before do
allow(service).to receive(:verify_file_set).and_raise(StandardError.new('I am an error'))
end

RSpec::Matchers.define :match_block do
match do |response|
response.call == { derivatives: ['I am an error'] }
end
supports_block_expectations
end
it 'raises error when hyrax fails' do
expect { service.verify }.not_to raise_error
end
it 'returns the error message' do
expect { service.verify }.to match_block
end
end
end

describe 'check_thumbnail' do
context 'when it is an index problem' do
let(:all_derivative_paths) { ['/data/tmp/shared/derivatives/c2/47/ds/08/x-jp2-0000.jp2', '/data/tmp/shared/derivatives/c2/47/ds/08/x-thumbnail.jpeg'] }
let(:solr_doc) do
{
'id' => pid,
'thumbnail_path_ss' => '',
'file_set_ids_ssim' => []
}
end

it 'returns a solr error' do
service.instance_variable_set(:@verification_errors, { derivatives: [] })
service.check_thumbnail(file_set)
expect(service.verification_errors[:derivatives]).to include('Index problem (thumbnail)')
end
end

context 'when it is a derivative problem' do
let(:all_derivative_paths) { ['/data/tmp/shared/derivatives/c2/47/ds/08/x-jp2-0000.jp2'] }
let(:solr_doc) do
{
'id' => pid,
'thumbnail_path_ss' => '',
'file_set_ids_ssim' => []
}
end

it 'returns a derivative error' do
service.instance_variable_set(:@verification_errors, { derivatives: [] })
service.check_thumbnail(file_set)
expect(service.verification_errors[:derivatives]).to include('Missing thumbnail')
end
end
end

describe 'check_pdf_derivatives' do
context 'when the derivative is present' do
let(:all_derivative_paths) { ['/data/tmp/shared/derivatives/c2/47/ds/08/x-jp2-0000.jp2', '/data/tmp/shared/derivatives/c2/47/ds/08/x-thumbnail.jpeg'] }

it 'returns no errors' do
service.instance_variable_set(:@verification_errors, { derivatives: [] })
service.check_pdf_derivatives(file_set)
expect(service.verification_errors).to eq({ derivatives: [] })
end
end

context 'when the derivative is not present' do
let(:all_derivative_paths) { [] }

it 'returns the errors' do
service.instance_variable_set(:@verification_errors, { derivatives: [] })
service.check_pdf_derivatives(file_set)
expect(service.verification_errors[:derivatives]).to include('Missing pages')
end
end
end

describe 'check_image_derivatives' do
context 'when the derivative is present' do
let(:all_derivative_paths) { ['/data/tmp/shared/derivatives/p8/41/8n/20/k-jp2.jp2', '/data/tmp/shared/derivatives/p8/41/8n/20/k-thumbnail.jpeg'] }

it 'returns no errors' do
service.instance_variable_set(:@verification_errors, { derivatives: [] })
service.check_image_derivatives(file_set)
expect(service.verification_errors).to eq({ derivatives: [] })
end
end

context 'when the derivative is not present' do
let(:all_derivative_paths) { [] }

it 'returns the errors' do
service.instance_variable_set(:@verification_errors, { derivatives: [] })
service.check_image_derivatives(file_set)
expect(service.verification_errors[:derivatives]).to include('Missing jp2 derivative')
end
end
end

describe 'check_office_document_derivatives' do
context 'when the derivative is present' do
let(:all_derivative_paths) { ['/data/tmp/shared/derivatives/nv/93/52/84/1-jp2-0000.jp2', '/data/tmp/shared/derivatives/nv/93/52/84/1-thumbnail.jpeg'] }

it 'returns no errors' do
service.instance_variable_set(:@verification_errors, { derivatives: [] })
service.check_office_document_derivatives(file_set)
expect(service.verification_errors).to eq({ derivatives: [] })
end
end

# undo skip if/when extracted_text is restored in the service
context 'when the derivative is not present' do
let(:all_derivative_paths) { [] }

before { allow(file_set).to receive(:extracted_text).and_return nil }

xit 'returns the errors' do
service.instance_variable_set(:@verification_errors, { derivatives: [] })
service.check_office_document_derivatives(file_set)
expect(service.verification_errors[:derivatives]).to include('Missing extracted text')
end
end
end

describe 'check_audio_derivatives' do
context 'when derivative is present' do
let(:all_derivative_paths) { ['/data/tmp/shared/derivatives/cn/69/m4/12/8-jp2.jp2', '/data/tmp/shared/derivatives/cn/69/m4/12/8-mp3.mp3', '/data/tmp/shared/derivatives/cn/69/m4/12/8-thumbnail.jpeg'] }

it 'returns no errors' do
service.instance_variable_set(:@verification_errors, { derivatives: [] })
service.check_audio_derivatives(file_set)
expect(service.verification_errors).to eq({ derivatives: [] })
end
end

context 'when derivative is not present' do
let(:all_derivative_paths) { [] }

it 'returns the errors' do
service.instance_variable_set(:@verification_errors, { derivatives: [] })
service.check_audio_derivatives(file_set)
expect(service.verification_errors[:derivatives]).to include('Missing mp3 derivative')
end
end
end

describe 'check_video_derivatives' do
context 'when derivative is present' do
let(:all_derivative_paths) { ['/data/tmp/shared/derivatives/nc/58/0m/64/9-jp2.jp2', '/data/tmp/shared/derivatives/nc/58/0m/64/9-mp4.mp4', '/data/tmp/shared/derivatives/nc/58/0m/64/9-thumbnail.jpeg'] }

it 'returns no errors' do
service.instance_variable_set(:@verification_errors, { derivatives: [] })
service.check_video_derivatives(file_set)
expect(service.verification_errors).to eq({ derivatives: [] })
end
end

context 'when derivative is not present' do
let(:all_derivative_paths) { [] }

it 'returns the errors' do
service.instance_variable_set(:@verification_errors, { derivatives: [] })
service.check_video_derivatives(file_set)
expect(service.verification_errors[:derivatives]).to include('Missing mp4 derivative')
end
end
end
end