diff --git a/README.md b/README.md index 6b10669a..9c9e8718 100644 --- a/README.md +++ b/README.md @@ -217,7 +217,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an - NEF examples are downloaded from http://www.rawsamples.ch/ and are Creative Common Licensed. ### OGG -- `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `with_garbage_at_the_end.ogg` have been generated by the project contributors +- `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `invalid_with_garbage_at_the_end.ogg` have been generated by the project contributors ### PDF - PDF 2.0 files downloaded from the [PDF Association public Github repository](https://github.com/pdf-association/pdf20examples). These files are licensed under the Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) license. @@ -236,7 +236,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an ### WAV - c_11k16bitpcm.wav and c_8kmp316.wav are from [Wikipedia WAV](https://en.wikipedia.org/wiki/WAV#Comparison_of_coding_schemes), retrieved January 7, 2018 - c_39064__alienbomb__atmo-truck.wav is from [freesound](https://freesound.org/people/alienbomb/sounds/39064/) and is CC0 licensed -- c_M1F1-Alaw-AFsp.wav and d_6_Channel_ID.wav are from a [McGill Engineering site](http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples.html) +- c_M1F1-Alaw-AFsp.wav and invalid_d_6_Channel_ID.wav are from a [McGill Engineering site](http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples.html) ### WEBP - With the exception of extended-animation.webp, which was obtained from Wikimedia Commons and is Creative Commons diff --git a/lib/format_parser.rb b/lib/format_parser.rb index bb1c48ee..21618a60 100644 --- a/lib/format_parser.rb +++ b/lib/format_parser.rb @@ -36,6 +36,9 @@ module FormatParser # The value will ensure the parser having it will be applied to the file last. LEAST_PRIORITY = 99 + @registered_natures = [] + @registered_formats = [] + # Register a parser object to be used to perform file format detection. Each parser FormatParser # provides out of the box registers itself using this method. # @@ -68,9 +71,20 @@ def self.register_parser(callable_parser, formats:, natures:, priority: LEAST_PR end @parser_priorities ||= {} @parser_priorities[callable_parser] = priority + + @registered_natures |= parser_provided_natures + @registered_formats |= parser_provided_formats end end + def self.registered_natures + @registered_natures + end + + def self.registered_formats + @registered_formats + end + # Deregister a parser object (makes FormatParser forget this parser existed). Is mostly used in # tests, but can also be used to forcibly disable some formats completely. # diff --git a/lib/format_parser/version.rb b/lib/format_parser/version.rb index d171bae6..5cd3d07a 100644 --- a/lib/format_parser/version.rb +++ b/lib/format_parser/version.rb @@ -1,3 +1,3 @@ module FormatParser - VERSION = '2.7.0' + VERSION = '2.7.1' end diff --git a/lib/parsers/mp3_parser.rb b/lib/parsers/mp3_parser.rb index 7bd5e753..7c33d8d9 100644 --- a/lib/parsers/mp3_parser.rb +++ b/lib/parsers/mp3_parser.rb @@ -76,6 +76,11 @@ def call(raw_io) io.seek(0) return if TIFF_HEADER_BYTES.include?(safe_read(io, 4)) + # Prevention against parsing WAV files. + io.seek(0) + wav_chunk_id, _wav_size, wav_riff_type = safe_read(io, 12).unpack('a4la4') + return if wav_chunk_id == 'RIFF' || wav_riff_type == 'WAVE' + # Read all the ID3 tags (or at least attempt to) io.seek(0) id3v1 = ID3Extraction.attempt_id3_v1_extraction(io) @@ -315,5 +320,5 @@ def with_id3tag_local_configs end end - FormatParser.register_parser new, natures: :audio, formats: :mp3, priority: 99 + FormatParser.register_parser new, natures: :audio, formats: :mp3, priority: 101 end diff --git a/spec/fixtures/FLAC/sample_rate_0.flac b/spec/fixtures/FLAC/invalid_sample_rate_0.flac similarity index 100% rename from spec/fixtures/FLAC/sample_rate_0.flac rename to spec/fixtures/FLAC/invalid_sample_rate_0.flac diff --git a/spec/fixtures/JSON/lorem_ipsum.json b/spec/fixtures/JSON/invalid_lorem_ipsum.json similarity index 100% rename from spec/fixtures/JSON/lorem_ipsum.json rename to spec/fixtures/JSON/invalid_lorem_ipsum.json diff --git a/spec/fixtures/JSON/malformed.json b/spec/fixtures/JSON/invalid_malformed.json similarity index 100% rename from spec/fixtures/JSON/malformed.json rename to spec/fixtures/JSON/invalid_malformed.json diff --git a/spec/fixtures/M3U/plain_text.m3u b/spec/fixtures/M3U/invalid_plain_text.m3u similarity index 100% rename from spec/fixtures/M3U/plain_text.m3u rename to spec/fixtures/M3U/invalid_plain_text.m3u diff --git a/spec/fixtures/Ogg/with_garbage_at_the_end.ogg b/spec/fixtures/Ogg/invalid_with_garbage_at_the_end.ogg similarity index 100% rename from spec/fixtures/Ogg/with_garbage_at_the_end.ogg rename to spec/fixtures/Ogg/invalid_with_garbage_at_the_end.ogg diff --git a/spec/fixtures/PDF/PDF 2.0 with offset start.pdf b/spec/fixtures/PDF/invalid PDF 2.0 with offset start.pdf similarity index 100% rename from spec/fixtures/PDF/PDF 2.0 with offset start.pdf rename to spec/fixtures/PDF/invalid PDF 2.0 with offset start.pdf diff --git a/spec/fixtures/PDF/exceed_PDF_read_limit.pdf b/spec/fixtures/PDF/invalid_exceed_PDF_read_limit.pdf similarity index 100% rename from spec/fixtures/PDF/exceed_PDF_read_limit.pdf rename to spec/fixtures/PDF/invalid_exceed_PDF_read_limit.pdf diff --git a/spec/fixtures/PDF/not_a.pdf b/spec/fixtures/PDF/invalid_not_a.pdf similarity index 100% rename from spec/fixtures/PDF/not_a.pdf rename to spec/fixtures/PDF/invalid_not_a.pdf diff --git a/spec/fixtures/WAV/c_SCAM_MIC_SOL001_RUN001.wav b/spec/fixtures/WAV/c_SCAM_MIC_SOL001_RUN001.wav new file mode 100644 index 00000000..94f11925 Binary files /dev/null and b/spec/fixtures/WAV/c_SCAM_MIC_SOL001_RUN001.wav differ diff --git a/spec/fixtures/WAV/d_6_Channel_ID.wav b/spec/fixtures/WAV/invalid_d_6_Channel_ID.wav similarity index 100% rename from spec/fixtures/WAV/d_6_Channel_ID.wav rename to spec/fixtures/WAV/invalid_d_6_Channel_ID.wav diff --git a/spec/fixtures/WEBP/unrecognised-variant.webp b/spec/fixtures/WEBP/invalid-unrecognised-variant.webp similarity index 100% rename from spec/fixtures/WEBP/unrecognised-variant.webp rename to spec/fixtures/WEBP/invalid-unrecognised-variant.webp diff --git a/spec/format_parser_spec.rb b/spec/format_parser_spec.rb index 46c4d875..f8ebedd4 100644 --- a/spec/format_parser_spec.rb +++ b/spec/format_parser_spec.rb @@ -34,6 +34,26 @@ end end + it "fixtures with 'invalid' in the filename should fail to parse" do + Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path| + file_name = File.basename(fixture_path) + next unless file_name.include? "invalid" + File.open(fixture_path, 'rb') do |file| + FormatParser.parse(file) + end + end + end + + it "fixtures without 'invalid' in the filename should be parsed successfully" do + Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path| + file_name = File.basename(fixture_path) + next if file_name.include? "invalid" + File.open(fixture_path, 'rb') do |file| + FormatParser.parse(file) + end + end + end + it 'triggers parsers in a certain order that corresponds to the parser priorities' do file_contents = StringIO.new('a' * 4096) @@ -189,12 +209,20 @@ 'FormatParser::CR3Parser', 'FormatParser::DPXParser', 'FormatParser::FLACParser', - 'FormatParser::MP3Parser', 'FormatParser::OggParser', 'FormatParser::TIFFParser', - 'FormatParser::WAVParser' + 'FormatParser::WAVParser', + 'FormatParser::MP3Parser' ]) end + + it 'ensures that MP3 parser is the last one among all' do + natures = FormatParser.registered_natures + formats = FormatParser.registered_formats + prioritised_parsers = FormatParser.parsers_for(natures, formats) + parser_class_names = prioritised_parsers.map { |parser| parser.class.name } + expect(parser_class_names.last).to eq 'FormatParser::MP3Parser' + end end describe '.register_parser and .deregister_parser' do diff --git a/spec/parsers/flac_parser_spec.rb b/spec/parsers/flac_parser_spec.rb index 608007fa..c06790f1 100644 --- a/spec/parsers/flac_parser_spec.rb +++ b/spec/parsers/flac_parser_spec.rb @@ -55,7 +55,7 @@ end it 'raises an error when sample rate is 0' do - fpath = fixtures_dir + 'FLAC/sample_rate_0.flac' + fpath = fixtures_dir + 'FLAC/invalid_sample_rate_0.flac' expect { subject.call(File.open(fpath, 'rb')) diff --git a/spec/parsers/json_parser_spec.rb b/spec/parsers/json_parser_spec.rb index 7c32edcd..ab7c8a1a 100644 --- a/spec/parsers/json_parser_spec.rb +++ b/spec/parsers/json_parser_spec.rb @@ -99,7 +99,7 @@ def file_size(file_name) describe 'When reading objects invalid JSON files' do it "rejects files with corrupted JSON data" do - io = load_file 'malformed.json' + io = load_file 'invalid_malformed.json' parsed = subject.call(io) @@ -107,7 +107,7 @@ def file_size(file_name) end it "rejects invalid files early without reading the whole content" do - io = load_file 'lorem_ipsum.json' + io = load_file 'invalid_lorem_ipsum.json' parsed = subject.call(io) diff --git a/spec/parsers/m3u_parser_spec.rb b/spec/parsers/m3u_parser_spec.rb index 3614a2dc..40f7b81b 100644 --- a/spec/parsers/m3u_parser_spec.rb +++ b/spec/parsers/m3u_parser_spec.rb @@ -11,7 +11,7 @@ end describe 'an m3u file with missing header' do - let(:m3u_file) { 'plain_text.m3u' } + let(:m3u_file) { 'invalid_plain_text.m3u' } it 'does not parse the file successfully' do expect(parsed_m3u).to be_nil diff --git a/spec/parsers/mp3_parser_spec.rb b/spec/parsers/mp3_parser_spec.rb index 62214918..9343b74a 100644 --- a/spec/parsers/mp3_parser_spec.rb +++ b/spec/parsers/mp3_parser_spec.rb @@ -36,6 +36,12 @@ expect(parsed).to be_nil end + it 'does not misdetect a WAV' do + fpath = fixtures_dir + '/WAV/c_SCAM_MIC_SOL001_RUN001.wav' + parsed = subject.call(File.open(fpath, 'rb')) + expect(parsed).to be_nil + end + describe 'title/artist/album attributes' do let(:parsed) { subject.call(File.open(fpath, 'rb')) } diff --git a/spec/parsers/ogg_parser_spec.rb b/spec/parsers/ogg_parser_spec.rb index 6bbe9fcf..1c9e9c7a 100644 --- a/spec/parsers/ogg_parser_spec.rb +++ b/spec/parsers/ogg_parser_spec.rb @@ -13,7 +13,7 @@ end it 'skips a file if it contains more than MAX_POSSIBLE_OGG_PAGE_SIZE bytes of garbage at the end' do - parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/with_garbage_at_the_end.ogg', 'rb')) + parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/invalid_with_garbage_at_the_end.ogg', 'rb')) expect(parse_result).to be_nil end diff --git a/spec/parsers/pdf_parser_spec.rb b/spec/parsers/pdf_parser_spec.rb index 8d748474..47b6fe37 100644 --- a/spec/parsers/pdf_parser_spec.rb +++ b/spec/parsers/pdf_parser_spec.rb @@ -46,17 +46,17 @@ def parse_pdf(pdf_filename) describe 'broken PDF files should not parse' do it 'PDF with missing version header' do - parsed_pdf = parse_pdf 'not_a.pdf' + parsed_pdf = parse_pdf 'invalid_not_a.pdf' expect(parsed_pdf).to be_nil end it 'PDF 2.0 with offset start' do - parsed_pdf = parse_pdf 'PDF 2.0 with offset start.pdf' + parsed_pdf = parse_pdf 'invalid PDF 2.0 with offset start.pdf' expect(parsed_pdf).to be_nil end it 'exceeds the PDF read limit' do - parsed_pdf = parse_pdf 'exceed_PDF_read_limit.pdf' + parsed_pdf = parse_pdf 'invalid_exceed_PDF_read_limit.pdf' expect(parsed_pdf).to be_nil end end diff --git a/spec/parsers/wav_parser_spec.rb b/spec/parsers/wav_parser_spec.rb index c3dae466..e81c2b0b 100644 --- a/spec/parsers/wav_parser_spec.rb +++ b/spec/parsers/wav_parser_spec.rb @@ -48,7 +48,7 @@ it "cannot parse file with audio format different from 1 and no 'fact' chunk" do expect { - subject.call(File.open(__dir__ + '/../fixtures/WAV/d_6_Channel_ID.wav', 'rb')) + subject.call(File.open(__dir__ + '/../fixtures/WAV/invalid_d_6_Channel_ID.wav', 'rb')) }.to raise_error(FormatParser::IOUtils::InvalidRead) end end diff --git a/spec/parsers/webp_parser_spec.rb b/spec/parsers/webp_parser_spec.rb index 51ce9503..b9fc7980 100644 --- a/spec/parsers/webp_parser_spec.rb +++ b/spec/parsers/webp_parser_spec.rb @@ -7,7 +7,7 @@ end it 'does not parse files with an unrecognised variant' do - result = subject.call(File.open(fixtures_dir + 'WEBP/unrecognised-variant.webp', 'rb')) + result = subject.call(File.open(fixtures_dir + 'WEBP/invalid-unrecognised-variant.webp', 'rb')) expect(result).to be_nil end diff --git a/spec/remote_fetching_spec.rb b/spec/remote_fetching_spec.rb index 0f359970..ae827d61 100644 --- a/spec/remote_fetching_spec.rb +++ b/spec/remote_fetching_spec.rb @@ -104,6 +104,43 @@ expect(file_information.format).to eq(:png) end + describe 'correctly parses WAV files without falling back to another filetype' do + ['c_8kmp316.wav', 'c_SCAM_MIC_SOL001_RUN001.wav'].each do |filename| + it "parses WAV file #{filename}" do + remote_url = 'http://localhost:9399/WAV/' + filename + file_information = FormatParser.parse_http(remote_url) + expect(file_information).not_to be_nil + expect(file_information.format).to eq(:wav) + end + end + end + + describe "correctly parses files over HTTP without filename hint" do + Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path| + file_name = File.basename(fixture_path) + next if file_name.include? "invalid" + + file_type_dir = fixture_path.delete_prefix(fixtures_dir).delete_suffix(file_name) + file_type_dir.delete_prefix!('/').delete_suffix!('/') + next if file_type_dir.empty? + + # skipping this one because it's a special case + next if file_name == "arch_many_entries.zip" + + it "parses #{file_type_dir} file: #{file_name}" do + url = "http://localhost:9399/#{file_type_dir}/#{file_name}?some_param=test".gsub(' ', '%20') + result_with_hint = FormatParser.parse_http(url, filename_hint: file_name) + result_no_hint = FormatParser.parse_http(url) + + expect(result_with_hint).not_to be_nil + expect(result_no_hint).not_to be_nil + + expect(result_no_hint.nature).to eq(result_with_hint.nature) + expect(result_no_hint.format).to eq(result_with_hint.format) + end + end + end + describe 'when parsing remote fixtures' do Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path| filename = File.basename(fixture_path)