WeTransfer · julik · Jul 8, 2019 · Jul 6, 2019 · Jul 6, 2019 · Jul 6, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -1,9 +1,9 @@
 rvm:
 - 2.2.0
-- 2.3.0
-- 2.4.2
-- 2.5.0
-- jruby-9.0
+- 2.4.6
+- 2.5.5
+- 2.6.3
+- jruby
 sudo: false
 cache: bundler
 script:

diff --git a/lib/format_parser.rb b/lib/format_parser.rb
@@ -86,18 +86,22 @@ def self.deregister_parser(callable_or_responding_to_new)
   # @param kwargs the keyword arguments to be delegated to `.parse`
   # @see {.parse}
   def self.parse_http(url, **kwargs)
+    # Do not extract the filename, since the URL
+    # can really be "anything". But if the caller
+    # provides filename_hint it will be carried over
     parse(RemoteIO.new(url), **kwargs)
   end
 
   # Parses the file at the given `path` and returns the results as if it were any IO
   # given to `.parse`. The accepted keyword arguments are the same as the ones for `parse`.
+  # The file path will be used to provide the `filename_hint` to `.parse()`.
   #
   # @param path[String] the path to the file to parse on the local filesystem
   # @param kwargs the keyword arguments to be delegated to `.parse`
   # @see {.parse}
   def self.parse_file_at(path, **kwargs)
     File.open(path, 'rb') do |io|
-      parse(io, **kwargs)
+      parse(io, filename_hint: File.basename(path), **kwargs)
     end
   end
 
@@ -116,9 +120,13 @@ def self.parse_file_at(path, **kwargs)
   #   When using `:first` parsing will stop at the first successful match and other parsers won't run.
   # @param limits_config[ReadLimitsConfig] the configuration object for various read/cache limits. The default
   #   one should be good for most cases.
+  # @param filename_hint[String?] the filename. If provided, the first parser applied will be the
+  #   one that responds `true` to `likely_match?` with that filename as an argument. This way
+  #   we can optimize the order of application of parsers and start with the one that is more likely
+  #   to match.
   # @return [Array<Result>, Result, nil] either an Array of results, a single parsing result or `nil`if
   #   no useful metadata could be recovered from the file
-  def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first, limits_config: default_limits_config)
+  def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first, limits_config: default_limits_config, filename_hint: nil)
     # Limit the number of cached _pages_ we may fetch. This allows us to limit the number
     # of page faults (page cache misses) a parser may incur
     read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: limits_config.max_pagefaults_per_parser)
@@ -140,7 +148,7 @@ def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_form
     # Always instantiate parsers fresh for each input, since they might
     # contain instance variables which otherwise would have to be reset
     # between invocations, and would complicate threading situations
-    parsers = parsers_for(natures, formats)
+    parsers = parsers_for(natures, formats, filename_hint)
 
     # Limit how many operations the parser can perform
     limited_io = ReadLimiter.new(
@@ -225,9 +233,11 @@ def self.execute_parser_and_capture_expected_exceptions(parser, limited_io)
   #
   # @param desired_natures[Array] which natures should be considered (like `[:image, :archive]`)
   # @param desired_formats[Array] which formats should be considered (like `[:tif, :jpg]`)
+  # @param filename_hint[String?] the filename hint for the file. If provided,
+  #     the parser that likely matches this filename will be applied first.
   # @return [Array<#call>] an array of callable parsers
   # @raise ArgumentError when there are no parsers satisfying the constraint
-  def self.parsers_for(desired_natures, desired_formats)
+  def self.parsers_for(desired_natures, desired_formats, filename_hint = nil)
     assemble_parser_set = ->(hash_of_sets, keys_of_interest) {
       hash_of_sets.values_at(*keys_of_interest).compact.inject(&:+) || Set.new
     }
@@ -246,6 +256,12 @@ def self.parsers_for(desired_natures, desired_formats)
       @parser_priorities[parser_factory_a] <=> @parser_priorities[parser_factory_b]
     end
 
+    # If there is one parser that is more likely to match, place it first
+    if first_match = factories_in_order_of_priority.find { |f| f.respond_to?(:likely_match?) && f.likely_match?(filename_hint) }
+      factories_in_order_of_priority.delete(first_match)
+      factories_in_order_of_priority.unshift(first_match)
+    end
+
     factories_in_order_of_priority.map { |callable_or_class| instantiate_parser(callable_or_class) }
   end
 

diff --git a/lib/parsers/aiff_parser.rb b/lib/parsers/aiff_parser.rb
@@ -18,6 +18,10 @@ class FormatParser::AIFFParser
     'ANNO',
   ]
 
+  def self.likely_match?(filename)
+    filename =~ /\.aiff?$/i
+  end
+
   def call(io)
     io = FormatParser::IOConstraint.new(io)
     form_chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')

diff --git a/lib/parsers/bmp_parser.rb b/lib/parsers/bmp_parser.rb
@@ -6,6 +6,10 @@ class FormatParser::BMPParser
   VALID_BMP = 'BM'
   PERMISSIBLE_PIXEL_ARRAY_LOCATIONS = 40..512
 
+  def self.likely_match?(filename)
+    filename =~ /\.bmp$/i
+  end
+
   def call(io)
     io = FormatParser::IOConstraint.new(io)
 

diff --git a/lib/parsers/cr2_parser.rb b/lib/parsers/cr2_parser.rb
@@ -7,6 +7,10 @@ class FormatParser::CR2Parser
   TIFF_HEADER = [0x49, 0x49, 0x2a, 0x00]
   CR2_HEADER  = [0x43, 0x52, 0x02, 0x00]
 
+  def self.likely_match?(filename)
+    filename =~ /\.cr2$/i
+  end
+
   def call(io)
     io = FormatParser::IOConstraint.new(io)
 

diff --git a/lib/parsers/dpx_parser.rb b/lib/parsers/dpx_parser.rb
@@ -19,6 +19,10 @@ def le?
 
   private_constant :ByteOrderHintIO
 
+  def self.likely_match?(filename)
+    filename =~ /\.dpx$/i
+  end
+
   def call(io)
     io = FormatParser::IOConstraint.new(io)
     magic = safe_read(io, 4)

diff --git a/lib/parsers/fdx_parser.rb b/lib/parsers/fdx_parser.rb
@@ -1,6 +1,10 @@
 class FormatParser::FDXParser
   include FormatParser::IOUtils
 
+  def self.likely_match?(filename)
+    filename =~ /\.fdx$/i
+  end
+
   def call(io)
     return unless xml_check(io)
     file_and_document_type = safe_read(io, 100)

diff --git a/lib/parsers/flac_parser.rb b/lib/parsers/flac_parser.rb
@@ -5,8 +5,8 @@ class FormatParser::FLACParser
   MAGIC_BYTE_STRING = 'fLaC'
   BLOCK_HEADER_BYTES = 4
 
-  def bytestring_to_int(s)
-    s.unpack('B*')[0].to_i(2)
+  def self.likely_match?(filename)
+    filename =~ /\.flac$/i
   end
 
   def call(io)
@@ -71,5 +71,9 @@ def call(io)
     )
   end
 
+  def bytestring_to_int(s)
+    s.unpack('B*')[0].to_i(2)
+  end
+
   FormatParser.register_parser self, natures: :audio, formats: :flac
 end
diff --git a/lib/parsers/gif_parser.rb b/lib/parsers/gif_parser.rb
@@ -4,6 +4,10 @@ class FormatParser::GIFParser
   HEADERS = ['GIF87a', 'GIF89a'].map(&:b)
   NETSCAPE_AND_AUTHENTICATION_CODE = 'NETSCAPE2.0'
 
+  def self.likely_match?(filename)
+    filename =~ /\.gif$/i
+  end
+
   def call(io)
     io = FormatParser::IOConstraint.new(io)
     header = safe_read(io, 6)

diff --git a/lib/parsers/jpeg_parser.rb b/lib/parsers/jpeg_parser.rb
@@ -13,6 +13,10 @@ class InvalidStructure < StandardError
   EXIF_MAGIC_STRING = "Exif\0\0".b
   MUST_FIND_NEXT_MARKER_WITHIN_BYTES = 1024
 
+  def self.likely_match?(filename)
+    filename =~ /\.jpe?g$/i
+  end
+
   def call(io)
     @buf = FormatParser::IOConstraint.new(io)
     @width             = nil

diff --git a/lib/parsers/moov_parser.rb b/lib/parsers/moov_parser.rb
@@ -11,6 +11,10 @@ class FormatParser::MOOVParser
     'm4a ' => :m4a,
   }
 
+  def self.likely_match?(filename)
+    filename =~ /\.(mov|m4a|ma4|mp4|aac)$/i
+  end
+
   def call(io)
     return unless matches_moov_definition?(io)
 

diff --git a/lib/parsers/mp3_parser.rb b/lib/parsers/mp3_parser.rb
@@ -53,6 +53,10 @@ def as_json(*)
     end
   end
 
+  def self.likely_match?(filename)
+    filename =~ /\.mp3$/i
+  end
+
   def call(raw_io)
     io = FormatParser::IOConstraint.new(raw_io)
 

diff --git a/lib/parsers/ogg_parser.rb b/lib/parsers/ogg_parser.rb
@@ -6,6 +6,10 @@ class FormatParser::OggParser
   # Maximum size of an Ogg page
   MAX_POSSIBLE_PAGE_SIZE = 65307
 
+  def self.likely_match?(filename)
+    filename =~ /\.ogg$/i
+  end
+
   def call(io)
     # The format consists of chunks of data each called an "Ogg page". Each page
     # begins with the characters, "OggS", to identify the file as Ogg format.

diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb
@@ -9,6 +9,10 @@ class FormatParser::PDFParser
   #
   PDF_MARKER = /%PDF-1\.[0-8]{1}/
 
+  def self.likely_match?(filename)
+    filename =~ /\.(pdf|ai)$/i
+  end
+
   def call(io)
     io = FormatParser::IOConstraint.new(io)
 

diff --git a/lib/parsers/png_parser.rb b/lib/parsers/png_parser.rb
@@ -15,8 +15,8 @@ class FormatParser::PNGParser
     6 => true,
   }
 
-  def chunk_length_and_type(io)
-    safe_read(io, 8).unpack('Na4')
+  def self.likely_match?(filename)
+    filename =~ /\.png$/i
   end
 
   def call(io)
@@ -70,6 +70,10 @@ def call(io)
     )
   end
 
+  def chunk_length_and_type(io)
+    safe_read(io, 8).unpack('Na4')
+  end
+
   # Give it priority 1 since priority 0 is reserved for JPEG, our most popular
   FormatParser.register_parser self, natures: :image, formats: :png, priority: 1
 end
diff --git a/lib/parsers/psd_parser.rb b/lib/parsers/psd_parser.rb
@@ -3,6 +3,10 @@ class FormatParser::PSDParser
 
   PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
 
+  def self.likely_match?(filename)
+    filename =~ /\.psd$/i # Maybe also PSB at some point
+  end
+
   def call(io)
     io = FormatParser::IOConstraint.new(io)
     magic_bytes = safe_read(io, 4).unpack('C4')

diff --git a/lib/parsers/tiff_parser.rb b/lib/parsers/tiff_parser.rb
@@ -5,6 +5,10 @@ class FormatParser::TIFFParser
   MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
   MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
 
+  def self.likely_match?(filename)
+    filename =~ /\.tiff?$/i
+  end
+
   def call(io)
     io = FormatParser::IOConstraint.new(io)
 

diff --git a/lib/parsers/wav_parser.rb b/lib/parsers/wav_parser.rb
@@ -1,6 +1,10 @@
 class FormatParser::WAVParser
   include FormatParser::IOUtils
 
+  def self.likely_match?(filename)
+    filename =~ /\.wav$/i
+  end
+
   def call(io)
     # Read the RIFF header. Chunk descriptor should be RIFF, the size should
     # contain the size of the entire file in bytes minus 8 bytes for the

diff --git a/lib/parsers/zip_parser.rb b/lib/parsers/zip_parser.rb
@@ -5,6 +5,10 @@ class FormatParser::ZIPParser
   include OfficeFormats
   include FormatParser::IOUtils
 
+  def self.likely_match?(filename)
+    filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
+  end
+
   def call(io)
     io = FormatParser::IOConstraint.new(io)
     safe_read(io, 1) # Ensure the file is not empty

diff --git a/spec/format_parser_spec.rb b/spec/format_parser_spec.rb
@@ -139,7 +139,7 @@
 
     it 'passes keyword arguments to parse()' do
       path = fixtures_dir + '/WAV/c_M1F1-Alaw-AFsp.wav'
-      expect(FormatParser).to receive(:parse).with(an_instance_of(File), foo: :bar)
+      expect(FormatParser).to receive(:parse).with(an_instance_of(File), filename_hint: 'c_M1F1-Alaw-AFsp.wav', foo: :bar)
       FormatParser.parse_file_at(path, foo: :bar)
     end
   end
@@ -165,6 +165,14 @@
       image_parsers = FormatParser.parsers_for([:image], [:tif, :jpg, :aiff, :mp3])
       expect(image_parsers.length).to eq(2)
     end
+
+    it 'returns an array with the ZIPParser first if the filename_hint is for a ZIP file' do
+      prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], nil)
+      expect(prioritized_parsers.first).not_to be_kind_of(FormatParser::ZIPParser)
+
+      prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], 'a-file.zip')
+      expect(prioritized_parsers.first).to be_kind_of(FormatParser::ZIPParser)
+    end
   end
 
   describe '.register_parser and .deregister_parser' do

diff --git a/spec/parsers/zip_parser_spec.rb b/spec/parsers/zip_parser_spec.rb
@@ -1,6 +1,11 @@
 require 'spec_helper'
 
 describe FormatParser::ZIPParser do
+  it 'provides filename hints' do
+    expect(FormatParser::ZIPParser).to be_likely_match('file.zip')
+    expect(FormatParser::ZIPParser).not_to be_likely_match('file.tif')
+  end
+
   it 'parses a ZIP archive with Zip64 extra fields (due to the number of files)' do
     fixture_path = fixtures_dir + '/ZIP/arch_many_entries.zip'
     fi_io = File.open(fixture_path, 'rb')