Skip to content

Commit

Permalink
Merge pull request #398 from Shopify/symbol-encoding
Browse files Browse the repository at this point in the history
YAML compile cache: encoding aware symbols
  • Loading branch information
casperisfine committed Jan 28, 2022
2 parents 647969f + 76a05db commit 2e61f8d
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 27 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Unreleased

* Improve the YAML compile cache to support `UTF-8` symbols. (#398)
[The default `MessagePack` symbol serializer assumes all symbols are ASCII](https://github.com/msgpack/msgpack-ruby/pull/211),
because of this, non-ASCII compatible symbol would be restored with `ASCII_8BIT` encoding (AKA `BINARY`).
Bootsnap now properly cache them in `UTF-8`.

Note that the above only apply for actual YAML symbols (e..g `--- :foo`).
The issue is still present for string keys parsed with `YAML.load_file(..., symbolize_names: true)`, that is a bug
in `msgpack` that will hopefully be solved soon, see: https://github.com/msgpack/msgpack-ruby/pull/246

* Entirely disable the YAML compile cache if `Encoding.default_internal` is set to an encoding not supported by `msgpack`. (#398)
`Psych` coerce strings to `Encoding.default_internal`, but `MessagePack` doesn't. So in this scenario we can't provide
YAML caching at all without returning the strings in the wrong encoding.
This never came up in practice but might as well be safe.

# 1.10.2

* Reduce the `Kernel.require` extra stack frames some more. Now bootsnap should only add one extra frame per `require` call.
Expand Down
2 changes: 1 addition & 1 deletion ext/bootsnap/bootsnap.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ struct bs_cache_key {
STATIC_ASSERT(sizeof(struct bs_cache_key) == KEY_SIZE);

/* Effectively a schema version. Bumping invalidates all previous caches */
static const uint32_t current_version = 4;
static const uint32_t current_version = 5;

/* hash of e.g. "x86_64-darwin17", invalidating when ruby is recompiled on a
* new OS ABI, etc. */
Expand Down
111 changes: 86 additions & 25 deletions lib/bootsnap/compile_cache/yaml.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,15 @@
module Bootsnap
module CompileCache
module YAML
UnsupportedTags = Class.new(StandardError)
Uncompilable = Class.new(StandardError)
UnsupportedTags = Class.new(Uncompilable)

SUPPORTED_INTERNAL_ENCODINGS = [
nil, # UTF-8
Encoding::UTF_8,
Encoding::ASCII,
Encoding::BINARY,
].freeze

class << self
attr_accessor(:msgpack_factory, :supported_options)
Expand All @@ -16,7 +24,9 @@ def cache_dir=(cache_dir)
end

def precompile(path)
Bootsnap::CompileCache::Native.precompile(
return false unless CompileCache::YAML.supported_internal_encoding?

CompileCache::Native.precompile(
cache_dir,
path.to_s,
@implementation,
Expand All @@ -29,6 +39,44 @@ def install!(cache_dir)
::YAML.singleton_class.prepend(@implementation::Patch)
end

# Psych coerce strings to `Encoding.default_internal` but Message Pack only support
# UTF-8, US-ASCII and BINARY. So if Encoding.default_internal is set to anything else
# we can't safely use the cache
def supported_internal_encoding?
SUPPORTED_INTERNAL_ENCODINGS.include?(Encoding.default_internal)
end

module EncodingAwareSymbols
extend self

if Symbol.method_defined?(:name)
def pack(symbol)
if symbol.encoding == Encoding::UTF_8
1.chr << symbol.name
else
0.chr << symbol.name
end
end
else
def pack(symbol)
if symbol.encoding == Encoding::UTF_8
1.chr << symbol.to_s
else
0.chr << symbol.to_s
end
end
end

def unpack(payload)
payload.freeze
string = payload.byteslice(1..-1)
if payload.ord == 1 # Encoding::UTF_8
string.force_encoding(Encoding::UTF_8)
end
string.to_sym
end
end

def init!
require("yaml")
require("msgpack")
Expand All @@ -43,7 +91,12 @@ def init!
# We want them to roundtrip cleanly, so we use a custom factory.
# see: https://github.com/msgpack/msgpack-ruby/pull/122
factory = MessagePack::Factory.new
factory.register_type(0x00, Symbol)
factory.register_type(
0x00,
Symbol,
packer: EncodingAwareSymbols.method(:pack).to_proc,
unpacker: EncodingAwareSymbols.method(:unpack).to_proc,
)

if defined? MessagePack::Timestamp
factory.register_type(
Expand Down Expand Up @@ -124,7 +177,7 @@ def input_to_storage(contents, _)
packer.pack(false) # not safe loaded
packer.pack(obj)
packer.to_s
rescue NoMethodError, RangeError, UnsupportedTags
rescue NoMethodError, RangeError, Uncompilable
UNCOMPILABLE # The object included things that we can't serialize
end

Expand Down Expand Up @@ -179,44 +232,48 @@ def input_to_output(data, kwargs)

module Patch
def load_file(path, *args)
return super unless CompileCache::YAML.supported_internal_encoding?

return super if args.size > 1

if (kwargs = args.first)
return super unless kwargs.is_a?(Hash)
return super unless (kwargs.keys - ::Bootsnap::CompileCache::YAML.supported_options).empty?
return super unless (kwargs.keys - CompileCache::YAML.supported_options).empty?
end

begin
::Bootsnap::CompileCache::Native.fetch(
Bootsnap::CompileCache::YAML.cache_dir,
CompileCache::Native.fetch(
CompileCache::YAML.cache_dir,
File.realpath(path),
::Bootsnap::CompileCache::YAML::Psych4::SafeLoad,
CompileCache::YAML::Psych4::SafeLoad,
kwargs,
)
rescue Errno::EACCES
::Bootsnap::CompileCache.permission_error(path)
CompileCache.permission_error(path)
end
end

ruby2_keywords :load_file if respond_to?(:ruby2_keywords, true)

def unsafe_load_file(path, *args)
return super unless CompileCache::YAML.supported_internal_encoding?

return super if args.size > 1

if (kwargs = args.first)
return super unless kwargs.is_a?(Hash)
return super unless (kwargs.keys - ::Bootsnap::CompileCache::YAML.supported_options).empty?
return super unless (kwargs.keys - CompileCache::YAML.supported_options).empty?
end

begin
::Bootsnap::CompileCache::Native.fetch(
Bootsnap::CompileCache::YAML.cache_dir,
CompileCache::Native.fetch(
CompileCache::YAML.cache_dir,
File.realpath(path),
::Bootsnap::CompileCache::YAML::Psych4::UnsafeLoad,
CompileCache::YAML::Psych4::UnsafeLoad,
kwargs,
)
rescue Errno::EACCES
::Bootsnap::CompileCache.permission_error(path)
CompileCache.permission_error(path)
end
end

Expand All @@ -233,7 +290,7 @@ def input_to_storage(contents, _)
packer.pack(false) # not safe loaded
packer.pack(obj)
packer.to_s
rescue NoMethodError, RangeError, UnsupportedTags
rescue NoMethodError, RangeError, Uncompilable
UNCOMPILABLE # The object included things that we can't serialize
end

Expand All @@ -253,44 +310,48 @@ def input_to_output(data, kwargs)

module Patch
def load_file(path, *args)
return super unless CompileCache::YAML.supported_internal_encoding?

return super if args.size > 1

if (kwargs = args.first)
return super unless kwargs.is_a?(Hash)
return super unless (kwargs.keys - ::Bootsnap::CompileCache::YAML.supported_options).empty?
return super unless (kwargs.keys - CompileCache::YAML.supported_options).empty?
end

begin
::Bootsnap::CompileCache::Native.fetch(
Bootsnap::CompileCache::YAML.cache_dir,
CompileCache::Native.fetch(
CompileCache::YAML.cache_dir,
File.realpath(path),
::Bootsnap::CompileCache::YAML::Psych3,
CompileCache::YAML::Psych3,
kwargs,
)
rescue Errno::EACCES
::Bootsnap::CompileCache.permission_error(path)
CompileCache.permission_error(path)
end
end

ruby2_keywords :load_file if respond_to?(:ruby2_keywords, true)

def unsafe_load_file(path, *args)
return super unless CompileCache::YAML.supported_internal_encoding?

return super if args.size > 1

if (kwargs = args.first)
return super unless kwargs.is_a?(Hash)
return super unless (kwargs.keys - ::Bootsnap::CompileCache::YAML.supported_options).empty?
return super unless (kwargs.keys - CompileCache::YAML.supported_options).empty?
end

begin
::Bootsnap::CompileCache::Native.fetch(
Bootsnap::CompileCache::YAML.cache_dir,
CompileCache::Native.fetch(
CompileCache::YAML.cache_dir,
File.realpath(path),
::Bootsnap::CompileCache::YAML::Psych3,
CompileCache::YAML::Psych3,
kwargs,
)
rescue Errno::EACCES
::Bootsnap::CompileCache.permission_error(path)
CompileCache.permission_error(path)
end
end

Expand Down
37 changes: 37 additions & 0 deletions test/compile_cache/yaml_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,27 @@ def test_yaml_tags
assert_equal "YAML tags are not supported: !ruby/object", error.message
end

def test_symbols_encoding
symbols = [:ascii, :utf8_fée]
Help.set_file("a.yml", YAML.dump(symbols), 100)

loaded_symbols = FakeYaml.load_file("a.yml")
assert_equal(symbols, loaded_symbols)
assert_equal(symbols.map(&:encoding), loaded_symbols.map(&:encoding))
end

def test_custom_symbols_encoding
sym = "壁に耳あり、障子に目あり".to_sym
Help.set_file("a.yml", YAML.dump(sym), 100)
# YAML is limited to UTF-8 and UTF-16 by spec, but Psych does respect Encoding.default_internal
# so strings and symbol can actually be of any encoding.
assert_raises FakeYaml::Fallback do
with_default_encoding_internal(Encoding::EUC_JP) do
FakeYaml.load_file("a.yml")
end
end
end

if YAML::VERSION >= "4"
def test_load_psych_4_with_alias
Help.set_file("a.yml", "foo: &foo\n bar: 42\nplop:\n <<: *foo", 100)
Expand Down Expand Up @@ -170,4 +191,20 @@ def test_unsafe_load_file
assert_equal({"foo" => {"bar" => 42}, "plop" => {"bar" => 42}}, FakeYaml.unsafe_load_file("a.yml"))
end
end

private

def with_default_encoding_internal(encoding)
original_internal = Encoding.default_internal
$VERBOSE = false
Encoding.default_internal = encoding
$VERBOSE = true
begin
yield
ensure
$VERBOSE = false
Encoding.default_internal = original_internal
$VERBOSE = true
end
end
end
2 changes: 1 addition & 1 deletion test/compile_cache_key_format_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class CompileCacheKeyFormatTest < Minitest::Test

def test_key_version
key = cache_key_for_file(FILE)
exp = [4].pack("L")
exp = [5].pack("L")
assert_equal(exp, key[R[:version]])
end

Expand Down

0 comments on commit 2e61f8d

Please sign in to comment.