diff --git a/CHANGELOG.md b/CHANGELOG.md index 788470b..8feb82f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Unreleased +* Revalidate stale cache entries by digesting the source content. + This should significantly improve performance in environments where `mtime` isn't preserved (e.g. CI systems doing a git clone, etc). + See #468. * Open source files and cache entries with `O_NOATIME` when available to reduce disk accesses. See #469. * `bootsnap precompile --gemfile` now look for `.rb` files in the whole gem and not just the `lib/` directory. See #466. diff --git a/README.md b/README.md index 08167de..333607f 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ Bootsnap cache misses can be monitored though a callback: Bootsnap.instrumentation = ->(event, path) { puts "#{event} #{path}" } ``` -`event` is either `:miss` or `:stale`. You can also call `Bootsnap.log!` as a shortcut to +`event` is either `:miss`, `:stale` or `:revalidated`. You can also call `Bootsnap.log!` as a shortcut to log all events to STDERR. To turn instrumentation back off you can set it to nil: diff --git a/ext/bootsnap/bootsnap.c b/ext/bootsnap/bootsnap.c index 439c2ac..efdfdf6 100644 --- a/ext/bootsnap/bootsnap.c +++ b/ext/bootsnap/bootsnap.c @@ -58,8 +58,10 @@ struct bs_cache_key { uint32_t ruby_revision; uint64_t size; uint64_t mtime; - uint64_t data_size; /* not used for equality */ - uint8_t pad[24]; + uint64_t data_size; // + uint64_t digest; + uint8_t digest_set; + uint8_t pad[15]; } __attribute__((packed)); /* @@ -73,7 +75,7 @@ struct bs_cache_key { STATIC_ASSERT(sizeof(struct bs_cache_key) == KEY_SIZE); /* Effectively a schema version. Bumping invalidates all previous caches */ -static const uint32_t current_version = 4; +static const uint32_t current_version = 5; /* hash of e.g. "x86_64-darwin17", invalidating when ruby is recompiled on a * new OS ABI, etc. */ @@ -93,6 +95,7 @@ static VALUE rb_cBootsnap_CompileCache_UNCOMPILABLE; static ID instrumentation_method; static VALUE sym_miss; static VALUE sym_stale; +static VALUE sym_revalidated; static bool instrumentation_enabled = false; static bool readonly = false; @@ -104,9 +107,18 @@ static VALUE bs_rb_fetch(VALUE self, VALUE cachedir_v, VALUE path_v, VALUE handl static VALUE bs_rb_precompile(VALUE self, VALUE cachedir_v, VALUE path_v, VALUE handler); /* Helpers */ +enum cache_status { + miss, + hit, + stale, +}; static void bs_cache_path(const char * cachedir, const VALUE path, char (* cache_path)[MAX_CACHEPATH_SIZE]); static int bs_read_key(int fd, struct bs_cache_key * key); -static int cache_key_equal(struct bs_cache_key * k1, struct bs_cache_key * k2); +static enum cache_status cache_key_equal_fast_path(struct bs_cache_key * k1, struct bs_cache_key * k2); +static int cache_key_equal_slow_path(struct bs_cache_key * current_key, struct bs_cache_key * cached_key, const VALUE input_data); +static int update_cache_key(struct bs_cache_key *current_key, int cache_fd, const char ** errno_provenance); + +static void bs_cache_key_digest(struct bs_cache_key * key, const VALUE input_data); static VALUE bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args); static VALUE bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler); static int open_current_file(char * path, struct bs_cache_key * key, const char ** errno_provenance); @@ -171,6 +183,9 @@ Init_bootsnap(void) sym_stale = ID2SYM(rb_intern("stale")); rb_global_variable(&sym_stale); + sym_revalidated = ID2SYM(rb_intern("revalidated")); + rb_global_variable(&sym_revalidated); + rb_define_module_function(rb_mBootsnap, "instrumentation_enabled=", bs_instrumentation_enabled_set, 1); rb_define_module_function(rb_mBootsnap_CompileCache_Native, "readonly=", bs_readonly_set, 1); rb_define_module_function(rb_mBootsnap_CompileCache_Native, "coverage_running?", bs_rb_coverage_running, 0); @@ -189,6 +204,14 @@ bs_instrumentation_enabled_set(VALUE self, VALUE enabled) return enabled; } +static inline void +bs_instrumentation(VALUE event, VALUE path) +{ + if (RB_UNLIKELY(instrumentation_enabled)) { + rb_funcall(rb_mBootsnap, instrumentation_method, 2, event, path); + } +} + static VALUE bs_readonly_set(VALUE self, VALUE enabled) { @@ -294,17 +317,51 @@ bs_cache_path(const char * cachedir, const VALUE path, char (* cache_path)[MAX_C * The data_size member is not compared, as it serves more of a "header" * function. */ -static int -cache_key_equal(struct bs_cache_key * k1, struct bs_cache_key * k2) +static enum cache_status cache_key_equal_fast_path(struct bs_cache_key *k1, + struct bs_cache_key *k2) { + if (k1->version == k2->version && + k1->ruby_platform == k2->ruby_platform && + k1->compile_option == k2->compile_option && + k1->ruby_revision == k2->ruby_revision && k1->size == k2->size) { + return (k1->mtime == k2->mtime) ? hit : stale; + } + return miss; +} + +static int cache_key_equal_slow_path(struct bs_cache_key *current_key, + struct bs_cache_key *cached_key, + const VALUE input_data) { - return ( - k1->version == k2->version && - k1->ruby_platform == k2->ruby_platform && - k1->compile_option == k2->compile_option && - k1->ruby_revision == k2->ruby_revision && - k1->size == k2->size && - k1->mtime == k2->mtime - ); + bs_cache_key_digest(current_key, input_data); + return current_key->digest == cached_key->digest; +} + +static int update_cache_key(struct bs_cache_key *current_key, int cache_fd, const char ** errno_provenance) +{ + lseek(cache_fd, 0, SEEK_SET); + ssize_t nwrite = write(cache_fd, current_key, KEY_SIZE); + if (nwrite < 0) { + *errno_provenance = "update_cache_key:write"; + return -1; + } + + if (fdatasync(cache_fd) < 0) { + *errno_provenance = "update_cache_key:fdatasync"; + return -1; + } + + return 0; +} + +/* + * Fills the cache key digest. + */ +static void bs_cache_key_digest(struct bs_cache_key *key, + const VALUE input_data) { + if (key->digest_set) + return; + key->digest = fnv1a_64(input_data); + key->digest_set = 1; } /* @@ -393,6 +450,7 @@ open_current_file(char * path, struct bs_cache_key * key, const char ** errno_pr key->ruby_revision = current_ruby_revision; key->size = (uint64_t)statbuf.st_size; key->mtime = (uint64_t)statbuf.st_mtime; + key->digest_set = false; return fd; } @@ -436,7 +494,7 @@ open_cache_file(const char * path, struct bs_cache_key * key, const char ** errn { int fd, res; - fd = open(path, O_RDONLY | O_NOATIME); + fd = open(path, O_RDWR | O_NOATIME); if (fd < 0) { *errno_provenance = "bs_fetch:open_cache_file:open"; return CACHE_MISS; @@ -681,7 +739,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args int res, valid_cache = 0, exception_tag = 0; const char * errno_provenance = NULL; - VALUE input_data; /* data read from source file, e.g. YAML or ruby source */ + VALUE input_data = Qfalse; /* data read from source file, e.g. YAML or ruby source */ VALUE storage_data; /* compiled data, e.g. msgpack / binary iseq */ VALUE output_data; /* return data, e.g. ruby hash or loaded iseq */ @@ -699,20 +757,43 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args cache_fd = open_cache_file(cache_path, &cached_key, &errno_provenance); if (cache_fd == CACHE_MISS || cache_fd == CACHE_STALE) { /* This is ok: valid_cache remains false, we re-populate it. */ - if (RB_UNLIKELY(instrumentation_enabled)) { - rb_funcall(rb_mBootsnap, instrumentation_method, 2, cache_fd == CACHE_MISS ? sym_miss : sym_stale, path_v); - } + bs_instrumentation(cache_fd == CACHE_MISS ? sym_miss : sym_stale, path_v); } else if (cache_fd < 0) { exception_message = rb_str_new_cstr(cache_path); goto fail_errno; } else { /* True if the cache existed and no invalidating changes have occurred since * it was generated. */ - valid_cache = cache_key_equal(¤t_key, &cached_key); - if (RB_UNLIKELY(instrumentation_enabled)) { - if (!valid_cache) { - rb_funcall(rb_mBootsnap, instrumentation_method, 2, sym_stale, path_v); + + switch(cache_key_equal_fast_path(¤t_key, &cached_key)) { + case hit: + valid_cache = true; + break; + case miss: + valid_cache = false; + break; + case stale: + valid_cache = false; + if ((input_data = bs_read_contents(current_fd, current_key.size, + &errno_provenance)) == Qfalse) { + exception_message = path_v; + goto fail_errno; } + valid_cache = cache_key_equal_slow_path(¤t_key, &cached_key, input_data); + if (valid_cache) { + if (!readonly) { + if (update_cache_key(¤t_key, cache_fd, &errno_provenance)) { + exception_message = path_v; + goto fail_errno; + } + } + bs_instrumentation(sym_revalidated, path_v); + } + break; + }; + + if (!valid_cache) { + bs_instrumentation(sym_stale, path_v); } } @@ -726,7 +807,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args else if (res == CACHE_UNCOMPILABLE) { /* If fetch_cached_data returned `Uncompilable` we fallback to `input_to_output` This happens if we have say, an unsafe YAML cache, but try to load it in safe mode */ - if ((input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse){ + if (input_data == Qfalse && (input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse) { exception_message = path_v; goto fail_errno; } @@ -745,7 +826,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args /* Cache is stale, invalid, or missing. Regenerate and write it out. */ /* Read the contents of the source file into a buffer */ - if ((input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse){ + if (input_data == Qfalse && (input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse) { exception_message = path_v; goto fail_errno; } @@ -767,6 +848,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args * We do however ignore any failures to persist the cache, as it's better * to move along, than to interrupt the process. */ + bs_cache_key_digest(¤t_key, input_data); atomic_write_cache_file(cache_path, ¤t_key, storage_data, &errno_provenance); /* Having written the cache, now convert storage_data to output_data */ @@ -822,12 +904,16 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args static VALUE bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler) { + if (readonly) { + return Qfalse; + } + struct bs_cache_key cached_key, current_key; int cache_fd = -1, current_fd = -1; int res, valid_cache = 0, exception_tag = 0; const char * errno_provenance = NULL; - VALUE input_data; /* data read from source file, e.g. YAML or ruby source */ + VALUE input_data = Qfalse; /* data read from source file, e.g. YAML or ruby source */ VALUE storage_data; /* compiled data, e.g. msgpack / binary iseq */ /* Open the source file and generate a cache key for it */ @@ -843,7 +929,28 @@ bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler) } else { /* True if the cache existed and no invalidating changes have occurred since * it was generated. */ - valid_cache = cache_key_equal(¤t_key, &cached_key); + switch(cache_key_equal_fast_path(¤t_key, &cached_key)) { + case hit: + valid_cache = true; + break; + case miss: + valid_cache = false; + break; + case stale: + valid_cache = false; + if ((input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse) { + goto fail; + } + valid_cache = cache_key_equal_slow_path(¤t_key, &cached_key, input_data); + if (valid_cache) { + if (update_cache_key(¤t_key, cache_fd, &errno_provenance)) { + goto fail; + } + + bs_instrumentation(sym_revalidated, path_v); + } + break; + }; } if (valid_cache) { @@ -870,6 +977,7 @@ bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler) if (!RB_TYPE_P(storage_data, T_STRING)) goto fail; /* Write the cache key and storage_data to the cache directory */ + bs_cache_key_digest(¤t_key, input_data); res = atomic_write_cache_file(cache_path, ¤t_key, storage_data, &errno_provenance); if (res < 0) goto fail; diff --git a/test/compile_cache_key_format_test.rb b/test/compile_cache_key_format_test.rb index 3817587..5108374 100644 --- a/test/compile_cache_key_format_test.rb +++ b/test/compile_cache_key_format_test.rb @@ -22,7 +22,7 @@ class CompileCacheKeyFormatTest < Minitest::Test def test_key_version key = cache_key_for_file(FILE) - exp = [4].pack("L") + exp = [5].pack("L") assert_equal(exp, key[R[:version]]) end diff --git a/test/compile_cache_test.rb b/test/compile_cache_test.rb index d17452f..5ed37cf 100644 --- a/test/compile_cache_test.rb +++ b/test/compile_cache_test.rb @@ -9,6 +9,7 @@ class CompileCacheTest < Minitest::Test def teardown super Bootsnap::CompileCache::Native.readonly = false + Bootsnap.instrumentation = nil end def test_compile_option_crc32 @@ -158,6 +159,34 @@ def test_dont_store_cache_after_a_stale_when_readonly load(path) end + def test_dont_revalidate_when_readonly + path = Help.set_file("a.rb", "a = a = 3", 100) + load(path) + + entries = Dir["#{Bootsnap::CompileCache::ISeq.cache_dir}/**/*"].select { |f| File.file?(f) } + assert_equal 1, entries.size + cache_entry = entries.first + old_cache_content = File.binread(cache_entry) + + Bootsnap::CompileCache::Native.readonly = true + + output = RubyVM::InstructionSequence.compile_file(path) + Bootsnap::CompileCache::ISeq.expects(:input_to_storage).never + Bootsnap::CompileCache::ISeq.expects(:storage_to_output).once.returns(output) + Bootsnap::CompileCache::ISeq.expects(:input_to_output).never + + FileUtils.touch(path, mtime: File.mtime(path) + 50) + + calls = [] + Bootsnap.instrumentation = ->(event, source_path) { calls << [event, source_path] } + load(path) + + assert_equal [[:revalidated, "a.rb"]], calls + + new_cache_content = File.binread(cache_entry) + assert_equal old_cache_content, new_cache_content, "Cache entry was mutated" + end + def test_invalid_cache_file path = Help.set_file("a.rb", "a = a = 3", 100) cp = Help.cache_path("#{@tmp_dir}-iseq", path) @@ -177,8 +206,6 @@ def test_instrumentation_hit load(file_path) assert_equal [], calls - ensure - Bootsnap.instrumentation = nil end def test_instrumentation_miss @@ -190,8 +217,19 @@ def test_instrumentation_miss load(file_path) assert_equal [[:miss, "a.rb"]], calls - ensure - Bootsnap.instrumentation = nil + end + + def test_instrumentation_revalidate + file_path = Help.set_file("a.rb", "a = a = 3", 100) + load(file_path) + FileUtils.touch("a.rb", mtime: File.mtime("a.rb") + 42) + + calls = [] + Bootsnap.instrumentation = ->(event, path) { calls << [event, path] } + + load(file_path) + + assert_equal [[:revalidated, "a.rb"]], calls end def test_instrumentation_stale @@ -205,7 +243,5 @@ def test_instrumentation_stale load(file_path) assert_equal [[:stale, "a.rb"]], calls - ensure - Bootsnap.instrumentation = nil end end diff --git a/test/load_path_cache/core_ext/kernel_require_test.rb b/test/load_path_cache/core_ext/kernel_require_test.rb index dc120ea..4aa1b7e 100644 --- a/test/load_path_cache/core_ext/kernel_require_test.rb +++ b/test/load_path_cache/core_ext/kernel_require_test.rb @@ -12,7 +12,7 @@ def test_uses_the_same_duck_type_as_require assert_nil LoadPathCache.load_path_cache cache = Tempfile.new("cache") pid = Process.fork do - LoadPathCache.setup(cache_path: cache, development_mode: true, ignore_directories: nil) + LoadPathCache.setup(cache_path: cache.path, development_mode: true, ignore_directories: nil) dir = File.realpath(Dir.mktmpdir) $LOAD_PATH.push(dir) FileUtils.touch("#{dir}/a.rb") @@ -40,7 +40,7 @@ def test_load_static_libaries assert_nil LoadPathCache.load_path_cache cache = Tempfile.new("cache") pid = Process.fork do - LoadPathCache.setup(cache_path: cache, development_mode: false, ignore_directories: nil) + LoadPathCache.setup(cache_path: cache.path, development_mode: false, ignore_directories: nil) require("prism") end _, status = Process.wait2(pid) @@ -53,7 +53,10 @@ def test_load_static_libaries end class KernelLoadTest < Minitest::Test + include TmpdirHelper + def setup + super @initial_dir = Dir.pwd @dir1 = File.realpath(Dir.mktmpdir) FileUtils.touch("#{@dir1}/a.rb") @@ -70,6 +73,7 @@ def teardown Dir.chdir(@initial_dir) FileUtils.rm_rf(@dir1) FileUtils.rm_rf(@dir2) + super end def test_no_exstensions_for_kernel_load