diff --git a/.github/workflows/Benchmarking.yml b/.github/workflows/Benchmarking.yml index 3c91e003b..a6310afe5 100644 --- a/.github/workflows/Benchmarking.yml +++ b/.github/workflows/Benchmarking.yml @@ -4,80 +4,122 @@ on: pull_request: jobs: - benchmarks: + benchmark-base: runs-on: ubuntu-latest + outputs: + results: ${{ steps.benchmark.outputs.results }} + sha: ${{ steps.benchmark.outputs.sha }} + steps: + - uses: actions/checkout@v5 + with: + ref: ${{ github.base_ref }} + - uses: julia-actions/setup-julia@v2 + with: + version: '1.11' + - uses: julia-actions/cache@v2 + + - name: Run benchmarks + id: benchmark + working-directory: ./benchmarks + run: | + # github output can't handle more than 1 line, hence the tail + julia --project=. -e 'using Pkg; Pkg.instantiate()' + results=$(julia --project=. benchmarks.jl json | tail -n 1 || true) + echo $results + echo "results=$results" >> "$GITHUB_OUTPUT" + echo "sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT" + benchmark-head: + runs-on: ubuntu-latest + outputs: + results: ${{ steps.benchmark.outputs.results }} + sha: ${{ steps.benchmark.outputs.sha }} steps: - - name: Checkout Repository - uses: actions/checkout@v5 + - uses: actions/checkout@v5 with: ref: ${{ github.event.pull_request.head.sha }} - - - name: Set up Julia - uses: julia-actions/setup-julia@v2 + - uses: julia-actions/setup-julia@v2 with: version: '1.11' - - uses: julia-actions/cache@v2 - - name: Install Dependencies - run: julia --project=benchmarks/ -e 'using Pkg; Pkg.instantiate()' + - name: Run benchmarks + id: benchmark + working-directory: ./benchmarks + run: | + # github output can't handle more than 1 line, hence the tail + julia --project=. -e 'using Pkg; Pkg.instantiate()' + results=$(julia --project=. benchmarks.jl json | tail -n 1 || true) + echo $results + echo "results=$results" >> "$GITHUB_OUTPUT" + echo "sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT" - - name: Run Benchmarks - id: run_benchmarks + combine-results: + runs-on: ubuntu-latest + needs: [benchmark-base, benchmark-head] + steps: + - uses: actions/checkout@v5 + with: + ref: ${{ github.event.pull_request.head.sha }} + - uses: julia-actions/setup-julia@v2 + with: + version: '1.11' + - uses: julia-actions/cache@v2 + + - name: Combine benchmark results + working-directory: ./benchmarks run: | - # Capture version info into a variable, print it, and set it as an env var for later steps version_info=$(julia -e 'using InteractiveUtils; versioninfo()') echo "$version_info" echo "VERSION_INFO<> $GITHUB_ENV echo "$version_info" >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV - # Capture benchmark output into a variable. The sed and tail calls cut out anything but the - # final block of results. - echo "Running Benchmarks..." - benchmark_output=$(\ - julia --project=benchmarks benchmarks/benchmarks.jl \ - | sed -n '/Final results:/,$p' \ - | tail -n +2\ - ) + # save outputs of previous jobs to json file + echo "Base results" + echo "--------------------------------------------------------" + echo '${{needs.benchmark-base.outputs.results}}' + echo '${{needs.benchmark-base.outputs.results}}' > base.json + echo "Head results" + echo "--------------------------------------------------------" + echo '${{needs.benchmark-head.outputs.results}}' + echo '${{needs.benchmark-head.outputs.results}}' > head.json - # Print benchmark results directly to the workflow log - echo "Benchmark Results:" - echo "$benchmark_output" + # combine them and save the output as an env var for later steps + julia --project=. -e 'using Pkg; Pkg.instantiate()' + results=$(julia --project=. benchmarks.jl combine head.json base.json) + echo "Combined results" + echo "--------------------------------------------------------" + echo "$results" - # Set the benchmark output as an env var for later steps echo "BENCHMARK_OUTPUT<> $GITHUB_ENV - echo "$benchmark_output" >> $GITHUB_ENV + echo "$results" >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV - # Get the current commit SHA of DynamicPPL - DPPL_COMMIT_SHA=$(git rev-parse HEAD) - echo "DPPL_COMMIT_SHA=$DPPL_COMMIT_SHA" >> $GITHUB_ENV - - COMMIT_URL="https://github.com/${{ github.repository }}/commit/$DPPL_COMMIT_SHA" - echo "DPPL_COMMIT_URL=$COMMIT_URL" >> $GITHUB_ENV - - - name: Find Existing Comment + - name: Find existing benchmark comment uses: peter-evans/find-comment@v4 id: find_comment with: issue-number: ${{ github.event.pull_request.number }} comment-author: github-actions[bot] - - name: Post Benchmark Results as PR Comment + - name: Create or update benchmark comment uses: peter-evans/create-or-update-comment@v5 with: issue-number: ${{ github.event.pull_request.number }} body: | - ## Benchmark Report for Commit ${{ env.DPPL_COMMIT_SHA }} + ## Benchmark Report + + - this PR's head: `${{ needs.benchmark-head.outputs.sha }}` + - base branch: `${{ needs.benchmark-base.outputs.sha }}` + ### Computer Information ``` ${{ env.VERSION_INFO }} ``` ### Benchmark Results - ``` + ${{ env.BENCHMARK_OUTPUT }} - ``` + comment-id: ${{ steps.find_comment.outputs.comment-id }} edit-mode: replace diff --git a/.gitignore b/.gitignore index 198907c73..d5a87f1eb 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ .DS_Store Manifest.toml **.~undo-tree~ + +benchmarks/*.json diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml index 0d4e9a654..c154c5ca5 100644 --- a/benchmarks/Project.toml +++ b/benchmarks/Project.toml @@ -4,11 +4,12 @@ version = "0.1.0" [deps] ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +Chairmarks = "0ca39b1e-fe0b-4e98-acfc-b1656634c4de" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LogDensityProblems = "6fdf6af0-433a-55f7-b3ed-c6c6e0b8df7c" Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6" @@ -21,11 +22,12 @@ DynamicPPL = {path = "../"} [compat] ADTypes = "1.14.0" -BenchmarkTools = "1.6.0" +Chairmarks = "1.3.1" Distributions = "0.25.117" DynamicPPL = "0.38" Enzyme = "0.13" -ForwardDiff = "0.10.38, 1" +ForwardDiff = "1" +JSON = "1.3.0" LogDensityProblems = "2.1.2" Mooncake = "0.4" PrettyTables = "3" diff --git a/benchmarks/README.md b/benchmarks/README.md index 35cb8c0bf..ad70b7c03 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,4 +1,4 @@ -To run the benchmarks, run this from the root directory of the repository: +To run the benchmarks locally, run this from the root directory of the repository: ```sh julia --project=benchmarks benchmarks/benchmarks.jl diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl index 035d8ff49..7bae6a9da 100644 --- a/benchmarks/benchmarks.jl +++ b/benchmarks/benchmarks.jl @@ -1,113 +1,231 @@ using Pkg -using DynamicPPLBenchmarks: Models, make_suite, model_dimension -using BenchmarkTools: @benchmark, median, run -using PrettyTables: pretty_table, fmt__printf +using Chairmarks: @be, median +using DynamicPPLBenchmarks: Models, benchmark, model_dimension +using JSON: JSON +using PrettyTables: pretty_table, fmt__printf, EmptyCells, MultiColumn, TextTableFormat +using Printf: @sprintf using StableRNGs: StableRNG rng = StableRNG(23) -function print_results(results_table) - table_matrix = hcat(Iterators.map(collect, zip(results_table...))...) - header = [ - "Model", - "Dim", - "AD Backend", - "VarInfo", - "Linked", - "t(eval)/t(ref)", - "t(grad)/t(eval)", - ] - return pretty_table( - table_matrix; - column_labels=header, - backend=:text, - formatters=[fmt__printf("%.1f", [6, 7])], - fit_table_in_display_horizontally=false, - fit_table_in_display_vertically=false, - ) -end - -# Create DynamicPPL.Model instances to run benchmarks on. -smorgasbord_instance = Models.smorgasbord(randn(rng, 100), randn(rng, 100)) -loop_univariate1k, multivariate1k = begin - data_1k = randn(rng, 1_000) - loop = Models.loop_univariate(length(data_1k)) | (; o=data_1k) - multi = Models.multivariate(length(data_1k)) | (; o=data_1k) - loop, multi -end -loop_univariate10k, multivariate10k = begin - data_10k = randn(rng, 10_000) - loop = Models.loop_univariate(length(data_10k)) | (; o=data_10k) - multi = Models.multivariate(length(data_10k)) | (; o=data_10k) - loop, multi -end -lda_instance = begin - w = [1, 2, 3, 2, 1, 1] - d = [1, 1, 1, 2, 2, 2] - Models.lda(2, d, w) -end +head_filename = "benchmarks_result_head.json" +base_filename = "benchmarks_result_base.json" -# Specify the combinations to test: -# (Model Name, model instance, VarInfo choice, AD backend, linked) -chosen_combinations = [ - ( - "Simple assume observe", - Models.simple_assume_observe(randn(rng)), - :typed, - :forwarddiff, - false, - ), - ("Smorgasbord", smorgasbord_instance, :typed, :forwarddiff, false), - ("Smorgasbord", smorgasbord_instance, :simple_namedtuple, :forwarddiff, true), - ("Smorgasbord", smorgasbord_instance, :untyped, :forwarddiff, true), - ("Smorgasbord", smorgasbord_instance, :simple_dict, :forwarddiff, true), - ("Smorgasbord", smorgasbord_instance, :typed_vector, :forwarddiff, true), - ("Smorgasbord", smorgasbord_instance, :untyped_vector, :forwarddiff, true), - ("Smorgasbord", smorgasbord_instance, :typed, :reversediff, true), - ("Smorgasbord", smorgasbord_instance, :typed, :mooncake, true), - ("Smorgasbord", smorgasbord_instance, :typed, :enzyme, true), - ("Loop univariate 1k", loop_univariate1k, :typed, :mooncake, true), - ("Multivariate 1k", multivariate1k, :typed, :mooncake, true), - ("Loop univariate 10k", loop_univariate10k, :typed, :mooncake, true), - ("Multivariate 10k", multivariate10k, :typed, :mooncake, true), - ("Dynamic", Models.dynamic(), :typed, :mooncake, true), - ("Submodel", Models.parent(randn(rng)), :typed, :mooncake, true), - ("LDA", lda_instance, :typed, :reversediff, true), +colnames = [ + "Model", "Dim", "AD Backend", "VarInfo", "Linked", "t(eval)/t(ref)", "t(grad)/t(eval)" ] - -# Time running a model-like function that does not use DynamicPPL, as a reference point. -# Eval timings will be relative to this. -reference_time = begin - obs = randn(rng) - median(@benchmark Models.simple_assume_observe_non_model(obs)).time +function print_results(results_table; to_json=false) + if to_json + # Print to the given file as JSON + results_array = [ + Dict(colnames[i] => results_table[j][i] for i in eachindex(colnames)) for + j in eachindex(results_table) + ] + # do not use pretty=true, as GitHub Actions expects no linebreaks + JSON.json(stdout, results_array) + println() + else + # Pretty-print to terminal + table_matrix = hcat(Iterators.map(collect, zip(results_table...))...) + return pretty_table( + table_matrix; + column_labels=colnames, + backend=:text, + formatters=[fmt__printf("%.1f", [6, 7])], + fit_table_in_display_horizontally=false, + fit_table_in_display_vertically=false, + ) + end end -results_table = Tuple{String,Int,String,String,Bool,Float64,Float64}[] +function run(; to_json=false) + # Create DynamicPPL.Model instances to run benchmarks on. + smorgasbord_instance = Models.smorgasbord(randn(rng, 100), randn(rng, 100)) + loop_univariate1k, multivariate1k = begin + data_1k = randn(rng, 1_000) + loop = Models.loop_univariate(length(data_1k)) | (; o=data_1k) + multi = Models.multivariate(length(data_1k)) | (; o=data_1k) + loop, multi + end + loop_univariate10k, multivariate10k = begin + data_10k = randn(rng, 10_000) + loop = Models.loop_univariate(length(data_10k)) | (; o=data_10k) + multi = Models.multivariate(length(data_10k)) | (; o=data_10k) + loop, multi + end + lda_instance = begin + w = [1, 2, 3, 2, 1, 1] + d = [1, 1, 1, 2, 2, 2] + Models.lda(2, d, w) + end -for (model_name, model, varinfo_choice, adbackend, islinked) in chosen_combinations - @info "Running benchmark for $model_name" - suite = make_suite(model, varinfo_choice, adbackend, islinked) - results = run(suite) - eval_time = median(results["evaluation"]).time - relative_eval_time = eval_time / reference_time - ad_eval_time = median(results["gradient"]).time - relative_ad_eval_time = ad_eval_time / eval_time - push!( - results_table, + # Specify the combinations to test: + # (Model Name, model instance, VarInfo choice, AD backend, linked) + chosen_combinations = [ ( - model_name, - model_dimension(model, islinked), - string(adbackend), - string(varinfo_choice), - islinked, - relative_eval_time, - relative_ad_eval_time, + "Simple assume observe", + Models.simple_assume_observe(randn(rng)), + :typed, + :forwarddiff, + false, ), + ("Smorgasbord", smorgasbord_instance, :typed, :forwarddiff, false), + ("Smorgasbord", smorgasbord_instance, :simple_namedtuple, :forwarddiff, true), + ("Smorgasbord", smorgasbord_instance, :untyped, :forwarddiff, true), + ("Smorgasbord", smorgasbord_instance, :simple_dict, :forwarddiff, true), + ("Smorgasbord", smorgasbord_instance, :typed_vector, :forwarddiff, true), + ("Smorgasbord", smorgasbord_instance, :untyped_vector, :forwarddiff, true), + ("Smorgasbord", smorgasbord_instance, :typed, :reversediff, true), + ("Smorgasbord", smorgasbord_instance, :typed, :mooncake, true), + ("Smorgasbord", smorgasbord_instance, :typed, :enzyme, true), + ("Loop univariate 1k", loop_univariate1k, :typed, :mooncake, true), + ("Multivariate 1k", multivariate1k, :typed, :mooncake, true), + ("Loop univariate 10k", loop_univariate10k, :typed, :mooncake, true), + ("Multivariate 10k", multivariate10k, :typed, :mooncake, true), + ("Dynamic", Models.dynamic(), :typed, :mooncake, true), + ("Submodel", Models.parent(randn(rng)), :typed, :mooncake, true), + ("LDA", lda_instance, :typed, :reversediff, true), + ] + + # Time running a model-like function that does not use DynamicPPL, as a reference point. + # Eval timings will be relative to this. + reference_time = begin + obs = randn(rng) + median(@be Models.simple_assume_observe_non_model(obs)).time + end + @info "Reference evaluation time: $(reference_time) seconds" + + results_table = Tuple{ + String,Int,String,String,Bool,Union{Float64,Missing},Union{Float64,Missing} + }[] + + for (model_name, model, varinfo_choice, adbackend, islinked) in chosen_combinations + @info "Running benchmark for $model_name" + relative_eval_time, relative_ad_eval_time = try + results = benchmark(model, varinfo_choice, adbackend, islinked) + (results.primal_time / reference_time), + (results.grad_time / results.primal_time) + catch e + missing, missing + end + push!( + results_table, + ( + model_name, + model_dimension(model, islinked), + string(adbackend), + string(varinfo_choice), + islinked, + relative_eval_time, + relative_ad_eval_time, + ), + ) + print_results(results_table; to_json=to_json) + end + print_results(results_table; to_json=to_json) + return nothing +end + +struct TestCase + model_name::String + dim::Integer + ad_backend::String + varinfo::String + linked::Bool + TestCase(d::Dict{String,Any}) = new((d[c] for c in colnames[1:5])...) +end +function combine(head_filename::String, base_filename::String) + head_results = try + JSON.parsefile(head_filename, Vector{Dict{String,Any}}) + catch + Dict{String,Any}[] + end + @info "Loaded $(length(head_results)) results from $head_filename" + base_results = try + JSON.parsefile(base_filename, Vector{Dict{String,Any}}) + catch + Dict{String,Any}[] + end + @info "Loaded $(length(base_results)) results from $base_filename" + # Identify unique combinations of (Model, Dim, AD Backend, VarInfo, Linked) + head_testcases = Dict( + TestCase(d) => (d[colnames[6]], d[colnames[7]]) for d in head_results ) - println("Results so far:") - print_results(results_table) + base_testcases = Dict( + TestCase(d) => (d[colnames[6]], d[colnames[7]]) for d in base_results + ) + all_testcases = union(Set(keys(head_testcases)), Set(keys(base_testcases))) + @info "$(length(all_testcases)) unique test cases found" + sorted_testcases = sort( + collect(all_testcases); by=(c -> (c.model_name, c.ad_backend, c.varinfo, c.linked)) + ) + results_table = Tuple{ + String,Int,String,String,Bool,String,String,String,String,String,String + }[] + results_colnames = [ + [ + EmptyCells(5), + MultiColumn(3, "t(eval) / t(ref)"), + MultiColumn(3, "t(grad) / t(eval)"), + ], + [colnames[1:5]..., "base", "this PR", "speedup", "base", "this PR", "speedup"], + ] + sprint_float(x::Float64) = @sprintf("%.2f", x) + sprint_float(m::Missing) = "err" + for c in sorted_testcases + head_eval, head_grad = get(head_testcases, c, (missing, missing)) + base_eval, base_grad = get(base_testcases, c, (missing, missing)) + speedup_eval = base_eval / head_eval + speedup_grad = base_grad / head_grad + push!( + results_table, + ( + c.model_name, + c.dim, + c.ad_backend, + c.varinfo, + c.linked, + sprint_float(base_eval), + sprint_float(head_eval), + sprint_float(speedup_eval), + sprint_float(base_grad), + sprint_float(head_grad), + sprint_float(speedup_grad), + ), + ) + end + # Pretty-print to terminal + if isempty(results_table) + println("No benchmark results obtained.") + else + table_matrix = hcat(Iterators.map(collect, zip(results_table...))...) + println("```") + pretty_table( + table_matrix; + column_labels=results_colnames, + backend=:text, + fit_table_in_display_horizontally=false, + fit_table_in_display_vertically=false, + table_format=TextTableFormat(; horizontal_line_at_merged_column_labels=true), + ) + println("```") + end end -println("Final results:") -print_results(results_table) +# The command-line arguments are used on CI purposes. +# Run with `julia --project=. benchmarks.jl json` to run benchmarks and output JSON to +# stdout +# Run with `julia --project=. benchmarks.jl combine head.json base.json` to combine two JSON +# files +if length(ARGS) == 3 && ARGS[1] == "combine" + combine(ARGS[2], ARGS[3]) +elseif ARGS == ["json"] + run(; to_json=true) +elseif ARGS == [] + # When running locally just omit the argument and it will just benchmark and print to + # terminal. + run() +else + error("invalid arguments: $(ARGS)") +end diff --git a/benchmarks/src/DynamicPPLBenchmarks.jl b/benchmarks/src/DynamicPPLBenchmarks.jl index 225e40cd8..0dc7ece6e 100644 --- a/benchmarks/src/DynamicPPLBenchmarks.jl +++ b/benchmarks/src/DynamicPPLBenchmarks.jl @@ -1,20 +1,20 @@ module DynamicPPLBenchmarks using DynamicPPL: VarInfo, SimpleVarInfo, VarName -using BenchmarkTools: BenchmarkGroup, @benchmarkable using DynamicPPL: DynamicPPL +using DynamicPPL.TestUtils.AD: run_ad, NoTest using ADTypes: ADTypes using LogDensityProblems: LogDensityProblems using ForwardDiff: ForwardDiff -using Mooncake: Mooncake using ReverseDiff: ReverseDiff +using Mooncake: Mooncake +using Enzyme: Enzyme using StableRNGs: StableRNG include("./Models.jl") using .Models: Models -using Enzyme: Enzyme -export Models, make_suite, model_dimension +export Models, benchmark, model_dimension """ model_dimension(model, islinked) @@ -52,9 +52,11 @@ function to_backend(x::Union{AbstractString,Symbol}) end """ - make_suite(model, varinfo_choice::Symbol, adbackend::Symbol, islinked::Bool) + benchmark(model, varinfo_choice::Symbol, adbackend::Symbol, islinked::Bool) + +Benchmark evaluation and gradient calculation for `model` using the selected varinfo type +and AD backend. -Create a benchmark suite for `model` using the selected varinfo type and AD backend. Available varinfo choices: • `:untyped` → uses `DynamicPPL.untyped_varinfo(model)` • `:typed` → uses `DynamicPPL.typed_varinfo(model)` @@ -65,10 +67,10 @@ The AD backend should be specified as a Symbol (e.g. `:forwarddiff`, `:reversedi `islinked` determines whether to link the VarInfo for evaluation. """ -function make_suite(model, varinfo_choice::Symbol, adbackend::Symbol, islinked::Bool) +function benchmark(model, varinfo_choice::Symbol, adbackend::Symbol, islinked::Bool) rng = StableRNG(23) - suite = BenchmarkGroup() + adbackend = to_backend(adbackend) vi = if varinfo_choice == :untyped DynamicPPL.untyped_varinfo(rng, model) @@ -94,20 +96,9 @@ function make_suite(model, varinfo_choice::Symbol, adbackend::Symbol, islinked:: vi = DynamicPPL.link(vi, model) end - f = DynamicPPL.LogDensityFunction( - model, DynamicPPL.getlogjoint_internal, vi; adtype=adbackend + return run_ad( + model, adbackend; varinfo=vi, benchmark=true, test=NoTest(), verbose=false ) - # The parameters at which we evaluate f. - θ = vi[:] - - # Run once to trigger compilation. - LogDensityProblems.logdensity_and_gradient(f, θ) - suite["gradient"] = @benchmarkable $(LogDensityProblems.logdensity_and_gradient)($f, $θ) - - # Also benchmark just standard model evaluation because why not. - suite["evaluation"] = @benchmarkable $(LogDensityProblems.logdensity)($f, $θ) - - return suite end end # module