From 0efe36881f9eeb47da6e496dbaed06fe8f9ef891 Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Fri, 7 Jan 2022 03:37:43 +0000
Subject: [PATCH 01/18] temporarily add some pkgs to do testing

---
 Project.toml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 7166d770..a5bb9d46 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,15 +6,18 @@ repo = "https://github.com/TuringLang/Libtask.jl.git"
 version = "0.6.2"
 
 [deps]
+AdvancedPS = "576499cb-2369-40b2-a588-c64705576edc"
+DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
 IRTools = "7869d1d1-7146-5819-86e3-90919afe41df"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
 
 [compat]
-julia = "1.3"
-MacroTools = "0.5"
 IRTools = "0.4"
+MacroTools = "0.5"
+julia = "1.3"
 
 [extras]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"

From 463c187a3a6c0a6205e7bb1baa2bbc567259b34f Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Fri, 7 Jan 2022 08:07:22 +0000
Subject: [PATCH 02/18] simple benchmarks

---
 Project.toml |  1 +
 perf/p0.jl   | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 perf/p0.jl

diff --git a/Project.toml b/Project.toml
index a5bb9d46..3538d4eb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,6 +7,7 @@ version = "0.6.2"
 
 [deps]
 AdvancedPS = "576499cb-2369-40b2-a588-c64705576edc"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
 IRTools = "7869d1d1-7146-5819-86e3-90919afe41df"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/perf/p0.jl b/perf/p0.jl
new file mode 100644
index 00000000..273fe869
--- /dev/null
+++ b/perf/p0.jl
@@ -0,0 +1,40 @@
+# ]add  Turing#hg/new-libtask2
+
+using Libtask
+using Turing, DynamicPPL, AdvancedPS
+using BenchmarkTools
+
+@model gdemo(x, y) = begin
+    # Assumptions
+    σ ~ InverseGamma(2,3)
+    μ ~ Normal(0,sqrt(σ))
+    # Observations
+    x ~ Normal(μ, sqrt(σ))
+    y ~ Normal(μ, sqrt(σ))
+end
+
+
+# Case 1: Sample from the prior.
+
+m = Turing.Core.TracedModel(gdemo(1.5, 2.), SampleFromPrior(), VarInfo())
+
+f = m.evaluator[1];
+
+args = m.evaluator[2:end];
+
+@btime f(args...)
+# (2.0, VarInfo (2 variables (μ, σ), dimension 2; logp: -6.162))
+
+t = Libtask.CTask(f, args...)
+# schedule(t.task) # work fine!
+# @show Libtask.result(t.tf.tape)
+@btime Libtask.step_in(t.tf.tape, args)
+
+# Case 2: SMC sampler
+
+m = Turing.Core.TracedModel(gdemo(1.5, 2.), Sampler(SMC(50)), VarInfo());
+t = Libtask.CTask(m.evaluator[1], m.evaluator[2:end]...);
+# schedule(t.task)
+# @show Libtask.result(t.tf.tape)
+@btime m.evaluator[1](m.evaluator[2:end]...)
+@btime Libtask.step_in(t.tf.tape, m.evaluator[2:end])

From 7d9eeb0bfc02d814bbdc1df6f393d2e5de87ae85 Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Sun, 9 Jan 2022 07:30:41 +0000
Subject: [PATCH 03/18] use ir and tape cache

---
 perf/p0.jl           | 13 ++++++++++---
 src/tapedfunction.jl |  9 ++++++++-
 src/tapedtask.jl     | 12 +++++++++++-
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/perf/p0.jl b/perf/p0.jl
index 273fe869..b2ae3c57 100644
--- a/perf/p0.jl
+++ b/perf/p0.jl
@@ -22,19 +22,26 @@ f = m.evaluator[1];
 
 args = m.evaluator[2:end];
 
+@show "Directly call..."
 @btime f(args...)
 # (2.0, VarInfo (2 variables (μ, σ), dimension 2; logp: -6.162))
 
-t = Libtask.CTask(f, args...)
+@show "CTask construction..."
+t = @btime  Libtask.CTask(f, args...)
 # schedule(t.task) # work fine!
 # @show Libtask.result(t.tf.tape)
+@show "Step in a tape..."
 @btime Libtask.step_in(t.tf.tape, args)
 
 # Case 2: SMC sampler
 
 m = Turing.Core.TracedModel(gdemo(1.5, 2.), Sampler(SMC(50)), VarInfo());
-t = Libtask.CTask(m.evaluator[1], m.evaluator[2:end]...);
+@show "Directly call..."
+@btime m.evaluator[1](m.evaluator[2:end]...)
+
+@show "CTask construction..."
+t = @btime Libtask.CTask(m.evaluator[1], m.evaluator[2:end]...);
 # schedule(t.task)
 # @show Libtask.result(t.tf.tape)
-@btime m.evaluator[1](m.evaluator[2:end]...)
+@show "Step in a tape..."
 @btime Libtask.step_in(t.tf.tape, m.evaluator[2:end])
diff --git a/src/tapedfunction.jl b/src/tapedfunction.jl
index 083cfccc..186ddb53 100644
--- a/src/tapedfunction.jl
+++ b/src/tapedfunction.jl
@@ -91,7 +91,7 @@ end
 function run_and_record!(tape::Tape, f, args...)
     f = val(f) # f maybe a Boxed closure
     output = try
-        box(f(map(val, args)...))
+        Box{Any}(f(map(val, args)...))
     catch e
         @warn e
         Box{Any}(nothing)
@@ -190,6 +190,13 @@ mutable struct TapedFunction
     end
 end
 
+function reset!(tf::TapedFunction, ir::IRTools.IR, tape::Tape)
+    tf.ir = ir
+    tf.tape = tape
+    setowner!(tape, tf)
+    return tf
+end
+
 function (tf::TapedFunction)(args...)
     if isempty(tf.tape)
         ir = IRTools.@code_ir tf.func(args...)
diff --git a/src/tapedtask.jl b/src/tapedtask.jl
index e6da976d..90aaafba 100644
--- a/src/tapedtask.jl
+++ b/src/tapedtask.jl
@@ -16,9 +16,19 @@ struct TapedTask
     end
 end
 
+const TRCache = Dict{Any, Any}()
+
 function TapedTask(tf::TapedFunction, args...)
     tf.owner != nothing && error("TapedFunction is owned to another task.")
-    isempty(tf.tape) && tf(args...)
+    if isempty(tf.tape)
+        if haskey(TRCache, tf.func)
+            ir, tape = TRCache[tf.func]
+            reset!(tf, ir, copy(tape, Dict{UInt64, Any}()))
+        else
+            tf(args...)
+            TRCache[tf.func] = (tf.ir, tf.tape)
+        end
+    end
     produce_ch = Channel()
     consume_ch = Channel{Int}()
     task = @task try

From b78d3f30fa9e7aedda35921349e240ea43761e5c Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Mon, 10 Jan 2022 19:23:39 +0000
Subject: [PATCH 04/18] use LRUCache instead of Dict

---
 Project.toml     | 1 +
 src/Libtask.jl   | 2 ++
 src/tapedtask.jl | 2 +-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 3538d4eb..56ec483e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,6 +10,7 @@ AdvancedPS = "576499cb-2369-40b2-a588-c64705576edc"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
 IRTools = "7869d1d1-7146-5819-86e3-90919afe41df"
+LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
diff --git a/src/Libtask.jl b/src/Libtask.jl
index 12a8c516..59795b98 100644
--- a/src/Libtask.jl
+++ b/src/Libtask.jl
@@ -3,6 +3,8 @@ module Libtask
 using IRTools
 using MacroTools
 
+using LRUCache
+
 export CTask, consume, produce
 export TArray, tzeros, tfill, TRef
 
diff --git a/src/tapedtask.jl b/src/tapedtask.jl
index 90aaafba..bc358bc8 100644
--- a/src/tapedtask.jl
+++ b/src/tapedtask.jl
@@ -16,7 +16,7 @@ struct TapedTask
     end
 end
 
-const TRCache = Dict{Any, Any}()
+const TRCache = LRU{Any, Any}(maxsize=10)
 
 function TapedTask(tf::TapedFunction, args...)
     tf.owner != nothing && error("TapedFunction is owned to another task.")

From dd211e9b9c52d8edd911dc30c52bb72b3d5c1982 Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Mon, 10 Jan 2022 19:38:33 +0000
Subject: [PATCH 05/18] partially copy tape

---
 src/tapedtask.jl | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/tapedtask.jl b/src/tapedtask.jl
index bc358bc8..bf2fd7a0 100644
--- a/src/tapedtask.jl
+++ b/src/tapedtask.jl
@@ -23,7 +23,7 @@ function TapedTask(tf::TapedFunction, args...)
     if isempty(tf.tape)
         if haskey(TRCache, tf.func)
             ir, tape = TRCache[tf.func]
-            reset!(tf, ir, copy(tape, Dict{UInt64, Any}()))
+            reset!(tf, ir, copy(tape, Dict{UInt64, Any}(); partial=false))
         else
             tf(args...)
             TRCache[tf.func] = (tf.ir, tf.tape)
@@ -209,14 +209,16 @@ function Base.copy(x::Instruction, on_tape::Tape, roster::Dict{UInt64, Any})
     Instruction(x.fun, input, output, on_tape)
 end
 
-function Base.copy(t::Tape, roster::Dict{UInt64, Any})
+function Base.copy(t::Tape, roster::Dict{UInt64, Any}; partial=true)
     old_data = t.tape
-    new_data = Vector{AbstractInstruction}()
-    new_tape = Tape(new_data, t.counter, t.owner)
+    len = partial ? length(old_data) - t.counter + 1 : length(old_data)
+    start = partial ? t.counter : 1
+    new_data = Vector{AbstractInstruction}(undef, len)
+    new_tape = Tape(new_data, 1, t.owner)
 
-    for x in old_data
+    for (i, x) in enumerate(old_data[start:end])
         new_ins = copy(x, new_tape, roster)
-        push!(new_data, new_ins)
+        new_data[i] = new_ins
     end
 
     return new_tape

From 2e3491d42a9061a0cf846828fa879dd8d2b2b717 Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Tue, 11 Jan 2022 01:21:58 +0000
Subject: [PATCH 06/18] fix a TArray bug

---
 src/tarray.jl | 93 ++++++++++++++++++++++++++-------------------------
 1 file changed, 48 insertions(+), 45 deletions(-)

diff --git a/src/tarray.jl b/src/tarray.jl
index 507f59c5..9c3c2fba 100644
--- a/src/tarray.jl
+++ b/src/tarray.jl
@@ -37,6 +37,9 @@ TArray{T,N}(::UndefInitializer, d::Vararg{<:Integer,N}) where {T,N} = TArray{T,N
 TArray{T,N}(dim::NTuple{N,Int}) where {T,N} = TArray(T, dim)
 TArray(T::Type, dim) = TArray(Array{T}(undef, dim))
 
+localize(x) = x
+localize(x::AbstractArray) = TArray(x)
+getdata(x) = x
 getdata(x::TArray) = x.data
 tape_copy(x::TArray) = TArray(deepcopy(x.data))
 
@@ -166,70 +169,70 @@ end
 # Other methods from stdlib
 
 Base.view(x::TArray, inds...; kwargs...) =
-    Base.view(getdata(x), inds...; kwargs...) |> TArray
-Base.:-(x::TArray) = (-getdata(x)) |> TArray
-Base.transpose(x::TArray) = transpose(getdata(x)) |> TArray
-Base.adjoint(x::TArray) = adjoint(getdata(x)) |> TArray
-Base.repeat(x::TArray; kw...) = repeat(getdata(x); kw...) |> TArray
+    Base.view(getdata(x), inds...; kwargs...) |> localize
+Base.:-(x::TArray) = (-getdata(x)) |> localize
+Base.transpose(x::TArray) = transpose(getdata(x)) |> localize
+Base.adjoint(x::TArray) = adjoint(getdata(x)) |> localize
+Base.repeat(x::TArray; kw...) = repeat(getdata(x); kw...) |> localize
 
 Base.hcat(xs::Union{TArray{T,1}, TArray{T,2}}...) where T =
-    hcat(getdata.(xs)...) |> TArray
+    hcat(getdata.(xs)...) |> localize
 Base.vcat(xs::Union{TArray{T,1}, TArray{T,2}}...) where T =
-    vcat(getdata.(xs)...) |> TArray
+    vcat(getdata.(xs)...) |> localize
 Base.cat(xs::Union{TArray{T,1}, TArray{T,2}}...; dims) where T =
-    cat(getdata.(xs)...; dims = dims) |> TArray
+    cat(getdata.(xs)...; dims = dims) |> localize
 
 
-Base.reshape(x::TArray, dims::Union{Colon,Int}...) = reshape(getdata(x), dims) |> TArray
+Base.reshape(x::TArray, dims::Union{Colon,Int}...) = reshape(getdata(x), dims) |> localize
 Base.reshape(x::TArray, dims::Tuple{Vararg{Union{Int,Colon}}}) =
-    reshape(getdata(x), Base._reshape_uncolon(getdata(x), dims)) |> TArray
-Base.reshape(x::TArray, dims::Tuple{Vararg{Int}}) = reshape(getdata(x), dims) |> TArray
-
-Base.permutedims(x::TArray, perm) = permutedims(getdata(x), perm) |> TArray
-Base.PermutedDimsArray(x::TArray, perm) = PermutedDimsArray(getdata(x), perm) |> TArray
-Base.reverse(x::TArray; dims) = reverse(getdata(x), dims = dims) |> TArray
-
-Base.sum(x::TArray; dims = :) = sum(getdata(x), dims = dims) |> TArray
-Base.sum(f::Union{Function,Type},x::TArray) = sum(f.(getdata(x))) |> TArray
-Base.prod(x::TArray; dims=:) = prod(getdata(x); dims=dims) |> TArray
-Base.prod(f::Union{Function, Type}, x::TArray) = prod(f.(getdata(x))) |> TArray
-
-Base.findfirst(x::TArray, args...) = findfirst(getdata(x), args...) |> TArray
-Base.maximum(x::TArray; dims = :) = maximum(getdata(x), dims = dims) |> TArray
-Base.minimum(x::TArray; dims = :) = minimum(getdata(x), dims = dims) |> TArray
-
-Base.:/(x::TArray, y::TArray) = getdata(x) / getdata(y) |> TArray
-Base.:/(x::AbstractArray, y::TArray) = x / getdata(y) |> TArray
-Base.:/(x::TArray, y::AbstractArray) = getdata(x) / y |> TArray
-Base.:\(x::TArray, y::TArray) = getdata(x) \ getdata(y) |> TArray
-Base.:\(x::AbstractArray, y::TArray) = x \ getdata(y) |> TArray
-Base.:\(x::TArray, y::AbstractArray) = getdata(x) \ y |> TArray
-Base.:*(x::TArray, y::TArray) = getdata(x) * getdata(y) |> TArray
-Base.:*(x::AbstractArray, y::TArray) = x * getdata(y) |> TArray
-Base.:*(x::TArray, y::AbstractArray) = getdata(x) * y |> TArray
+    reshape(getdata(x), Base._reshape_uncolon(getdata(x), dims)) |> localize
+Base.reshape(x::TArray, dims::Tuple{Vararg{Int}}) = reshape(getdata(x), dims) |> localize
+
+Base.permutedims(x::TArray, perm) = permutedims(getdata(x), perm) |> localize
+Base.PermutedDimsArray(x::TArray, perm) = PermutedDimsArray(getdata(x), perm) |> localize
+Base.reverse(x::TArray; dims) = reverse(getdata(x), dims = dims) |> localize
+
+Base.sum(x::TArray; dims = :) = sum(getdata(x), dims = dims) |> localize
+Base.sum(f::Union{Function,Type},x::TArray) = sum(f.(getdata(x))) |> localize
+Base.prod(x::TArray; dims=:) = prod(getdata(x); dims=dims) |> localize
+Base.prod(f::Union{Function, Type}, x::TArray) = prod(f.(getdata(x))) |> localize
+
+Base.findfirst(x::TArray, args...) = findfirst(getdata(x), args...) |> localize
+Base.maximum(x::TArray; dims = :) = maximum(getdata(x), dims = dims) |> localize
+Base.minimum(x::TArray; dims = :) = minimum(getdata(x), dims = dims) |> localize
+
+Base.:/(x::TArray, y::TArray) = getdata(x) / getdata(y) |> localize
+Base.:/(x::AbstractArray, y::TArray) = x / getdata(y) |> localize
+Base.:/(x::TArray, y::AbstractArray) = getdata(x) / y |> localize
+Base.:\(x::TArray, y::TArray) = getdata(x) \ getdata(y) |> localize
+Base.:\(x::AbstractArray, y::TArray) = x \ getdata(y) |> localize
+Base.:\(x::TArray, y::AbstractArray) = getdata(x) \ y |> localize
+Base.:*(x::TArray, y::TArray) = getdata(x) * getdata(y) |> localize
+Base.:*(x::AbstractArray, y::TArray) = x * getdata(y) |> localize
+Base.:*(x::TArray, y::AbstractArray) = getdata(x) * y |> localize
 
 # broadcast
 Base.BroadcastStyle(::Type{<:TArray}) = Broadcast.ArrayStyle{TArray}()
-Broadcast.broadcasted(::Broadcast.ArrayStyle{TArray}, f, args...) = f.(getdata.(args)...) |> TArray
+Broadcast.broadcasted(::Broadcast.ArrayStyle{TArray}, f, args...) = f.(getdata.(args)...) |> localize
 
 import LinearAlgebra
 import LinearAlgebra:  \, /, inv, det, logdet, logabsdet, norm
 
-LinearAlgebra.inv(x::TArray) = inv(getdata(x)) |> TArray
-LinearAlgebra.det(x::TArray) = det(getdata(x)) |> TArray
-LinearAlgebra.logdet(x::TArray) = logdet(getdata(x)) |> TArray
-LinearAlgebra.logabsdet(x::TArray) = logabsdet(getdata(x)) |> TArray
+LinearAlgebra.inv(x::TArray) = inv(getdata(x)) |> localize
+LinearAlgebra.det(x::TArray) = det(getdata(x)) |> localize
+LinearAlgebra.logdet(x::TArray) = logdet(getdata(x)) |> localize
+LinearAlgebra.logabsdet(x::TArray) = logabsdet(getdata(x)) |> localize
 LinearAlgebra.norm(x::TArray, p::Real = 2) =
-    LinearAlgebra.norm(getdata(x), p) |> TArray
+    LinearAlgebra.norm(getdata(x), p) |> localize
 
 import LinearAlgebra: dot
-dot(x::TArray, ys::TArray) = dot(getdata(x), getdata(ys)) |> TArray
-dot(x::AbstractArray, ys::TArray) = dot(x, getdata(ys)) |> TArray
-dot(x::TArray, ys::AbstractArray) = dot(getdata(x), ys) |> TArray
+dot(x::TArray, ys::TArray) = dot(getdata(x), getdata(ys)) |> localize
+dot(x::AbstractArray, ys::TArray) = dot(x, getdata(ys)) |> localize
+dot(x::TArray, ys::AbstractArray) = dot(getdata(x), ys) |> localize
 
 using Statistics
-Statistics.mean(x::TArray; dims = :) = mean(getdata(x), dims = dims) |> TArray
-Statistics.std(x::TArray; kw...) = std(getdata(x), kw...) |> TArray
+Statistics.mean(x::TArray; dims = :) = mean(getdata(x), dims = dims) |> localize
+Statistics.std(x::TArray; kw...) = std(getdata(x), kw...) |> localize
 
 # TODO
 # * NNlib

From 400571cc12893deb2ac82694e2d0835df003a83d Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Wed, 12 Jan 2022 00:24:34 +0000
Subject: [PATCH 07/18] add Project.toml for perf dir

---
 Project.toml            |  8 ++------
 perf/Project.toml       | 22 ++++++++++++++++++++++
 perf/p1.jl              | 39 +++++++++++++++++++++++++++++++++++++++
 perf/src/LibtaskPerf.jl |  2 ++
 src/tapedfunction.jl    |  5 ++++-
 src/tapedtask.jl        | 11 +++++++----
 6 files changed, 76 insertions(+), 11 deletions(-)
 create mode 100644 perf/Project.toml
 create mode 100644 perf/p1.jl
 create mode 100644 perf/src/LibtaskPerf.jl

diff --git a/Project.toml b/Project.toml
index 56ec483e..1de90e38 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,20 +6,16 @@ repo = "https://github.com/TuringLang/Libtask.jl.git"
 version = "0.6.2"
 
 [deps]
-AdvancedPS = "576499cb-2369-40b2-a588-c64705576edc"
-BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
 IRTools = "7869d1d1-7146-5819-86e3-90919afe41df"
 LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
 
 [compat]
-IRTools = "0.4"
-MacroTools = "0.5"
 julia = "1.3"
+MacroTools = "0.5"
+IRTools = "0.4"
 
 [extras]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
diff --git a/perf/Project.toml b/perf/Project.toml
new file mode 100644
index 00000000..6cd85f3c
--- /dev/null
+++ b/perf/Project.toml
@@ -0,0 +1,22 @@
+name = "LibtaskPerf"
+uuid = "09aeecf0-733b-11ec-8a4e-06c55de9177a"
+license = "MIT"
+desc = "Performance Tuning for Libtask"
+version = "0.0.1"
+
+[deps]
+AbstractMCMC = "80f14c24-f653-4e6a-9b94-39d6b0f70001"
+AdvancedPS = "576499cb-2369-40b2-a588-c64705576edc"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
+Libtask = "6f1fad26-d15e-5dc8-ae53-837a1d7b8c9f"
+Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+julia = "1.3"
+
+[extras]
+
+[targets]
+test = ["Test", "BenchmarkTools"]
diff --git a/perf/p1.jl b/perf/p1.jl
new file mode 100644
index 00000000..85adcfa9
--- /dev/null
+++ b/perf/p1.jl
@@ -0,0 +1,39 @@
+using Turing, Test, AbstractMCMC, DynamicPPL, Random
+
+import AbstractMCMC.AbstractSampler
+
+function check_numerical(chain,
+                         symbols::Vector,
+                         exact_vals::Vector;
+                         atol=0.2,
+                         rtol=0.0)
+    for (sym, val) in zip(symbols, exact_vals)
+        E = val isa Real ?
+            mean(chain[sym]) :
+            vec(mean(chain[sym], dims=1))
+        @info (symbol=sym, exact=val, evaluated=E)
+        @test E ≈ val atol=atol rtol=rtol
+    end
+end
+
+function check_MoGtest_default(chain; atol=0.2, rtol=0.0)
+    check_numerical(chain,
+                    [:z1, :z2, :z3, :z4, :mu1, :mu2],
+                    [1.0, 1.0, 2.0, 2.0, 1.0, 4.0],
+                    atol=atol, rtol=rtol)
+end
+
+@model gdemo_d(x, y) = begin
+    s ~ InverseGamma(2, 3)
+    m ~ Normal(0, sqrt(s))
+    x ~ Normal(m, sqrt(s))
+    y ~ Normal(m, sqrt(s))
+    return s, m
+end
+
+alg = CSMC(15)
+chain = sample(gdemo_d(1.5, 2.0), alg, 5_00)
+
+@show chain
+
+check_numerical(chain, [:s, :m], [49/24, 7/6], atol=0.1)
diff --git a/perf/src/LibtaskPerf.jl b/perf/src/LibtaskPerf.jl
new file mode 100644
index 00000000..b0871b2d
--- /dev/null
+++ b/perf/src/LibtaskPerf.jl
@@ -0,0 +1,2 @@
+module LibtaskPerf
+end
diff --git a/src/tapedfunction.jl b/src/tapedfunction.jl
index 186ddb53..8e58df3a 100644
--- a/src/tapedfunction.jl
+++ b/src/tapedfunction.jl
@@ -53,6 +53,9 @@ function Base.show(io::IO, instruction::Instruction)
 end
 
 function Base.show(io::IO, tp::Tape)
+    # we use an extra IOBuffer to collect all the data and then
+    # output it once to avoid output interrupt during task context
+    # switching
     buf = IOBuffer()
     print(buf, "$(length(tp))-element Tape")
     isempty(tp) || println(buf, ":")
@@ -91,7 +94,7 @@ end
 function run_and_record!(tape::Tape, f, args...)
     f = val(f) # f maybe a Boxed closure
     output = try
-        Box{Any}(f(map(val, args)...))
+        box(f(map(val, args)...))
     catch e
         @warn e
         Box{Any}(nothing)
diff --git a/src/tapedtask.jl b/src/tapedtask.jl
index bf2fd7a0..b96cb5e6 100644
--- a/src/tapedtask.jl
+++ b/src/tapedtask.jl
@@ -19,14 +19,17 @@ end
 const TRCache = LRU{Any, Any}(maxsize=10)
 
 function TapedTask(tf::TapedFunction, args...)
-    tf.owner != nothing && error("TapedFunction is owned to another task.")
+    tf.owner != nothing && error("TapedFunction is owned by another task.")
     if isempty(tf.tape)
-        if haskey(TRCache, tf.func)
-            ir, tape = TRCache[tf.func]
+        cache_key = (tf.func, typeof.(args)...)
+        if haskey(TRCache, cache_key)
+            ir, tape = TRCache[cache_key]
+            # Here we don't need change the initial arguments of the tape,
+            # it will be set when we `step_in` to the tape.
             reset!(tf, ir, copy(tape, Dict{UInt64, Any}(); partial=false))
         else
             tf(args...)
-            TRCache[tf.func] = (tf.ir, tf.tape)
+            TRCache[cache_key] = (tf.ir, tf.tape)
         end
     end
     produce_ch = Channel()

From 3a2524061b9c7be7cefce423c46d4c546f2d9e05 Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Wed, 12 Jan 2022 00:41:04 +0000
Subject: [PATCH 08/18] minor update

---
 perf/p1.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf/p1.jl b/perf/p1.jl
index 85adcfa9..4ecd2ec8 100644
--- a/perf/p1.jl
+++ b/perf/p1.jl
@@ -32,7 +32,7 @@ end
 end
 
 alg = CSMC(15)
-chain = sample(gdemo_d(1.5, 2.0), alg, 5_00)
+chain = sample(gdemo_d(1.5, 2.0), alg, 5_000)
 
 @show chain
 

From f24480ebd45e9a5ded071aade25a64d075876847 Mon Sep 17 00:00:00 2001
From: Hong Ge <hg344@cam.ac.uk>
Date: Wed, 12 Jan 2022 12:32:43 +0000
Subject: [PATCH 09/18] Update src/tapedtask.jl

Co-authored-by: David Widmann <devmotion@users.noreply.github.com>
---
 src/tapedtask.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tapedtask.jl b/src/tapedtask.jl
index b96cb5e6..72af7d2e 100644
--- a/src/tapedtask.jl
+++ b/src/tapedtask.jl
@@ -19,7 +19,7 @@ end
 const TRCache = LRU{Any, Any}(maxsize=10)
 
 function TapedTask(tf::TapedFunction, args...)
-    tf.owner != nothing && error("TapedFunction is owned by another task.")
+    tf.owner !== nothing && error("TapedFunction is owned by another task.")
     if isempty(tf.tape)
         cache_key = (tf.func, typeof.(args)...)
         if haskey(TRCache, cache_key)

From 9b548c6acd37e5413328d899c4e7c88783edbfbc Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Wed, 12 Jan 2022 23:57:38 +0000
Subject: [PATCH 10/18] remove redundant module

---
 perf/Project.toml       | 6 ------
 perf/src/LibtaskPerf.jl | 2 --
 2 files changed, 8 deletions(-)
 delete mode 100644 perf/src/LibtaskPerf.jl

diff --git a/perf/Project.toml b/perf/Project.toml
index 6cd85f3c..829ff90d 100644
--- a/perf/Project.toml
+++ b/perf/Project.toml
@@ -1,9 +1,3 @@
-name = "LibtaskPerf"
-uuid = "09aeecf0-733b-11ec-8a4e-06c55de9177a"
-license = "MIT"
-desc = "Performance Tuning for Libtask"
-version = "0.0.1"
-
 [deps]
 AbstractMCMC = "80f14c24-f653-4e6a-9b94-39d6b0f70001"
 AdvancedPS = "576499cb-2369-40b2-a588-c64705576edc"
diff --git a/perf/src/LibtaskPerf.jl b/perf/src/LibtaskPerf.jl
deleted file mode 100644
index b0871b2d..00000000
--- a/perf/src/LibtaskPerf.jl
+++ /dev/null
@@ -1,2 +0,0 @@
-module LibtaskPerf
-end

From c6ec201e52bb4d9b9c28cc2affd7138b40d57f09 Mon Sep 17 00:00:00 2001
From: Hong Ge <hg344@cam.ac.uk>
Date: Mon, 17 Jan 2022 17:20:07 +0000
Subject: [PATCH 11/18] Catch and print error while re-running a (cached) tape.

---
 src/tapedfunction.jl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/tapedfunction.jl b/src/tapedfunction.jl
index 8e58df3a..d213b0af 100644
--- a/src/tapedfunction.jl
+++ b/src/tapedfunction.jl
@@ -69,8 +69,13 @@ function Base.show(io::IO, tp::Tape)
 end
 
 function (instr::Instruction{F})() where F
-    output = instr.fun(map(val, instr.input)...)
+    try 
+        output = instr.fun(map(Libtask.val, instr.input)...)
     instr.output.val = output
+    catch e
+        println(e, catch_backtrace()); 
+        rethrow(e);
+    end
 end
 
 function increase_counter!(t::Tape)

From e4838e9d0e7c206bb668f453266d790113f37d9a Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Tue, 18 Jan 2022 03:18:43 +0000
Subject: [PATCH 12/18] put `new` onto tape

---
 perf/Project.toml    |  4 +--
 perf/p2.jl           | 63 ++++++++++++++++++++++++++++++++++++++++++++
 src/tapedfunction.jl | 54 ++++++++++++++++++++++++++++++++-----
 3 files changed, 111 insertions(+), 10 deletions(-)
 create mode 100644 perf/p2.jl

diff --git a/perf/Project.toml b/perf/Project.toml
index 829ff90d..9e9ab49b 100644
--- a/perf/Project.toml
+++ b/perf/Project.toml
@@ -4,13 +4,11 @@ AdvancedPS = "576499cb-2369-40b2-a588-c64705576edc"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
 Libtask = "6f1fad26-d15e-5dc8-ae53-837a1d7b8c9f"
-Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
 
 [compat]
 julia = "1.3"
 
-[extras]
-
 [targets]
 test = ["Test", "BenchmarkTools"]
diff --git a/perf/p2.jl b/perf/p2.jl
new file mode 100644
index 00000000..44fd61a7
--- /dev/null
+++ b/perf/p2.jl
@@ -0,0 +1,63 @@
+using Turing, Test, AbstractMCMC, DynamicPPL, Random, Turing.RandomMeasures, Libtask
+
+@model infiniteGMM(x) = begin
+    # Hyper-parameters, i.e. concentration parameter and parameters of H.
+    α = 1.0
+    μ0 = 0.0
+    σ0 = 1.0
+
+    # Define random measure, e.g. Dirichlet process.
+    rpm = DirichletProcess(α)
+
+    # Define the base distribution, i.e. expected value of the Dirichlet process.
+    H = Normal(μ0, σ0)
+
+    # Latent assignment.
+    z = tzeros(Int, length(x))
+
+    # Locations of the infinitely many clusters.
+    μ = tzeros(Float64, 0)
+
+    for i in 1:length(x)
+
+        # Number of clusters.
+        K = maximum(z)
+        nk = Vector{Int}(map(k -> sum(z .== k), 1:K))
+
+        # Draw the latent assignment.
+        z[i] ~ ChineseRestaurantProcess(rpm, nk)
+
+        # Create a new cluster?
+        if z[i] > K
+            push!(μ, 0.0)
+
+            # Draw location of new cluster.
+            μ[z[i]] ~ H
+        end
+
+        # Draw observation.
+        x[i] ~ Normal(μ[z[i]], 1.0)
+    end
+end
+
+# Generate some test data.
+Random.seed!(1)
+
+data = vcat(randn(10), randn(10) .- 5, randn(10) .+ 10)
+data .-= mean(data)
+data /= std(data)
+
+# MCMC sampling
+Random.seed!(2)
+iterations = 500
+model_fun = infiniteGMM(data)
+
+m = Turing.Core.TracedModel(model_fun, Sampler(SMC(50)), VarInfo())
+f = m.evaluator[1]
+args = m.evaluator[2:end]
+
+t = Libtask.CTask(f, args...)
+
+Libtask.step_in(t.tf.tape, args)
+
+@show Libtask.result(t.tf.tape)
diff --git a/src/tapedfunction.jl b/src/tapedfunction.jl
index d213b0af..ed52d9be 100644
--- a/src/tapedfunction.jl
+++ b/src/tapedfunction.jl
@@ -13,6 +13,12 @@ mutable struct Instruction{F} <: AbstractInstruction
     tape::Tape
 end
 
+mutable struct NewInstruction <: AbstractInstruction
+    input::Tuple
+    output
+    tape::Tape
+end
+
 Tape() = Tape(Vector{AbstractInstruction}(), 1, nothing)
 Tape(owner) = Tape(Vector{AbstractInstruction}(), 1, owner)
 MacroTools.@forward Tape.tape Base.iterate, Base.length
@@ -46,6 +52,10 @@ function Base.show(io::IO, box::Box)
     println(io, "Box($(box.val))")
 end
 
+function Base.show(io::IO, instruction::AbstractInstruction)
+    println(io, "A $(typeof(instruction))")
+end
+
 function Base.show(io::IO, instruction::Instruction)
     fun = instruction.fun
     tape = instruction.tape
@@ -69,15 +79,28 @@ function Base.show(io::IO, tp::Tape)
 end
 
 function (instr::Instruction{F})() where F
-    try 
-        output = instr.fun(map(Libtask.val, instr.input)...)
-    instr.output.val = output
+    try
+        output = instr.fun(map(val, instr.input)...)
+        instr.output.val = output
+    catch e
+        println(e, catch_backtrace());
+        rethrow(e);
+    end
+end
+
+
+function (instr::NewInstruction)()
+    try
+        expr = Expr(:new, map(val, instr.input)...)
+        output = eval(expr)
+        instr.output.val = output
     catch e
-        println(e, catch_backtrace()); 
+        println(e, catch_backtrace());
         rethrow(e);
     end
 end
 
+
 function increase_counter!(t::Tape)
     t.counter > length(t) && return
     # instr = t[t.counter]
@@ -109,6 +132,19 @@ function run_and_record!(tape::Tape, f, args...)
     return output
 end
 
+function run_and_record_new!(tape::Tape, args...)
+    output = try
+        expr = Expr(:new, map(val, args)...)
+        box(eval(expr))
+    catch e
+        @warn e
+        Box{Any}(nothing)
+    end
+    ins = NewInstruction(args, output, tape)
+    push!(tape, ins)
+    return output
+end
+
 function unbox_condition(ir)
     for blk in IRTools.blocks(ir)
         vars = keys(blk)
@@ -177,9 +213,13 @@ function intercept(ir; recorder=:run_and_record!)
 
     for (x, st) in ir
         x == tape && continue
-        Meta.isexpr(st.expr, :call) || continue
-        new_args = (x == args_var) ? st.expr.args : _replace_args(st.expr.args, arg_pairs)
-        ir[x] = IRTools.xcall(@__MODULE__, recorder, tape, new_args...)
+        if Meta.isexpr(st.expr, :call)
+            new_args = (x == args_var) ? st.expr.args : _replace_args(st.expr.args, arg_pairs)
+            ir[x] = IRTools.xcall(@__MODULE__, recorder, tape, new_args...)
+        elseif Meta.isexpr(st.expr, :new)
+            args = st.expr.args
+            ir[x] = IRTools.xcall(@__MODULE__, :run_and_record_new!, tape, args...)
+        end
     end
     # the real return value will be in the last instruction on the tape
     IRTools.return!(ir, tape)

From e1ae835bed9bd7e0c8b20c17f4dbaa580fc60999 Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Tue, 18 Jan 2022 05:54:42 +0000
Subject: [PATCH 13/18] copy NewInstruction

---
 src/tapedtask.jl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/tapedtask.jl b/src/tapedtask.jl
index 72af7d2e..75861cea 100644
--- a/src/tapedtask.jl
+++ b/src/tapedtask.jl
@@ -212,6 +212,14 @@ function Base.copy(x::Instruction, on_tape::Tape, roster::Dict{UInt64, Any})
     Instruction(x.fun, input, output, on_tape)
 end
 
+function Base.copy(x::NewInstruction, on_tape::Tape, roster::Dict{UInt64, Any})
+    input = map(x.input) do ob
+        copy_box(ob, roster)
+    end
+    output = copy_box(x.output, roster)
+    NewInstruction(input, output, on_tape)
+end
+
 function Base.copy(t::Tape, roster::Dict{UInt64, Any}; partial=true)
     old_data = t.tape
     len = partial ? length(old_data) - t.counter + 1 : length(old_data)

From c9673cd11255b54f311b0354cae11db78a66db36 Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Tue, 18 Jan 2022 08:41:44 +0000
Subject: [PATCH 14/18] update docs/comments

---
 src/tapedfunction.jl | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/tapedfunction.jl b/src/tapedfunction.jl
index ed52d9be..ceece830 100644
--- a/src/tapedfunction.jl
+++ b/src/tapedfunction.jl
@@ -6,6 +6,11 @@ mutable struct Tape
     owner
 end
 
+"""
+    Instruction
+
+An `Instruction` stands for a function call
+"""
 mutable struct Instruction{F} <: AbstractInstruction
     fun::F
     input::Tuple
@@ -13,6 +18,14 @@ mutable struct Instruction{F} <: AbstractInstruction
     tape::Tape
 end
 
+
+"""
+    NewInstruction
+
+A `NewInstruction` stands for a `new` operator, which only appears in
+an inner constructor. Its represtation in IRCode is not a function call,
+so we need a new intruction type to represent it on tapes.
+"""
 mutable struct NewInstruction <: AbstractInstruction
     input::Tuple
     output
@@ -79,6 +92,7 @@ function Base.show(io::IO, tp::Tape)
 end
 
 function (instr::Instruction{F})() where F
+    # Catch run-time exceptions / errors.
     try
         output = instr.fun(map(val, instr.input)...)
         instr.output.val = output
@@ -90,6 +104,7 @@ end
 
 
 function (instr::NewInstruction)()
+    # Catch run-time exceptions / errors.
     try
         expr = Expr(:new, map(val, instr.input)...)
         output = eval(expr)

From d14936ba07e649ead60d5b09578b744bf191c63b Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Tue, 18 Jan 2022 13:56:32 +0000
Subject: [PATCH 15/18] give a warning when find an unknown ir code

---
 src/tapedfunction.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tapedfunction.jl b/src/tapedfunction.jl
index ceece830..b3339cd8 100644
--- a/src/tapedfunction.jl
+++ b/src/tapedfunction.jl
@@ -234,6 +234,8 @@ function intercept(ir; recorder=:run_and_record!)
         elseif Meta.isexpr(st.expr, :new)
             args = st.expr.args
             ir[x] = IRTools.xcall(@__MODULE__, :run_and_record_new!, tape, args...)
+        else
+            @warn "Unknown IR code: " st
         end
     end
     # the real return value will be in the last instruction on the tape

From 000ef2bcb58de3acf8f203d04b1dc4991f3899ca Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Tue, 18 Jan 2022 15:20:25 +0000
Subject: [PATCH 16/18] refactor new instruction, add test cases

---
 src/tapedfunction.jl | 28 +++++++---------------------
 src/tapedtask.jl     |  8 --------
 test/runtests.jl     |  1 +
 test/tf.jl           | 17 +++++++++++++++++
 4 files changed, 25 insertions(+), 29 deletions(-)
 create mode 100644 test/tf.jl

diff --git a/src/tapedfunction.jl b/src/tapedfunction.jl
index b3339cd8..10d27dc3 100644
--- a/src/tapedfunction.jl
+++ b/src/tapedfunction.jl
@@ -18,20 +18,6 @@ mutable struct Instruction{F} <: AbstractInstruction
     tape::Tape
 end
 
-
-"""
-    NewInstruction
-
-A `NewInstruction` stands for a `new` operator, which only appears in
-an inner constructor. Its represtation in IRCode is not a function call,
-so we need a new intruction type to represent it on tapes.
-"""
-mutable struct NewInstruction <: AbstractInstruction
-    input::Tuple
-    output
-    tape::Tape
-end
-
 Tape() = Tape(Vector{AbstractInstruction}(), 1, nothing)
 Tape(owner) = Tape(Vector{AbstractInstruction}(), 1, owner)
 MacroTools.@forward Tape.tape Base.iterate, Base.length
@@ -92,7 +78,7 @@ function Base.show(io::IO, tp::Tape)
 end
 
 function (instr::Instruction{F})() where F
-    # Catch run-time exceptions / errors.
+    # catch run-time exceptions / errors.
     try
         output = instr.fun(map(val, instr.input)...)
         instr.output.val = output
@@ -102,9 +88,9 @@ function (instr::Instruction{F})() where F
     end
 end
 
-
-function (instr::NewInstruction)()
-    # Catch run-time exceptions / errors.
+function _new end
+function (instr::Instruction{typeof(_new)})()
+    # catch run-time exceptions / errors.
     try
         expr = Expr(:new, map(val, instr.input)...)
         output = eval(expr)
@@ -147,7 +133,7 @@ function run_and_record!(tape::Tape, f, args...)
     return output
 end
 
-function run_and_record_new!(tape::Tape, args...)
+function run_and_record!(tape::Tape, ::typeof(_new), args...)
     output = try
         expr = Expr(:new, map(val, args)...)
         box(eval(expr))
@@ -155,7 +141,7 @@ function run_and_record_new!(tape::Tape, args...)
         @warn e
         Box{Any}(nothing)
     end
-    ins = NewInstruction(args, output, tape)
+    ins = Instruction(_new, args, output, tape)
     push!(tape, ins)
     return output
 end
@@ -233,7 +219,7 @@ function intercept(ir; recorder=:run_and_record!)
             ir[x] = IRTools.xcall(@__MODULE__, recorder, tape, new_args...)
         elseif Meta.isexpr(st.expr, :new)
             args = st.expr.args
-            ir[x] = IRTools.xcall(@__MODULE__, :run_and_record_new!, tape, args...)
+            ir[x] = IRTools.xcall(@__MODULE__, recorder, tape, _new, args...)
         else
             @warn "Unknown IR code: " st
         end
diff --git a/src/tapedtask.jl b/src/tapedtask.jl
index 75861cea..72af7d2e 100644
--- a/src/tapedtask.jl
+++ b/src/tapedtask.jl
@@ -212,14 +212,6 @@ function Base.copy(x::Instruction, on_tape::Tape, roster::Dict{UInt64, Any})
     Instruction(x.fun, input, output, on_tape)
 end
 
-function Base.copy(x::NewInstruction, on_tape::Tape, roster::Dict{UInt64, Any})
-    input = map(x.input) do ob
-        copy_box(ob, roster)
-    end
-    output = copy_box(x.output, roster)
-    NewInstruction(input, output, on_tape)
-end
-
 function Base.copy(t::Tape, roster::Dict{UInt64, Any}; partial=true)
     old_data = t.tape
     len = partial ? length(old_data) - t.counter + 1 : length(old_data)
diff --git a/test/runtests.jl b/test/runtests.jl
index 2749827a..24c28f6e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,7 @@
 using Libtask
 using Test
 
+include("tf.jl")
 include("ctask.jl")
 include("tarray.jl")
 include("tref.jl")
diff --git a/test/tf.jl b/test/tf.jl
new file mode 100644
index 00000000..53ac57ca
--- /dev/null
+++ b/test/tf.jl
@@ -0,0 +1,17 @@
+using Libtask
+
+@testset "tapedfunction" begin
+    # Test case 1: stack allocated objects are deep copied.
+    @testset "Instruction{typeof(_new)}" begin
+        mutable struct S
+            i::Int
+            S(x, y) = new(x + y)
+        end
+
+        tf = Libtask.TapedFunction(S)
+        s1 = tf(1, 2)
+        @test s1.i == 3
+        newins = findall(x -> isa(x, Libtask.Instruction{typeof(Libtask._new)}), tf.tape.tape)
+        @test length(newins) == 1
+    end
+end

From 1edd7189f9981a8e109f8a553a0119509579e68e Mon Sep 17 00:00:00 2001
From: KDr2 <zhuo@hexoasis.com>
Date: Wed, 19 Jan 2022 00:31:55 +0000
Subject: [PATCH 17/18] new CI job

---
 .../BenchmarksAndMicroIntegration.yml         | 40 +++++++++++++++++++
 perf/runtests.jl                              |  3 ++
 2 files changed, 43 insertions(+)
 create mode 100644 .github/workflows/BenchmarksAndMicroIntegration.yml
 create mode 100644 perf/runtests.jl

diff --git a/.github/workflows/BenchmarksAndMicroIntegration.yml b/.github/workflows/BenchmarksAndMicroIntegration.yml
new file mode 100644
index 00000000..67b8b421
--- /dev/null
+++ b/.github/workflows/BenchmarksAndMicroIntegration.yml
@@ -0,0 +1,40 @@
+name: Benchmarks and MicroIntegration
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+
+jobs:
+  test:
+    name: Benchmarks and MicroIntegration
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: 1
+          arch: x64
+      - uses: julia-actions/julia-buildpkg@latest
+      - name: setup enviroment
+        shell: julia --color=yes --project=perf {0}
+        run: |
+          using Pkg
+          try
+            # force it to use this PR's version of the package
+            pkg"add Turing#hg/new-libtask2" # TODO: remove this when Turing is updated
+            Pkg.develop(PackageSpec(path="."))  # resolver may fail with main deps
+            Pkg.update()
+          catch err
+            err isa Pkg.Resolve.ResolverError || rethrow()
+            # If we can't resolve that means this is incompatible by SemVer and this is fine
+            # It means we marked this as a breaking change, so we don't need to worry about
+            # Mistakenly introducing a breaking change, as we have intentionally made one
+            @info "Not compatible with this release. No problem." exception=err
+            exit(0)  # Exit immediately, as a success
+          end
+      - name: run
+        run: julia --color=yes --project=perf perf/runtests.jl
diff --git a/perf/runtests.jl b/perf/runtests.jl
new file mode 100644
index 00000000..9856db08
--- /dev/null
+++ b/perf/runtests.jl
@@ -0,0 +1,3 @@
+include("p0.jl")
+include("p1.jl")
+include("p2.jl")

From 3b4fb2b7be30cb44dd959c1faeae54ad94cc07b8 Mon Sep 17 00:00:00 2001
From: Hong Ge <hg344@cam.ac.uk>
Date: Wed, 19 Jan 2022 09:40:52 +0000
Subject: [PATCH 18/18] Update Project.toml

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 1de90e38..544296e8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,7 +3,7 @@ uuid = "6f1fad26-d15e-5dc8-ae53-837a1d7b8c9f"
 license = "MIT"
 desc = "Tape based task copying in Turing"
 repo = "https://github.com/TuringLang/Libtask.jl.git"
-version = "0.6.2"
+version = "0.6.3"
 
 [deps]
 IRTools = "7869d1d1-7146-5819-86e3-90919afe41df"