Merge pull request #72 from PALEOtoolkit/simd_iteratorutils

Update doc for SIMDutils, IteratorUtils
PALEOtoolkit · Apr 20, 2023 · 0c89818 · 0c89818
2 parents 28affad + 6a2a2f0
commit 0c89818
Show file tree

Hide file tree

Showing 2 changed files with 113 additions and 24 deletions.
diff --git a/src/utils/IteratorUtils.jl b/src/utils/IteratorUtils.jl
@@ -49,8 +49,9 @@ Call `f(t1[n], t2[n])` for each element `n` of `t1::Tuple`, `t2::Tuple`.
 # Implementation
 Recursively generates inline code for speed and type stability.
 
-NB: as of Julia 1.6, slows down and allocates if `length(t1) > 32` due to Julia limitation.
-See [`foreach_longtuple`](@ref) for this.
+# Limitations
+As of Julia 1.6, slows down and allocates if `length(t1) > 32` due to Julia limitation.
+See [`foreach_longtuple`](@ref).
 
 See https://github.com/JuliaLang/julia/issues/31869
 https://github.com/JuliaLang/julia/blob/master/base/tuple.jl (map implementation)
@@ -77,6 +78,62 @@ foreach_tuple_unchecked(f, t1::Tuple, t2::Tuple) =
 Call `f(t1[n], t2[n], ... tm[n])` or `f(t1[n], t2[n], ... tm[n], p)` for each element
 `n` of `t1::Tuple`, `t2`, ... `tm`.
 
+# Examples
+    Here `input_concs_cell` and `input_concs` are `NamedTuple`s.
+
+    PB.IteratorUtils.foreach_longtuple(values(input_concs_cell), values(input_concs)) do ic_cell, ic
+       ic_cell[] = r_rhofac*ic[i]  # generates a closure with r_rhofac as a captured variable
+    end
+
+
+    # closure with r_rhofac as a captured variable
+    function f(ic_cell, ic)
+        ic_cell[] = r_rhofac*ic[i]  
+    end
+    PB.IteratorUtils.foreach_longtuple(f, values(input_concs_cell), values(input_concs))
+
+
+    PB.IteratorUtils.foreach_longtuple_p(values(input_concs_cell), values(input_concs), r_rhofac) do ic_cell, ic, _r_rhofac
+        ic_cell[] = _r_rhofac*ic[i]  # no closure with _r_rhofac as an argument
+    end
+
+
+    # no closure with _r_rhofac as an argument
+    function f(ic_cell, ic, _r_rhofac)
+        ic_cell[] = _r_rhofac*ic[i]  
+    end
+    PB.IteratorUtils.foreach_longtuple_p(f, values(input_concs_cell), values(input_concs), r_rhofac)
+
+
+# Limitations
+See Julia performance tips:
+    - There are potential problems with captured variables when using a closure as `f`, either explicitly or using do block syntax. 
+    It is usually safer to explicitly write out `f` with all arguments, and pass them in using `foreach_longtuple_p`, using `p::Tuple` for multiple arguments.
+    - Passing type parameters to `f` is prone to creating additional problems if `f` is not specialized, which happens:
+      - if `f` is either an explicit closure or is a closure created using do block syntax
+      - if a type parameter is passed as a member of `p::Tuple`
+      - if a type parameter is passed as `p`, but `f` is not explicitly specialized using `p::Type{MyType}` and `f` is not inlined (do block syntax does appear to work) 
+    It is safest to either explicitly specialize `f` on a type parameter using `p::Type{MyType}`, or avoid passing type parameters and eg use eltype to derive them.
+
+    # Allocates: generates a closure with BufType as a captured variable, with no specialization
+    PB.IteratorUtils.foreach_longtuple(values(input_concs_cell), values(input_concs)) do ic_cell, ic
+        ic_cell[] = convert(BufType, ic[i])  # allocates
+    end
+
+
+    # OK: no closure, with explicit specialization on BufType argument
+    function f(ic_cell, ic, ::Type{BufType}) where {BufType} # force specialization
+        ic_cell[] = convert(BufType, ic[i])  
+    end
+    PB.IteratorUtils.foreach_longtuple_p(f, values(input_concs_cell), values(input_concs), BufType)
+
+
+    # Allocates: no closure, but BufType is a Tuple member so is passed as a `DataType` hence no specialization
+    function f(ic_cell, ic, (BufType, r_rhofac))
+        ic_cell[] = r_rhofac*convert(BufType, ic[i])  
+    end
+    PB.IteratorUtils.foreach_longtuple_p(f, values(input_concs_cell), values(input_concs), (r_rhofac, BufType))
+
 # Implementation
 Uses `@generated` to generate unrolled code for speed and type stability without length restrictions on `tm`.
 
@@ -148,7 +205,7 @@ end
 end
 
 @inline foreach_longtuple_p(f::F, t1, t2, p; errmsg="iterables lengths differ") where{F} =
-    foreach_longtuple_unchecked_p(f, t1, t2, p)
+    (check_lengths_equal(t1, t2; errmsg=errmsg); foreach_longtuple_unchecked_p(f, t1, t2, p))
 @generated function foreach_longtuple_unchecked_p(f, t1::Tuple, t2, p)
     ex = quote ; end  # empty expression
     for j=1:fieldcount(t1)
@@ -160,7 +217,7 @@ end
 end
 
 @inline foreach_longtuple_p(f::F, t1, t2, t3, p; errmsg="iterables lengths differ") where{F} =
-    foreach_longtuple_unchecked_p(f, t1, t2, t3, p)
+    (check_lengths_equal(t1, t2, t3; errmsg=errmsg); foreach_longtuple_unchecked_p(f, t1, t2, t3, p))
 @generated function foreach_longtuple_unchecked_p(f, t1::Tuple, t2, t3, p)
     ex = quote ; end  # empty expression
     for j=1:fieldcount(t1)
@@ -172,7 +229,7 @@ end
 end
 
 @inline foreach_longtuple_p(f::F, t1, t2, t3, t4, p; errmsg="iterables lengths differ") where{F} =
-    foreach_longtuple_unchecked_p(f, t1, t2, t3, t4, p)
+    (check_lengths_equal(t1, t2, t3, t4; errmsg=errmsg); foreach_longtuple_unchecked_p(f, t1, t2, t3, t4, p))
 @generated function foreach_longtuple_unchecked_p(f, t1::Tuple, t2, t3, t4, p)
     ex = quote ; end  # empty expression
     for j=1:fieldcount(t1)
@@ -184,7 +241,7 @@ end
 end
 
 @inline foreach_longtuple_p(f::F, t1, t2, t3, t4, t5, p; errmsg="iterables lengths differ") where{F} =
-    foreach_longtuple_unchecked_p(f, t1, t2, t3, t4, t5, p)
+    (check_lengths_equal(t1, t2, t3, t4, t5; errmsg=errmsg); foreach_longtuple_unchecked_p(f, t1, t2, t3, t4, t5, p))
 @generated function foreach_longtuple_unchecked_p(f, t1::Tuple, t2, t3, t4, t5, p)
     ex = quote ; end  # empty expression
     for j=1:fieldcount(t1)

diff --git a/src/utils/SIMDutils.jl b/src/utils/SIMDutils.jl
@@ -36,7 +36,7 @@ indices into a Vector). See Julia package [SIMD.jl](https://github.com/eschnett/
 If `baseiter` contained 1 or more but less then `N` elements, then `indices` is filled with
 repeats of the last available element.
 
-Returns Tuple `indices` (length `N`).
+Returns Tuple of indices (length `N`).
 
 # Examples
 
@@ -48,22 +48,22 @@ Returns Tuple `indices` (length `N`).
 
     # simplest version - Float64 x 4, ie type of v_a x 4
  
-    for indvec in SIMDIter(iter, Val(4))
-        x = v_a[indvec]  # x is a packed SIMD vector
-        v_b[indvec] = x
+    for i in SIMDIter(iter, Val(4))
+        x = v_a[i]  # x is a packed SIMD vector
+        v_b[i] = x
     end
 
 
     # with type conversion - Float32 x 8, ie explicitly change Type of SIMD vector
 
     ST = SIMD.Vec{8, Float32}}    
-    for indvec in SIMDIter(iter, ST)
-        #   v = vec[indvec]  <--> vgatherind(ST, vec, indvec)
-        #   vec[indvec] = v  <--> vscatterind!(v, vec, indvec)
-        #   vec[indvec] += v <--> vaddind!(v, vec, indvec) 
+    for i in SIMDIter(iter, ST)
+        #   v = vec[i]  <--> vgatherind(ST, vec, i)
+        #   vec[i] = v  <--> vscatterind!(v, vec, i)
+        #   vec[i] += v <--> vaddind!(v, vec, i) 
 
-        x = vgatherind(ST, v_a, indices)  # x is a packed SIMD vector with type conversion to Float32
-        vscatterind!(x, v_b, indices)
+        x = vgatherind(ST, v_a, i)  # x is a packed SIMD vector with type conversion to Float32
+        vscatterind!(x, v_b, i)
     end
 
 """
@@ -145,7 +145,14 @@ end
 # Fill SIMD type from scalar Vector
 #####################################
 
-"scalar fallback - just element in a vector"
+"""
+    vgatherind(vec, indices::Integer, mask) -> v # v[i], scalar fallback
+    vgatherind(vec, indices::SIMD.Vec{N, <:Integer}, mask) -> v::SIMD.Vec{N, eltype(vec)}
+
+Fill SIMD type from scalar Vector, without type conversion
+
+See [`SIMDIter`](@ref) for example usage.
+"""
 @Base.propagate_inbounds function vgatherind(vec, indices::Integer, mask)
     return vec[indices]
 end
@@ -154,22 +161,28 @@ end
     return SIMD.vgather(vec, indices, mask)
 end
 
-"scalar fallback - just element in a vector"
+"""
+    vgatherind(::Type{T}, vec, indices::Integer, mask) -> v::T # v[i], scalar fallback
+    vgatherind(::Type{SIMD.Vec{N, T}}, vec, indices::SIMD.Vec{N, <:Integer}) -> v::SIMD.Vec{N, T}
+
+Fill SIMD type from scalar Vector, with explicit type conversion to scalar type `T`
+
+NB: assumes all `indices` are valid
+
+See [`SIMDIter`](@ref) for example usage.
+"""
 @Base.propagate_inbounds function vgatherind(::Type{T}, vec, indices::Integer) where {T}
     return convert(T, vec[indices])
 end
 
-"gather with type conversion, assumes indices all valid"
 @Base.propagate_inbounds function vgatherind(::Type{SIMD.Vec{2, T}}, vec, indices::SIMD.Vec{2, <:Integer}) where T
     return SIMD.Vec{2, T}((vec[indices[1]], vec[indices[2]]))
 end
 
-"gather with type conversion, assumes indices all valid"
 @Base.propagate_inbounds function vgatherind(::Type{SIMD.Vec{4, T}}, vec, indices::SIMD.Vec{4, <:Integer}) where T
     return SIMD.Vec{4, T}((vec[indices[1]], vec[indices[2]], vec[indices[3]], vec[indices[4]]))
 end
 
-"gather with type conversion, assumes indices all valid"
 @Base.propagate_inbounds function vgatherind(::Type{SIMD.Vec{8, T}}, vec, indices::SIMD.Vec{8, <:Integer}) where T
     return SIMD.Vec{8, T}(
         ( 
@@ -183,13 +196,23 @@ end
 # Unpack SIMD type and write to scalar Vector
 #############################################
 
-"scalar fallback - just set element in a vector"
+"""
+    vscatterind!(v::T, vec::Array{T, N}, indices::Integer, mask) # scalar fallback, no type conversion
+    vscatterind!(v::T, vec::Array{T, N}, indices::Integer) # scalar fallback, no type conversion
+    vscatterind!(v::SIMD.Vec{N, T}, vec, indices::SIMD.Vec{N, <:Integer}, mask) # converts to eltype{vec}
+    vscatterind!(v::SIMD.Vec{N, T}, vec, indices::SIMD.Vec{N, <:Integer}) # converts to eltype{vec} 
+
+Unpack SIMD type and write to scalar Vector, with type conversion to `eltype(vec)`.
+
+Versions without `mask` assume all indices are valid (but may be repeated).
+
+See [`SIMDIter`](@ref) for example usage.
+"""
 @Base.propagate_inbounds function vscatterind!(v::T, vec::Array{T, N}, indices::Integer, mask) where {T,N}
     vec[indices] = v   
     return nothing
 end
 
-"scalar fallback - just set element in a vector"
 @Base.propagate_inbounds function vscatterind!(v::T, vec::Array{T,N}, indices::Integer) where {T,N}
     vec[indices] = v   
     return nothing
@@ -209,7 +232,16 @@ end
 # Unpack SIMD type and add to scalar Vector
 #############################################
 
-"scalar fallback - just add to element in a vector `vec`"
+"""
+    vaddind!(v::T,  vec::Array{T, N}, indices::Integer) # scalar fallback, no type conversion
+    vaddind!(v::SIMD.Vec{N, T}, vec, indices::SIMD.Vec{N, <:Integer})
+
+Unpack SIMD type and add to scalar Vector, with type conversion to `eltype(vec)`.
+
+Assumes all indices are valid (but may be repeated, which will still have the effect of one add).
+
+See [`SIMDIter`](@ref) for example usage.
+"""
 @Base.propagate_inbounds function vaddind!(v::T,  vec::Array{T, N}, indices::Integer) where {T, N}
     vec[indices] += v
     return nothing