Skip to content

Commit

Permalink
Merge pull request #361 from SciML/metal
Browse files Browse the repository at this point in the history
Setup Accelerate and MKL for 32-bit, MKL getrf, and Metal.jl integration
  • Loading branch information
ChrisRackauckas committed Aug 10, 2023
2 parents 9224384 + c3a93d9 commit 3881230
Show file tree
Hide file tree
Showing 11 changed files with 375 additions and 9 deletions.
7 changes: 5 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,19 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"

[extensions]
LinearSolveCUDAExt = "CUDA"
LinearSolveHYPREExt = "HYPRE"
LinearSolveIterativeSolversExt = "IterativeSolvers"
LinearSolveMKLExt = "MKL_jll"
LinearSolveKrylovKitExt = "KrylovKit"
LinearSolveMKLExt = "MKL_jll"
LinearSolveMetalExt = "Metal"
LinearSolvePardisoExt = "Pardiso"

[compat]
Expand Down Expand Up @@ -74,6 +76,7 @@ IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
MultiFloats = "bdf0d083-296b-4888-a5b6-7498122e68a5"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Expand Down
51 changes: 51 additions & 0 deletions benchmarks/applelu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
using BenchmarkTools, Random, VectorizationBase
using LinearAlgebra, LinearSolve, Metal
nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
BLAS.set_num_threads(nc)
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5

function luflop(m, n = m; innerflop = 2)
sum(1:min(m, n)) do k
invflop = 1
scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
updateflop = isempty((k + 1):n) ? 0 :
sum((k + 1):n) do j
isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
innerflop
end
end
invflop + scaleflop + updateflop
end
end

algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), AppleAccelerateLUFactorization(), MetalLUFactorization()]
res = [Float32[] for i in 1:length(algs)]

ns = 4:8:500
for i in 1:length(ns)
n = ns[i]
@info "$n × $n"
rng = MersenneTwister(123)
global A = rand(rng, Float32, n, n)
global b = rand(rng, Float32, n)
global u0= rand(rng, Float32, n)

for j in 1:length(algs)
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
push!(res[j], luflop(n) / bt / 1e9)
end
end

using Plots
__parameterless_type(T) = Base.typename(T).wrapper
parameterless_type(x) = __parameterless_type(typeof(x))
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)

p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
for i in 2:length(res)
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
end
p

savefig("metallubench.png")
savefig("metallubench.pdf")
51 changes: 51 additions & 0 deletions benchmarks/cudalu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
using BenchmarkTools, Random, VectorizationBase
using LinearAlgebra, LinearSolve, CUDA, MKL_jll
nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
BLAS.set_num_threads(nc)
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5

function luflop(m, n = m; innerflop = 2)
sum(1:min(m, n)) do k
invflop = 1
scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
updateflop = isempty((k + 1):n) ? 0 :
sum((k + 1):n) do j
isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
innerflop
end
end
invflop + scaleflop + updateflop
end
end

algs = [MKLLUFactorization(), CudaOffloadFactorization()]
res = [Float32[] for i in 1:length(algs)]

ns = 200:400:10000
for i in 1:length(ns)
n = ns[i]
@info "$n × $n"
rng = MersenneTwister(123)
global A = rand(rng, Float32, n, n)
global b = rand(rng, Float32, n)
global u0= rand(rng, Float32, n)

for j in 1:length(algs)
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
push!(res[j], luflop(n) / bt / 1e9)
end
end

using Plots
__parameterless_type(T) = Base.typename(T).wrapper
parameterless_type(x) = __parameterless_type(typeof(x))
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)

p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
for i in 2:length(res)
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
end
p

savefig("cudaoffloadlubench.png")
savefig("cudaoffloadlubench.pdf")
52 changes: 52 additions & 0 deletions benchmarks/metallu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
using BenchmarkTools, Random, VectorizationBase
using LinearAlgebra, LinearSolve, Metal
nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
BLAS.set_num_threads(nc)
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5

function luflop(m, n = m; innerflop = 2)
sum(1:min(m, n)) do k
invflop = 1
scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
updateflop = isempty((k + 1):n) ? 0 :
sum((k + 1):n) do j
isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
innerflop
end
end
invflop + scaleflop + updateflop
end
end

algs = [AppleAccelerateLUFactorization(), MetalLUFactorization()]
res = [Float32[] for i in 1:length(algs)]

ns = 200:600:15000
for i in 1:length(ns)
n = ns[i]
@info "$n × $n"
rng = MersenneTwister(123)
global A = rand(rng, Float32, n, n)
global b = rand(rng, Float32, n)
global u0= rand(rng, Float32, n)

for j in 1:length(algs)
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
GC.gc()
push!(res[j], luflop(n) / bt / 1e9)
end
end

using Plots
__parameterless_type(T) = Base.typename(T).wrapper
parameterless_type(x) = __parameterless_type(typeof(x))
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)

p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
for i in 2:length(res)
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
end
p

savefig("metal_large_lubench.png")
savefig("metal_large_lubench.pdf")
65 changes: 60 additions & 5 deletions docs/src/solvers/solvers.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,37 @@ Solves for ``Au=b`` in the problem defined by `prob` using the algorithm

## Recommended Methods

### Dense Matrices

The default algorithm `nothing` is good for picking an algorithm that will work,
but one may need to change this to receive more performance or precision. If
more precision is necessary, `QRFactorization()` and `SVDFactorization()` are
the best choices, with SVD being the slowest but most precise.

For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations.
`FastLUFactorization` will be faster than `LUFactorization` which is the Base.LinearAlgebra
(`\` default) implementation of LU factorization. `SimpleLUFactorization` will be fast
on very small matrices.
For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around
150x150 matrices, though this can be dependent on the exact details of the hardware. After this
point, `MKLLUFactorization` is usually faster on most hardware. Note that on Mac computers
that `AppleAccelerateLUFactorization` is generally always the fastest. `LUFactorization` will
use your base system BLAS which can be fast or slow depending on the hardware configuration.
`SimpleLUFactorization` will be fast only on very small matrices but can cut down on compile times.

For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used
on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000
matrices (and only supports Float32). `CudaOffloadFactorization` can be more efficient at a
much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though this is highly dependent
on the chosen GPU hardware. `CudaOffloadFactorization` requires a CUDA-compatible NVIDIA GPU.
CUDA offload supports Float64 but most consumer GPU hardware will be much faster on Float32
(many are >32x faster for Float32 operations than Float64 operations) and thus for most hardware
this is only recommended for Float32 matrices.

!!! note

Performance details for dense LU-factorizations can be highly dependent on the hardware configuration.
For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357).
If one is looking to best optimize their system, we suggest running the performance
tuning benchmark.

### Sparse Matrices

For sparse LU-factorizations, `KLUFactorization` if there is less structure
to the sparsity pattern and `UMFPACKFactorization` if there is more structure.
Expand All @@ -31,12 +53,25 @@ As sparse matrices get larger, iterative solvers tend to get more efficient than
factorization methods if a lower tolerance of the solution is required.

Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods.
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
choice of Krylov method should be the one most constrained to the type of operator one
has, for example if positive definite then `Krylov_CG()`, but if no good properties then
use `Krylov_GMRES()`.

Finally, a user can pass a custom function for handling the linear solve using
`LinearSolveFunction()` if existing solvers are not optimally suited for their application.
The interface is detailed [here](@ref custom).

### Lazy SciMLOperators

If the linear operator is given as a lazy non-concrete operator, such as a `FunctionOperator`,
then using a Krylov method is preferred in order to not concretize the matrix.
Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
choice of Krylov method should be the one most constrained to the type of operator one
has, for example if positive definite then `Krylov_CG()`, but if no good properties then
use `Krylov_GMRES()`.

## Full List of Methods

### RecursiveFactorization.jl
Expand Down Expand Up @@ -121,6 +156,26 @@ KrylovJL
MKLLUFactorization
```

### AppleAccelerate.jl

!!! note

Using this solver requires a Mac with Apple Accelerate. This should come standard in most "modern" Mac computers.

```@docs
AppleAccelerateLUFactorization
```

### Metal.jl

!!! note

Using this solver requires adding the package Metal.jl, i.e. `using Metal`. This package is only compatible with Mac M-Series computers with a Metal-compatible GPU.

```@docs
MetalLUFactorization
```

### Pardiso.jl

!!! note
Expand Down
74 changes: 74 additions & 0 deletions ext/LinearSolveMKLExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,63 @@ function getrf!(A::AbstractMatrix{<:Float64}; ipiv = similar(A, BlasInt, min(siz
A, ipiv, info[], info #Error code is stored in LU factorization type
end

function getrf!(A::AbstractMatrix{<:Float32}; ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))), info = Ref{BlasInt}(), check = false)
require_one_based_indexing(A)
check && chkfinite(A)
chkstride1(A)
m, n = size(A)
lda = max(1,stride(A, 2))
if isempty(ipiv)
ipiv = similar(A, BlasInt, min(size(A,1),size(A,2)))
end
ccall((@blasfunc(sgetrf_), MKL_jll.libmkl_rt), Cvoid,
(Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32},
Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
m, n, A, lda, ipiv, info)
chkargsok(info[])
A, ipiv, info[], info #Error code is stored in LU factorization type
end

function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::AbstractVector{BlasInt}, B::AbstractVecOrMat{<:Float64}; info = Ref{BlasInt}())
require_one_based_indexing(A, ipiv, B)
LinearAlgebra.LAPACK.chktrans(trans)
chkstride1(A, B, ipiv)
n = LinearAlgebra.checksquare(A)
if n != size(B, 1)
throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n"))
end
if n != length(ipiv)
throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n"))
end
nrhs = size(B, 2)
ccall(("dgetrs_", MKL_jll.libmkl_rt), Cvoid,
(Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt},
Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
B
end

function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::AbstractVector{BlasInt}, B::AbstractVecOrMat{<:Float32}; info = Ref{BlasInt}())
require_one_based_indexing(A, ipiv, B)
LinearAlgebra.LAPACK.chktrans(trans)
chkstride1(A, B, ipiv)
n = LinearAlgebra.checksquare(A)
if n != size(B, 1)
throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n"))
end
if n != length(ipiv)
throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n"))
end
nrhs = size(B, 2)
ccall(("sgetrs_", MKL_jll.libmkl_rt), Cvoid,
(Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt},
Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
B
end

default_alias_A(::MKLLUFactorization, ::Any, ::Any) = false
default_alias_b(::MKLLUFactorization, ::Any, ::Any) = false

Expand All @@ -47,8 +104,25 @@ function SciMLBase.solve!(cache::LinearCache, alg::MKLLUFactorization;
cache.cacheval = fact
cache.isfresh = false
end

y = ldiv!(cache.u, @get_cacheval(cache, :MKLLUFactorization)[1], cache.b)
SciMLBase.build_linear_solution(alg, y, nothing, cache)

#=
A, info = @get_cacheval(cache, :MKLLUFactorization)
LinearAlgebra.require_one_based_indexing(cache.u, cache.b)
m, n = size(A, 1), size(A, 2)
if m > n
Bc = copy(cache.b)
getrs!('N', A.factors, A.ipiv, Bc; info)
return copyto!(cache.u, 1, Bc, 1, n)
else
copyto!(cache.u, cache.b)
getrs!('N', A.factors, A.ipiv, cache.u; info)
end
SciMLBase.build_linear_solution(alg, cache.u, nothing, cache)
=#
end

end
Loading

0 comments on commit 3881230

Please sign in to comment.