From c0e1e13e735401dfc1338a6d4a19f18bce32dc5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20W=C3=BCrfel?= Date: Fri, 28 Nov 2025 12:04:57 +0100 Subject: [PATCH] Revert "Fix default algorithm for sparse CUDA matrices to LUFactorization" --- ext/LinearSolveCUDAExt.jl | 62 ++++++++++++-------------------------- ext/LinearSolveCUDSSExt.jl | 1 - 2 files changed, 19 insertions(+), 44 deletions(-) diff --git a/ext/LinearSolveCUDAExt.jl b/ext/LinearSolveCUDAExt.jl index e446094e1..58e2c3444 100644 --- a/ext/LinearSolveCUDAExt.jl +++ b/ext/LinearSolveCUDAExt.jl @@ -5,11 +5,9 @@ using LinearSolve: LinearSolve, is_cusparse, defaultalg, cudss_loaded, DefaultLi DefaultAlgorithmChoice, ALREADY_WARNED_CUDSS, LinearCache, needs_concrete_A, error_no_cudss_lu, init_cacheval, OperatorAssumptions, - CudaOffloadFactorization, CudaOffloadLUFactorization, - CudaOffloadQRFactorization, + CudaOffloadFactorization, CudaOffloadLUFactorization, CudaOffloadQRFactorization, CUDAOffload32MixedLUFactorization, - SparspakFactorization, KLUFactorization, UMFPACKFactorization, - LinearVerbosity + SparspakFactorization, KLUFactorization, UMFPACKFactorization, LinearVerbosity using LinearSolve.LinearAlgebra, LinearSolve.SciMLBase, LinearSolve.ArrayInterface using SciMLBase: AbstractSciMLOperator @@ -25,16 +23,11 @@ function LinearSolve.defaultalg(A::CUDA.CUSPARSE.CuSparseMatrixCSR{Tv, Ti}, b, if LinearSolve.cudss_loaded(A) LinearSolve.DefaultLinearSolver(LinearSolve.DefaultAlgorithmChoice.LUFactorization) else - error("CUDSS.jl is required for LU Factorizations on CuSparseMatrixCSR. Please load this library.") - end -end - -function LinearSolve.defaultalg(A::CUDA.CUSPARSE.CuSparseMatrixCSC{Tv, Ti}, b, - assump::OperatorAssumptions{Bool}) where {Tv, Ti} - if LinearSolve.cudss_loaded(A) - LinearSolve.DefaultLinearSolver(LinearSolve.DefaultAlgorithmChoice.LUFactorization) - else - error("CUDSS.jl is required for LU Factorizations on CuSparseMatrixCSC. Please load this library.") + if !LinearSolve.ALREADY_WARNED_CUDSS[] + @warn("CUDSS.jl is required for LU Factorizations on CuSparseMatrixCSR. Please load this library. Falling back to Krylov") + LinearSolve.ALREADY_WARNED_CUDSS[] = true + end + LinearSolve.DefaultLinearSolver(LinearSolve.DefaultAlgorithmChoice.KrylovJL_GMRES) end end @@ -45,13 +38,6 @@ function LinearSolve.error_no_cudss_lu(A::CUDA.CUSPARSE.CuSparseMatrixCSR) nothing end -function LinearSolve.error_no_cudss_lu(A::CUDA.CUSPARSE.CuSparseMatrixCSC) - if !LinearSolve.cudss_loaded(A) - error("CUDSS.jl is required for LU Factorizations on CuSparseMatrixCSC. Please load this library.") - end - nothing -end - function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CudaOffloadLUFactorization; kwargs...) if cache.isfresh @@ -66,15 +52,14 @@ function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CudaOffloadLUFact SciMLBase.build_linear_solution(alg, y, nothing, cache) end -function LinearSolve.init_cacheval( - alg::CudaOffloadLUFactorization, A::AbstractArray, b, u, Pl, Pr, +function LinearSolve.init_cacheval(alg::CudaOffloadLUFactorization, A::AbstractArray, b, u, Pl, Pr, maxiters::Int, abstol, reltol, verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions) # Check if CUDA is functional before creating CUDA arrays if !CUDA.functional() return nothing end - + T = eltype(A) noUnitT = typeof(zero(T)) luT = LinearAlgebra.lutype(noUnitT) @@ -102,7 +87,7 @@ function LinearSolve.init_cacheval(alg::CudaOffloadQRFactorization, A, b, u, Pl, if !CUDA.functional() return nothing end - + qr(CUDA.CuArray(A)) end @@ -119,8 +104,7 @@ function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CudaOffloadFactor SciMLBase.build_linear_solution(alg, y, nothing, cache) end -function LinearSolve.init_cacheval( - alg::CudaOffloadFactorization, A::AbstractArray, b, u, Pl, Pr, +function LinearSolve.init_cacheval(alg::CudaOffloadFactorization, A::AbstractArray, b, u, Pl, Pr, maxiters::Int, abstol, reltol, verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions) qr(CUDA.CuArray(A)) @@ -128,33 +112,27 @@ end function LinearSolve.init_cacheval( ::SparspakFactorization, A::CUDA.CUSPARSE.CuSparseMatrixCSR, b, u, - Pl, Pr, maxiters::Int, abstol, reltol, - verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions) + Pl, Pr, maxiters::Int, abstol, reltol, verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions) nothing end function LinearSolve.init_cacheval( ::KLUFactorization, A::CUDA.CUSPARSE.CuSparseMatrixCSR, b, u, - Pl, Pr, maxiters::Int, abstol, reltol, - verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions) + Pl, Pr, maxiters::Int, abstol, reltol, verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions) nothing end function LinearSolve.init_cacheval( ::UMFPACKFactorization, A::CUDA.CUSPARSE.CuSparseMatrixCSR, b, u, - Pl, Pr, maxiters::Int, abstol, reltol, - verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions) + Pl, Pr, maxiters::Int, abstol, reltol, verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions) nothing end # Mixed precision CUDA LU implementation -function SciMLBase.solve!( - cache::LinearSolve.LinearCache, alg::CUDAOffload32MixedLUFactorization; +function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CUDAOffload32MixedLUFactorization; kwargs...) if cache.isfresh - fact, A_gpu_f32, - b_gpu_f32, - u_gpu_f32 = LinearSolve.@get_cacheval(cache, :CUDAOffload32MixedLUFactorization) + fact, A_gpu_f32, b_gpu_f32, u_gpu_f32 = LinearSolve.@get_cacheval(cache, :CUDAOffload32MixedLUFactorization) # Compute 32-bit type on demand and convert T32 = eltype(cache.A) <: Complex ? ComplexF32 : Float32 A_f32 = T32.(cache.A) @@ -163,14 +141,12 @@ function SciMLBase.solve!( cache.cacheval = (fact, A_gpu_f32, b_gpu_f32, u_gpu_f32) cache.isfresh = false end - fact, A_gpu_f32, - b_gpu_f32, - u_gpu_f32 = LinearSolve.@get_cacheval(cache, :CUDAOffload32MixedLUFactorization) - + fact, A_gpu_f32, b_gpu_f32, u_gpu_f32 = LinearSolve.@get_cacheval(cache, :CUDAOffload32MixedLUFactorization) + # Compute types on demand for conversions T32 = eltype(cache.A) <: Complex ? ComplexF32 : Float32 Torig = eltype(cache.u) - + # Convert b to Float32, solve, then convert back to original precision b_f32 = T32.(cache.b) copyto!(b_gpu_f32, b_f32) diff --git a/ext/LinearSolveCUDSSExt.jl b/ext/LinearSolveCUDSSExt.jl index bbcf635f0..506ada99a 100644 --- a/ext/LinearSolveCUDSSExt.jl +++ b/ext/LinearSolveCUDSSExt.jl @@ -4,6 +4,5 @@ using LinearSolve: LinearSolve, cudss_loaded using CUDSS LinearSolve.cudss_loaded(A::CUDSS.CUDA.CUSPARSE.CuSparseMatrixCSR) = true -LinearSolve.cudss_loaded(A::CUDSS.CUDA.CUSPARSE.CuSparseMatrixCSC) = true end