From 0295e3a065d132f20f3a313627cafe938df7f85e Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Tue, 17 May 2022 06:14:13 -0400
Subject: [PATCH 1/7] Remove Requires.jl and make a multi-package setup

---
 .github/workflows/CI.yml                      |  2 +
 Project.toml                                  |  7 +-
 docs/src/solvers/solvers.md                   | 19 ++++-
 lib/LinearSolveCUDA/Project.toml              | 22 +++++
 lib/LinearSolveCUDA/src/LinearSolveCUDA.jl    | 31 +++++++
 .../LinearSolveCUDA/test/runtests.jl          | 83 +++++++++----------
 lib/LinearSolvePardiso/Project.toml           | 22 +++++
 .../src/LinearSolvePardiso.jl                 | 58 +++++++------
 lib/LinearSolvePardiso/test/runtests.jl       | 30 +++++++
 src/LinearSolve.jl                            |  1 -
 src/cuda.jl                                   | 57 -------------
 src/init.jl                                   |  5 --
 test/downstream/Project.toml                  |  2 -
 test/runtests.jl                              | 24 ++++--
 14 files changed, 216 insertions(+), 147 deletions(-)
 create mode 100644 lib/LinearSolveCUDA/Project.toml
 create mode 100644 lib/LinearSolveCUDA/src/LinearSolveCUDA.jl
 rename test/cuda.jl => lib/LinearSolveCUDA/test/runtests.jl (90%)
 create mode 100644 lib/LinearSolvePardiso/Project.toml
 rename src/pardiso.jl => lib/LinearSolvePardiso/src/LinearSolvePardiso.jl (66%)
 create mode 100644 lib/LinearSolvePardiso/test/runtests.jl
 delete mode 100644 src/cuda.jl
 delete mode 100644 test/downstream/Project.toml

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 904e804fa..ffd4f0ab0 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -13,6 +13,8 @@ jobs:
       matrix:
         group:
           - All
+          - LinearSolveCUDA
+          - LinearSolvePardiso
         version:
           - '1'
           - '1.6'
diff --git a/Project.toml b/Project.toml
index 346963b00..0b729eaa5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,6 +6,7 @@ version = "1.15.0"
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 KLU = "ef3ab10e-7fda-4108-b977-705223b18434"
 Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
@@ -13,7 +14,6 @@ KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 RecursiveFactorization = "f2c3362d-daeb-58d1-803e-2bc74f2840b4"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
-Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -23,24 +23,23 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 [compat]
 ArrayInterface = "3, 4, 5"
 DocStringExtensions = "0.8"
+GPUArrays = "8"
 IterativeSolvers = "0.9.2"
 KLU = "0.3.0"
 Krylov = "0.7.11, 0.8"
 KrylovKit = "0.5"
 RecursiveFactorization = "0.2.8"
 Reexport = "1"
-Requires = "1"
 SciMLBase = "1.25"
 Setfield = "0.7, 0.8"
 UnPack = "1"
 julia = "1.6"
 
 [extras]
-Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Pardiso", "Pkg", "Random", "SafeTestsets"]
+test = ["Test", "Pkg", "Random", "SafeTestsets"]
diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md
index 94381d56d..431d85e75 100644
--- a/docs/src/solvers/solvers.md
+++ b/docs/src/solvers/solvers.md
@@ -77,6 +77,12 @@ customized per-package, details given below describe a subset of important array
       factorization does not use pivoting during the numerical factorization and therefore the
       procedure can fail even for invertible matrices.
 
+### LinearSolve.jl
+
+LinearSolve.jl contains some linear solvers built in.
+
+- `SimpleLUFactorization`: a simple LU-factorization implementation without BLAS. Fast for small matrices.
+
 ### SuiteSparse.jl
 
 By default, the SuiteSparse.jl are implemented for efficiency by caching the
@@ -92,8 +98,11 @@ be used in a context where that assumption does not hold, set `reuse_symbolic=fa
 
 ### Pardiso.jl
 
-This package is not loaded by default. Thus in order to use this package you
-must first `using Pardiso`. The following algorithms are pre-specified:
+!!! note
+
+    Using this solver requires adding the package LinearSolvePardiso.jl
+
+The following algorithms are pre-specified:
 
 - `MKLPardisoFactorize(;kwargs...)`: A sparse factorization method.
 - `MKLPardisoIterate(;kwargs...)`: A mixed factorization+iterative method.
@@ -128,7 +137,11 @@ end
 Note that `CuArrays` are supported by `GenericFactorization` in the "normal" way.
 The following are non-standard GPU factorization routines.
 
-- `GPUOffloadFactorization()`: An offloading technique used to GPU-accelerate CPU-based
+!!! note
+
+    Using this solver requires adding the package LinearSolveCUDA.jl
+    
+- `CudaOffloadFactorization()`: An offloading technique used to GPU-accelerate CPU-based
   computations. Requires a sufficiently large `A` to overcome the data transfer
   costs.
 
diff --git a/lib/LinearSolveCUDA/Project.toml b/lib/LinearSolveCUDA/Project.toml
new file mode 100644
index 000000000..d369c07cb
--- /dev/null
+++ b/lib/LinearSolveCUDA/Project.toml
@@ -0,0 +1,22 @@
+name = "LinearSolveCUDA"
+uuid = "0d26bed2-170e-468a-8415-1cfcbba6e180"
+authors = ["SciML"]
+version = "0.1"
+
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
+SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
+
+[compat]
+CUDA = "3"
+SciMLBase = "1.25"
+LinearSolve = "1"
+julia = "1.6"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
\ No newline at end of file
diff --git a/lib/LinearSolveCUDA/src/LinearSolveCUDA.jl b/lib/LinearSolveCUDA/src/LinearSolveCUDA.jl
new file mode 100644
index 000000000..660932d12
--- /dev/null
+++ b/lib/LinearSolveCUDA/src/LinearSolveCUDA.jl
@@ -0,0 +1,31 @@
+module LinearSolveCUDA
+
+using CUDA, LinearAlgebra, LinearSolve, SciMLBase
+
+struct CudaOffloadFactorization <: LinearSolve.AbstractFactorization end
+
+function SciMLBase.solve(cache::LinearCache, alg::CudaOffloadFactorization; kwargs...)
+    if cache.isfresh
+        fact = do_factorization(alg, CUDA.CuArray(cache.A), cache.b, cache.u)
+        cache = set_cacheval(cache, fact)
+    end
+
+    copyto!(cache.u, cache.b)
+    y = Array(ldiv!(cache.cacheval, CUDA.CuArray(cache.u)))
+    SciMLBase.build_linear_solution(alg, y, nothing, cache)
+end
+
+function do_factorization(alg::CudaOffloadFactorization, A, b, u)
+    A isa Union{AbstractMatrix,AbstractDiffEqOperator} ||
+        error("LU is not defined for $(typeof(A))")
+
+    if A isa AbstractDiffEqOperator
+        A = A.A
+    end
+    fact = qr(CUDA.CuArray(A))
+    return fact
+end
+
+export CudaOffloadFactorization
+
+end
\ No newline at end of file
diff --git a/test/cuda.jl b/lib/LinearSolveCUDA/test/runtests.jl
similarity index 90%
rename from test/cuda.jl
rename to lib/LinearSolveCUDA/test/runtests.jl
index 49421700c..1fc5e0cb5 100644
--- a/test/cuda.jl
+++ b/lib/LinearSolveCUDA/test/runtests.jl
@@ -1,42 +1,41 @@
-using LinearSolve, LinearAlgebra, SparseArrays
-using CUDA
-using Test
-
-n = 8
-A = Matrix(I,n,n)
-b = ones(n)
-A1 = A/1; b1 = rand(n); x1 = zero(b)
-A2 = A/2; b2 = rand(n); x2 = zero(b)
-
-prob1 = LinearProblem(A1, b1; u0=x1)
-prob2 = LinearProblem(A2, b2; u0=x2)
-
-cache_kwargs = (;verbose=true, abstol=1e-8, reltol=1e-8, maxiter=30,)
-
-function test_interface(alg, prob1, prob2)
-    A1 = prob1.A; b1 = prob1.b; x1 = prob1.u0
-    A2 = prob2.A; b2 = prob2.b; x2 = prob2.u0
-
-    y = solve(prob1, alg; cache_kwargs...)
-    @test A1 *  y  ≈ b1
-
-    cache = SciMLBase.init(prob1,alg; cache_kwargs...) # initialize cache
-    y = solve(cache)
-    @test A1 *  y  ≈ b1
-
-    cache = LinearSolve.set_A(cache,copy(A2))
-    y = solve(cache)
-    @test A2 *  y  ≈ b1
-
-    cache = LinearSolve.set_b(cache,b2)
-    y = solve(cache)
-    @test A2 *  y  ≈ b2
-
-    return
-end
-
-test_interface(GPUOffloadFactorization(), prob1, prob2)
-
-A1 = prob1.A; b1 = prob1.b; x1 = prob1.u0
-y = solve(prob1)
-@test A1 *  y  ≈ b1
+using LinearSolve, LinearSolveCUDA, LinearAlgebra, SparseArrays
+using Test
+
+n = 8
+A = Matrix(I,n,n)
+b = ones(n)
+A1 = A/1; b1 = rand(n); x1 = zero(b)
+A2 = A/2; b2 = rand(n); x2 = zero(b)
+
+prob1 = LinearProblem(A1, b1; u0=x1)
+prob2 = LinearProblem(A2, b2; u0=x2)
+
+cache_kwargs = (;verbose=true, abstol=1e-8, reltol=1e-8, maxiter=30,)
+
+function test_interface(alg, prob1, prob2)
+    A1 = prob1.A; b1 = prob1.b; x1 = prob1.u0
+    A2 = prob2.A; b2 = prob2.b; x2 = prob2.u0
+
+    y = solve(prob1, alg; cache_kwargs...)
+    @test A1 *  y  ≈ b1
+
+    cache = SciMLBase.init(prob1,alg; cache_kwargs...) # initialize cache
+    y = solve(cache)
+    @test A1 *  y  ≈ b1
+
+    cache = LinearSolve.set_A(cache,copy(A2))
+    y = solve(cache)
+    @test A2 *  y  ≈ b1
+
+    cache = LinearSolve.set_b(cache,b2)
+    y = solve(cache)
+    @test A2 *  y  ≈ b2
+
+    return
+end
+
+test_interface(GPUOffloadFactorization(), prob1, prob2)
+
+A1 = prob1.A; b1 = prob1.b; x1 = prob1.u0
+y = solve(prob1)
+@test A1 *  y  ≈ b1
diff --git a/lib/LinearSolvePardiso/Project.toml b/lib/LinearSolvePardiso/Project.toml
new file mode 100644
index 000000000..35588780e
--- /dev/null
+++ b/lib/LinearSolvePardiso/Project.toml
@@ -0,0 +1,22 @@
+name = "LinearSolvePardiso"
+uuid = "a6722589-28b8-4472-944e-bde9ee6da670"
+authors = ["SciML"]
+version = "0.1"
+
+[deps]
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
+Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"
+SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
+
+[compat]
+SciMLBase = "1.25"
+LinearSolve = "1"
+Pardiso =  "0.5"
+julia = "1.6"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
\ No newline at end of file
diff --git a/src/pardiso.jl b/lib/LinearSolvePardiso/src/LinearSolvePardiso.jl
similarity index 66%
rename from src/pardiso.jl
rename to lib/LinearSolvePardiso/src/LinearSolvePardiso.jl
index 02d3f46ed..4bfa30d67 100644
--- a/src/pardiso.jl
+++ b/lib/LinearSolvePardiso/src/LinearSolvePardiso.jl
@@ -1,33 +1,37 @@
-Base.@kwdef struct PardisoJL <: SciMLLinearSolveAlgorithm
-    nprocs::Union{Int, Nothing} = nothing
-    solver_type::Union{Int, Pardiso.Solver, Nothing} = nothing
-    matrix_type::Union{Int, Pardiso.MatrixType, Nothing} = nothing
-    iparm::Union{Vector{Tuple{Int,Int}}, Nothing} = nothing
-    dparm::Union{Vector{Tuple{Int,Int}}, Nothing} = nothing
+module LinearSolvePardiso
+
+using LinearSolve, SciMLBase
+
+Base.@kwdef struct PardisoJL <: SciMLBase.SciMLLinearSolveAlgorithm
+    nprocs::Union{Int,Nothing} = nothing
+    solver_type::Union{Int,Pardiso.Solver,Nothing} = nothing
+    matrix_type::Union{Int,Pardiso.MatrixType,Nothing} = nothing
+    iparm::Union{Vector{Tuple{Int,Int}},Nothing} = nothing
+    dparm::Union{Vector{Tuple{Int,Int}},Nothing} = nothing
 end
 
-MKLPardisoFactorize(;kwargs...) = PardisoJL(;solver_type = 0,kwargs...)
-MKLPardisoIterate(;kwargs...) = PardisoJL(;solver_type = 1,kwargs...)
-needs_concrete_A(alg::PardisoJL) = true
+MKLPardisoFactorize(; kwargs...) = PardisoJL(; solver_type=0, kwargs...)
+MKLPardisoIterate(; kwargs...) = PardisoJL(; solver_type=1, kwargs...)
+LinearSolve.needs_concrete_A(alg::PardisoJL) = true
 
 # TODO schur complement functionality
 
-function init_cacheval(alg::PardisoJL, A, b, u, Pl, Pr, maxiters, abstol, reltol, verbose)
+function LinearSolve.init_cacheval(alg::PardisoJL, A, b, u, Pl, Pr, maxiters, abstol, reltol, verbose)
     @unpack nprocs, solver_type, matrix_type, iparm, dparm = alg
-    A = convert(AbstractMatrix,A)
+    A = convert(AbstractMatrix, A)
 
     solver =
-    if Pardiso.PARDISO_LOADED[]
-        solver = Pardiso.PardisoSolver()
-        solver_type !== nothing && Pardiso.set_solver!(solver, solver_type)
+        if Pardiso.PARDISO_LOADED[]
+            solver = Pardiso.PardisoSolver()
+            solver_type !== nothing && Pardiso.set_solver!(solver, solver_type)
 
-        solver
-    else
-        solver = Pardiso.MKLPardisoSolver()
-        nprocs !== nothing && Pardiso.set_nprocs!(solver, nprocs)
+            solver
+        else
+            solver = Pardiso.MKLPardisoSolver()
+            nprocs !== nothing && Pardiso.set_nprocs!(solver, nprocs)
 
-        solver
-    end
+            solver
+        end
 
     Pardiso.pardisoinit(solver) # default initialization
 
@@ -58,7 +62,7 @@ function init_cacheval(alg::PardisoJL, A, b, u, Pl, Pr, maxiters, abstol, reltol
     end
 
     # Make sure to say it's transposed because its CSC not CSR
-    Pardiso.set_iparm!(solver,12, 1)
+    Pardiso.set_iparm!(solver, 12, 1)
 
     #=
     Note: It is recommended to use IPARM(11)=1 (scaling) and IPARM(13)=1 (matchings) for
@@ -71,8 +75,8 @@ function init_cacheval(alg::PardisoJL, A, b, u, Pl, Pr, maxiters, abstol, reltol
     be changed to Pardiso.ANALYSIS_NUM_FACT in the solver loop otherwise instabilities
     occur in the example https://github.com/SciML/OrdinaryDiffEq.jl/issues/1569
     =#
-    Pardiso.set_iparm!(solver,11, 0)
-    Pardiso.set_iparm!(solver,13, 0)
+    Pardiso.set_iparm!(solver, 11, 0)
+    Pardiso.set_iparm!(solver, 13, 0)
 
     Pardiso.set_phase!(solver, Pardiso.ANALYSIS)
 
@@ -81,7 +85,7 @@ function init_cacheval(alg::PardisoJL, A, b, u, Pl, Pr, maxiters, abstol, reltol
         # applies these exact factors L and U for the next steps in a
         # preconditioned Krylov-Subspace iteration. If the iteration does not
         # converge, the solver will automatically switch back to the numerical factorization.
-        Pardiso.set_iparm!(solver,3,round(Int,abs(log10(reltol)),RoundDown) * 10 + 1)
+        Pardiso.set_iparm!(solver, 3, round(Int, abs(log10(reltol)), RoundDown) * 10 + 1)
     end
 
     Pardiso.pardiso(solver, u, A, b)
@@ -91,7 +95,7 @@ end
 
 function SciMLBase.solve(cache::LinearCache, alg::PardisoJL; kwargs...)
     @unpack A, b, u = cache
-    A = convert(AbstractMatrix,A)
+    A = convert(AbstractMatrix, A)
 
     if cache.isfresh
         Pardiso.set_phase!(cache.cacheval, Pardiso.NUM_FACT)
@@ -100,10 +104,12 @@ function SciMLBase.solve(cache::LinearCache, alg::PardisoJL; kwargs...)
 
     Pardiso.set_phase!(cache.cacheval, Pardiso.SOLVE_ITERATIVE_REFINE)
     Pardiso.pardiso(cache.cacheval, u, A, b)
-    return SciMLBase.build_linear_solution(alg,cache.u,nothing,cache)
+    return SciMLBase.build_linear_solution(alg, cache.u, nothing, cache)
 end
 
 # Add finalizer to release memory
 # Pardiso.set_phase!(cache.cacheval, Pardiso.RELEASE_ALL)
 
 export PardisoJL, MKLPardisoFactorize, MKLPardisoIterate
+
+end
\ No newline at end of file
diff --git a/lib/LinearSolvePardiso/test/runtests.jl b/lib/LinearSolvePardiso/test/runtests.jl
new file mode 100644
index 000000000..f101be89d
--- /dev/null
+++ b/lib/LinearSolvePardiso/test/runtests.jl
@@ -0,0 +1,30 @@
+using LinearSolve, LinearSolvePardiso, SparseArrays
+
+A1 = sparse([ 1. 0 -2  3
+                0  5  1  2
+            -2  1  4 -7
+                3  2 -7  5 ])
+b1 = rand(4)
+prob1 = LinearProblem(A1, b1)
+
+lambda = 3
+e = ones(n)
+e2 = ones(n-1)
+A2 = spdiagm(-1 => im*e2, 0 => lambda*e, 1 => -im*e2)
+b2 = rand(n) + im * zeros(n)
+
+prob2 = LinearProblem(A2, b2)
+
+for alg in (
+            PardisoJL(),
+            MKLPardisoFactorize(),
+            MKLPardisoIterate(),
+            )
+
+    u = solve(prob1, alg; cache_kwargs...).u
+    @test A1 * u ≈ b1
+
+    u = solve(prob2, alg; cache_kwargs...).u
+    @test eltype(u) <: Complex
+    @test_broken A2 * u ≈ b2
+end
\ No newline at end of file
diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl
index 50eaca349..138acf127 100644
--- a/src/LinearSolve.jl
+++ b/src/LinearSolve.jl
@@ -10,7 +10,6 @@ using SparseArrays
 using SciMLBase: AbstractDiffEqOperator, AbstractLinearAlgorithm
 using Setfield
 using UnPack
-using Requires
 using SuiteSparse
 using KLU
 using DocStringExtensions
diff --git a/src/cuda.jl b/src/cuda.jl
deleted file mode 100644
index 99eb5089b..000000000
--- a/src/cuda.jl
+++ /dev/null
@@ -1,57 +0,0 @@
-gpu_or_cpu(x::CUDA.CuArray) = CUDA.CuArray
-gpu_or_cpu(x::Transpose{<:Any,<:CUDA.CuArray}) = CUDA.CuArray
-gpu_or_cpu(x::Adjoint{<:Any,<:CUDA.CuArray}) = CUDA.CuArray
-isgpu(::CUDA.CuArray) = true
-isgpu(::Transpose{<:Any,<:CUDA.CuArray}) = true
-isgpu(::Adjoint{<:Any,<:CUDA.CuArray}) = true
-ifgpufree(x::CUDA.CuArray) = CUDA.unsafe_free!(x)
-ifgpufree(x::Transpose{<:Any,<:CUDA.CuArray}) = CUDA.unsafe_free!(x.parent)
-ifgpufree(x::Adjoint{<:Any,<:CUDA.CuArray}) = CUDA.unsafe_free!(x.parent)
-
-@require Tracker="9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" begin
-    TrackedArray = Tracker.TrackedArray
-    gpu_or_cpu(x::TrackedArray{<:Any,<:Any,<:CUDA.CuArray}) = CUDA.CuArray
-    gpu_or_cpu(x::Adjoint{<:Any,TrackedArray{<:Any,<:Any,<:CUDA.CuArray}}) = CUDA.CuArray
-    gpu_or_cpu(x::Transpose{<:Any,TrackedArray{<:Any,<:Any,<:CUDA.CuArray}}) = CUDA.CuArray
-    isgpu(::Adjoint{<:Any,TrackedArray{<:Any,<:Any,<:CUDA.CuArray}}) = true
-    isgpu(::TrackedArray{<:Any,<:Any,<:CUDA.CuArray}) = true
-    isgpu(::Transpose{<:Any,TrackedArray{<:Any,<:Any,<:CUDA.CuArray}}) = true
-    ifgpufree(x::TrackedArray{<:Any,<:Any,<:CUDA.CuArray}) = CUDA.unsafe_free!(x.data)
-    ifgpufree(x::Adjoint{<:Any,TrackedArray{<:Any,<:Any,<:CUDA.CuArray}}) = CUDA.unsafe_free!((x.data).parent)
-    ifgpufree(x::Transpose{<:Any,TrackedArray{<:Any,<:Any,<:CUDA.CuArray}}) = CUDA.unsafe_free!((x.data).parent)
-end
-
-struct GPUOffloadFactorization <: AbstractFactorization end
-
-function SciMLBase.solve(cache::LinearCache, alg::GPUOffloadFactorization; kwargs...)
-    if cache.isfresh
-        fact = do_factorization(alg, CUDA.CuArray(cache.A), cache.b, cache.u)
-        cache = set_cacheval(cache, fact)
-    end
-
-    copyto!(cache.u,cache.b)
-    y = Array(ldiv!(cache.cacheval, CUDA.CuArray(cache.u)))
-    SciMLBase.build_linear_solution(alg,y,nothing,cache)
-end
-
-function do_factorization(alg::GPUOffloadFactorization, A, b, u)
-    A isa Union{AbstractMatrix,AbstractDiffEqOperator} ||
-        error("LU is not defined for $(typeof(A))")
-
-    if A isa AbstractDiffEqOperator
-        A = A.A
-    end
-    fact = qr(CUDA.CuArray(A))
-    return fact
-end
-
-function LinearAlgebra.ldiv!(x::CUDA.CuArray,_qr::CUDA.CUSOLVER.CuQR,b::CUDA.CuArray)
-  _x = UpperTriangular(_qr.R) \ (_qr.Q' * reshape(b,length(b),1))
-  x .= vec(_x)
-  CUDA.unsafe_free!(_x)
-  return x
-end
-# make `\` work
-LinearAlgebra.ldiv!(F::CUDA.CUSOLVER.CuQR, b::CUDA.CuArray) = (x = similar(b); ldiv!(x, F, b); x)
-
-export GPUOffloadFactorization
diff --git a/src/init.jl b/src/init.jl
index bbdb9c28d..ca3b02680 100644
--- a/src/init.jl
+++ b/src/init.jl
@@ -1,5 +1,3 @@
-isgpu(x) = false
-ifgpufree(x) = nothing
 function __init__()
     @static if VERSION < v"1.7beta"
       blas = BLAS.vendor()
@@ -7,7 +5,4 @@ function __init__()
     else
       IS_OPENBLAS[] = occursin("openblas", BLAS.get_config().loaded_libs[1].libname)
     end
-
-    @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" include("cuda.jl")
-    @require Pardiso="46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2" include("pardiso.jl")
 end
diff --git a/test/downstream/Project.toml b/test/downstream/Project.toml
deleted file mode 100644
index c96c01b2f..000000000
--- a/test/downstream/Project.toml
+++ /dev/null
@@ -1,2 +0,0 @@
-[deps]
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
diff --git a/test/runtests.jl b/test/runtests.jl
index b97bbb403..9e725b080 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,11 +3,16 @@ using SafeTestsets
 const LONGER_TESTS = false
 
 const GROUP = get(ENV, "GROUP", "All")
-const is_APPVEYOR = Sys.iswindows() && haskey(ENV,"APPVEYOR")
 
-function activate_downstream_env()
-    Pkg.activate("downstream")
-    Pkg.develop(PackageSpec(path=dirname(@__DIR__)))
+function dev_subpkg(subpkg)
+    subpkg_path = joinpath(dirname(@__DIR__), "lib", subpkg)
+    Pkg.develop(PackageSpec(path=subpkg_path))
+end
+
+function activate_subpkg_env(subpkg)
+    subpkg_path = joinpath(dirname(@__DIR__), "lib", subpkg)
+    Pkg.activate(subpkg_path)
+    Pkg.develop(PackageSpec(path=subpkg_path))
     Pkg.instantiate()
 end
 
@@ -15,7 +20,12 @@ if GROUP == "All" || GROUP == "Core"
   @time @safetestset "Basic Tests" begin include("basictests.jl") end
 end
 
-if !is_APPVEYOR && GROUP == "GPU"
-  activate_downstream_env()
-  @time @safetestset "CUDA" begin include("cuda.jl") end
+if GROUP == "LinearSolveCUDA"
+  dev_subpkg("LinearSolveCUDA")
+  @time @safetestset "CUDA" begin include("../lib/LinearSolveCUDA/test/runtests.jl") end
+end
+
+if GROUP == "LinearSolvePardiso"
+  dev_subpkg("LinearSolvePardiso")
+  @time @safetestset "Pardiso" begin include("../lib/LinearSolvePardiso/test/runtests.jl") end
 end

From b3c38a3a091b28ee4a6447e9868f8aff3a1f5c17 Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Tue, 17 May 2022 06:23:23 -0400
Subject: [PATCH 2/7] Fix tests

---
 .buildkite/pipeline.yml                          | 4 ++--
 lib/LinearSolvePardiso/src/LinearSolvePardiso.jl | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 5fe3bef07..fa895d8e0 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,5 +1,5 @@
 steps:
-  - label: "GPU"
+  - label: "LinearSolveCUDA"
     plugins:
       - JuliaCI/julia#v1:
           version: "1"
@@ -9,7 +9,7 @@ steps:
       queue: "juliagpu"
       cuda: "*"
     env:
-      GROUP: 'GPU'
+      GROUP: 'LinearSolveCUDA'
       JULIA_PKG_SERVER: "" # it often struggles with our large artifacts
       # SECRET_CODECOV_TOKEN: "..."
     timeout_in_minutes: 30
diff --git a/lib/LinearSolvePardiso/src/LinearSolvePardiso.jl b/lib/LinearSolvePardiso/src/LinearSolvePardiso.jl
index 4bfa30d67..ef75a452b 100644
--- a/lib/LinearSolvePardiso/src/LinearSolvePardiso.jl
+++ b/lib/LinearSolvePardiso/src/LinearSolvePardiso.jl
@@ -1,6 +1,6 @@
 module LinearSolvePardiso
 
-using LinearSolve, SciMLBase
+using Pardiso, LinearSolve, SciMLBase
 
 Base.@kwdef struct PardisoJL <: SciMLBase.SciMLLinearSolveAlgorithm
     nprocs::Union{Int,Nothing} = nothing

From 647161923b7e7ca1ff4fd252df75c4097af47dba Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Tue, 17 May 2022 06:27:57 -0400
Subject: [PATCH 3/7] Namespace

---
 lib/LinearSolveCUDA/src/LinearSolveCUDA.jl       | 6 +++---
 lib/LinearSolvePardiso/src/LinearSolvePardiso.jl | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/LinearSolveCUDA/src/LinearSolveCUDA.jl b/lib/LinearSolveCUDA/src/LinearSolveCUDA.jl
index 660932d12..dab132caf 100644
--- a/lib/LinearSolveCUDA/src/LinearSolveCUDA.jl
+++ b/lib/LinearSolveCUDA/src/LinearSolveCUDA.jl
@@ -4,10 +4,10 @@ using CUDA, LinearAlgebra, LinearSolve, SciMLBase
 
 struct CudaOffloadFactorization <: LinearSolve.AbstractFactorization end
 
-function SciMLBase.solve(cache::LinearCache, alg::CudaOffloadFactorization; kwargs...)
+function SciMLBase.solve(cache::LinearSolve.LinearCache, alg::CudaOffloadFactorization; kwargs...)
     if cache.isfresh
-        fact = do_factorization(alg, CUDA.CuArray(cache.A), cache.b, cache.u)
-        cache = set_cacheval(cache, fact)
+        fact = LinearSolve.do_factorization(alg, CUDA.CuArray(cache.A), cache.b, cache.u)
+        cache = LinearSolve.set_cacheval(cache, fact)
     end
 
     copyto!(cache.u, cache.b)
diff --git a/lib/LinearSolvePardiso/src/LinearSolvePardiso.jl b/lib/LinearSolvePardiso/src/LinearSolvePardiso.jl
index ef75a452b..2942fde41 100644
--- a/lib/LinearSolvePardiso/src/LinearSolvePardiso.jl
+++ b/lib/LinearSolvePardiso/src/LinearSolvePardiso.jl
@@ -93,7 +93,7 @@ function LinearSolve.init_cacheval(alg::PardisoJL, A, b, u, Pl, Pr, maxiters, ab
     return solver
 end
 
-function SciMLBase.solve(cache::LinearCache, alg::PardisoJL; kwargs...)
+function SciMLBase.solve(cache::LinearSolve.LinearCache, alg::PardisoJL; kwargs...)
     @unpack A, b, u = cache
     A = convert(AbstractMatrix, A)
 

From 29aecf0ac6db98eda6ed5589251cf1fba83d8cc2 Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Tue, 17 May 2022 08:14:48 -0400
Subject: [PATCH 4/7] Remove Pardiso test from basics

---
 .github/workflows/CI.yml |  1 -
 test/basictests.jl       | 36 ------------------------------------
 2 files changed, 37 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index ffd4f0ab0..6238e812c 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -13,7 +13,6 @@ jobs:
       matrix:
         group:
           - All
-          - LinearSolveCUDA
           - LinearSolvePardiso
         version:
           - '1'
diff --git a/test/basictests.jl b/test/basictests.jl
index faa63c133..b5d3696af 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -178,42 +178,6 @@ end
     end
 end
 
-@testset "PardisoJL" begin
-    @test_throws UndefVarError alg = PardisoJL()
-
-    using Pardiso, SparseArrays
-
-    A1 = sparse([ 1. 0 -2  3
-                 0  5  1  2
-                -2  1  4 -7
-                 3  2 -7  5 ])
-    b1 = rand(4)
-    prob1 = LinearProblem(A1, b1)
-
-    lambda = 3
-    e = ones(n)
-    e2 = ones(n-1)
-    A2 = spdiagm(-1 => im*e2, 0 => lambda*e, 1 => -im*e2)
-    b2 = rand(n) + im * zeros(n)
-
-    prob2 = LinearProblem(A2, b2)
-
-    for alg in (
-                PardisoJL(),
-                MKLPardisoFactorize(),
-                MKLPardisoIterate(),
-               )
-
-        u = solve(prob1, alg; cache_kwargs...).u
-        @test A1 * u ≈ b1
-
-        u = solve(prob2, alg; cache_kwargs...).u
-        @test eltype(u) <: Complex
-        @test_broken A2 * u ≈ b2
-    end
-
-end
-
 @testset "Preconditioners" begin
     @testset "Vector Diagonal Preconditioner" begin
         s = rand(n)

From 28188d9fd6d0ab8ae6e5749de1527b54dd7737fa Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Tue, 17 May 2022 08:26:31 -0400
Subject: [PATCH 5/7] Move last of pardiso to the subpackage

---
 lib/LinearSolvePardiso/test/runtests.jl | 39 ++++++++++++++++++-------
 test/basictests.jl                      |  9 ------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/lib/LinearSolvePardiso/test/runtests.jl b/lib/LinearSolvePardiso/test/runtests.jl
index f101be89d..e78f8a7ec 100644
--- a/lib/LinearSolvePardiso/test/runtests.jl
+++ b/lib/LinearSolvePardiso/test/runtests.jl
@@ -1,25 +1,25 @@
 using LinearSolve, LinearSolvePardiso, SparseArrays
 
-A1 = sparse([ 1. 0 -2  3
-                0  5  1  2
-            -2  1  4 -7
-                3  2 -7  5 ])
+A1 = sparse([1.0 0 -2 3
+    0 5 1 2
+    -2 1 4 -7
+    3 2 -7 5])
 b1 = rand(4)
 prob1 = LinearProblem(A1, b1)
 
 lambda = 3
 e = ones(n)
-e2 = ones(n-1)
-A2 = spdiagm(-1 => im*e2, 0 => lambda*e, 1 => -im*e2)
+e2 = ones(n - 1)
+A2 = spdiagm(-1 => im * e2, 0 => lambda * e, 1 => -im * e2)
 b2 = rand(n) + im * zeros(n)
 
 prob2 = LinearProblem(A2, b2)
 
 for alg in (
-            PardisoJL(),
-            MKLPardisoFactorize(),
-            MKLPardisoIterate(),
-            )
+    PardisoJL(),
+    MKLPardisoFactorize(),
+    MKLPardisoIterate(),
+)
 
     u = solve(prob1, alg; cache_kwargs...).u
     @test A1 * u ≈ b1
@@ -27,4 +27,21 @@ for alg in (
     u = solve(prob2, alg; cache_kwargs...).u
     @test eltype(u) <: Complex
     @test_broken A2 * u ≈ b2
-end
\ No newline at end of file
+end
+
+n = 4
+Random.seed!(10)
+A = sprand(n, n, 0.8);
+A2 = 2.0 .* A;
+b1 = rand(n);
+b2 = rand(n);
+prob = LinearProblem(copy(A), copy(b1))
+
+linsolve = init(prob, MKLPardisoFactorize())
+sol31 = solve(linsolve)
+linsolve = LinearSolve.set_b(sol31.cache, copy(b2))
+sol32 = solve(linsolve)
+linsolve = LinearSolve.set_A(sol32.cache, copy(A2))
+sol33 = solve(linsolve)
+@test sol13.u ≈ sol23.u
+@test sol13.u ≈ sol33.u
\ No newline at end of file
diff --git a/test/basictests.jl b/test/basictests.jl
index b5d3696af..a4374aa9d 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -236,19 +236,10 @@ end
     linsolve = LinearSolve.set_A(sol22.cache,copy(A2))
     sol23 = solve(linsolve)
 
-    linsolve = init(prob,MKLPardisoFactorize())
-    sol31 = solve(linsolve)
-    linsolve = LinearSolve.set_b(sol31.cache,copy(b2))
-    sol32 = solve(linsolve)
-    linsolve = LinearSolve.set_A(sol32.cache,copy(A2))
-    sol33 = solve(linsolve)
-
     @test sol11.u ≈ sol21.u
     @test sol11.u ≈ sol31.u
     @test sol12.u ≈ sol22.u
     @test sol12.u ≈ sol32.u
-    @test sol13.u ≈ sol23.u
-    @test sol13.u ≈ sol33.u
 end
 
 @testset "Solve Function" begin

From 40f908f6955acba09b2e0d5ecfdccc18967ac370 Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Tue, 17 May 2022 19:55:42 -0400
Subject: [PATCH 6/7] fix tests

---
 lib/LinearSolveCUDA/test/runtests.jl    |  2 +-
 lib/LinearSolvePardiso/test/runtests.jl | 12 +++++++++++-
 test/basictests.jl                      |  3 +--
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/lib/LinearSolveCUDA/test/runtests.jl b/lib/LinearSolveCUDA/test/runtests.jl
index 1fc5e0cb5..3fd98fd89 100644
--- a/lib/LinearSolveCUDA/test/runtests.jl
+++ b/lib/LinearSolveCUDA/test/runtests.jl
@@ -34,7 +34,7 @@ function test_interface(alg, prob1, prob2)
     return
 end
 
-test_interface(GPUOffloadFactorization(), prob1, prob2)
+test_interface(CudaOffloadFactorization(), prob1, prob2)
 
 A1 = prob1.A; b1 = prob1.b; x1 = prob1.u0
 y = solve(prob1)
diff --git a/lib/LinearSolvePardiso/test/runtests.jl b/lib/LinearSolvePardiso/test/runtests.jl
index e78f8a7ec..811eeb28a 100644
--- a/lib/LinearSolvePardiso/test/runtests.jl
+++ b/lib/LinearSolvePardiso/test/runtests.jl
@@ -37,11 +37,21 @@ b1 = rand(n);
 b2 = rand(n);
 prob = LinearProblem(copy(A), copy(b1))
 
+prob = LinearProblem(copy(A), copy(b1))
+linsolve = init(prob, UMFPACKFactorization())
+sol11 = solve(linsolve)
+linsolve = LinearSolve.set_b(sol11.cache, copy(b2))
+sol12 = solve(linsolve)
+linsolve = LinearSolve.set_A(sol12.cache, copy(A2))
+sol13 = solve(linsolve)
+
 linsolve = init(prob, MKLPardisoFactorize())
 sol31 = solve(linsolve)
 linsolve = LinearSolve.set_b(sol31.cache, copy(b2))
 sol32 = solve(linsolve)
 linsolve = LinearSolve.set_A(sol32.cache, copy(A2))
 sol33 = solve(linsolve)
-@test sol13.u ≈ sol23.u
+
+@test sol11.u ≈ sol31.u
+@test sol12.u ≈ sol32.u
 @test sol13.u ≈ sol33.u
\ No newline at end of file
diff --git a/test/basictests.jl b/test/basictests.jl
index a4374aa9d..f0ea53d05 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -237,9 +237,8 @@ end
     sol23 = solve(linsolve)
 
     @test sol11.u ≈ sol21.u
-    @test sol11.u ≈ sol31.u
     @test sol12.u ≈ sol22.u
-    @test sol12.u ≈ sol32.u
+    @test sol13.u ≈ sol23.u
 end
 
 @testset "Solve Function" begin

From 4fac1680c97e4140686801c753d2721f7f1a501c Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Tue, 17 May 2022 20:03:36 -0400
Subject: [PATCH 7/7] Fix extension

---
 lib/LinearSolveCUDA/src/LinearSolveCUDA.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/LinearSolveCUDA/src/LinearSolveCUDA.jl b/lib/LinearSolveCUDA/src/LinearSolveCUDA.jl
index dab132caf..82acf9691 100644
--- a/lib/LinearSolveCUDA/src/LinearSolveCUDA.jl
+++ b/lib/LinearSolveCUDA/src/LinearSolveCUDA.jl
@@ -15,7 +15,7 @@ function SciMLBase.solve(cache::LinearSolve.LinearCache, alg::CudaOffloadFactori
     SciMLBase.build_linear_solution(alg, y, nothing, cache)
 end
 
-function do_factorization(alg::CudaOffloadFactorization, A, b, u)
+function LinearSolve.do_factorization(alg::CudaOffloadFactorization, A, b, u)
     A isa Union{AbstractMatrix,AbstractDiffEqOperator} ||
         error("LU is not defined for $(typeof(A))")