From 7a4fcf62b6d26a319ab1e17440a4e25fb0c672c9 Mon Sep 17 00:00:00 2001
From: Xingjian Guo <xg703@nyu.edu>
Date: Tue, 3 Jul 2018 14:54:13 -0400
Subject: [PATCH 1/5] `EPIRK4s3A` (out-of-place)

---
 src/OrdinaryDiffEq.jl                         |  2 +-
 src/alg_utils.jl                              |  1 +
 src/algorithms.jl                             |  5 ++++
 src/caches/linear_nonlinear_caches.jl         |  4 +++
 .../exponential_rk_perform_step.jl            | 28 +++++++++++++++++++
 5 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/OrdinaryDiffEq.jl b/src/OrdinaryDiffEq.jl
index 6df9aa9e16..2416fe8d76 100644
--- a/src/OrdinaryDiffEq.jl
+++ b/src/OrdinaryDiffEq.jl
@@ -164,7 +164,7 @@ module OrdinaryDiffEq
 
   export GenericIIF1, GenericIIF2
 
-  export LawsonEuler, NorsettEuler, ETD1, ETDRK2, ETDRK3, ETDRK4, HochOst4, Exp4, ETD2
+  export LawsonEuler, NorsettEuler, ETD1, ETDRK2, ETDRK3, ETDRK4, HochOst4, Exp4, EPIRK4s3A, ETD2
 
   export SymplecticEuler, VelocityVerlet, VerletLeapfrog, PseudoVerletLeapfrog,
          McAte2, Ruth3, McAte3, CandyRoz4, McAte4, McAte42, McAte5,
diff --git a/src/alg_utils.jl b/src/alg_utils.jl
index 7e63a7a185..b8f353ddb7 100644
--- a/src/alg_utils.jl
+++ b/src/alg_utils.jl
@@ -110,6 +110,7 @@ alg_order(alg::ETDRK3) = 3
 alg_order(alg::ETDRK4) = 4
 alg_order(alg::HochOst4) = 4
 alg_order(alg::Exp4) = 4
+alg_order(alg::EPIRK4s3A) = 4
 alg_order(alg::SplitEuler) = 1
 alg_order(alg::ETD2) = 2
 
diff --git a/src/algorithms.jl b/src/algorithms.jl
index 37bd290a35..39f5477f43 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -737,6 +737,11 @@ struct Exp4 <: OrdinaryDiffEqExponentialAlgorithm
   iop::Int
 end
 Base.@pure Exp4(;m=30, iop=0)  = Exp4(m, iop)
+struct EPIRK4s3A <: OrdinaryDiffEqExponentialAlgorithm
+  m::Int
+  iop::Int
+end
+Base.@pure EPIRK4s3A(;m=30, iop=0) = EPIRK4s3A(m, iop)
 struct SplitEuler <: OrdinaryDiffEqExponentialAlgorithm end
 struct ETD2 <: OrdinaryDiffEqExponentialAlgorithm end
 
diff --git a/src/caches/linear_nonlinear_caches.jl b/src/caches/linear_nonlinear_caches.jl
index 09ec839f34..5fed2c0431 100644
--- a/src/caches/linear_nonlinear_caches.jl
+++ b/src/caches/linear_nonlinear_caches.jl
@@ -365,6 +365,10 @@ function alg_cache(alg::Exp4,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnit
   Exp4Cache(u,uprev,tmp,rtmp,rtmp2,k7,K,A,B,KsCache)
 end
 
+struct EPIRK4s3AConstantCache <: ExpRKConstantCache end
+alg_cache(alg::EPIRK4s3A,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnits,tTypeNoUnits,
+  uprev,uprev2,f,t,dt,reltol,p,calck,::Type{Val{false}}) = EPIRK4s3AConstantCache()
+
 ####################################
 # Multistep exponential method caches
 
diff --git a/src/perform_step/exponential_rk_perform_step.jl b/src/perform_step/exponential_rk_perform_step.jl
index bd257789da..e337cf80ad 100644
--- a/src/perform_step/exponential_rk_perform_step.jl
+++ b/src/perform_step/exponential_rk_perform_step.jl
@@ -665,6 +665,34 @@ function perform_step!(integrator, cache::Exp4Cache, repeat_step=false)
   # integrator.k is automatically set due to aliasing
 end
 
+function perform_step!(integrator, cache::EPIRK4s3AConstantCache, repeat_step=false)
+  @unpack t,dt,uprev,f,p = integrator
+  A = f.jac(uprev, p, t)
+  alg = typeof(integrator.alg) <: CompositeAlgorithm ? integrator.alg.algs[integrator.cache.current] : integrator.alg
+  f0 = integrator.fsalfirst # f(uprev) is fsaled
+  kwargs = [(:tol, integrator.opts.reltol), (:iop, alg.iop), (:norm, integrator.opts.internalnorm), (:adaptive, true)]
+
+  # Compute U2 and U3 vertically
+  K1 = phiv_timestep([dt/2, 2dt/3], A, [zeros(f0) f0]; kwargs...)
+  U2 = uprev + K1[:, 1]
+  U3 = uprev + K1[:, 2]
+  R2 = f(U2, p, t + dt/2)  - f0 - A*(U2 - uprev) # remainder of U2
+  R3 = f(U3, p, t + 2dt/3) - f0 - A*(U3 - uprev) # remainder of U3
+  
+  # Update u (horizontally)
+  B = zeros(eltype(f0), length(f0), 5)
+  B[:, 2] = f0
+  B[:, 4] = (32R2 - 13.5R3) / dt^2
+  B[:, 5] = (-144R2 + 81R3) / dt^3
+  u = uprev + phiv_timestep(dt, A, B; kwargs...)
+
+  # Update integrator state
+  integrator.fsallast = f(u, p, t + dt)
+  integrator.k[1] = integrator.fsalfirst
+  integrator.k[2] = integrator.fsallast
+  integrator.u = u
+end
+
 ######################################################
 # Multistep exponential integrators
 function initialize!(integrator,cache::ETD2ConstantCache)

From f0b06cfe473ee5567540f4e85043901ba0da8ad6 Mon Sep 17 00:00:00 2001
From: Xingjian Guo <xg703@nyu.edu>
Date: Tue, 3 Jul 2018 15:07:02 -0400
Subject: [PATCH 2/5] Allow sub arrays as input/output to `phiv_timestep`

---
 src/caches/linear_nonlinear_caches.jl           |  5 ++---
 src/exponential_utils.jl                        | 17 ++++++++++-------
 src/perform_step/exponential_rk_perform_step.jl |  3 ++-
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/caches/linear_nonlinear_caches.jl b/src/caches/linear_nonlinear_caches.jl
index 5fed2c0431..9f8e5ca71b 100644
--- a/src/caches/linear_nonlinear_caches.jl
+++ b/src/caches/linear_nonlinear_caches.jl
@@ -343,7 +343,6 @@ struct Exp4Cache{uType,rateType,matType,KsType} <: ExpRKCache
   tmp::uType
   rtmp::rateType
   rtmp2::rateType
-  k7::rateType
   K::matType
   A::matType
   B::matType
@@ -352,7 +351,7 @@ end
 function alg_cache(alg::Exp4,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnits,
   tTypeNoUnits,uprev,uprev2,f,t,dt,reltol,p,calck,::Type{Val{true}})
   tmp = similar(u)                                              # uType caches
-  rtmp, rtmp2, k7 = (zeros(rate_prototype) for i = 1:3)         # rateType caches
+  rtmp, rtmp2 = (zeros(rate_prototype) for i = 1:2)             # rateType caches
   # Allocate matrices
   # TODO: units
   n = length(u); T = eltype(u)
@@ -362,7 +361,7 @@ function alg_cache(alg::Exp4,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnit
   # Allocate caches for phiv_timestep
   maxiter = min(alg.m, n)
   KsCache = _phiv_timestep_caches(u, maxiter, 1)
-  Exp4Cache(u,uprev,tmp,rtmp,rtmp2,k7,K,A,B,KsCache)
+  Exp4Cache(u,uprev,tmp,rtmp,rtmp2,K,A,B,KsCache)
 end
 
 struct EPIRK4s3AConstantCache <: ExpRKConstantCache end
diff --git a/src/exponential_utils.jl b/src/exponential_utils.jl
index 1db98af3d5..16a4bbed08 100644
--- a/src/exponential_utils.jl
+++ b/src/exponential_utils.jl
@@ -364,7 +364,7 @@ end
 
 Non-allocating version of `expv` that uses precomputed Krylov subspace `Ks`.
 """
-function expv!(w::Vector{T}, t::Number, Ks::KrylovSubspace{B, T}; 
+function expv!(w::AbstractVector{T}, t::Number, Ks::KrylovSubspace{B, T}; 
   cache=nothing) where {B, T <: Number}
   m, beta, V, H = Ks.m, Ks.beta, getV(Ks), getH(Ks)
   @assert length(w) == size(V, 1) "Dimension mismatch"
@@ -451,7 +451,7 @@ end
 
 Non-allocating version of 'phiv' that uses precomputed Krylov subspace `Ks`.
 """
-function phiv!(w::Matrix{T}, t::Number, Ks::KrylovSubspace{B, T}, k::Integer; 
+function phiv!(w::AbstractMatrix{T}, t::Number, Ks::KrylovSubspace{B, T}, k::Integer; 
   cache=nothing, correct=false, errest=false) where {B, T <: Number}
   m, beta, V, H = Ks.m, Ks.beta, getV(Ks), getH(Ks)
   @assert size(w, 1) == size(V, 1) "Dimension mismatch"
@@ -529,12 +529,12 @@ end
 
 Non-allocating version of `expv_timestep`.
 """
-function expv_timestep!(u::Vector{T}, t::tType, A, b::Vector{T}; 
+function expv_timestep!(u::AbstractVector{T}, t::tType, A, b::AbstractVector{T}; 
   kwargs...) where {T <: Number, tType <: Real}
   expv_timestep!(reshape(u, length(u), 1), [t], A, b; kwargs...)
   return u
 end
-function expv_timestep!(U::Matrix{T}, ts::Vector{tType}, A, b::Vector{T}; 
+function expv_timestep!(U::AbstractMatrix{T}, ts::Vector{tType}, A, b::AbstractVector{T}; 
   kwargs...) where {T <: Number, tType <: Real}
   B = reshape(b, length(b), 1)
   phiv_timestep!(U, ts, A, B; kwargs...)
@@ -580,12 +580,12 @@ end
 
 Non-allocating version of `phiv_timestep`.
 """
-function phiv_timestep!(u::Vector{T}, t::tType, A, B::Matrix{T}; 
+function phiv_timestep!(u::AbstractVector{T}, t::tType, A, B::AbstractMatrix{T}; 
   kwargs...) where {T <: Number, tType <: Real}
   phiv_timestep!(reshape(u, length(u), 1), [t], A, B; kwargs...)
   return u
 end
-function phiv_timestep!(U::Matrix{T}, ts::Vector{tType}, A, B::Matrix{T}; tau::Real=0.0, 
+function phiv_timestep!(U::AbstractMatrix{T}, ts::Vector{tType}, A, B::AbstractMatrix{T}; tau::Real=0.0, 
   m::Int=min(10, size(A, 1)), tol::Real=1e-7, norm=Base.norm, iop::Int=0, 
   correct::Bool=false, caches=nothing, adaptive=false, delta::Real=1.2, 
   gamma::Real=0.8, NA::Int=0, verbose=false) where {T <: Number, tType <: Real}
@@ -613,7 +613,10 @@ function phiv_timestep!(U::Matrix{T}, ts::Vector{tType}, A, B::Matrix{T}; tau::R
     phiv_cache = nothing         # cache used by phiv!
   else
     u, W, P, Ks, phiv_cache = caches
-    @assert length(u) == n && size(W) == (n, p+1) && size(P) == (n, p+2) "Dimension mismatch"
+    @assert length(u) == n && size(W, 1) == n && size(P, 1) == n "Dimension mismatch"
+    # W and P may be bigger than actually needed
+    W = @view(W[:, 1:p+1])
+    P = @view(P[:, 1:p+2])
   end
   copy!(u, @view(B[:, 1])) # u(0) = b0
   coeffs = ones(tType, p);
diff --git a/src/perform_step/exponential_rk_perform_step.jl b/src/perform_step/exponential_rk_perform_step.jl
index e337cf80ad..6c2f0d5120 100644
--- a/src/perform_step/exponential_rk_perform_step.jl
+++ b/src/perform_step/exponential_rk_perform_step.jl
@@ -620,7 +620,7 @@ end
 
 function perform_step!(integrator, cache::Exp4Cache, repeat_step=false)
   @unpack t,dt,uprev,u,f,p = integrator
-  @unpack tmp,rtmp,rtmp2,k7,K,A,B,KsCache = cache
+  @unpack tmp,rtmp,rtmp2,K,A,B,KsCache = cache
   f.jac(A, uprev, p, t)
   alg = typeof(integrator.alg) <: CompositeAlgorithm ? integrator.alg.algs[integrator.cache.current] : integrator.alg
   f0 = integrator.fsalfirst # f(u0) is fsaled
@@ -656,6 +656,7 @@ function perform_step!(integrator, cache::Exp4Cache, repeat_step=false)
   A_mul_B!(rtmp, K, [1.0, -4/3, 1.0])
   Base.axpy!(dt, rtmp, u)
   # Krylov for the second remainder d7
+  k7 = @view(K[:, 1])
   phiv_timestep!(k7, ts[1], A, B; kwargs...)
   k7 ./= ts[1]
   Base.axpy!(dt/6, k7, u)

From 144e395f8ab43d79e13c89b43f3bb2717742a6aa Mon Sep 17 00:00:00 2001
From: Xingjian Guo <xg703@nyu.edu>
Date: Tue, 3 Jul 2018 15:50:30 -0400
Subject: [PATCH 3/5] `EPIRK4s3A` (in-place)

---
 src/caches/linear_nonlinear_caches.jl         | 26 +++++++++++
 .../exponential_rk_perform_step.jl            | 45 ++++++++++++++++---
 test/linear_nonlinear_krylov_tests.jl         |  4 +-
 3 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/src/caches/linear_nonlinear_caches.jl b/src/caches/linear_nonlinear_caches.jl
index 9f8e5ca71b..63028d6912 100644
--- a/src/caches/linear_nonlinear_caches.jl
+++ b/src/caches/linear_nonlinear_caches.jl
@@ -364,6 +364,32 @@ function alg_cache(alg::Exp4,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnit
   Exp4Cache(u,uprev,tmp,rtmp,rtmp2,K,A,B,KsCache)
 end
 
+struct EPIRK4s3ACache{uType,rateType,matType,KsType} <: ExpRKCache
+  u::uType
+  uprev::uType
+  tmp::uType
+  rtmp::rateType
+  rtmp2::rateType
+  K::matType
+  A::matType
+  B::matType
+  KsCache::KsType
+end
+function alg_cache(alg::EPIRK4s3A,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnits,
+  tTypeNoUnits,uprev,uprev2,f,t,dt,reltol,p,calck,::Type{Val{true}})
+  tmp = similar(u)                                    # uType caches
+  rtmp, rtmp2 = (zeros(rate_prototype) for i = 1:2)   # rateType caches
+  # Allocate matrices
+  n = length(u); T = eltype(u)
+  K = Matrix{T}(n, 2)
+  A = Matrix{T}(n, n) # TODO: sparse Jacobian support
+  B = zeros(T, n, 5)
+  # Allocate caches for phiv_timestep
+  maxiter = min(alg.m, n)
+  KsCache = _phiv_timestep_caches(u, maxiter, 4)
+  EPIRK4s3ACache(u,uprev,tmp,rtmp,rtmp2,K,A,B,KsCache)
+end
+
 struct EPIRK4s3AConstantCache <: ExpRKConstantCache end
 alg_cache(alg::EPIRK4s3A,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnits,tTypeNoUnits,
   uprev,uprev2,f,t,dt,reltol,p,calck,::Type{Val{false}}) = EPIRK4s3AConstantCache()
diff --git a/src/perform_step/exponential_rk_perform_step.jl b/src/perform_step/exponential_rk_perform_step.jl
index 6c2f0d5120..c71c35c601 100644
--- a/src/perform_step/exponential_rk_perform_step.jl
+++ b/src/perform_step/exponential_rk_perform_step.jl
@@ -674,11 +674,11 @@ function perform_step!(integrator, cache::EPIRK4s3AConstantCache, repeat_step=fa
   kwargs = [(:tol, integrator.opts.reltol), (:iop, alg.iop), (:norm, integrator.opts.internalnorm), (:adaptive, true)]
 
   # Compute U2 and U3 vertically
-  K1 = phiv_timestep([dt/2, 2dt/3], A, [zeros(f0) f0]; kwargs...)
-  U2 = uprev + K1[:, 1]
-  U3 = uprev + K1[:, 2]
-  R2 = f(U2, p, t + dt/2)  - f0 - A*(U2 - uprev) # remainder of U2
-  R3 = f(U3, p, t + 2dt/3) - f0 - A*(U3 - uprev) # remainder of U3
+  K = phiv_timestep([dt/2, 2dt/3], A, [zeros(f0) f0]; kwargs...)
+  U2 = uprev + K[:, 1] 
+  U3 = uprev + K[:, 2]
+  R2 = f(U2, p, t + dt/2)  - f0 - A*K[:, 1] # remainder of U2
+  R3 = f(U3, p, t + 2dt/3) - f0 - A*K[:, 2] # remainder of U3
   
   # Update u (horizontally)
   B = zeros(eltype(f0), length(f0), 5)
@@ -694,6 +694,41 @@ function perform_step!(integrator, cache::EPIRK4s3AConstantCache, repeat_step=fa
   integrator.u = u
 end
 
+function perform_step!(integrator, cache::EPIRK4s3ACache, repeat_step=false)
+  @unpack t,dt,uprev,u,f,p = integrator
+  @unpack tmp,rtmp,rtmp2,K,A,B,KsCache = cache
+  f.jac(A, uprev, p, t)
+  alg = typeof(integrator.alg) <: CompositeAlgorithm ? integrator.alg.algs[integrator.cache.current] : integrator.alg
+  f0 = integrator.fsalfirst # f(u0) is fsaled
+  kwargs = [(:tol, integrator.opts.reltol), (:iop, alg.iop), (:norm, integrator.opts.internalnorm), 
+    (:adaptive, true), (:caches, KsCache)]
+
+  # Compute U2 and U3 vertically
+  B[:, 2] .= f0
+  phiv_timestep!(K, [dt/2, 2dt/3], A, @view(B[:, 1:2]); kwargs...)
+  ## U2 and R2
+  @. tmp = uprev + @view(K[:, 1]) # tmp is now U2
+  f(rtmp, tmp, p, t + dt/2); A_mul_B!(rtmp2, A, @view(K[:, 1]))
+  @. rtmp = rtmp - f0 - rtmp2 # rtmp is now R2
+  B[:, 4] .= (32/dt^2) * rtmp
+  B[:, 5] .= (-144/dt^3) * rtmp
+  ## U3 and R3
+  @. tmp = uprev + @view(K[:, 2]) # tmp is now U3
+  f(rtmp, tmp, p, t + 2dt/3); A_mul_B!(rtmp2, A, @view(K[:, 2]))
+  @. rtmp = rtmp - f0 - rtmp2 # rtmp is now R3
+  B[:, 4] .-= (13.5/dt^2) * rtmp
+  B[:, 5] .+= (81/dt^3) * rtmp
+  
+  # Update u
+  du = @view(K[:, 1])
+  phiv_timestep!(du, dt, A, B; kwargs...)
+  @. u = uprev + du
+
+  # Update integrator state
+  f(integrator.fsallast, u, p, t + dt)
+  # integrator.k is automatically set due to aliasing
+end
+
 ######################################################
 # Multistep exponential integrators
 function initialize!(integrator,cache::ETD2ConstantCache)
diff --git a/test/linear_nonlinear_krylov_tests.jl b/test/linear_nonlinear_krylov_tests.jl
index 3f99e4512b..b463e3192f 100644
--- a/test/linear_nonlinear_krylov_tests.jl
+++ b/test/linear_nonlinear_krylov_tests.jl
@@ -49,8 +49,8 @@ end
     prob = ODEProblem(f, u0, (0.0, 1.0))
     prob_ip = ODEProblem{true}(f_ip, u0, (0.0, 1.0))
 
-    dt = 0.1; tol=1e-5
-    Algs = [Exp4]
+    dt = 0.05; tol=1e-5
+    Algs = [Exp4, EPIRK4s3A]
     for Alg in Algs
         gc()
         sol = solve(prob, Alg(); dt=dt, internalnorm=Base.norm, reltol=tol)

From 8486f912bf50473293f74fcd85e451aa1dbb6aa3 Mon Sep 17 00:00:00 2001
From: Xingjian Guo <xg703@nyu.edu>
Date: Wed, 4 Jul 2018 14:48:31 -0400
Subject: [PATCH 4/5] `EPIRK4s3B` (out-of-place)

---
 src/OrdinaryDiffEq.jl                         |  2 +-
 src/alg_utils.jl                              |  1 +
 src/algorithms.jl                             | 15 ++++------
 src/caches/linear_nonlinear_caches.jl         | 10 +++++--
 .../exponential_rk_perform_step.jl            | 30 +++++++++++++++++++
 5 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/src/OrdinaryDiffEq.jl b/src/OrdinaryDiffEq.jl
index 2416fe8d76..5189b5dfe1 100644
--- a/src/OrdinaryDiffEq.jl
+++ b/src/OrdinaryDiffEq.jl
@@ -164,7 +164,7 @@ module OrdinaryDiffEq
 
   export GenericIIF1, GenericIIF2
 
-  export LawsonEuler, NorsettEuler, ETD1, ETDRK2, ETDRK3, ETDRK4, HochOst4, Exp4, EPIRK4s3A, ETD2
+  export LawsonEuler, NorsettEuler, ETD1, ETDRK2, ETDRK3, ETDRK4, HochOst4, Exp4, EPIRK4s3A, EPIRK4s3B, ETD2
 
   export SymplecticEuler, VelocityVerlet, VerletLeapfrog, PseudoVerletLeapfrog,
          McAte2, Ruth3, McAte3, CandyRoz4, McAte4, McAte42, McAte5,
diff --git a/src/alg_utils.jl b/src/alg_utils.jl
index b8f353ddb7..1583548c8b 100644
--- a/src/alg_utils.jl
+++ b/src/alg_utils.jl
@@ -111,6 +111,7 @@ alg_order(alg::ETDRK4) = 4
 alg_order(alg::HochOst4) = 4
 alg_order(alg::Exp4) = 4
 alg_order(alg::EPIRK4s3A) = 4
+alg_order(alg::EPIRK4s3B) = 4
 alg_order(alg::SplitEuler) = 1
 alg_order(alg::ETD2) = 2
 
diff --git a/src/algorithms.jl b/src/algorithms.jl
index 39f5477f43..385d6448f7 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -732,16 +732,13 @@ for Alg in [:LawsonEuler, :NorsettEuler, :ETDRK2, :ETDRK3, :ETDRK4, :HochOst4]
   @eval Base.@pure $Alg(;krylov=false, m=30, iop=0) = $Alg(krylov, m, iop)
 end
 ETD1 = NorsettEuler # alias
-struct Exp4 <: OrdinaryDiffEqExponentialAlgorithm
-  m::Int
-  iop::Int
-end
-Base.@pure Exp4(;m=30, iop=0)  = Exp4(m, iop)
-struct EPIRK4s3A <: OrdinaryDiffEqExponentialAlgorithm
-  m::Int
-  iop::Int
+for Alg in [:Exp4, :EPIRK4s3A, :EPIRK4s3B]
+  @eval struct $Alg <: OrdinaryDiffEqExponentialAlgorithm
+    m::Int
+    iop::Int
+  end
+  @eval Base.@pure $Alg(;m=30, iop=0) = $Alg(m, iop)
 end
-Base.@pure EPIRK4s3A(;m=30, iop=0) = EPIRK4s3A(m, iop)
 struct SplitEuler <: OrdinaryDiffEqExponentialAlgorithm end
 struct ETD2 <: OrdinaryDiffEqExponentialAlgorithm end
 
diff --git a/src/caches/linear_nonlinear_caches.jl b/src/caches/linear_nonlinear_caches.jl
index 63028d6912..27fdfed011 100644
--- a/src/caches/linear_nonlinear_caches.jl
+++ b/src/caches/linear_nonlinear_caches.jl
@@ -364,6 +364,10 @@ function alg_cache(alg::Exp4,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnit
   Exp4Cache(u,uprev,tmp,rtmp,rtmp2,K,A,B,KsCache)
 end
 
+struct EPIRK4s3AConstantCache <: ExpRKConstantCache end
+alg_cache(alg::EPIRK4s3A,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnits,tTypeNoUnits,
+  uprev,uprev2,f,t,dt,reltol,p,calck,::Type{Val{false}}) = EPIRK4s3AConstantCache()
+
 struct EPIRK4s3ACache{uType,rateType,matType,KsType} <: ExpRKCache
   u::uType
   uprev::uType
@@ -390,9 +394,9 @@ function alg_cache(alg::EPIRK4s3A,u,rate_prototype,uEltypeNoUnits,uBottomEltypeN
   EPIRK4s3ACache(u,uprev,tmp,rtmp,rtmp2,K,A,B,KsCache)
 end
 
-struct EPIRK4s3AConstantCache <: ExpRKConstantCache end
-alg_cache(alg::EPIRK4s3A,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnits,tTypeNoUnits,
-  uprev,uprev2,f,t,dt,reltol,p,calck,::Type{Val{false}}) = EPIRK4s3AConstantCache()
+struct EPIRK4s3BConstantCache <: ExpRKConstantCache end
+alg_cache(alg::EPIRK4s3B,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnits,tTypeNoUnits,
+  uprev,uprev2,f,t,dt,reltol,p,calck,::Type{Val{false}}) = EPIRK4s3BConstantCache()
 
 ####################################
 # Multistep exponential method caches
diff --git a/src/perform_step/exponential_rk_perform_step.jl b/src/perform_step/exponential_rk_perform_step.jl
index c71c35c601..3c34b218f8 100644
--- a/src/perform_step/exponential_rk_perform_step.jl
+++ b/src/perform_step/exponential_rk_perform_step.jl
@@ -729,6 +729,36 @@ function perform_step!(integrator, cache::EPIRK4s3ACache, repeat_step=false)
   # integrator.k is automatically set due to aliasing
 end
 
+function perform_step!(integrator, cache::EPIRK4s3BConstantCache, repeat_step=false)
+  @unpack t,dt,uprev,f,p = integrator
+  A = f.jac(uprev, p, t)
+  alg = typeof(integrator.alg) <: CompositeAlgorithm ? integrator.alg.algs[integrator.cache.current] : integrator.alg
+  f0 = integrator.fsalfirst # f(uprev) is fsaled
+  kwargs = [(:tol, integrator.opts.reltol), (:iop, alg.iop), (:norm, integrator.opts.internalnorm), (:adaptive, true)]
+
+  # Compute U2 and U3 vertically
+  K = phiv_timestep([dt/2, 3dt/4], A, [zeros(f0) zeros(f0) f0]; kwargs...)
+  K[:, 1] .*= 8 / (3*dt)
+  K[:, 2] .*= 16 / (9*dt)
+  U2 = uprev + K[:, 1]
+  U3 = uprev + K[:, 2]
+  R2 = f(U2, p, t + dt/2)  - f0 - A*K[:, 1] # remainder of U2
+  R3 = f(U3, p, t + 3dt/4) - f0 - A*K[:, 2] # remainder of U3
+  
+  # Update u (horizontally)
+  B = zeros(eltype(f0), length(f0), 5)
+  B[:, 2] = f0
+  B[:, 4] = (54R2 - 16R3) / dt^2
+  B[:, 5] = (-324R2 + 144R3) / dt^3
+  u = uprev + phiv_timestep(dt, A, B; kwargs...)
+
+  # Update integrator state
+  integrator.fsallast = f(u, p, t + dt)
+  integrator.k[1] = integrator.fsalfirst
+  integrator.k[2] = integrator.fsallast
+  integrator.u = u
+end
+
 ######################################################
 # Multistep exponential integrators
 function initialize!(integrator,cache::ETD2ConstantCache)

From 461da9261a47f5e6d92bd4c5107057318f6a3585 Mon Sep 17 00:00:00 2001
From: Xingjian Guo <xg703@nyu.edu>
Date: Wed, 4 Jul 2018 15:00:56 -0400
Subject: [PATCH 5/5] `EPIRK4s3B` (in-place)

---
 src/caches/linear_nonlinear_caches.jl         | 26 ++++++++++++
 .../exponential_rk_perform_step.jl            | 40 +++++++++++++++++++
 test/linear_nonlinear_krylov_tests.jl         |  2 +-
 3 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/src/caches/linear_nonlinear_caches.jl b/src/caches/linear_nonlinear_caches.jl
index 27fdfed011..c22cec1bb3 100644
--- a/src/caches/linear_nonlinear_caches.jl
+++ b/src/caches/linear_nonlinear_caches.jl
@@ -398,6 +398,32 @@ struct EPIRK4s3BConstantCache <: ExpRKConstantCache end
 alg_cache(alg::EPIRK4s3B,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnits,tTypeNoUnits,
   uprev,uprev2,f,t,dt,reltol,p,calck,::Type{Val{false}}) = EPIRK4s3BConstantCache()
 
+struct EPIRK4s3BCache{uType,rateType,matType,KsType} <: ExpRKCache
+  u::uType
+  uprev::uType
+  tmp::uType
+  rtmp::rateType
+  rtmp2::rateType
+  K::matType
+  A::matType
+  B::matType
+  KsCache::KsType
+end
+function alg_cache(alg::EPIRK4s3B,u,rate_prototype,uEltypeNoUnits,uBottomEltypeNoUnits,
+  tTypeNoUnits,uprev,uprev2,f,t,dt,reltol,p,calck,::Type{Val{true}})
+  tmp = similar(u)                                    # uType caches
+  rtmp, rtmp2 = (zeros(rate_prototype) for i = 1:2)   # rateType caches
+  # Allocate matrices
+  n = length(u); T = eltype(u)
+  K = Matrix{T}(n, 2)
+  A = Matrix{T}(n, n) # TODO: sparse Jacobian support
+  B = zeros(T, n, 5)
+  # Allocate caches for phiv_timestep
+  maxiter = min(alg.m, n)
+  KsCache = _phiv_timestep_caches(u, maxiter, 4)
+  EPIRK4s3BCache(u,uprev,tmp,rtmp,rtmp2,K,A,B,KsCache)
+end
+
 ####################################
 # Multistep exponential method caches
 
diff --git a/src/perform_step/exponential_rk_perform_step.jl b/src/perform_step/exponential_rk_perform_step.jl
index 3c34b218f8..9f64694416 100644
--- a/src/perform_step/exponential_rk_perform_step.jl
+++ b/src/perform_step/exponential_rk_perform_step.jl
@@ -759,6 +759,46 @@ function perform_step!(integrator, cache::EPIRK4s3BConstantCache, repeat_step=fa
   integrator.u = u
 end
 
+function perform_step!(integrator, cache::EPIRK4s3BCache, repeat_step=false)
+  @unpack t,dt,uprev,u,f,p = integrator
+  @unpack tmp,rtmp,rtmp2,K,A,B,KsCache = cache
+  f.jac(A, uprev, p, t)
+  alg = typeof(integrator.alg) <: CompositeAlgorithm ? integrator.alg.algs[integrator.cache.current] : integrator.alg
+  f0 = integrator.fsalfirst # f(u0) is fsaled
+  kwargs = [(:tol, integrator.opts.reltol), (:iop, alg.iop), (:norm, integrator.opts.internalnorm), 
+    (:adaptive, true), (:caches, KsCache)]
+
+  # Compute U2 and U3 vertically
+  fill!(@view(B[:, 2]), zero(eltype(B)))
+  B[:, 3] .= f0
+  phiv_timestep!(K, [dt/2, 3dt/4], A, @view(B[:, 1:3]); kwargs...)
+  K[:, 1] .*= 8 / (3*dt)
+  K[:, 2] .*= 16 / (9*dt)
+  ## U2 and R2
+  @. tmp = uprev + @view(K[:, 1]) # tmp is now U2
+  f(rtmp, tmp, p, t + dt/2); A_mul_B!(rtmp2, A, @view(K[:, 1]))
+  @. rtmp = rtmp - f0 - rtmp2 # rtmp is now R2
+  B[:, 4] .= (54/dt^2) * rtmp
+  B[:, 5] .= (-324/dt^3) * rtmp
+  ## U3 and R3
+  @. tmp = uprev + @view(K[:, 2]) # tmp is now U3
+  f(rtmp, tmp, p, t + 3dt/4); A_mul_B!(rtmp2, A, @view(K[:, 2]))
+  @. rtmp = rtmp - f0 - rtmp2 # rtmp is now R3
+  B[:, 4] .-= (16/dt^2) * rtmp
+  B[:, 5] .+= (144/dt^3) * rtmp
+  
+  # Update u
+  fill!(@view(B[:, 3]), zero(eltype(B)))
+  B[:, 2] .= f0
+  du = @view(K[:, 1])
+  phiv_timestep!(du, dt, A, B; kwargs...)
+  @. u = uprev + du
+
+  # Update integrator state
+  f(integrator.fsallast, u, p, t + dt)
+  # integrator.k is automatically set due to aliasing
+end
+
 ######################################################
 # Multistep exponential integrators
 function initialize!(integrator,cache::ETD2ConstantCache)
diff --git a/test/linear_nonlinear_krylov_tests.jl b/test/linear_nonlinear_krylov_tests.jl
index b463e3192f..ef7c3e6bf0 100644
--- a/test/linear_nonlinear_krylov_tests.jl
+++ b/test/linear_nonlinear_krylov_tests.jl
@@ -50,7 +50,7 @@ end
     prob_ip = ODEProblem{true}(f_ip, u0, (0.0, 1.0))
 
     dt = 0.05; tol=1e-5
-    Algs = [Exp4, EPIRK4s3A]
+    Algs = [Exp4, EPIRK4s3A, EPIRK4s3B]
     for Alg in Algs
         gc()
         sol = solve(prob, Alg(); dt=dt, internalnorm=Base.norm, reltol=tol)