diff --git a/Project.toml b/Project.toml
index ce6dc1f8..b71f2ba2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -20,6 +20,7 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
@@ -43,6 +44,7 @@ Pkg = "1.11.0"
 Plots = "1.40.15"
 Printf = "1.11.0"
 SparseArrays = "1.11.0"
+Random = "1.11.0"
 Roots = "2.2.13"
 SpecialFunctions = "2.5.1"
 StaticArrays = "1.9.15"
diff --git a/benchmarks/benchmark_delta_prime_methods.jl b/benchmarks/benchmark_delta_prime_methods.jl
new file mode 100644
index 00000000..704763f4
--- /dev/null
+++ b/benchmarks/benchmark_delta_prime_methods.jl
@@ -0,0 +1,95 @@
+# Sanity check: compute_delta_prime_from_ca! vs inline Δ' from riccati_cross_ideal_singular_surf!
+#
+# riccati_cross_ideal_singular_surf! computes Δ' inline at each singular surface crossing
+# using the diagonal formula (no Gaussian reduction permutation):
+#   Δ'[s] = (ca_r[ipert_res, ipert_res, 2, s] - ca_l[ipert_res, ipert_res, 2, s]) / (4π²·ψ₀)
+#
+# compute_delta_prime_from_ca! applies the identical formula post-hoc from the stored
+# ca_l/ca_r arrays. Since both operate on the same data with the same formula, results
+# should match to floating-point precision (not just approximately — exactly).
+#
+# This verifies that compute_delta_prime_from_ca! is a correct standalone implementation
+# of the Δ' formula that can be used for testing or alternative integration drivers.
+#
+# Usage (from JPEC_main root):
+#   julia --project=. benchmarks/benchmark_delta_prime_methods.jl
+
+using LinearAlgebra, Printf, TOML
+using GeneralizedPerturbedEquilibrium
+
+const FFS = GeneralizedPerturbedEquilibrium.ForceFreeStates
+
+function setup_and_run_solovev()
+    ex = joinpath(@__DIR__, "..", "test", "test_data", "regression_solovev_ideal_example")
+    inputs = TOML.parsefile(joinpath(ex, "gpec.toml"))
+    inputs["ForceFreeStates"]["verbose"] = false
+    inputs["ForceFreeStates"]["use_riccati"] = true
+    intr = FFS.ForceFreeStatesInternal(; dir_path=ex)
+    ctrl = FFS.ForceFreeStatesControl(;
+        (Symbol(k) => v for (k, v) in inputs["ForceFreeStates"])...)
+    eq_config = GeneralizedPerturbedEquilibrium.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], ex)
+    equil = GeneralizedPerturbedEquilibrium.Equilibrium.setup_equilibrium(eq_config)
+    intr.wall_settings = GeneralizedPerturbedEquilibrium.Vacuum.WallShapeSettings(;
+        (Symbol(k) => v for (k, v) in inputs["Wall"])...)
+    FFS.sing_lim!(intr, ctrl, equil)
+    intr.nlow = ctrl.nn_low; intr.nhigh = ctrl.nn_high; intr.npert = 1
+    FFS.sing_find!(intr, equil)
+    intr.mlow  = min(intr.nlow * equil.params.qmin, 0) - 4 - ctrl.delta_mlow
+    intr.mhigh = trunc(Int, intr.nhigh * equil.params.qmax) + ctrl.delta_mhigh
+    intr.mpert = intr.mhigh - intr.mlow + 1
+    intr.mband = intr.mpert - 1
+    intr.numpert_total = intr.mpert * intr.npert
+    metric = FFS.make_metric(equil; mband=intr.mband, fft_flag=ctrl.fft_flag)
+    ffit = FFS.make_matrix(equil, intr, metric)
+    odet = FFS.riccati_eulerlagrange_integration(ctrl, equil, ffit, intr)
+    return ctrl, equil, ffit, intr, odet
+end
+
+println("\n=== compute_delta_prime_from_ca! consistency check ===")
+println("Verifies the standalone Δ' formula matches the inline Riccati crossing computation.")
+println("Expected error: exactly zero (same formula, same data).\n")
+
+ctrl, equil, ffit, intr, odet = setup_and_run_solovev()
+msing = intr.msing
+
+# Capture Δ' values set inline by riccati_cross_ideal_singular_surf! during integration
+delta_prime_inline = [copy(intr.sing[s].delta_prime) for s in 1:msing]
+
+# Now call compute_delta_prime_from_ca! — it reads the same ca_l/ca_r arrays and
+# overwrites intr.sing[s].delta_prime using the identical diagonal formula
+FFS.compute_delta_prime_from_ca!(odet, intr, equil)
+
+println("  N=$(intr.numpert_total) modes, $msing singular surfaces\n")
+@printf("  %6s  %4s  %4s  %22s  %22s  %12s\n",
+        "Surf", "m", "n", "Δ' (inline)", "Δ' (from_ca)", "abs diff")
+println("  " * "-"^76)
+
+max_absdiff = let max_absdiff = 0.0
+    for s in 1:msing
+        sing = intr.sing[s]
+        dp_from_ca = intr.sing[s].delta_prime
+        for i in eachindex(delta_prime_inline[s])
+            dp_il  = delta_prime_inline[s][i]
+            dp_fc  = dp_from_ca[i]
+            absdiff = abs(dp_fc - dp_il)
+            max_absdiff = max(max_absdiff, absdiff)
+            @printf("  %6d  %4d  %4d  %22.6f%+.6fi  %22.6f%+.6fi  %12.4e\n",
+                    s, sing.m[i], sing.n[i],
+                    real(dp_il), imag(dp_il),
+                    real(dp_fc), imag(dp_fc),
+                    absdiff)
+        end
+    end
+    max_absdiff
+end
+
+println()
+if max_absdiff == 0.0
+    println("PASSED — Δ' values are bit-for-bit identical (max abs diff = 0.0)")
+elseif max_absdiff < 1e-14
+    @printf("PASSED — max abs diff = %.2e (floating-point rounding only)\n", max_absdiff)
+else
+    @printf("FAILED — max abs diff = %.2e (expected exact agreement)\n", max_absdiff)
+    exit(1)
+end
+println()
diff --git a/benchmarks/benchmark_riccati_der.jl b/benchmarks/benchmark_riccati_der.jl
new file mode 100644
index 00000000..f751588f
--- /dev/null
+++ b/benchmarks/benchmark_riccati_der.jl
@@ -0,0 +1,131 @@
+# Sanity check: riccati_der! correctly evaluates the explicit Riccati ODE.
+#
+# riccati_der! implements [Glasser 2018 Phys. Plasmas 25, 032507, Eq. 19]:
+#   dS/dψ = w†·F̄⁻¹·w - S·Ḡ·S,   w = Q - K̄·S
+#
+# where Q = diag(1/(m - n·q)), F̄ = L·L† (Cholesky), K̄ and Ḡ are the MHD
+# metric matrices evaluated at ψ.
+#
+# NOTE: The identity between this Riccati ODE and the EL chain rule
+#   dS/dψ = dU₁·U₂⁻¹ - S·dU₂·U₂⁻¹
+# holds ONLY for Hermitian S (physical states evolved from the axis, where
+# S†=S is preserved by the EL symmetry). For arbitrary non-Hermitian (U₁, U₂),
+# the two expressions differ — so this script compares riccati_der! against the
+# explicit formula rather than against sing_der!.
+#
+# Usage (from JPEC_main root):
+#   julia --project=. benchmarks/benchmark_riccati_der.jl
+
+using LinearAlgebra, Random, Printf, TOML
+using GeneralizedPerturbedEquilibrium
+
+const FFS = GeneralizedPerturbedEquilibrium.ForceFreeStates
+
+function setup_solovev()
+    ex = joinpath(@__DIR__, "..", "test", "test_data", "regression_solovev_ideal_example")
+    inputs = TOML.parsefile(joinpath(ex, "gpec.toml"))
+    inputs["ForceFreeStates"]["verbose"] = false
+    intr = FFS.ForceFreeStatesInternal(; dir_path=ex)
+    ctrl = FFS.ForceFreeStatesControl(;
+        (Symbol(k) => v for (k, v) in inputs["ForceFreeStates"])...)
+    eq_config = GeneralizedPerturbedEquilibrium.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], ex)
+    equil = GeneralizedPerturbedEquilibrium.Equilibrium.setup_equilibrium(eq_config)
+    intr.wall_settings = GeneralizedPerturbedEquilibrium.Vacuum.WallShapeSettings(;
+        (Symbol(k) => v for (k, v) in inputs["Wall"])...)
+    FFS.sing_lim!(intr, ctrl, equil)
+    intr.nlow = ctrl.nn_low; intr.nhigh = ctrl.nn_high; intr.npert = 1
+    FFS.sing_find!(intr, equil)
+    intr.mlow  = min(intr.nlow * equil.params.qmin, 0) - 4 - ctrl.delta_mlow
+    intr.mhigh = trunc(Int, intr.nhigh * equil.params.qmax) + ctrl.delta_mhigh
+    intr.mpert = intr.mhigh - intr.mlow + 1
+    intr.mband = intr.mpert - 1
+    intr.numpert_total = intr.mpert * intr.npert
+    metric = FFS.make_metric(equil; mband=intr.mband, fft_flag=ctrl.fft_flag)
+    ffit = FFS.make_matrix(equil, intr, metric)
+    return ctrl, equil, ffit, intr
+end
+
+# Evaluate the Riccati RHS explicitly from splines: dS = w†·F̄⁻¹·w - S·Ḡ·S
+function riccati_rhs_manual(S, psi, equil, ffit, intr)
+    N = intr.numpert_total
+    L    = zeros(ComplexF64, N, N)
+    Kmat = zeros(ComplexF64, N, N)
+    Gmat = zeros(ComplexF64, N, N)
+    ffit.fmats_lower(vec(L),    psi; hint=ffit._hint)
+    ffit.kmats(vec(Kmat), psi; hint=ffit._hint)
+    ffit.gmats(vec(Gmat), psi; hint=ffit._hint)
+
+    q = equil.profiles.q_spline(psi)
+    singfac = vec(1.0 ./ ((intr.mlow:intr.mhigh) .- q .* (intr.nlow:intr.nhigh)'))
+
+    # w = Q - K̄·S  (Q is diagonal; add only the diagonal entries)
+    w = -Kmat * S
+    for i in 1:N
+        w[i, i] += singfac[i]
+    end
+
+    # v = F̄⁻¹·w  via stored Cholesky factor L (L·L† = F̄)
+    v = copy(w)
+    ldiv!(LowerTriangular(L), v)
+    ldiv!(UpperTriangular(L'), v)
+
+    return adjoint(w) * v - S * Gmat * S
+end
+
+println("\n=== riccati_der! formula verification ===")
+println("Verifies riccati_der! output matches manual evaluation of Glasser 2018 Eq. 19.")
+println("Test state: Hermitian S (physical constraint). Expected error: ~machine epsilon.\n")
+
+ctrl, equil, ffit, intr = setup_solovev()
+N = intr.numpert_total
+
+odet = FFS.OdeState(N, ctrl.numsteps_init, ctrl.numunorms_init, intr.msing)
+FFS.initialize_el_at_axis!(odet, ctrl, equil.profiles, intr)
+chunks = FFS.chunk_el_integration_bounds(odet, ctrl, intr)
+
+# 30% into each chunk: well inside the interval, away from singularities at psi_end
+test_psis = [c.psi_start + 0.3 * (c.psi_end - c.psi_start) for c in chunks]
+
+println("  N=$N modes, $(length(test_psis)) test ψ points (30% into each chunk)\n")
+@printf("  %8s  %14s  %14s  %12s\n", "ψ", "‖dS_manual‖", "‖dS_ric‖", "rel error")
+println("  " * "-"^54)
+
+rng = Random.MersenneTwister(42)
+threshold = 1e-10
+
+max_err = let max_err = 0.0
+    for psi in test_psis
+        # Hermitian S: physical Riccati matrix is Hermitian (preserved by EL symmetry)
+        A = randn(rng, ComplexF64, N, N)
+        S = (A + A') / 2   # Hermitian by construction
+
+        # Manual RHS
+        dS_manual = riccati_rhs_manual(S, psi, equil, ffit, intr)
+
+        # riccati_der! RHS
+        u_ric  = zeros(ComplexF64, N, N, 2)
+        du_ric = zeros(ComplexF64, N, N, 2)
+        u_ric[:, :, 1] .= S
+        u_ric[:, :, 2] .= Matrix{ComplexF64}(I, N, N)
+        dummy_chunk = FFS.IntegrationChunk(psi, psi, false, 0, 1)
+        params = (ctrl, equil, ffit, intr, odet, dummy_chunk)
+        FFS.riccati_der!(du_ric, u_ric, params, psi)
+        dS_ric = du_ric[:, :, 1]
+
+        ref = max(norm(dS_manual), 1e-10)
+        err = norm(dS_ric - dS_manual) / ref
+        max_err = max(max_err, err)
+        status = err < threshold ? "" : "  ← FAIL"
+        @printf("  %8.4f  %14.4e  %14.4e  %12.4e%s\n", psi, norm(dS_manual), norm(dS_ric), err, status)
+    end
+    max_err
+end
+
+println()
+if max_err < threshold
+    @printf("PASSED — max rel error = %.2e (threshold %.0e)\n", max_err, threshold)
+else
+    @printf("FAILED — max rel error = %.2e exceeds threshold %.0e\n", max_err, threshold)
+    exit(1)
+end
+println()
diff --git a/benchmarks/benchmark_threads.jl b/benchmarks/benchmark_threads.jl
new file mode 100644
index 00000000..1c8b4c4c
--- /dev/null
+++ b/benchmarks/benchmark_threads.jl
@@ -0,0 +1,76 @@
+# Thread-scaling benchmark for the bidirectional parallel FM integration.
+# Runs the Solovev (N=8) and DIIID-like (N=26) examples with use_parallel=true
+# across 1, 2, 4, 8 threads and compares against the serial Riccati path.
+#
+# Usage (from JPEC_main root):
+#   for t in 1 2 4 8; do julia -t $t --project=. benchmarks/benchmark_threads.jl; done
+
+using GeneralizedPerturbedEquilibrium, TOML, Printf, Statistics
+
+function run_ffs(ex; use_parallel, use_riccati=false)
+    inputs = TOML.parsefile(joinpath(ex, "gpec.toml"))
+    inputs["ForceFreeStates"]["verbose"] = false
+    inputs["ForceFreeStates"]["use_parallel"] = use_parallel
+    inputs["ForceFreeStates"]["use_riccati"] = use_riccati
+    inputs["ForceFreeStates"]["write_outputs_to_HDF5"] = false
+    intr = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesInternal(; dir_path=ex)
+    ctrl = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesControl(;
+        (Symbol(k) => v for (k, v) in inputs["ForceFreeStates"])...)
+    eq_config = GeneralizedPerturbedEquilibrium.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], ex)
+    equil = GeneralizedPerturbedEquilibrium.Equilibrium.setup_equilibrium(eq_config)
+    intr.wall_settings = GeneralizedPerturbedEquilibrium.Vacuum.WallShapeSettings(;
+        (Symbol(k) => v for (k, v) in inputs["Wall"])...)
+    GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_lim!(intr, ctrl, equil)
+    intr.nlow = ctrl.nn_low; intr.nhigh = ctrl.nn_high; intr.npert = 1
+    GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_find!(intr, equil)
+    intr.mlow  = min(intr.nlow * equil.params.qmin, 0) - 4 - ctrl.delta_mlow
+    intr.mhigh = trunc(Int, intr.nhigh * equil.params.qmax) + ctrl.delta_mhigh
+    intr.mpert = intr.mhigh - intr.mlow + 1
+    intr.mband = intr.mpert - 1
+    intr.numpert_total = intr.mpert * intr.npert
+    metric = GeneralizedPerturbedEquilibrium.ForceFreeStates.make_metric(equil; mband=intr.mband, fft_flag=ctrl.fft_flag)
+    ffit = GeneralizedPerturbedEquilibrium.ForceFreeStates.make_matrix(equil, intr, metric)
+    odet = GeneralizedPerturbedEquilibrium.ForceFreeStates.eulerlagrange_integration(ctrl, equil, ffit, intr)
+    vac = GeneralizedPerturbedEquilibrium.ForceFreeStates.free_run!(odet, ctrl, equil, ffit, intr)
+    return real(vac.et[1]), intr.numpert_total
+end
+
+function timed_run(ex; use_parallel, use_riccati=false, nwarm=1, nrep=2)
+    # Warmup
+    for _ in 1:nwarm
+        run_ffs(ex; use_parallel, use_riccati)
+    end
+    # Timed runs
+    times = Float64[]
+    local et1, N
+    for _ in 1:nrep
+        t0 = time()
+        et1, N = run_ffs(ex; use_parallel, use_riccati)
+        push!(times, time() - t0)
+    end
+    return mean(times), et1, N
+end
+
+nthreads = Threads.nthreads()
+root     = joinpath(@__DIR__, "..")
+sol_ex   = joinpath(root, "test", "test_data", "regression_solovev_ideal_example")
+diiid_ex = joinpath(root, "examples", "DIIID-like_ideal_example")
+
+println("\n=== Thread-scaling benchmark ($(nthreads) thread(s)) ===\n")
+
+for (label, ex) in [("Solovev", sol_ex), ("DIIID-like", diiid_ex)]
+    t_std,    et_std,  N = timed_run(ex; use_parallel=false, use_riccati=false)
+    t_ric,    et_ric,  _ = timed_run(ex; use_parallel=false, use_riccati=true)
+    t_par,    et_par,  _ = timed_run(ex; use_parallel=true,  use_riccati=false)
+
+    err_ric = abs(et_ric - et_std) / abs(et_std) * 100
+    err_par = abs(et_par - et_std) / abs(et_std) * 100
+
+    println("$label (N=$N, nthreads=$nthreads)")
+    @printf("  standard   et[1]=%.5f  t=%.2fs  speedup=1.00×\n", et_std, t_std)
+    @printf("  riccati    et[1]=%.5f  t=%.2fs  speedup=%.2f×  err=%.4f%%\n",
+            et_ric, t_ric, t_std/t_ric, err_ric)
+    @printf("  parallel   et[1]=%.5f  t=%.2fs  speedup=%.2f×  err=%.4f%%\n",
+            et_par, t_par, t_std/t_par, err_par)
+    println()
+end
diff --git a/docs/make.jl b/docs/make.jl
index 3ab4649a..0a801037 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -26,6 +26,7 @@ makedocs(;
         "API Reference" => [
             "Vacuum" => "vacuum.md",
             "Equilibrium" => "equilibrium.md",
+            "Stability Analysis" => "stability.md",
             "Utilities" => "utilities.md",
             "Forcing Terms" => "forcing_terms.md",
             "Perturbed Equilibrium" => "perturbed_equilibrium.md"
diff --git a/docs/src/equilibrium.md b/docs/src/equilibrium.md
index bbd5aa4c..2353facf 100644
--- a/docs/src/equilibrium.md
+++ b/docs/src/equilibrium.md
@@ -104,6 +104,7 @@ Notes:
 
 ## See also
 
+- `docs/src/stability.md` — ideal MHD stability analysis built on top of the equilibrium
 - `docs/src/splines.md` — spline helpers used by equilibrium routines
 - `docs/src/vacuum.md` — coupling between equilibrium and vacuum solvers
 
diff --git a/docs/src/stability.md b/docs/src/stability.md
new file mode 100644
index 00000000..59bc7136
--- /dev/null
+++ b/docs/src/stability.md
@@ -0,0 +1,304 @@
+# Ideal MHD Stability (ForceFreeStates)
+
+The `ForceFreeStates` module implements ideal MHD stability analysis for axisymmetric toroidal
+plasmas following the direct Newcomb criterion described in [Glasser 2016].  It solves the
+Euler-Lagrange (EL) system derived from the potential energy functional, identifies singular
+(rational) surfaces where resonant coupling occurs, and returns eigenmode energies, the
+tearing stability parameters Δ', and the full inter-surface Δ' matrix.
+
+## Physical background
+
+Ideal MHD stability is determined by the sign of the perturbed potential energy
+
+```math
+\delta W[\xi] = \int_0^{\psi_\mathrm{lim}} \mathcal{F}(\xi, \xi') \, d\psi,
+```
+
+where ``\xi(\psi)`` is the poloidal displacement vector.  The extremum of ``\delta W`` over all
+admissible ``\xi`` satisfies the Euler-Lagrange system [Glasser 2016, Eq. 24]:
+
+```math
+\frac{d}{d\psi}
+\begin{pmatrix} U_1 \\ U_2 \end{pmatrix}
+=
+\begin{pmatrix} A & B \\ C & D \end{pmatrix}
+\begin{pmatrix} U_1 \\ U_2 \end{pmatrix},
+\quad
+A = -Q\bar{F}^{-1}\bar{K}, \;
+B = Q\bar{F}^{-1}Q, \;
+C = \bar{G} - \bar{K}^\dagger\bar{F}^{-1}\bar{K}, \;
+D = \bar{K}^\dagger\bar{F}^{-1}Q,
+```
+
+where ``\bar{F}``, ``\bar{K}``, ``\bar{G}`` are the MHD metric matrices in Fourier-mode space
+and ``Q = \mathrm{diag}(1/(m - nq))`` is the singular factor.  The Newcomb criterion states
+that the plasma is stable if and only if this system admits a regular solution that remains
+finite across every rational surface.
+
+**Key references**
+
+| Paper | Content |
+|-------|---------|
+| [Glasser 2016] Phys. Plasmas **23**, 112506 | Newcomb criterion, EL system, standard DCON integration |
+| [Glasser 2018a] Phys. Plasmas **25**, 032507 | Riccati reformulation, reduced stiffness near singular surfaces |
+| [Glasser 2018b] Phys. Plasmas **25**, 032501 | STRIDE code: parallel FM integration, inter-surface Δ' matrix |
+
+## Integration methods
+
+Three integration drivers are available, all solving the same EL system but with different
+numerical strategies.
+
+### Standard integration
+
+`eulerlagrange_integration` is the baseline driver.  It integrates the EL ODE directly in
+``(U_1, U_2)`` using Tsit5 with adaptive step control.  Near each rational surface the
+columns of ``U_2`` that correspond to resonant modes are zeroed via Gaussian reduction (GR),
+keeping the solution bounded.  This is the reference path for correctness comparisons.
+
+Enable with (default):
+```toml
+[ForceFreeStates]
+use_riccati  = false
+use_parallel = false
+```
+
+### Riccati integration
+
+`riccati_eulerlagrange_integration` reformulates the problem in terms of the dual Riccati
+matrix ``S = U_1 \cdot U_2^{-1}`` [Glasser 2018a, Eq. 19]:
+
+```math
+\frac{dS}{d\psi} = w^\dagger \bar{F}^{-1} w - S\bar{G}S, \qquad
+w = Q - \bar{K}S.
+```
+
+``S`` remains bounded near rational surfaces (where ``U_1, U_2`` grow exponentially), so the
+solver takes fewer steps.  Rather than integrating the quadratic Riccati ODE directly (which
+blows up when ``|S|`` is large), the code integrates the linear EL system with
+`sing_der!` as the RHS and recovers ``S = U_1 U_2^{-1}`` via periodic renormalization — an
+approach that is mathematically equivalent to O(Δψ) but uses the ODE solver's full 5th-order
+accuracy.
+
+Renormalization is triggered whenever ``\max(|U_1|)`` or ``\max(|U_2|)`` exceeds the
+threshold `ucrit` (default 1e6), and is forced at the end of each chunk.  At singular surface
+crossings, `riccati_cross_ideal_singular_surf!` applies the small-asymptotic matching
+directly in column `ipert_res` — without Gaussian reduction — and renormalizes to ``(S, I)``.
+
+Enable with:
+```toml
+[ForceFreeStates]
+use_riccati  = true
+use_parallel = false
+```
+
+**Speedup** (benchmarked on reference examples):
+
+| Example | N modes | Speedup vs standard |
+|---------|---------|---------------------|
+| Solovev | 8  | ~1.6× (1 thread), ~2.8× (4 threads) |
+| DIIID   | 26 | ~2.0× (1 thread), ~1.3× (4 threads) |
+
+### Parallel fundamental-matrix (FM) integration
+
+`parallel_eulerlagrange_integration` decomposes the radial domain into independent chunks and
+integrates each chunk in parallel using `Threads.@threads`.  Each chunk produces a
+fundamental-matrix (FM) propagator.  Serial post-processing multiplies the propagators in
+order and applies each singular-surface crossing, recovering the same EL trajectory as the
+Riccati path.
+
+#### Bidirectional integration for large N
+
+For large mode counts the FM propagator for a chunk ending near a rational surface is
+ill-conditioned: the EL solutions grow exponentially toward the rational surface, so the
+forward FM amplifies numerical errors.  GPEC follows the STRIDE approach [Glasser 2018b,
+Sec. III.A]: the crossing chunk (the last sub-chunk before each rational surface) is
+integrated *backward* — from the rational surface toward the interior — producing a
+well-conditioned backward FM ``\Phi_L``.  The forward propagation is recovered as
+``\Phi_L^{-1}`` via an LU solve in serial assembly, which is accurate precisely because
+``\Phi_L`` is well-conditioned.
+
+The implementation uses a `direction` field on `IntegrationChunk`:
+
+- `direction = +1`: standard forward integration, `tspan = (ψ_start, ψ_end)`.
+- `direction = -1`: backward integration, `tspan = (ψ_end, ψ_start)` (reversed).
+
+`chunk_el_integration_bounds(...; bidirectional=true)` assigns `direction = -1` to every
+crossing chunk.  `balance_integration_chunks` preserves this: the sub-chunk closest to the
+rational surface inherits `direction`, while the earlier sub-chunk always gets `direction=+1`.
+
+Enable with:
+```toml
+[ForceFreeStates]
+use_parallel = true
+```
+
+**Accuracy** (N=26, DIIID-like example): energy eigenvalue within 2% of standard path.
+The residual ~2% gap comes from the different crossing convention (Riccati-style direct
+zeroing vs GR), not from ODE tolerance; it is present in both 1-thread and 4-thread runs.
+
+## Δ' tearing stability parameter
+
+### Per-surface Δ' (`delta_prime`)
+
+At each rational surface the asymptotic matching condition gives the tearing stability
+parameter [Glasser 2016]:
+
+```math
+\Delta'_s = \frac{c_{a,r}[i_s,i_s,2] - c_{a,l}[i_s,i_s,2]}{4\pi^2 \psi_0},
+```
+
+where ``c_{a,l}`` and ``c_{a,r}`` are the left and right asymptotic coefficients at surface
+``s``, and ``i_s`` is the column index of the resonant mode.  Positive ``\Delta' > 0``
+indicates a tearing-unstable surface.
+
+The Riccati and parallel FM paths populate `intr.sing[s].delta_prime` (a length-``n_\mathrm{res}``
+vector) inline during each crossing.  A companion vector `delta_prime_col` (length N) stores
+the coupling of all poloidal modes to the resonant mode at surface ``s``:
+
+```math
+(\Delta'_\mathrm{col})_{j,i} = \frac{c_{a,r}[j,i_s,2] - c_{a,l}[j,i_s,2]}{4\pi^2 \psi_0}.
+```
+
+The diagonal element ``(\Delta'_\mathrm{col})_{i_s,i}`` equals `delta_prime[i]` exactly by
+construction.
+
+### Inter-surface Δ' matrix (`delta_prime_matrix`)
+
+`compute_delta_prime_matrix!` assembles the full ``2 m_\mathrm{sing} \times 2 m_\mathrm{sing}``
+inter-surface tearing matrix following the STRIDE global BVP [Glasser 2018b, Sec. III.B].
+The BVP unknowns are the plasma state at the left and right inner-layer boundaries of every
+rational surface; the driving terms are unit-amplitude asymptotic solutions at each boundary.
+The resulting matrix encodes the full plasma response between all pairs of surfaces and is
+required for resistive stability analysis of multi-surface configurations.
+
+The BVP is well-conditioned because it is formulated using the split ``(\Phi_R, \Phi_L)``
+propagator blocks from bidirectional integration rather than the monolithic forward product
+``\Phi_L^{-1} \Phi_R`` (which is ill-conditioned for large N):
+
+```math
+\Phi_R[j] \cdot x_R[j-1] - \Phi_L[j] \cdot x_L[j] = 0
+\quad \text{(junction at } \psi_m[j]\text{)},
+```
+
+where ``\Phi_R[j]`` is the forward FM product from ``\psi_{R,j-1}`` to the junction, and
+``\Phi_L[j]`` is the backward crossing FM from ``\psi_{L,j}`` to the junction.
+
+The matrix is only populated by the parallel FM path and is written to the HDF5 output
+under `singular/delta_prime_matrix`.
+
+## Configuration reference
+
+All `ForceFreeStates` options are set in the `[ForceFreeStates]` section of `gpec.toml`.
+
+```toml
+[ForceFreeStates]
+# Integration driver
+use_riccati  = false   # true: Riccati path (faster, same accuracy)
+use_parallel = false   # true: parallel FM path (multi-thread, large N)
+
+# Mode space
+nn_low       = 1       # lowest toroidal mode number
+nn_high      = 1       # highest toroidal mode number
+delta_mlow   = 0       # extra low poloidal modes (m < mlow)
+delta_mhigh  = 0       # extra high poloidal modes (m > mhigh)
+
+# ODE solver
+numsteps_init     = 200    # initial step budget per chunk
+numunorms_init    = 50     # renorm checkpoint budget
+reltol            = 1e-6   # ODE relative tolerance
+
+# Output
+verbose              = true
+write_outputs_to_HDF5 = true
+```
+
+The number of Julia threads is controlled at startup via `-t N` or the `JULIA_NUM_THREADS`
+environment variable; it is not a runtime parameter.
+
+## API Reference
+
+```@autodocs
+Modules = [GeneralizedPerturbedEquilibrium.ForceFreeStates]
+```
+
+## Example usage
+
+### Run stability analysis from a TOML configuration
+
+```julia
+using GeneralizedPerturbedEquilibrium, TOML
+
+const FFS = GeneralizedPerturbedEquilibrium.ForceFreeStates
+
+ex     = "examples/Solovev_ideal_example"
+inputs = TOML.parsefile(joinpath(ex, "gpec.toml"))
+
+ctrl  = FFS.ForceFreeStatesControl(;
+            (Symbol(k) => v for (k, v) in inputs["ForceFreeStates"])...)
+equil = GeneralizedPerturbedEquilibrium.Equilibrium.setup_equilibrium(
+            GeneralizedPerturbedEquilibrium.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], ex))
+
+intr  = FFS.ForceFreeStatesInternal(; dir_path=ex)
+intr.wall_settings = GeneralizedPerturbedEquilibrium.Vacuum.WallShapeSettings(;
+    (Symbol(k) => v for (k, v) in inputs["Wall"])...)
+FFS.sing_lim!(intr, ctrl, equil)
+intr.nlow = ctrl.nn_low; intr.nhigh = ctrl.nn_high; intr.npert = 1
+FFS.sing_find!(intr, equil)
+intr.mlow  = min(intr.nlow * equil.params.qmin, 0) - 4 - ctrl.delta_mlow
+intr.mhigh = trunc(Int, intr.nhigh * equil.params.qmax) + ctrl.delta_mhigh
+intr.mpert = intr.mhigh - intr.mlow + 1
+intr.mband = intr.mpert - 1
+intr.numpert_total = intr.mpert * intr.npert
+
+metric = FFS.make_metric(equil; mband=intr.mband, fft_flag=ctrl.fft_flag)
+ffit   = FFS.make_matrix(equil, intr, metric)
+
+# Choose integration driver
+odet = ctrl.use_parallel ? FFS.parallel_eulerlagrange_integration(ctrl, equil, ffit, intr) :
+       ctrl.use_riccati  ? FFS.riccati_eulerlagrange_integration(ctrl, equil, ffit, intr) :
+                           FFS.eulerlagrange_integration(ctrl, equil, ffit, intr)
+
+vac = FFS.free_run!(odet, ctrl, equil, ffit, intr)
+println("Energy eigenvalue et[1] = ", real(vac.et[1]))
+```
+
+### Inspect Δ' at singular surfaces
+
+```julia
+for s in 1:intr.msing
+    sing = intr.sing[s]
+    println("Surface $s: ψ = $(sing.psi_s), m/n = $(sing.m[1])/$(sing.n[1])")
+    println("  Δ' = $(real(sing.delta_prime[1]))")
+end
+```
+
+### Access inter-surface Δ' matrix (parallel FM path)
+
+```julia
+# intr.delta_prime_matrix is 2·msing × 2·msing after parallel_eulerlagrange_integration
+dpm = intr.delta_prime_matrix
+println("Δ' matrix size: ", size(dpm))
+println("Diagonal (surface response to self-driving):")
+for j in 1:intr.msing
+    println("  Surface $j left:  ", real(dpm[2j-1, 2j-1]))
+    println("  Surface $j right: ", real(dpm[2j,   2j  ]))
+end
+```
+
+## Notes
+
+- The standard path does not populate `delta_prime`; use `PerturbedEquilibrium.SingularCoupling`
+  for Δ' on the standard path (it reads `ca_l`/`ca_r` directly).
+- The Riccati and parallel FM paths compute Δ' inline at each crossing, using the
+  direct diagonal formula (no GR permutation).  The result in `delta_prime_col[ipert_res, i]`
+  equals `delta_prime[i]` to machine precision.
+- `delta_prime_matrix` contains raw BVP coefficients, not asymptotic-normalized values;
+  its diagonal elements do **not** in general equal `delta_prime`.
+- ODE step counts depend on the equilibrium profile and mode count; the `numsteps_init`
+  parameter sets the initial allocation but the solver adapts automatically.
+
+## See also
+
+- `docs/src/equilibrium.md` — build the `PlasmaEquilibrium` object required by this module
+- `docs/src/vacuum.md` — vacuum response computed from the EL solution in `free_run!`
+- `docs/src/perturbed_equilibrium.md` — downstream singular coupling analysis using Δ'
diff --git a/src/ForceFreeStates/EulerLagrange.jl b/src/ForceFreeStates/EulerLagrange.jl
index 20b2ed39..9871f145 100644
--- a/src/ForceFreeStates/EulerLagrange.jl
+++ b/src/ForceFreeStates/EulerLagrange.jl
@@ -1,3 +1,141 @@
+"""
+    compute_delta_prime_from_ca!(odet, intr, equil)
+
+Compute the tearing stability parameter Δ' for each singular surface from the
+asymptotic coefficients `ca_l` and `ca_r` accumulated during integration.
+
+Uses the diagonal formula Δ'[i] = (ca_r[i,i,2,s] - ca_l[i,i,2,s]) / (4π² · psio),
+which is correct when the small asymptotic was introduced in column `ipert_res` directly
+(no GR permutation).
+
+**Note**: This function is no longer called from any integration driver. Δ' is now computed
+inline inside each crossing function where the correct column index is known:
+- `cross_ideal_singular_surf!` uses `perm_col` (GR-permuted column)
+- `riccati_cross_ideal_singular_surf!` uses the diagonal `ipert_res` (no GR permutation)
+
+Retained for reference and potential use in testing.
+
+This matches the formula in `PerturbedEquilibrium/SingularCoupling.jl` (lines ~197):
+  `delta_prime_val = (rbwp1 - lbwp1) / (twopi * chi1)`
+with `chi1 = 2π·psio`, so the denominators are identical.
+"""
+function compute_delta_prime_from_ca!(odet::OdeState, intr::ForceFreeStatesInternal, equil::Equilibrium.PlasmaEquilibrium)
+    denom = (2π)^2 * equil.psio  # = twopi * chi1 in SingularCoupling.jl
+    for s in 1:intr.msing
+        sing = intr.sing[s]
+        n_modes = length(sing.m)
+        resize!(intr.sing[s].delta_prime, n_modes)
+        for i in 1:n_modes
+            ipert_res = 1 + sing.m[i] - intr.mlow + (sing.n[i] - intr.nlow) * intr.mpert
+            if 1 <= ipert_res <= intr.numpert_total
+                Δca = odet.ca_r[ipert_res, ipert_res, 2, s] - odet.ca_l[ipert_res, ipert_res, 2, s]
+                intr.sing[s].delta_prime[i] = Δca / denom
+            else
+                intr.sing[s].delta_prime[i] = 0.0 + 0.0im
+            end
+        end
+    end
+end
+
+"""
+    ode_itime_cost(psi1, psi2, intr) -> Float64
+
+Estimate the relative ODE integration cost for the interval [ψ₁, ψ₂] using the
+empirical log-divergent cost model from STRIDE (Glasser 2018).
+
+The cost is a sum of logarithmic contributions from reference points:
+  - Magnetic axis (ψ_ref = 0): steep divergence, (a,b) = (39695, 212830)
+  - Each rational surface (ψ_ref = ψ_s): moderate divergence, (a,b) = (17147, 470710)
+  - Edge (ψ_ref = ψ_lim): mild divergence, (a,b) = (1646, 4683)
+
+For each reference: cost += (a/b) * |log(1 + b|ψ₂-ref|) - log(1 + b|ψ₁-ref|)|
+
+The cost model is additive for sub-intervals not containing rational surfaces,
+which makes it suitable for equal-cost splitting via bisection.
+"""
+function ode_itime_cost(psi1::Float64, psi2::Float64, intr::ForceFreeStatesInternal)
+    a_ax, b_ax = 39695.0, 212830.0
+    a_rat, b_rat = 17147.0, 470710.0
+    a_edge, b_edge = 1646.0, 4683.0
+
+    cost = (a_ax / b_ax) * abs(log(1.0 + b_ax * abs(psi2)) - log(1.0 + b_ax * abs(psi1)))
+
+    for sing in intr.sing
+        ref = sing.psifac
+        cost += (a_rat / b_rat) * abs(log(1.0 + b_rat * abs(psi2 - ref)) - log(1.0 + b_rat * abs(psi1 - ref)))
+    end
+
+    ref_edge = intr.psilim
+    cost += (a_edge / b_edge) * abs(log(1.0 + b_edge * abs(psi2 - ref_edge)) - log(1.0 + b_edge * abs(psi1 - ref_edge)))
+
+    return cost
+end
+
+"""
+    balance_integration_chunks(chunks, ctrl, intr) -> Vector{IntegrationChunk}
+
+Sub-divide integration chunks to produce a load-balanced set for parallel execution.
+Starts from the output of `chunk_el_integration_bounds` and iteratively splits the
+highest-cost chunk (by `ode_itime_cost`) until the total chunk count reaches
+`max(2*msing + 3, 4 * Threads.nthreads())`.
+
+Each split finds the equal-cost midpoint ψ_mid via bisection:
+  ode_itime_cost(psi_start, psi_mid) ≈ ode_itime_cost(psi_start, psi_end) / 2
+
+Sub-chunks inherit `needs_crossing=false` and `ising=0`. Only the LAST sub-chunk of
+each original chunk retains `needs_crossing=true` and the original `ising`, so the
+rational surface crossing still fires at the correct ψ in the serial assembly phase.
+"""
+function balance_integration_chunks(chunks::Vector{IntegrationChunk}, ctrl::ForceFreeStatesControl, intr::ForceFreeStatesInternal)
+    min_chunks = 2 * intr.msing + 3
+    target_n = max(min_chunks, 4 * Threads.nthreads())
+
+    result = collect(chunks)
+
+    while length(result) < target_n
+        # Find the highest-cost splittable chunk
+        best_idx = 0
+        best_cost = -Inf
+        for (i, chunk) in enumerate(result)
+            width = chunk.psi_end - chunk.psi_start
+            if width > 1e-8
+                c = ode_itime_cost(chunk.psi_start, chunk.psi_end, intr)
+                if c > best_cost
+                    best_cost = c
+                    best_idx = i
+                end
+            end
+        end
+
+        best_idx == 0 && break  # No more splittable chunks
+
+        chunk = result[best_idx]
+        total_cost = best_cost
+        target_cost = total_cost / 2.0
+
+        # Bisect to find ψ_mid where cost(psi_start, ψ_mid) ≈ target_cost
+        lo, hi = chunk.psi_start, chunk.psi_end
+        for _ in 1:50
+            mid = (lo + hi) / 2.0
+            if ode_itime_cost(chunk.psi_start, mid, intr) < target_cost
+                lo = mid
+            else
+                hi = mid
+            end
+        end
+        psi_mid = (lo + hi) / 2.0
+
+        left = IntegrationChunk(; psi_start=chunk.psi_start, psi_end=psi_mid,
+                                  needs_crossing=false, ising=0, direction=1)
+        right = IntegrationChunk(; psi_start=psi_mid, psi_end=chunk.psi_end,
+                                   needs_crossing=chunk.needs_crossing, ising=chunk.ising,
+                                   direction=chunk.direction)
+        splice!(result, best_idx, [left, right])
+    end
+
+    return result
+end
+
 """
     eulerlagrange_integration(ctrl::ForceFreeStatesControl, equil::Equilibrium.PlasmaEquilibrium, ffit::FourFitVars, intr::ForceFreeStatesInternal)
 
@@ -22,6 +160,13 @@ An OdeState struct containing the final state of the ODE solver after integratio
 """
 function eulerlagrange_integration(ctrl::ForceFreeStatesControl, equil::Equilibrium.PlasmaEquilibrium, ffit::FourFitVars, intr::ForceFreeStatesInternal)
 
+    # Dispatch to parallel or Riccati solver if requested
+    if ctrl.use_parallel
+        return parallel_eulerlagrange_integration(ctrl, equil, ffit, intr)
+    elseif ctrl.use_riccati
+        return riccati_eulerlagrange_integration(ctrl, equil, ffit, intr)
+    end
+
     # Initialization
     odet = OdeState(intr.numpert_total, ctrl.numsteps_init, ctrl.numunorms_init, intr.msing)
     if ctrl.sing_start <= 0
@@ -168,7 +313,7 @@ making the integration flow more predictable and easier to parallelize (e.g., fo
 
 Support for `kin_flag`
 """
-function chunk_el_integration_bounds(odet::OdeState, ctrl::ForceFreeStatesControl, intr::ForceFreeStatesInternal)
+function chunk_el_integration_bounds(odet::OdeState, ctrl::ForceFreeStatesControl, intr::ForceFreeStatesInternal; bidirectional::Bool=false)
     chunks = IntegrationChunk[]
 
     # Start from current position
@@ -207,7 +352,8 @@ function chunk_el_integration_bounds(odet::OdeState, ctrl::ForceFreeStatesContro
                 psi_start=psi_current,
                 psi_end=psi_end,
                 needs_crossing=true,
-                ising=ising_current
+                ising=ising_current,
+                direction = bidirectional ? -1 : 1
             ))
 
             # After crossing, we jump to the other side of the singular surface
@@ -278,7 +424,7 @@ function cross_ideal_singular_surf!(odet::OdeState, ctrl::ForceFreeStatesControl
     end
 
     # Re-initialize on opposite side of rational surface by approximating solution
-    params = (ctrl, equil, ffit, intr, odet, IntegrationChunk(0.0, 0.0, false, ising))
+    params = (ctrl, equil, ffit, intr, odet, IntegrationChunk(0.0, 0.0, false, ising, 1))
     du1 = zeros(ComplexF64, intr.numpert_total, intr.numpert_total, 2)
     du2 = zeros(ComplexF64, intr.numpert_total, intr.numpert_total, 2)
     sing_der!(du1, odet.u, params, odet.psifac)
@@ -299,6 +445,15 @@ function cross_ideal_singular_surf!(odet::OdeState, ctrl::ForceFreeStatesControl
     # Get asymptotic coefficients after crossing rational surface
     odet.ca_r[:, :, :, ising] .= sing_get_ca(odet.u, ua, intr)
 
+    # Note: Δ' is NOT computed for the standard path. The physical Δ' is a complex
+    # normalization-convention-dependent quantity: the correct value requires the solution
+    # columns to be in the Riccati gauge (U₂=I), which is maintained by the Riccati
+    # renormalization. The standard path's solution columns grow from the axis with an
+    # arbitrary complex phase; dividing by the outer asymptotic coefficient normalizes the
+    # magnitude but not the complex phase, so the result is in a different convention.
+    # Δ' is computed inline in riccati_cross_ideal_singular_surf! for the Riccati and
+    # parallel FM paths, where the renormalization convention is consistent.
+
     # Store values after crossing step and advance
     odet.psi_store[odet.step] = odet.psifac
     odet.q_store[odet.step] = odet.q
diff --git a/src/ForceFreeStates/ForceFreeStates.jl b/src/ForceFreeStates/ForceFreeStates.jl
index 7d580322..859c4067 100644
--- a/src/ForceFreeStates/ForceFreeStates.jl
+++ b/src/ForceFreeStates/ForceFreeStates.jl
@@ -29,6 +29,7 @@ include("Fourfit.jl")
 include("FixedBoundaryStability.jl")
 include("Utils.jl")
 include("Free.jl")
+include("Riccati.jl")
 
 # These are used for various small tolerances and root finders throughout ForceFreeStates
 global eps = 1e-10
diff --git a/src/ForceFreeStates/ForceFreeStatesStructs.jl b/src/ForceFreeStates/ForceFreeStatesStructs.jl
index d96c0cbe..6e9f2de1 100644
--- a/src/ForceFreeStates/ForceFreeStatesStructs.jl
+++ b/src/ForceFreeStates/ForceFreeStatesStructs.jl
@@ -13,6 +13,13 @@ A mutable struct holding data related to the singular surfaces in the equilibriu
   - `q1::Float64` - Derivative of safety factor with respect to ψ
   - `grri::Array{Float64,2}` - Interior Green's function at this surface [2*mthvac, 2*mpert]
   - `grre::Array{Float64,2}` - Exterior Green's function at this surface [2*mthvac, 2*mpert]
+  - `delta_prime::Vector{ComplexF64}` - Tearing stability Δ' per resonant mode (indexed same as m/n)
+  - `delta_prime_col::Matrix{ComplexF64}` - Full Δ' column: shape (numpert_total × n_res_modes).
+    `delta_prime_col[j, i]` = (ca_r[j,ipert_res_i,2] - ca_l[j,ipert_res_i,2]) / (4π²·psio),
+    the coupling of mode j to resonant mode i through the singular layer.
+    The diagonal element `delta_prime_col[ipert_res_i, i]` equals `delta_prime[i]`.
+    Off-diagonal elements represent intra-surface mode coupling via the small asymptotic.
+    Only populated for the Riccati/parallel FM paths (not the standard path).
 """
 @kwdef mutable struct SingType
     psifac::Float64 = 0.0
@@ -23,6 +30,8 @@ A mutable struct holding data related to the singular surfaces in the equilibriu
     q1::Float64 = 0.0
     grri::Array{Float64,2} = Array{Float64}(undef, 0, 0)
     grre::Array{Float64,2} = Array{Float64}(undef, 0, 0)
+    delta_prime::Vector{ComplexF64} = ComplexF64[]
+    delta_prime_col::Matrix{ComplexF64} = Matrix{ComplexF64}(undef, 0, 0)
 end
 
 """
@@ -67,14 +76,46 @@ A struct representing a region of integration in the Euler-Lagrange solver.
   - `psi_end::Float64` - Ending ψ coordinate for this integration region
   - `needs_crossing::Bool` - Whether a rational surface crossing is needed after this chunk
   - `ising::Int` - Index of the singular surface associated with this chunk (0 if none)
+  - `direction::Int` - Integration direction: +1 forward (axis→edge), -1 backward (edge→axis).
+    For `direction=-1` chunks, `psi_start` < `psi_end` but integration proceeds from `psi_end`
+    toward `psi_start`. The resulting propagator maps state at `psi_end` → state at `psi_start`.
+    Used in bidirectional parallel FM to produce well-conditioned crossing-chunk propagators:
+    solutions that grow exponentially forward (toward a singularity) decay when integrated
+    backward, so the backward propagator is well-conditioned.
 """
 @kwdef struct IntegrationChunk
     psi_start::Float64
     psi_end::Float64
     needs_crossing::Bool
     ising::Int = 0
+    direction::Int = 1   # +1 forward, -1 backward
 end
 
+"""
+    ChunkPropagator
+
+Fundamental matrix for one integration chunk, stored as two N×N×2 solution blocks.
+Represents the propagator Φ(ψ₂,ψ₁) computed by integrating the EL ODE from two
+identity-block initial conditions:
+
+  - `block_upper_ic`: result of integrating with IC = (I_N, 0_N)  (U₁ = I, U₂ = 0)
+  - `block_lower_ic`: result of integrating with IC = (0_N, I_N)  (U₁ = 0, U₂ = I)
+
+Applying the propagator to the current state `u_prev`:
+
+  u₁_new = block_upper_ic[:,:,1] · u₁_prev + block_lower_ic[:,:,1] · u₂_prev
+  u₂_new = block_upper_ic[:,:,2] · u₁_prev + block_lower_ic[:,:,2] · u₂_prev
+
+Since each chunk starts from a bounded identity IC (rather than the accumulated state),
+exponential growth within a chunk does not affect the conditioning of the overall
+assembly. This enables `Threads.@threads` parallel integration across all chunks.
+"""
+struct ChunkPropagator
+    block_upper_ic::Array{ComplexF64,3}   # shape (N, N, 2) — result from IC = (I, 0)
+    block_lower_ic::Array{ComplexF64,3}   # shape (N, N, 2) — result from IC = (0, I)
+end
+ChunkPropagator(N::Int) = ChunkPropagator(zeros(ComplexF64, N, N, 2), zeros(ComplexF64, N, N, 2))
+
 """
 DebugSettings
 
@@ -144,6 +185,15 @@ A mutable struct holding internal state variables for stability calculations.
     locstab::FastInterpolations.CubicSeriesInterpolant = cubic_interp(collect(0.0:0.25:1.0), zeros(5, 5); bc=NaturalBC())
     debug_settings::DebugSettings = DebugSettings()
     wall_settings::Vacuum.WallShapeSettings = Vacuum.WallShapeSettings()
+    """
+    Inter-surface tearing stability matrix of shape (2*msing × 2*msing).
+    delta_prime_matrix[2j-1, 2k-1] = small-asymptotic amplitude at left of surface j
+                                       when left of surface k is driven with unit amplitude.
+    Populated by `compute_delta_prime_matrix!` (parallel FM path only).
+    Uses bidirectional propagators (backward crossing chunks + forward intermediate chunks)
+    for a well-conditioned BVP, improving accuracy for large N (N ≳ 20).
+    """
+    delta_prime_matrix::Matrix{ComplexF64} = Matrix{ComplexF64}(undef, 0, 0)
 end
 
 """
@@ -198,7 +248,6 @@ A mutable struct containing control parameters for stability analysis, set by th
   - `qlow::Float64` - Integration terminated at q limit determined by minimum of qlow and q0 from equil
   - `reform_eq_with_psilim::Bool` - Reform equilibrium with computed psilim (not yet implemented)
   - `psiedge::Float64` - If less then psilim, calculates dW(psi) between psiedge and psilim, then runs with truncation at max(dW)
-  - `parallel_threads::Int` - Number of parallel threads (not yet implemented)
   - `diagnose::Bool` - Enable diagnostic output (not yet implemented)
   - `diagnose_ca::Bool` - Enable asymptotic coefficient diagnostics (not yet implemented)
   - `write_outputs_to_HDF5::Bool` - Write results to HDF5 format
@@ -206,6 +255,8 @@ A mutable struct containing control parameters for stability analysis, set by th
   - `force_wv_symmetry::Bool` - Boolean flag to enforce symmetry in the vacuum response matrix
   - `save_interval::Int` - Save every Nth ODE step (1=all, 10=every 10th). Always saves near rational surfaces. (Same as `euler_step` in the Fortran)
   - `force_termination::Bool` - Terminate after force-free states (skip perturbed equilibrium calculations)
+  - `use_riccati::Bool` - Use the dual Riccati reformulation S = U₁·U₂⁻¹ instead of the standard U₁/U₂ ODE. Reduces stiffness for faster integration. See Glasser (2018) Phys. Plasmas 25, 032507.
+  - `use_parallel::Bool` - Parallel fundamental matrix (propagator) integration using `Threads.@threads`. Each chunk is integrated independently from identity IC and assembled serially. Requires `singfac_min != 0`. Uses the same chunk bounds as the standard path but sub-divides chunks for load balancing. Crossings use the Riccati-style algorithm (no Gaussian reduction).
 """
 @kwdef mutable struct ForceFreeStatesControl
     verbose::Bool = true
@@ -253,7 +304,6 @@ A mutable struct containing control parameters for stability analysis, set by th
     qlow::Float64 = 0.0
     reform_eq_with_psilim::Bool = false
     psiedge::Float64 = 1.0
-    parallel_threads::Int = 1
     diagnose::Bool = false
     diagnose_ca::Bool = false
     write_outputs_to_HDF5::Bool = true
@@ -261,6 +311,8 @@ A mutable struct containing control parameters for stability analysis, set by th
     force_wv_symmetry::Bool = true
     save_interval::Int = 10
     force_termination::Bool = false
+    use_riccati::Bool = false
+    use_parallel::Bool = false
 end
 
 @kwdef mutable struct FourFitVars{S<:CubicSeriesInterpolant,Opts<:NamedTuple}
@@ -329,8 +381,8 @@ Populated in `Free.jl`.
   - `et::Vector{ComplexF64}` - Total eigenvalues of plasma + vacuum
   - `grri::Array{Float64, 2}` - Interior Green's function matrices (2 * mthvac * nzvac × 2 * numpert_total)
   - `grre::Array{Float64, 2}` - Exterior Green's function matrices (2 * mthvac * nzvac × 2 * numpert_total)
-  - `plasma_pts::Array{Float64, 3}` - Cartesian coordinates of plasma points [x, y, z] (mthvac * nzvac × 3)
-  - `wall_pts::Array{Float64, 3}` - Cartesian coordinates of wall points [x, y, z] (mthvac * nzvac × 3)
+  - `plasma_pts::Array{Float64, 3}` - Cartesian coordinates of plasma points, shape (mthvac * nzvac) × 3 for (x, y, z)
+  - `wall_pts::Array{Float64, 3}` - Cartesian coordinates of wall points, shape (mthvac * nzvac) × 3 for (x, y, z)
 """
 @kwdef mutable struct VacuumData
     numpoints::Int
diff --git a/src/ForceFreeStates/Riccati.jl b/src/ForceFreeStates/Riccati.jl
new file mode 100644
index 00000000..eb987f58
--- /dev/null
+++ b/src/ForceFreeStates/Riccati.jl
@@ -0,0 +1,1026 @@
+"""
+    Riccati.jl - Dual Riccati reformulation of the Euler-Lagrange ODE
+
+Implements the dual Riccati matrix S = U₁ · U₂⁻¹ = P⁻¹, which satisfies a bounded
+ODE even near singular surfaces where U₁, U₂ grow exponentially. This reduced stiffness
+leads to fewer ODE integration steps and faster wall-clock time.
+
+Reference: Glasser (2018) Phys. Plasmas 25, 032507 — Eq. 19 (adapted for dual form S = P⁻¹)
+where P = U₂ · U₁⁻¹ is the forward plasma response matrix.
+
+## Dual Riccati ODE
+
+Starting from the Euler-Lagrange system [Glasser 2016 eq. 24]:
+  dU₁/dψ = A·U₁ + B·U₂        A = -Q·F̄⁻¹·K̄,  B = Q·F̄⁻¹·Q
+  dU₂/dψ = C·U₁ + D·U₂        C = Ḡ - K̄†·F̄⁻¹·K̄,  D = K̄†·F̄⁻¹·Q
+
+with S = U₁·U₂⁻¹, differentiating gives the Riccati ODE:
+  dS/dψ = B + A·S - S·D - S·C·S
+
+Setting w = Q - K̄·S (shape N×N) and v = F̄⁻¹·w (Cholesky solve), this simplifies to:
+  dS/dψ = w†·v - S·Ḡ·S     [Glasser 2018 eq. 19, dual form]
+
+## Integration Strategy
+
+### Why not integrate the Riccati ODE directly?
+
+`riccati_der!` evaluates the explicit Riccati RHS `dS/dψ = w†F̄⁻¹w − S·Ḡ·S` correctly,
+but this ODE is **quadratic** in S. Near a rational surface, S grows large, so the quadratic
+term `-SGS` dominates and the RHS grows as |S|². Explicit adaptive solvers (Tsit5) use
+*relative* error control: they accept a step when |Δu|/|u| < reltol. When |S| is large,
+the absolute error |ΔS| can be enormous while the relative error stays within tolerance.
+The solver takes large steps through what is effectively a near-blowup — no amount of
+step-size adaptation saves it because the problem is the error *metric*, not the step size.
+An implicit solver could handle this stiffness, but is deferred.
+
+### Actual implementation: EL ODE + renormalization
+
+Instead we integrate the standard EL ODE (`sing_der!`) in the (U₁, U₂) variables and
+recover S = U₁·U₂⁻¹ by renormalization. This achieves the same Riccati trajectory with
+**no accuracy loss**:
+
+- `sing_der!` evaluates the exact EL RHS — no approximation.
+- Tsit5 integrates (U₁, U₂) to **5th-order accuracy** with the adaptive step-size
+  controller enforcing the configured reltol at every accepted step.
+- Renormalization `S = U₁·U₂⁻¹` is **exact** (a change of variables, not an approximation).
+- The global error is the same as the standard EL path — controlled by the ODE solver
+  reltol, not by the renormalization frequency.
+
+This works because the EL ODE is **linear** in (U₁, U₂): the RHS does not grow with |S|,
+so relative error control is faithful even when S is large. Renormalization triggered by
+`renormalize_riccati_inplace!` in the callback (when max(|U₁|) or max(|U₂|) > ucrit) keeps
+both matrices bounded, preventing overflow and maintaining a well-conditioned state for the
+solver — exactly analogous to Gaussian reduction in the standard ODE.
+
+### Consistency with the Riccati ODE (local analysis)
+
+To verify the method is consistent with the Riccati ODE, consider a single step from (S, I):
+
+  After one step: U₁_new = S + (A·S + B)·Δψ + O(Δψ²),  U₂_new = I + (C·S + D)·Δψ + O(Δψ²)
+  Renorm:         S_new = U₁_new · U₂_new⁻¹ = S + (B + A·S − S·D − S·C·S)·Δψ + O(Δψ²) ✓
+
+The leading term matches the Riccati ODE exactly. This is a local consistency check only —
+it does not imply the integration is first-order. In practice Tsit5 captures all higher-order
+terms through its internal stages, achieving 5th-order global accuracy at the configured reltol.
+
+## Storage Convention
+
+During chunk integration (with sing_der! as ODE RHS):
+  u[:,:,1] = U₁  (starts as S_prev, evolves toward new S)
+  u[:,:,2] = U₂  (starts as I, evolves with EL dynamics)
+
+After renormalization (at crossing or when norms exceed ucrit):
+  u[:,:,1] = S = U₁ · U₂⁻¹
+  u[:,:,2] = I
+
+This is compatible with downstream code (which uses U₁/U₂ ratio):
+  - Free.jl:     wp = u[:,:,2] / u[:,:,1] = I · S⁻¹ = P  ✓  (post-renorm)
+  - FixedBoundaryStability.jl: crit = min_eigval(u[:,:,1] / u[:,:,2]) = min_eigval(S)  ✓
+  - Axis init:   S(ψ₀) = 0  (initialize_el_at_axis! sets u[:,:,1]=0, u[:,:,2]=I)  ✓
+
+## Key Differences from Standard Integration
+
+1. `sing_der!` is used as the ODE RHS (same as standard, NOT `riccati_der!`)
+2. `riccati_integrator_callback!` replaces `integrator_callback!`: uses
+   `renormalize_riccati_inplace!` instead of Gaussian reduction
+3. `riccati_cross_ideal_singular_surf!` replaces `cross_ideal_singular_surf!`: skips Gaussian
+   reduction and uses ipert_res directly for column zeroing, then renormalizes to (S_new, I)
+4. `transform_u!` is skipped — S is already the true solution
+"""
+
+"""
+    assemble_fm_matrix(propagators, idx_range) -> Matrix{ComplexF64}
+
+Assemble the 2N×2N fundamental matrix (propagator) by multiplying chunk propagators
+in order for indices `idx_range`. Returns Φ_end * ... * Φ_start, so that the result
+maps the IC at the start of `idx_range[1]` to the state at the end of `idx_range[end]`.
+
+Each `ChunkPropagator` stores the 2N columns of Φ split into two N×N×2 blocks:
+```
+  block_upper_ic[:,:,1:2] ↔ Φ[:,1:N]     (result from IC=(I,0))
+  block_lower_ic[:,:,1:2] ↔ Φ[:,N+1:2N]  (result from IC=(0,I))
+```
+"""
+function assemble_fm_matrix(propagators::Vector{ChunkPropagator}, idx_range)
+    N = size(propagators[1].block_upper_ic, 1)
+    Phi = Matrix{ComplexF64}(I, 2N, 2N)
+    isempty(idx_range) && return Phi
+    for i in idx_range
+        p = propagators[i]
+        Phi_i = [p.block_upper_ic[:,:,1]  p.block_lower_ic[:,:,1];
+                 p.block_upper_ic[:,:,2]  p.block_lower_ic[:,:,2]]
+        Phi = Phi_i * Phi
+    end
+    return Phi
+end
+
+"""
+    compute_delta_prime_matrix!(intr, propagators, chunks)
+
+Compute the inter-surface tearing stability matrix (2·msing × 2·msing) using the
+STRIDE global BVP formulation [Glasser 2018 Phys. Plasmas 25, 032501, Sec. III.B].
+
+The BVP encodes the full plasma response with unknowns at each surface boundary:
+```
+  x_axis      (N):  free IC parameters at the axis  (U₁ = 0 regular solutions)
+  x_left[j]  (2N):  state at left inner-layer boundary of surface j
+  x_right[j] (2N):  state at right inner-layer boundary of surface j
+  x_edge      (N):  free IC parameters at the edge  (conducting wall, U₁ = 0)
+  Total unknowns: nMat = (2 + 4·msing)·N
+```
+
+The BVP matrix M is assembled from segment propagators, inner-layer continuity
+equations (non-resonant modes are continuous through each surface), and driving
+terms (unit U₂[ipert_res] amplitude at each surface side). Each of the 2·msing
+driving configurations is solved independently by LU back-substitution.
+
+## Well-conditioned BVP via bidirectional propagators
+
+For each inter-surface segment j (from `singR[j-1]` to `singL[j]`), the crossing chunk
+(direction=-1) was integrated backward, giving a well-conditioned backward FM:
+```
+  Phi_L[j] = propagators[i_crossings[j]]: maps state at singL[j] → state at psi_m[j]
+  Phi_R[j] = product of forward propagators: maps state at singR[j-1] → state at psi_m[j]
+```
+Continuity at the junction `psi_m[j]`:
+```
+  Phi_R[j] · x_right[j-1] = Phi_L[j] · x_left[j]
+  → Phi_R[j] · x_right[j-1] - Phi_L[j] · x_left[j] = 0
+```
+This replaces the ill-conditioned monolithic `Phi_segs[j] = Phi_L[j]⁻¹ · Phi_R[j]`
+with a split formulation where each factor is well-conditioned.
+
+Element delta_prime_matrix[dRow, 2k-1] = U₂[ipert_k] component at the left side
+of surface k when driving term dRow is active. dRow = 2j-1 (left of surface j) or
+2j (right of surface j). This is the raw BVP coefficient; it differs from `delta_prime`
+(which uses the asymptotic normalization from sing_get_ca).
+
+Only called from `parallel_eulerlagrange_integration` (requires FM propagators).
+The result is stored in `intr.delta_prime_matrix`.
+
+## Limitations
+- Assumes exactly one resonant mode per singular surface (standard single-n case).
+- Uses a conducting wall edge BC (U₁ = 0). Vacuum BC is deferred.
+"""
+function compute_delta_prime_matrix!(
+    intr::ForceFreeStatesInternal,
+    propagators::Vector{ChunkPropagator},
+    chunks::Vector{IntegrationChunk}
+)
+    msing = intr.msing
+    msing == 0 && return
+    N = intr.numpert_total
+
+    # Single-resonance assumption: each surface has exactly one resonant mode.
+    # Multi-resonance surfaces would require coupling all resonant modes simultaneously;
+    # only the first (sp.m[1], sp.n[1]) is used below.
+    @assert all(j -> length(intr.sing[j].m) == 1, 1:msing) "compute_delta_prime_matrix! only supports single-resonance surfaces"
+
+    # Find the index of the crossing chunk for each surface (direction=-1 in bidirectional mode)
+    i_crossings = findall(c -> c.needs_crossing, chunks)
+    @assert length(i_crossings) == msing
+
+    # Build Phi_L[j] (backward crossing chunk FM) and Phi_R[j] (product of forward
+    # chunks before the junction psi_m[j]) for each inter-surface segment j.
+    #
+    # Phi_L[j]: single backward chunk propagator at i_crossings[j]
+    #   Maps state at psi_end (≈ singL[j]) → psi_start (= psi_m[j], away from singularity)
+    #   Well-conditioned because growing EL solutions decay when integrated backward.
+    #
+    # Phi_R[j]: product of forward chunk propagators from singR[j-1] to psi_m[j]
+    #   Maps state at singR[j-1] → psi_m[j]
+    #   Phi_R[msing+1]: forward chunks from singR[msing] to edge (for edge BC)
+    Phi_L_mats = [assemble_fm_matrix(propagators, i_crossings[j]:i_crossings[j]) for j in 1:msing]
+    Phi_R_mats = Vector{Matrix{ComplexF64}}(undef, msing + 1)
+    Phi_R_mats[1] = assemble_fm_matrix(propagators, 1:i_crossings[1]-1)
+    for j in 2:msing
+        Phi_R_mats[j] = assemble_fm_matrix(propagators, i_crossings[j-1]+1:i_crossings[j]-1)
+    end
+    Phi_R_mats[msing+1] = assemble_fm_matrix(propagators, i_crossings[msing]+1:length(chunks))
+
+    # Resonant mode index (1:N) for each surface (single-resonance case)
+    ipert_all = [begin
+        sp = intr.sing[j]
+        idx = 1 + sp.m[1] - intr.mlow + (sp.n[1] - intr.nlow) * intr.mpert
+        @assert 1 <= idx <= N "Resonant mode index out of range"
+        idx
+    end for j in 1:msing]
+
+    # BVP dimensions
+    nMat = (2 + 4 * msing) * N
+    s2   = 2 * msing
+
+    # Column layout (1-indexed):
+    #   x_axis:     1:N
+    #   x_left[j]:  N + 4N*(j-1)+1 : N + 4N*(j-1)+2N
+    #   x_right[j]: N + 4N*(j-1)+2N+1 : N + 4N*j
+    #   x_edge:     N + 4N*msing+1 : nMat
+    col_axis     = 1:N
+    col_left(j)  = (N + 4N*(j-1)+1) : (N + 4N*(j-1)+2N)
+    col_right(j) = (N + 4N*(j-1)+2N+1) : (N + 4N*j)
+    col_edge     = (N + 4N*msing+1) : nMat
+
+    # Row layout:
+    #   Axis-to-surface 1 junction:  1:2N   (2N rows)
+    #   For each surface j:
+    #     Continuity:      2N + (4N-2)*(j-1)+1 : 2N + (4N-2)*(j-1)+(2N-2)  (2N-2 rows)
+    #     Junction/edge:   2N + (4N-2)*(j-1)+(2N-2)+1 : 2N + (4N-2)*j      (2N rows)
+    #   Driving terms:     2N + (4N-2)*msing+1 : nMat                        (2·msing rows)
+    row_drive_base = 2N + (4N-2)*msing
+
+    M = zeros(ComplexF64, nMat, nMat)
+
+    # Axis-to-surface 1 junction at psi_m[1]:
+    # Phi_R[1][:,N+1:2N]·x_axis = Phi_L[1]·x_left[1]
+    # → Phi_L[1]·x_left[1] - Phi_R[1][:,N+1:2N]·x_axis = 0
+    # (Phi_R[1][:,N+1:2N] selects the N regular-solution columns from the axis IC U₂=I)
+    M[1:2N, col_left(1)] .= Phi_L_mats[1]
+    M[1:2N, col_axis]    .= -view(Phi_R_mats[1], :, N+1:2N)
+
+    for j in 1:msing
+        ipert_j = ipert_all[j]
+
+        # Continuity at surface j: x_left[j][i] = x_right[j][i] for non-resonant i
+        # (skip i = ipert_j and i = ipert_j+N, the two resonant-mode rows)
+        row_cont = 2N + (4N-2)*(j-1)
+        for i in 1:2N
+            if i != ipert_j && i != ipert_j + N
+                row_cont += 1
+                M[row_cont, col_left(j)[i]]  =  1
+                M[row_cont, col_right(j)[i]] = -1
+            end
+        end
+
+        # Junction / edge matching (2N rows starting at row_cont+1)
+        junc_rows = (row_cont+1) : (2N + (4N-2)*j)
+        if j < msing
+            # Junction at psi_m[j+1]:
+            # Phi_R[j+1]·x_right[j] = Phi_L[j+1]·x_left[j+1]
+            # → Phi_R[j+1]·x_right[j] - Phi_L[j+1]·x_left[j+1] = 0
+            M[junc_rows, col_right(j)]   .=  Phi_R_mats[j+1]
+            M[junc_rows, col_left(j+1)]  .= -Phi_L_mats[j+1]
+        else
+            # Conducting wall: Phi_R[msing+1]·x_right[msing] = [0; I_N]·x_edge
+            # Upper N rows: U₁ = 0  (no x_edge contribution)
+            # Lower N rows: U₂ = x_edge  (contribution from -I·x_edge)
+            # (Phi_R[msing+1] is all forward chunks → same as old Phi_segs[msing+1])
+            M[junc_rows, col_right(msing)] .= Phi_R_mats[msing+1]
+            M[junc_rows[N+1:end], col_edge] .= -I(N)
+        end
+
+        # Driving terms: unit U₂[ipert_j] amplitude at left and right of surface j
+        M[row_drive_base + 2j-1, col_left(j)[ipert_j+N]]  = 1
+        M[row_drive_base + 2j,   col_right(j)[ipert_j+N]] = 1
+    end
+
+    M_lu = lu(M)
+    delta_mat = zeros(ComplexF64, s2, s2)
+    b = zeros(ComplexF64, nMat)
+
+    for jsing in 1:msing
+        for side in 1:2   # side=1: left drive; side=2: right drive
+            dRow = 2jsing - (2 - side)   # 2j-1 for left, 2j for right
+            fill!(b, 0)
+            b[row_drive_base + dRow] = 1
+            x = M_lu \ b
+
+            for ksing in 1:msing
+                ipert_k = ipert_all[ksing]
+                # Extract U₂[ipert_k] at left and right boundaries of surface ksing
+                delta_mat[dRow, 2ksing-1] = x[col_left(ksing)[ipert_k+N]]
+                delta_mat[dRow, 2ksing]   = x[col_right(ksing)[ipert_k+N]]
+            end
+        end
+    end
+
+    intr.delta_prime_matrix = delta_mat
+end
+
+"""
+    riccati_der!(du, u, params, psieval)
+
+Evaluate the explicit dual Riccati ODE right-hand side:
+  dS/dψ = w†·F̄⁻¹·w - S·Ḡ·S,   w = Q - K̄·S
+
+where Q = diag(1/(m - n·q)) is the diagonal singular factor matrix.
+The identity slice u[:,:,2] = I does not evolve (du[:,:,2] = 0).
+
+**NOTE**: This function is NOT used as the ODE RHS in `riccati_integrate_chunk!`.
+The explicit Riccati ODE is numerically unstable for explicit solvers: the quadratic
+term S·Ḡ·S causes finite-time blowup when K̄·S >> Q. Instead, `sing_der!` is used
+with periodic renormalization via `renormalize_riccati_inplace!`. This function is
+retained for reference and potential use with implicit solvers.
+
+See: Glasser (2018) Phys. Plasmas 25, 032507 — Eq. 19 (dual Riccati form)
+"""
+@with_pool pool function riccati_der!(
+    du::Array{ComplexF64,3},
+    u::Array{ComplexF64,3},
+    params::Tuple{ForceFreeStatesControl,Equilibrium.PlasmaEquilibrium,
+        FourFitVars,ForceFreeStatesInternal,OdeState,IntegrationChunk},
+    psieval::Float64
+)
+
+    _, equil, ffit, intr, odet, _ = params
+
+    Npert = intr.numpert_total
+    S  = @view u[:, :, 1]
+    dS = @view du[:, :, 1]
+    @view(du[:, :, 2]) .= 0  # identity does not evolve
+
+    # Compute singfac = 1/(m - n·q) as column vector Q = diag(singfac_vec)
+    # [Glasser 2016 eq. 24]
+    singfac_vec = acquire!(pool, Float64, Npert)
+    singfac_mat = reshape(singfac_vec, intr.mpert, intr.npert)
+    odet.q = equil.profiles.q_spline(psieval; hint=odet.spline_hint)
+    singfac_mat .= 1.0 ./ ((intr.mlow:intr.mhigh) .- odet.q .* (intr.nlow:intr.nhigh)')
+
+    # Allocate temporaries from pool
+    fmat_lower = acquire!(pool, ComplexF64, Npert, Npert)
+    kmat = similar!(pool, fmat_lower)
+    gmat = similar!(pool, fmat_lower)
+    w    = similar!(pool, fmat_lower)  # w = Q - K̄·S
+    v    = similar!(pool, fmat_lower)  # v = F̄⁻¹·w (then reused for S·Ḡ·S)
+    tmp  = similar!(pool, fmat_lower)  # scratch
+
+    # Evaluate F̄ (Cholesky factor), K̄, Ḡ splines at current ψ
+    ffit.fmats_lower(vec(fmat_lower), psieval; hint=ffit._hint)
+    ffit.kmats(vec(kmat), psieval; hint=ffit._hint)
+    ffit.gmats(vec(gmat), psieval; hint=ffit._hint)
+
+    # w = Q - K̄·S:  w[i,j] = singfac_vec[i]·δ_ij - (K̄·S)[i,j]
+    # Q is DIAGONAL (singfac_vec[i] only on i==j), so we cannot broadcast singfac_vec
+    # over all columns — that would give the wrong off-diagonal values.
+    mul!(w, kmat, S)      # w = K̄·S
+    @. w = -w             # w = -K̄·S
+    for i in 1:Npert
+        @inbounds w[i, i] += singfac_vec[i]  # add diagonal Q: w = Q - K̄·S
+    end
+
+    # v = F̄⁻¹·w  (in-place Cholesky solve with stored lower-triangular factor)
+    v .= w
+    ldiv!(LowerTriangular(fmat_lower), v)
+    ldiv!(UpperTriangular(fmat_lower'), v)
+
+    # dS = w†·v - S·Ḡ·S  [Glasser 2018 eq. 19, dual Riccati]
+    mul!(dS, adjoint(w), v)   # dS = w†·v
+
+    # Store du1/dψ = Q·v for ud diagnostic before v is reused
+    # Q·v = diag(singfac_vec)·v = Ξ'_Ψ (displacement gradient, with U₂ = I)
+    @. odet.ud[:, :, 1] = singfac_vec * v
+    @view(odet.ud[:, :, 2]) .= 0
+
+    # Subtract S·Ḡ·S (reuse v and tmp to avoid extra allocation)
+    mul!(tmp, gmat, S)        # tmp = Ḡ·S
+    mul!(v, S, tmp)           # v   = S·Ḡ·S
+    dS .-= v
+end
+
+"""
+    riccati_integrator_callback!(integrator)
+
+Callback function for the Riccati ODE integrator. Handles tolerance updates,
+renormalization, and storage at each step.
+
+Uses `sing_der!` as the ODE RHS: u[:,:,1] = U₁ (starts as S), u[:,:,2] = U₂ (starts as I).
+When max(|U₁|) or max(|U₂|) exceeds `ctrl.ucrit`, applies `renormalize_riccati_inplace!`
+to compute S = U₁·U₂⁻¹ and reset U₂ = I. This is the Riccati analogue of Gaussian
+reduction in the standard `integrator_callback!`, and keeps the ODE inputs bounded.
+"""
+function riccati_integrator_callback!(integrator)
+
+    ctrl, _, _, intr, odet, chunk = integrator.p
+
+    # Update integration tolerances (same logic as integrator_callback!)
+    integrator.opts.reltol = compute_tols(ctrl, intr, odet, chunk.ising)
+
+    # Renormalize when norms exceed ucrit (analogous to Gaussian reduction in integrator_callback!)
+    # During sing_der! integration: u[:,:,1]=U₁ (grows), u[:,:,2]=U₂ (grows).
+    # Renorm computes S = U₁·U₂⁻¹ and resets U₂ = I, keeping inputs bounded.
+    if maximum(abs, @view(integrator.u[:, :, 1])) > ctrl.ucrit ||
+       maximum(abs, @view(integrator.u[:, :, 2])) > ctrl.ucrit
+        renormalize_riccati_inplace!(integrator.u, intr.numpert_total)
+    end
+
+    # Determine if we should save this step
+    psi_range = abs(integrator.sol.prob.tspan[2] - integrator.sol.prob.tspan[1])
+    psi_remaining = abs(integrator.sol.prob.tspan[2] - integrator.t)
+    near_end = psi_remaining < 0.05 * psi_range || psi_remaining < 1e-4
+    steps_in_segment = length(integrator.sol.t)
+    near_start = steps_in_segment <= 2
+    should_save = near_start || near_end || (odet.step % ctrl.save_interval == 0)
+
+    if should_save
+        if odet.step >= size(odet.u_store, 4)
+            resize_storage!(odet)
+        end
+        odet.psi_store[odet.step] = integrator.t
+        @views odet.u_store[:, :, :, odet.step] .= integrator.u
+        odet.q_store[odet.step] = odet.q
+        @views odet.ud_store[:, :, :, odet.step] .= odet.ud
+        odet.step += 1
+    end
+end
+
+"""
+    riccati_integrate_chunk!(odet, ctrl, equil, ffit, intr, chunk)
+
+Integrate the dual Riccati ODE from `chunk.psi_start` to `chunk.psi_end`.
+
+Uses `sing_der!` as the ODE RHS with `riccati_integrator_callback!`, which applies
+`renormalize_riccati_inplace!` (instead of Gaussian reduction) when norms exceed ucrit.
+Starting state: u[:,:,1] = S_prev, u[:,:,2] = I (set by initialization or previous renorm).
+Ending state: u[:,:,1] = U₁, u[:,:,2] = U₂ (ratio S = U₁·U₂⁻¹ is the updated Riccati matrix).
+"""
+function riccati_integrate_chunk!(
+    odet::OdeState, ctrl::ForceFreeStatesControl, equil::Equilibrium.PlasmaEquilibrium,
+    ffit::FourFitVars, intr::ForceFreeStatesInternal, chunk::IntegrationChunk
+)
+    cb = DiscreteCallback((u, t, integrator) -> true, riccati_integrator_callback!)
+    rtol = compute_tols(ctrl, intr, odet, chunk.ising)
+    prob = ODEProblem(sing_der!, odet.u, (chunk.psi_start, chunk.psi_end),
+                      (ctrl, equil, ffit, intr, odet, chunk))
+    sol = solve(prob, BS5(); reltol=rtol, callback=cb, save_everystep=false, save_end=true)
+    odet.u .= sol.u[end]
+    odet.psifac = sol.t[end]
+    # Renormalize end state to (S, I) convention for the next chunk.
+    # When a crossing follows (needs_crossing=true), skip renorm so that ca_l is computed
+    # from the bounded (U₁, U₂) state in riccati_cross_ideal_singular_surf!: this gives
+    # consistent normalization with ca_r (also from pre-renorm state), enabling correct Δ'.
+    # The callback guarantees max(|U₁|), max(|U₂|) ≤ ucrit, so the state is bounded.
+    if !chunk.needs_crossing
+        renormalize_riccati_inplace!(odet.u, intr.numpert_total)
+    end
+end
+
+"""
+    renormalize_riccati!(odet, intr)
+
+After a singular surface crossing, restore the canonical Riccati storage convention:
+  u[:,:,1] = S_new = U₁_new · U₂_new⁻¹
+  u[:,:,2] = I
+
+`riccati_cross_ideal_singular_surf!` leaves u[:,:,1] = U₁_new and u[:,:,2] = U₂_new (not I),
+so this step is required before continuing the Riccati integration.
+
+The u_store entry from the crossing correctly has U₁_new and U₂_new (stored before this call),
+so `compute_smallest_eigenvalue` still computes U₁_new/U₂_new = S_new correctly.
+"""
+function renormalize_riccati!(odet::OdeState, intr::ForceFreeStatesInternal)
+    N = intr.numpert_total
+    # S_new = U₁_new · U₂_new⁻¹  (in-place to avoid allocation)
+    U2_copy = copy(@view odet.u[:, :, 2])
+    rdiv!(@view(odet.u[:, :, 1]), lu!(U2_copy))
+    # Reset U₂ = I
+    fill!(@view(odet.u[:, :, 2]), 0)
+    for i in 1:N
+        odet.u[i, i, 2] = 1
+    end
+end
+
+"""
+    renormalize_riccati_inplace!(u, N)
+
+In-place Riccati renormalization on an arbitrary N×N×2 array:
+  u[:,:,1] = U₁ · U₂⁻¹  (new S)
+  u[:,:,2] = I
+
+Used in `riccati_integrator_callback!` to renormalize the integrator's live state
+when column norms grow beyond `ctrl.ucrit`, analogous to Gaussian reduction in the
+standard ODE. This keeps the inputs to `sing_der!` bounded, preventing the same
+exponential growth that occurs in the standard (non-Riccati) ODE without Gaussian reduction.
+"""
+function renormalize_riccati_inplace!(u::Array{ComplexF64,3}, N::Int)
+    U2_copy = copy(@view u[:, :, 2])
+    rdiv!(@view(u[:, :, 1]), lu!(U2_copy))
+    fill!(@view(u[:, :, 2]), 0)
+    for i in 1:N
+        u[i, i, 2] = 1
+    end
+end
+
+"""
+    riccati_cross_ideal_singular_surf!(odet, ctrl, equil, ffit, intr, ising)
+
+Cross a singular surface for the Riccati formulation. Replaces `cross_ideal_singular_surf!`
+for the Riccati integration path with two key differences:
+
+1. **No Gaussian reduction**: `cross_ideal_singular_surf!` calls `compute_solution_norms!`
+   which applies Gaussian reduction to (S, I). This divides by pivot elements of S, which
+   can be near-zero (S = 0 at axis and grows slowly), producing NaN/Inf in U₂. For Riccati,
+   S is bounded so Gaussian reduction is unnecessary.
+
+2. **Direct column zeroing**: Instead of using the GR-sorted `odet.index` to identify the
+   column to zero, we use `ipert_res` directly (the resonant mode index). This is valid since
+   without GR there is no permutation applied to the columns of S.
+
+**Δ' normalization**: This function expects `odet.u` in the bounded (U₁, U₂) form produced by
+`riccati_integrate_chunk!` with `needs_crossing=true` (final renorm skipped). ca_l is computed
+from (U₁, U₂) before the crossing, and ca_r from (U₁_new, U₂_new) before `renormalize_riccati!`.
+Since column `ipert_res` of [U₁_new; U₂_new] equals the introduced asymptotic solution exactly,
+ca_r[ipert_res,ipert_res,2] = 1 regardless of other column normalizations. This gives a
+physically meaningful Δ' = ca_r - ca_l with consistent left/right normalization.
+
+After the predictor step and asymptotic introduction, `renormalize_riccati!` is called
+to restore the canonical (S_new, I) form before continuing integration.
+
+The u_store entry at the crossing step correctly stores (U₁_new, U₂_new) so that
+`evaluate_stability_criterion!` can compute U₁_new / U₂_new = S_new correctly.
+"""
+function riccati_cross_ideal_singular_surf!(
+    odet::OdeState, ctrl::ForceFreeStatesControl, equil::Equilibrium.PlasmaEquilibrium,
+    ffit::FourFitVars, intr::ForceFreeStatesInternal, ising::Int
+)
+    # Skip Gaussian reduction — S is bounded so no large-norm columns exist
+
+    singp = intr.sing[ising]
+    sing_asymp = compute_sing_asymptotics(singp, ctrl, equil, ffit, intr)
+    dpsi = singp.psifac - odet.psifac  # ψ_res - ψ_current (positive)
+
+    # Get asymptotic coefficients before crossing
+    ua = sing_get_ua(sing_asymp, -dpsi)
+    odet.ca_l[:, :, :, ising] .= sing_get_ca(odet.u, ua, intr)
+
+    # Resonant perturbation indices (same formula as in cross_ideal_singular_surf!)
+    ipert_res = 1 .+ singp.m .- intr.mlow .+ (singp.n .- intr.nlow) .* intr.mpert
+
+    if !ctrl.con_flag
+        # Zero the resonant column of (S, I) using ipert_res directly (no GR sorting needed).
+        # The zeroed column stays zero through the predictor step since both slices are zero.
+        for i in eachindex(sing_asymp.r1)
+            odet.u[:, ipert_res[i], :] .= 0
+        end
+    end
+
+    # Predictor: approximate solution on the other side of the singular surface.
+    # sing_der! works on any (U1, U2) state — the zeroed column remains zero since
+    # du1[:, ipert_res] = 0 and du2[:, ipert_res] = 0 when u[:, ipert_res, :] = 0.
+    params = (ctrl, equil, ffit, intr, odet, IntegrationChunk(0.0, 0.0, false, ising, 1))
+    du1 = zeros(ComplexF64, intr.numpert_total, intr.numpert_total, 2)
+    du2 = zeros(ComplexF64, intr.numpert_total, intr.numpert_total, 2)
+    sing_der!(du1, odet.u, params, odet.psifac)
+    odet.psifac += 2 * dpsi  # jump to other side of singular surface
+    sing_der!(du2, odet.u, params, odet.psifac)
+    odet.u .+= (du1 .+ du2) .* dpsi
+
+    # Apply asymptotic solution on other side of singular surface
+    ua = sing_get_ua(sing_asymp, dpsi)
+    if !ctrl.con_flag
+        for i in eachindex(sing_asymp.r1)
+            # Zero the resonant row (removes large components at the resonant mode)
+            odet.u[ipert_res[i], :, :] .= 0
+            # Introduce the small asymptotic resonant solution in the zeroed column.
+            # ua[:, ipert_res[i]+numpert_total, :] is the "lower" (small) solution for mode ipert_res[i].
+            # After this, u[:,:,2] = U₂_new ≠ I (has asymptotic in column ipert_res[i]);
+            # renormalize_riccati! will compute S_new = U₁_new · U₂_new⁻¹ and reset U₂ = I.
+            odet.u[:, ipert_res[i], :] .= ua[:, ipert_res[i]+intr.numpert_total, :]
+        end
+    end
+    # Compute ca_r from (U₁_new, U₂_new) before renormalization.
+    # Column ipert_res of [U₁_new; U₂_new] = ua[:,ipert_res+N,:] (the introduced small asymptotic),
+    # so ca_r[:,ipert_res] = e_{ipert_res+N} and ca_r[ipert_res,ipert_res,2] = 1 regardless of
+    # the normalization of the other columns. This gives Δ' = 1 - ca_l[ipert_res,ipert_res,2].
+    odet.ca_r[:, :, :, ising] .= sing_get_ca(odet.u, ua, intr)
+
+    # Compute Δ' using ipert_res directly (no GR → perm_col = ipert_res, ca_r diagonal = 1).
+    # Also compute the full column Δ' (all N modes) for the off-diagonal coupling.
+    if !ctrl.con_flag
+        denom = (2π)^2 * equil.psio
+        n_res = length(sing_asymp.r1)
+        N = intr.numpert_total
+        resize!(intr.sing[ising].delta_prime, n_res)
+        intr.sing[ising].delta_prime_col = zeros(ComplexF64, N, n_res)
+        for i in eachindex(sing_asymp.r1)
+            Δca_col = (odet.ca_r[:, ipert_res[i], 2, ising] - odet.ca_l[:, ipert_res[i], 2, ising]) / denom
+            intr.sing[ising].delta_prime_col[:, i] .= Δca_col
+            intr.sing[ising].delta_prime[i] = Δca_col[ipert_res[i]]
+        end
+    end
+
+    # Store (U₁_new, U₂_new) before renormalization so evaluate_stability_criterion!
+    # can recover S_new = U₁_new / U₂_new correctly via compute_smallest_eigenvalue
+    odet.psi_store[odet.step] = odet.psifac
+    odet.q_store[odet.step] = odet.q
+    odet.u_store[:, :, :, odet.step] = odet.u
+    odet.ud_store[:, :, :, odet.step] = odet.ud
+    odet.step += 1
+
+    # Renormalize to Riccati convention: S_new = U₁_new · U₂_new⁻¹, reset U₂ = I
+    renormalize_riccati!(odet, intr)
+end
+
+"""
+    riccati_eulerlagrange_integration(ctrl, equil, ffit, intr) -> OdeState
+
+Main driver for integrating the dual Riccati ODE across the plasma.
+Functionally identical to `eulerlagrange_integration` except:
+
+1. Uses `riccati_integrate_chunk!`: drives `sing_der!` with `riccati_integrator_callback!`
+   which applies `renormalize_riccati_inplace!` (instead of Gaussian reduction) when
+   column norms exceed ucrit
+2. Uses `riccati_cross_ideal_singular_surf!` instead of `cross_ideal_singular_surf!`:
+   skips Gaussian reduction (avoids near-zero pivot issues when S is small near axis)
+   and renormalizes to (S_new, I) in one step
+3. Skips `transform_u!` — S is already the true solution, no Gaussian-reduction undo needed
+
+Enable via `use_riccati = true` in `[ForceFreeStates]` section of jpec.toml, or by
+setting `ctrl.use_riccati = true` programmatically.
+"""
+function riccati_eulerlagrange_integration(
+    ctrl::ForceFreeStatesControl, equil::Equilibrium.PlasmaEquilibrium,
+    ffit::FourFitVars, intr::ForceFreeStatesInternal
+)
+    # Initialization — same as eulerlagrange_integration
+    odet = OdeState(intr.numpert_total, ctrl.numsteps_init, ctrl.numunorms_init, intr.msing)
+    if ctrl.sing_start <= 0
+        initialize_el_at_axis!(odet, ctrl, equil.profiles, intr)
+        # axis init sets u[:,:,1]=0, u[:,:,2]=I → S=0 at axis ✓
+    elseif ctrl.sing_start <= intr.msing
+        error("sing_start > 0 not implemented yet!")
+    else
+        error("Invalid value for sing_start: $(ctrl.sing_start) > msing = $(intr.msing)")
+    end
+
+    chunks = chunk_el_integration_bounds(odet, ctrl, intr)
+
+    # Prime odet.new = false so that compute_solution_norms! (if called elsewhere)
+    # does not skip Gaussian reduction on first invocation. Also initialize unorm0
+    # to safe defaults since the Riccati callback never calls compute_solution_norms!.
+    odet.new = false
+    fill!(odet.unorm0, 1.0)
+
+    if ctrl.verbose
+        @info "   ψ = $((@sprintf "%.3f" odet.psifac)),  q = $((@sprintf "%.3f" equil.profiles.q_spline(odet.psifac)))"
+    end
+
+    for chunk in chunks
+        # Integrate this chunk using the Riccati ODE (Riccati callback skips Gaussian reduction)
+        riccati_integrate_chunk!(odet, ctrl, equil, ffit, intr, chunk)
+        if ctrl.verbose
+            @info "   ψ = $((@sprintf "%.3f" odet.psifac)),  q= $((@sprintf "%.3f" odet.q)),  max(S) = $((@sprintf "%.2e" maximum(abs, odet.u[:,:,1]))),  steps = $(odet.step-1)"
+        end
+
+        # Cross rational surface (Riccati crossing skips GR, uses ipert_res directly)
+        if chunk.needs_crossing
+            if ctrl.kin_flag
+                error("kin_flag = true not implemented yet!")
+            else
+                riccati_cross_ideal_singular_surf!(odet, ctrl, equil, ffit, intr, chunk.ising)
+                # renormalize_riccati! is called inside riccati_cross_ideal_singular_surf!
+            end
+        end
+    end
+
+    # Find peak dW in edge region if applicable (uses free_compute_total which reads wp = I/S = P)
+    if ctrl.psiedge < intr.psilim
+        odet.step = findmax_dW_edge!(odet, ctrl, equil, ffit, intr)
+        trim_storage!(odet)
+        if ctrl.verbose
+            @info "Truncating integration at peak edge dW: ψ = $((@sprintf "%.2f" odet.psi_store[odet.step])),  q = $((@sprintf "%.2f" odet.q_store[odet.step]))"
+        end
+        intr.psilim = odet.psi_store[end]
+        intr.qlim = odet.q_store[end]
+        odet.u .= odet.u_store[:, :, :, end]
+    else
+        odet.step -= 1
+        trim_storage!(odet)
+    end
+
+    # Evaluate fixed-boundary stability criterion
+    if ctrl.verbose
+        @info "Evaluating fixed-boundary stability criterion"
+    end
+    odet.nzero = evaluate_stability_criterion!(odet, equil.profiles)
+
+    # Note: transform_u! is intentionally skipped.
+    # S is already the true solution (invariant under Gaussian reduction),
+    # and u_store entries have u[:,:,1]=S, u[:,:,2]=I throughout integration.
+    # At crossing steps, u_store has U₁_new/U₂_new which compute_smallest_eigenvalue
+    # correctly resolves to S_new via rdiv. No transformation is needed.
+
+    return odet
+end
+
+"""
+    integrate_propagator_chunk!(prop, chunk, ctrl, equil, ffit, intr, odet_proxy)
+
+Compute the fundamental matrix (propagator) for one integration chunk by solving the
+EL ODE twice from identity-block initial conditions.
+
+The first solve uses IC = (I_N, 0_N) (U₁=I, U₂=0) and stores the result in
+`prop.block_upper_ic`. The second uses IC = (0_N, I_N) (U₁=0, U₂=I) and stores
+the result in `prop.block_lower_ic`.
+
+`odet_proxy` is a per-thread lightweight `OdeState` used to provide thread-local
+storage for `sing_der!` side effects (`q`, `ud`, `spline_hint`). Multiple threads
+may call this function concurrently using distinct `odet_proxy` objects.
+
+No callback is used: the propagator integration proceeds without normalization or
+storage steps, since the identity ICs ensure bounded solutions within each chunk.
+"""
+function integrate_propagator_chunk!(
+    prop::ChunkPropagator,
+    chunk::IntegrationChunk,
+    ctrl::ForceFreeStatesControl,
+    equil::Equilibrium.PlasmaEquilibrium,
+    ffit::FourFitVars,
+    intr::ForceFreeStatesInternal,
+    odet_proxy::OdeState
+)
+    N = intr.numpert_total
+    # Reverse tspan for backward chunks (direction=-1): OrdinaryDiffEq handles negative tspan
+    # naturally. The resulting propagator maps state at psi_end → psi_start, which is
+    # well-conditioned because exponentially growing solutions (forward) decay backward.
+    tspan = chunk.direction == 1 ?
+        (chunk.psi_start, chunk.psi_end) :
+        (chunk.psi_end,   chunk.psi_start)
+    rtol = chunk.ising > 0 ? ctrl.tol_r : ctrl.tol_nr
+    params = (ctrl, equil, ffit, intr, odet_proxy, chunk)
+
+    # Upper block IC: U₁ = I, U₂ = 0
+    u_upper = zeros(ComplexF64, N, N, 2)
+    for i in 1:N
+        u_upper[i, i, 1] = 1
+    end
+    odet_proxy.spline_hint[] = 1
+    prob = ODEProblem(sing_der!, u_upper, tspan, params)
+    sol = solve(prob, BS5(); reltol=rtol, save_everystep=false, save_end=true)
+    prop.block_upper_ic .= sol.u[end]
+
+    # Lower block IC: U₁ = 0, U₂ = I
+    u_lower = zeros(ComplexF64, N, N, 2)
+    for i in 1:N
+        u_lower[i, i, 2] = 1
+    end
+    odet_proxy.spline_hint[] = 1
+    prob = ODEProblem(sing_der!, u_lower, tspan, params)
+    sol = solve(prob, BS5(); reltol=rtol, save_everystep=false, save_end=true)
+    prop.block_lower_ic .= sol.u[end]
+end
+
+"""
+    apply_propagator!(odet, prop)
+
+Apply the chunk propagator `prop` to the current state `odet.u` in-place.
+
+The propagator acts as a linear map on the (U₁, U₂) pair:
+
+  U₁_new = block_upper_ic[:,:,1] · U₁_prev + block_lower_ic[:,:,1] · U₂_prev
+  U₂_new = block_upper_ic[:,:,2] · U₁_prev + block_lower_ic[:,:,2] · U₂_prev
+
+This correctly propagates any state (not just the identity), including the
+(S, I) form produced by Riccati-style crossings.
+"""
+function apply_propagator!(odet::OdeState, prop::ChunkPropagator)
+    U1_upper = @view prop.block_upper_ic[:, :, 1]
+    U2_upper = @view prop.block_upper_ic[:, :, 2]
+    U1_lower = @view prop.block_lower_ic[:, :, 1]
+    U2_lower = @view prop.block_lower_ic[:, :, 2]
+
+    u1_prev = copy(@view odet.u[:, :, 1])
+    u2_prev = copy(@view odet.u[:, :, 2])
+    tmp = similar(u1_prev)
+
+    # U₁_new = U1_upper · u1_prev + U1_lower · u2_prev
+    mul!(view(odet.u, :, :, 1), U1_upper, u1_prev)
+    mul!(tmp, U1_lower, u2_prev)
+    odet.u[:, :, 1] .+= tmp
+
+    # U₂_new = U2_upper · u1_prev + U2_lower · u2_prev
+    mul!(view(odet.u, :, :, 2), U2_upper, u1_prev)
+    mul!(tmp, U2_lower, u2_prev)
+    odet.u[:, :, 2] .+= tmp
+end
+
+"""
+    apply_propagator_inverse!(odet, prop)
+
+Apply the *inverse* of the chunk propagator `prop` to the current state `odet.u` in-place.
+
+Used for backward chunks (direction=-1): the stored propagator Φ_bwd maps state at
+`psi_end` → state at `psi_start` (well-conditioned because solutions that grow
+exponentially forward decay backward). To advance the Riccati state from `psi_start`
+to `psi_end`, we solve Φ_bwd · x = u_old, which gives x = Φ_bwd⁻¹ · u_old = Φ_fwd · u_old.
+
+Since Φ_bwd is well-conditioned, the LU solve is accurate, giving the same result as
+applying the (ill-conditioned) forward propagator Φ_fwd but with far better precision.
+"""
+function apply_propagator_inverse!(odet::OdeState, prop::ChunkPropagator)
+    N = size(odet.u, 1)
+    # Assemble 2N×2N backward FM Φ_bwd
+    Φ = [prop.block_upper_ic[:,:,1] prop.block_lower_ic[:,:,1];
+         prop.block_upper_ic[:,:,2] prop.block_lower_ic[:,:,2]]
+    # Φ_bwd maps state at psi_end → psi_start (well-conditioned).
+    # We want Φ_fwd = Φ_bwd⁻¹ to advance state from psi_start → psi_end.
+    # Solving Φ_bwd · x = [U₁_old; U₂_old] gives x = Φ_bwd⁻¹ · [U₁_old; U₂_old].
+    u_old = [odet.u[:,:,1]; odet.u[:,:,2]]   # 2N × N
+    u_new = Φ \ u_old                         # LU solve, 2N × N
+    odet.u[:,:,1] .= u_new[1:N, :]
+    odet.u[:,:,2] .= u_new[N+1:2N, :]
+end
+
+"""
+    parallel_eulerlagrange_integration(ctrl, equil, ffit, intr) -> OdeState
+
+Parallel fundamental matrix (propagator) driver for the EL integration.
+
+Functionally equivalent to `eulerlagrange_integration`, integrating all bulk chunks
+concurrently using `Threads.@threads`, then re-integrating the outer plasma serially:
+
+1. **Chunk generation**: calls `chunk_el_integration_bounds`, then `balance_integration_chunks`
+   to sub-divide chunks for load-balanced parallel execution.
+2. **Parallel phase**: `integrate_propagator_chunk!` integrates each chunk independently
+   from identity initial conditions (no accumulated state, no normalization/callback).
+   Each thread uses a private `OdeState` proxy for `sing_der!` side effects.
+3. **Serial assembly**: propagators are applied sequentially with `apply_propagator!`.
+   Rational surface crossings use `riccati_cross_ideal_singular_surf!` (no Gaussian
+   reduction) matching the Riccati path convention.
+4. **Outer plasma re-integration**: after the last rational surface crossing, the outer
+   plasma (from last ψ_s to psilim) is re-integrated using `riccati_integrate_chunk!`.
+   FM propagation in this region is prone to precision loss for high N (exponential growth
+   without renormalization); Riccati integration keeps matrices bounded and provides dense
+   checkpoints for `findmax_dW_edge!`.
+
+Enable via `use_parallel = true` in `[ForceFreeStates]` of jpec.toml, or by setting
+`ctrl.use_parallel = true` programmatically. Requires `singfac_min != 0`.
+
+**Key differences from standard integration:**
+- No Gaussian reduction (crossings use riccati-style, odet.ifix stays 0)
+- `transform_u!` is called but is a no-op (identity transform, ifix=0)
+- `ud_store` is approximate (set to zeros for FM chunks; does not affect energies or Δ')
+- Outer plasma uses serial Riccati integration for numerical stability
+
+**Bidirectional integration for large-N accuracy:**
+The crossing chunk (nearest to each rational surface singL[j]) is integrated *backward*
+(`direction=-1`, `tspan` reversed). Backward integration of a region where solutions grow
+exponentially forward causes them to *decay*, so the resulting backward FM Φ_bwd is
+well-conditioned. The accurate forward propagation is recovered as Φ_bwd⁻¹ via a stable
+LU solve in `apply_propagator_inverse!`. This follows the same principle as STRIDE
+(Glasser 2018 Phys. Plasmas 25, 032501). The all-forward path had ~10% energy error for
+the DIIID-like example (N=26, n=1); bidirectional reduces this to within 2%.
+"""
+function parallel_eulerlagrange_integration(
+    ctrl::ForceFreeStatesControl, equil::Equilibrium.PlasmaEquilibrium,
+    ffit::FourFitVars, intr::ForceFreeStatesInternal
+)
+    # Initialization — same as eulerlagrange_integration
+    odet = OdeState(intr.numpert_total, ctrl.numsteps_init, ctrl.numunorms_init, intr.msing)
+    if ctrl.sing_start <= 0
+        initialize_el_at_axis!(odet, ctrl, equil.profiles, intr)
+    elseif ctrl.sing_start <= intr.msing
+        error("sing_start > 0 not implemented yet!")
+    else
+        error("Invalid value for sing_start: $(ctrl.sing_start) > msing = $(intr.msing)")
+    end
+
+    # Prime odet.new = false (consistent with riccati path — no Gaussian reduction used)
+    odet.new = false
+    fill!(odet.unorm0, 1.0)
+
+    # Build chunks and sub-divide for load-balanced parallel execution.
+    # bidirectional=true: crossing chunks (nearest to each rational surface) are assigned
+    # direction=-1, so they are integrated backward. The resulting backward propagator
+    # Φ_bwd is well-conditioned because growing EL solutions decay backward. The forward
+    # propagation is recovered as Φ_bwd⁻¹ via LU solve in apply_propagator_inverse!.
+    base_chunks = chunk_el_integration_bounds(odet, ctrl, intr; bidirectional=true)
+    chunks = balance_integration_chunks(base_chunks, ctrl, intr)
+
+    N = intr.numpert_total
+    propagators = [ChunkPropagator(N) for _ in chunks]
+
+    # Per-thread lightweight proxy OdeState for sing_der! side effects
+    nthreads = Threads.nthreads()
+    odet_proxies = [OdeState(N, 1, 1, 0) for _ in 1:nthreads]
+
+    if ctrl.verbose
+        @info "   ψ = $((@sprintf "%.3f" odet.psifac)),  q = $((@sprintf "%.3f" equil.profiles.q_spline(odet.psifac)))"
+        @info "   Parallel FM: $(length(chunks)) chunks, $nthreads threads"
+    end
+
+    # PARALLEL phase: integrate all chunks independently from identity IC.
+    # :static scheduler pins each task to one OS thread for its lifetime, so
+    # Threads.threadid() returns a stable index into odet_proxies.
+    # Without :static, Julia's task scheduler can migrate tasks between threads,
+    # making threadid() unreliable (Julia 1.7+).
+    Threads.@threads :static for i in eachindex(chunks)
+        integrate_propagator_chunk!(propagators[i], chunks[i], ctrl, equil, ffit, intr,
+                                    odet_proxies[Threads.threadid()])
+    end
+
+    # SERIAL assembly: apply propagators and handle crossings in order.
+    # After each apply_propagator!, renormalize to (S, I) form. This is the Julia
+    # equivalent of STRIDE's ode_fixup: it prevents exponential growth of the
+    # accumulated state between crossings. Without this renorm, products of N chunk
+    # FMs can have condition numbers up to (cond_per_chunk)^N, causing catastrophic
+    # cancellation for large N (N ≳ 20). With renorm, each chunk is applied as a
+    # Möbius transformation on the bounded S matrix, keeping errors at O(eps × cond_chunk)
+    # rather than O(eps × cond_chunk^N). [STRIDE ode.F: ode_fixup called after each uAxis step]
+    #
+    # last_crossing_step tracks the u_store index of the most recent crossing so that
+    # the outer plasma (from last rational surface to psilim) can be re-integrated.
+    last_crossing_step = 1
+    for (i, chunk) in enumerate(chunks)
+        # Forward chunks: apply propagator directly (Φ_fwd maps psi_start → psi_end).
+        # Backward chunks (crossing chunks with direction=-1): apply inverse of the
+        # backward propagator. Φ_bwd maps psi_end → psi_start and is well-conditioned;
+        # its inverse Φ_fwd = Φ_bwd⁻¹ gives accurate forward propagation via LU solve.
+        if chunk.direction == -1
+            apply_propagator_inverse!(odet, propagators[i])
+        else
+            apply_propagator!(odet, propagators[i])
+        end
+        # Renorm to (S, I) after every chunk — equivalent to STRIDE's ode_fixup.
+        # The state entering each crossing is already in (S, I) form.
+        renormalize_riccati_inplace!(odet.u, N)
+        odet.psifac = chunk.psi_end
+        odet.q = equil.profiles.q_spline(odet.psifac)
+
+        if ctrl.verbose
+            @info "   ψ = $((@sprintf "%.3f" odet.psifac)),  q= $((@sprintf "%.3f" odet.q)),  max(S) = $((@sprintf "%.2e" maximum(abs, odet.u[:,:,1]))),  steps = $(odet.step-1)"
+        end
+
+        if chunk.needs_crossing
+            if ctrl.kin_flag
+                error("kin_flag = true not implemented yet!")
+            else
+                # State is already (S, I) from the renorm above.
+                # riccati_cross_ideal_singular_surf! zeros column ipert_res directly
+                # (the resonant mode, no GR permutation needed in Riccati form).
+                riccati_cross_ideal_singular_surf!(odet, ctrl, equil, ffit, intr, chunk.ising)
+                last_crossing_step = odet.step - 1  # u_store index of the crossing state
+            end
+        else
+            # Save non-crossing end-of-chunk state (now always in (S, I) form)
+            if odet.step >= size(odet.u_store, 4)
+                resize_storage!(odet)
+            end
+            odet.psi_store[odet.step] = odet.psifac
+            odet.q_store[odet.step] = odet.q
+            @views odet.u_store[:, :, :, odet.step] .= odet.u
+            # ud not available from propagator integration — left as zeros
+            odet.step += 1
+        end
+    end
+
+    # Re-integrate the outer plasma (from last rational surface crossing to psilim) using
+    # Riccati for numerical stability and dense checkpoint storage.
+    #
+    # FM propagation in the outer plasma (no rational surfaces) is prone to precision loss
+    # for high N: the solution grows exponentially without renormalization, causing matrix
+    # condition numbers to grow and wp = U₂·U₁⁻¹ to lose accuracy. Riccati integration
+    # keeps matrices bounded via periodic renormalization.
+    #
+    # Dense checkpoints from this re-integration are also required for findmax_dW_edge! to
+    # accurately locate the peak dW in the edge region (psiedge < psilim case).
+    #
+    # The u_store entry at last_crossing_step contains (U₁_new, U₂_new) stored by
+    # riccati_cross_ideal_singular_surf! before renormalization; renormalizing here gives
+    # (S_new, I) as the correct Riccati starting state for the re-integration.
+    odet.u .= odet.u_store[:, :, :, last_crossing_step]
+    odet.psifac = odet.psi_store[last_crossing_step]
+    odet.q = odet.q_store[last_crossing_step]
+    odet.step = last_crossing_step + 1
+    renormalize_riccati_inplace!(odet.u, N)
+    outer_chunk = IntegrationChunk(; psi_start=odet.psifac, psi_end=intr.psilim * (1 - eps),
+                                     needs_crossing=false, ising=0)
+    riccati_integrate_chunk!(odet, ctrl, equil, ffit, intr, outer_chunk)
+    # After riccati_integrate_chunk! with needs_crossing=false:
+    #   odet.u is in (S, I) form (renorm'd at end of integration)
+    #   odet.step points to next empty slot; dense checkpoints stored for outer region
+
+    # Find peak dW in edge region (same as standard/Riccati path)
+    if ctrl.psiedge < intr.psilim
+        odet.step = findmax_dW_edge!(odet, ctrl, equil, ffit, intr)
+        trim_storage!(odet)
+        if ctrl.verbose
+            @info "Truncating integration at peak edge dW: ψ = $((@sprintf "%.2f" odet.psi_store[odet.step])),  q = $((@sprintf "%.2f" odet.q_store[odet.step]))"
+        end
+        intr.psilim = odet.psi_store[end]
+        intr.qlim = odet.q_store[end]
+        odet.u .= odet.u_store[:, :, :, end]
+        # The stored state may be a pre-renorm callback snapshot; renorm to (S, I) for free_run!
+        renormalize_riccati_inplace!(odet.u, N)
+    else
+        odet.step -= 1
+        trim_storage!(odet)
+        # odet.u is already in (S, I) from riccati_integrate_chunk! above
+    end
+
+    # Compute inter-surface Δ' matrix using the STRIDE global BVP.
+    # Uses the chunk propagators from the parallel phase (all chunks, including outer plasma).
+    # Only called when there are singular surfaces to couple.
+    if !ctrl.con_flag && intr.msing > 0
+        compute_delta_prime_matrix!(intr, propagators, chunks)
+    end
+
+    # Evaluate fixed-boundary stability criterion
+    if ctrl.verbose
+        @info "Evaluating fixed-boundary stability criterion"
+    end
+    odet.nzero = evaluate_stability_criterion!(odet, equil.profiles)
+
+    # transform_u! is called for consistency but is a no-op (ifix=0, no Gaussian reduction)
+    transform_u!(odet, intr)
+
+    return odet
+end
diff --git a/src/GeneralizedPerturbedEquilibrium.jl b/src/GeneralizedPerturbedEquilibrium.jl
index 93c5fd05..8f16adb7 100755
--- a/src/GeneralizedPerturbedEquilibrium.jl
+++ b/src/GeneralizedPerturbedEquilibrium.jl
@@ -439,6 +439,53 @@ function write_outputs_to_HDF5(
         out_h5["singular/ca_left"] = odet.ca_l
         out_h5["singular/ca_right"] = odet.ca_r
 
+        if intr.msing > 0
+            # Mode numbers at each surface (jagged — pad with 0 to max_modes width)
+            max_modes = maximum(s -> length(s.m), intr.sing)
+            m_matrix = zeros(Int, intr.msing, max_modes)
+            n_matrix = zeros(Int, intr.msing, max_modes)
+            for (s, sing) in enumerate(intr.sing)
+                for i in 1:length(sing.m)
+                    m_matrix[s, i] = sing.m[i]
+                    n_matrix[s, i] = sing.n[i]
+                end
+            end
+            out_h5["singular/m"] = m_matrix
+            out_h5["singular/n"] = n_matrix
+        end
+
+        # Write Δ' if computed (one complex value per resonant mode per singular surface)
+        if intr.msing > 0 && all(s -> !isempty(s.delta_prime), intr.sing)
+            max_modes = maximum(s -> length(s.delta_prime), intr.sing)
+            dp_matrix = zeros(ComplexF64, intr.msing, max_modes)
+            for (s, sing) in enumerate(intr.sing)
+                for i in 1:length(sing.delta_prime)
+                    dp_matrix[s, i] = sing.delta_prime[i]
+                end
+            end
+            out_h5["singular/delta_prime"] = dp_matrix
+        end
+
+        # Write full off-diagonal Δ' column if computed (Riccati/parallel FM paths only).
+        # Shape: [numpert_total × max_modes × msing], where delta_prime_col[:, i, s] is
+        # the coupling of all N modes to resonant mode i at surface s.
+        if intr.msing > 0 && all(s -> !isempty(s.delta_prime_col), intr.sing)
+            N = size(intr.sing[1].delta_prime_col, 1)
+            max_modes = maximum(s -> size(s.delta_prime_col, 2), intr.sing)
+            dp_col_tensor = zeros(ComplexF64, N, max_modes, intr.msing)
+            for (s, sing) in enumerate(intr.sing)
+                n_res = size(sing.delta_prime_col, 2)
+                dp_col_tensor[:, 1:n_res, s] = sing.delta_prime_col
+            end
+            out_h5["singular/delta_prime_col"] = dp_col_tensor
+        end
+
+        # Write inter-surface Δ' matrix if computed (parallel FM path only).
+        # Shape: [2·msing × 2·msing] where rows/columns index (surface, side) pairs.
+        if intr.msing > 0 && !isempty(intr.delta_prime_matrix)
+            out_h5["singular/delta_prime_matrix"] = intr.delta_prime_matrix
+        end
+
         # Write vacuum data; always write all entries, using empty arrays when not computed
         out_h5["vacuum/wt"] = ctrl.vac_flag ? vac_data.wt : ComplexF64[]
         out_h5["vacuum/wt0"] = ctrl.vac_flag ? vac_data.wt0 : ComplexF64[]
diff --git a/test/runtests.jl b/test/runtests.jl
index 2efa4098..06d4daf7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -24,6 +24,8 @@ else
     include("./runtests_vacuum.jl")
     include("./runtests_equil.jl")
     include("./runtests_eulerlagrange.jl")
+    include("./runtests_riccati.jl")
+    include("./runtests_parallel_integration.jl")
     include("./runtests_sing.jl")
     include("./runtests_fullruns.jl")
 end
diff --git a/test/runtests_parallel_integration.jl b/test/runtests_parallel_integration.jl
new file mode 100644
index 00000000..4a85d76c
--- /dev/null
+++ b/test/runtests_parallel_integration.jl
@@ -0,0 +1,446 @@
+using LinearAlgebra
+using TOML
+
+@testset "Parallel FM Integration Tests" begin
+
+    @testset "ChunkPropagator identity on trivial interval" begin
+        # Integrating over a zero-width interval should give the identity propagator.
+        # We test that apply_propagator! on an identity state preserves the state.
+        N = 3
+        prop = GeneralizedPerturbedEquilibrium.ForceFreeStates.ChunkPropagator(N)
+
+        # Set propagator to identity (block_upper_ic = (I, 0), block_lower_ic = (0, I))
+        for i in 1:N
+            prop.block_upper_ic[i, i, 1] = 1  # U1 block from IC=(I,0)
+            prop.block_lower_ic[i, i, 2] = 1  # U2 block from IC=(0,I)
+        end
+
+        # Apply identity propagator to an arbitrary state
+        odet = GeneralizedPerturbedEquilibrium.ForceFreeStates.OdeState(N, 10, 5, 0)
+        u1_in = [1.0+0.5im  0.2im   0.0;
+                 0.1+0.1im  1.2+0.1im 0.0;
+                 0.0im      0.0      0.9+0.3im]
+        u2_in = [0.8+0.1im  0.1im   0.0;
+                 0.0im      1.0+0.2im 0.1;
+                 0.1im      0.0      1.1+0.0im]
+        odet.u[:, :, 1] .= u1_in
+        odet.u[:, :, 2] .= u2_in
+
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.apply_propagator!(odet, prop)
+
+        @test odet.u[:, :, 1] ≈ u1_in  rtol=1e-12
+        @test odet.u[:, :, 2] ≈ u2_in  rtol=1e-12
+    end
+
+    @testset "apply_propagator! linearity" begin
+        # Verify that apply_propagator! applies the correct linear map.
+        N = 3
+        prop = GeneralizedPerturbedEquilibrium.ForceFreeStates.ChunkPropagator(N)
+
+        # Fill block_upper_ic and block_lower_ic with random data
+        rng_upper = [1.1+0.2im  0.1im   0.05;
+                     0.0im      0.9+0.3im 0.1;
+                     0.2+0.1im  0.0      1.0+0.1im]
+        rng_lower = [0.8+0.1im  0.1im   0.0;
+                     0.0im      1.2+0.2im 0.1;
+                     0.0im      0.1      0.9+0.1im]
+        prop.block_upper_ic[:, :, 1] .= rng_upper
+        prop.block_upper_ic[:, :, 2] .= 0.5 * rng_upper
+        prop.block_lower_ic[:, :, 1] .= 0.3 * rng_lower
+        prop.block_lower_ic[:, :, 2] .= rng_lower
+
+        odet = GeneralizedPerturbedEquilibrium.ForceFreeStates.OdeState(N, 10, 5, 0)
+        u1_in = 0.5 * I(N) .+ 0.1im * ones(N, N)
+        u2_in = I(N) .+ 0.2im * ones(N, N)
+        odet.u[:, :, 1] .= u1_in
+        odet.u[:, :, 2] .= u2_in
+
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.apply_propagator!(odet, prop)
+
+        # Manual computation of expected result
+        U1_upper = prop.block_upper_ic[:, :, 1]
+        U2_upper = prop.block_upper_ic[:, :, 2]
+        U1_lower = prop.block_lower_ic[:, :, 1]
+        U2_lower = prop.block_lower_ic[:, :, 2]
+        u1_expected = U1_upper * u1_in + U1_lower * u2_in
+        u2_expected = U2_upper * u1_in + U2_lower * u2_in
+
+        @test odet.u[:, :, 1] ≈ u1_expected  rtol=1e-12
+        @test odet.u[:, :, 2] ≈ u2_expected  rtol=1e-12
+    end
+
+    @testset "apply_propagator_inverse! is inverse of apply_propagator!" begin
+        # Verify that apply_propagator_inverse! is the algebraic inverse of apply_propagator!:
+        # applying inverse then forward should recover the original state exactly.
+        # This checks the LU-solve path: Φ \ (Φ * u) = u for an arbitrary invertible Φ.
+        N = 3
+        prop = GeneralizedPerturbedEquilibrium.ForceFreeStates.ChunkPropagator(N)
+
+        # Near-identity blocks guarantee the 2N×2N matrix [A B; C D] is invertible
+        A = I(N) .+ 0.15 * [1.0+0.2im  0.1im   0.05; 0.0im  0.9+0.3im  0.1; 0.2+0.1im  0.0  1.0+0.1im]
+        B = 0.1  * [0.8+0.1im  0.1im   0.0;    0.0im  1.2+0.2im  0.1; 0.0im  0.1  0.9+0.1im]
+        C = 0.1  * [0.5+0.1im  0.0im   0.1;    0.1im  0.8+0.2im  0.0; 0.0im  0.0  0.7+0.1im]
+        D = I(N) .+ 0.15 * [0.9+0.1im  0.0im   0.05; 0.0im  1.0+0.2im  0.0; 0.1+0.1im  0.0  0.95+0.1im]
+
+        prop.block_upper_ic[:, :, 1] .= A
+        prop.block_lower_ic[:, :, 1] .= B
+        prop.block_upper_ic[:, :, 2] .= C
+        prop.block_lower_ic[:, :, 2] .= D
+
+        u1_in = [1.0+0.5im  0.2im   0.0;
+                 0.1+0.1im  1.2+0.1im 0.0;
+                 0.0im      0.0      0.9+0.3im]
+        u2_in = I(N) .+ 0.1im * ones(N, N)
+
+        odet = GeneralizedPerturbedEquilibrium.ForceFreeStates.OdeState(N, 10, 5, 0)
+        odet.u[:, :, 1] .= u1_in
+        odet.u[:, :, 2] .= u2_in
+
+        # Round-trip: inverse then forward = identity
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.apply_propagator_inverse!(odet, prop)
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.apply_propagator!(odet, prop)
+
+        @test odet.u[:, :, 1] ≈ u1_in  rtol=1e-12
+        @test odet.u[:, :, 2] ≈ u2_in  rtol=1e-12
+    end
+
+    @testset "balance_integration_chunks produces target count" begin
+        # Verify that balance_integration_chunks creates at least
+        # max(2*msing+3, 4*nthreads) chunks from a small set of base chunks.
+        ex = joinpath(@__DIR__, "test_data", "regression_solovev_ideal_example")
+        inputs = TOML.parsefile(joinpath(ex, "gpec.toml"))
+        inputs["ForceFreeStates"]["verbose"] = false
+        intr = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesInternal(; dir_path=ex)
+        ctrl = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesControl(;
+            (Symbol(k) => v for (k, v) in inputs["ForceFreeStates"])...)
+        eq_config = GeneralizedPerturbedEquilibrium.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], ex)
+        equil = GeneralizedPerturbedEquilibrium.Equilibrium.setup_equilibrium(eq_config)
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_lim!(intr, ctrl, equil)
+        intr.nlow = ctrl.nn_low; intr.nhigh = ctrl.nn_high; intr.npert = 1
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_find!(intr, equil)
+        intr.mlow = min(intr.nlow * equil.params.qmin, 0) - 4 - ctrl.delta_mlow
+        intr.mhigh = trunc(Int, intr.nhigh * equil.params.qmax) + ctrl.delta_mhigh
+        intr.mpert = intr.mhigh - intr.mlow + 1
+        intr.mband = intr.mpert - 1
+        intr.numpert_total = intr.mpert * intr.npert
+
+        odet = GeneralizedPerturbedEquilibrium.ForceFreeStates.OdeState(intr.numpert_total, ctrl.numsteps_init, ctrl.numunorms_init, intr.msing)
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.initialize_el_at_axis!(odet, ctrl, equil.profiles, intr)
+
+        base_chunks = GeneralizedPerturbedEquilibrium.ForceFreeStates.chunk_el_integration_bounds(odet, ctrl, intr)
+        balanced = GeneralizedPerturbedEquilibrium.ForceFreeStates.balance_integration_chunks(base_chunks, ctrl, intr)
+
+        target_n = max(2 * intr.msing + 3, 4 * Threads.nthreads())
+
+        # After balancing, should have at least target_n chunks
+        @test length(balanced) >= min(target_n, length(base_chunks) * 50)
+
+        # First chunk starts at the correct position, last chunk ends at the edge
+        @test balanced[1].psi_start ≈ base_chunks[1].psi_start
+        @test balanced[end].psi_end ≈ base_chunks[end].psi_end
+
+        # Consecutive chunks are contiguous UNLESS the previous chunk ends with a
+        # crossing (needs_crossing=true), in which case there is an intentional inner-layer
+        # gap of ≈2·singfac_min/|n·q1| between the pre-crossing and post-crossing intervals.
+        for i in eachindex(balanced)[2:end]
+            if !balanced[i-1].needs_crossing
+                @test balanced[i].psi_start ≈ balanced[i-1].psi_end  rtol=1e-10
+            else
+                # Inner-layer gap: post-crossing chunk starts AFTER the rational surface
+                @test balanced[i].psi_start > balanced[i-1].psi_end
+            end
+        end
+
+        # The total number of needs_crossing=true chunks should equal the original
+        n_crossings_base = count(c -> c.needs_crossing, base_chunks)
+        n_crossings_bal = count(c -> c.needs_crossing, balanced)
+        @test n_crossings_bal == n_crossings_base
+    end
+
+    @testset "chunk_el_integration_bounds direction field — bidirectional mode" begin
+        # Verify that bidirectional=true sets direction=-1 on crossing chunks and direction=+1
+        # on non-crossing chunks, and that balance_integration_chunks propagates these correctly:
+        # the right sub-chunk inherits direction from the parent, the left sub-chunk is always +1.
+        ex = joinpath(@__DIR__, "test_data", "regression_solovev_ideal_example")
+        inputs = TOML.parsefile(joinpath(ex, "gpec.toml"))
+        inputs["ForceFreeStates"]["verbose"] = false
+        intr = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesInternal(; dir_path=ex)
+        ctrl = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesControl(;
+            (Symbol(k) => v for (k, v) in inputs["ForceFreeStates"])...)
+        eq_config = GeneralizedPerturbedEquilibrium.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], ex)
+        equil = GeneralizedPerturbedEquilibrium.Equilibrium.setup_equilibrium(eq_config)
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_lim!(intr, ctrl, equil)
+        intr.nlow = ctrl.nn_low; intr.nhigh = ctrl.nn_high; intr.npert = 1
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_find!(intr, equil)
+        intr.mlow = min(intr.nlow * equil.params.qmin, 0) - 4 - ctrl.delta_mlow
+        intr.mhigh = trunc(Int, intr.nhigh * equil.params.qmax) + ctrl.delta_mhigh
+        intr.mpert = intr.mhigh - intr.mlow + 1
+        intr.mband = intr.mpert - 1
+        intr.numpert_total = intr.mpert * intr.npert
+
+        odet = GeneralizedPerturbedEquilibrium.ForceFreeStates.OdeState(intr.numpert_total, ctrl.numsteps_init, ctrl.numunorms_init, intr.msing)
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.initialize_el_at_axis!(odet, ctrl, equil.profiles, intr)
+
+        # Default (bidirectional=false): all chunks should have direction=+1
+        chunks_fwd = GeneralizedPerturbedEquilibrium.ForceFreeStates.chunk_el_integration_bounds(odet, ctrl, intr)
+        @test all(c -> c.direction == 1, chunks_fwd)
+
+        # bidirectional=true: crossing chunks direction=-1, non-crossing direction=+1
+        chunks_bidi = GeneralizedPerturbedEquilibrium.ForceFreeStates.chunk_el_integration_bounds(odet, ctrl, intr; bidirectional=true)
+        @test count(c -> c.needs_crossing, chunks_bidi) > 0  # at least one crossing chunk
+        for chunk in chunks_bidi
+            if chunk.needs_crossing
+                @test chunk.direction == -1
+            else
+                @test chunk.direction == 1
+            end
+        end
+
+        # balance_integration_chunks preserves direction: right sub-chunk inherits parent direction,
+        # left sub-chunk is always +1 regardless of parent
+        balanced_bidi = GeneralizedPerturbedEquilibrium.ForceFreeStates.balance_integration_chunks(chunks_bidi, ctrl, intr)
+        for chunk in balanced_bidi
+            if chunk.needs_crossing
+                @test chunk.direction == -1
+            else
+                @test chunk.direction == 1
+            end
+        end
+    end
+
+    @testset "Parallel FM integration matches standard ODE — Solovev example" begin
+        # Run standard and parallel FM integrations on the Solovev regression test.
+        # The energy eigenvalue et[1] should match to within 2%.
+        #
+        # Bidirectional FM integration (crossing chunks integrated backward) is the
+        # default for use_parallel=true. It keeps FM propagators well-conditioned for
+        # both small-N (Solovev N=8, tested here) and large-N (DIIID N=26, tested below).
+        ex = joinpath(@__DIR__, "test_data", "regression_solovev_ideal_example")
+
+        function run_solovev(use_parallel)
+            inputs = TOML.parsefile(joinpath(ex, "gpec.toml"))
+            inputs["ForceFreeStates"]["verbose"] = false
+            inputs["ForceFreeStates"]["use_parallel"] = use_parallel
+            intr = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesInternal(; dir_path=ex)
+            ctrl = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesControl(;
+                (Symbol(k) => v for (k, v) in inputs["ForceFreeStates"])...)
+            eq_config = GeneralizedPerturbedEquilibrium.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], ex)
+            equil = GeneralizedPerturbedEquilibrium.Equilibrium.setup_equilibrium(eq_config)
+            intr.wall_settings = GeneralizedPerturbedEquilibrium.Vacuum.WallShapeSettings(;
+                (Symbol(k) => v for (k, v) in inputs["Wall"])...)
+            GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_lim!(intr, ctrl, equil)
+            intr.nlow = ctrl.nn_low; intr.nhigh = ctrl.nn_high; intr.npert = 1
+            GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_find!(intr, equil)
+            intr.mlow = min(intr.nlow * equil.params.qmin, 0) - 4 - ctrl.delta_mlow
+            intr.mhigh = trunc(Int, intr.nhigh * equil.params.qmax) + ctrl.delta_mhigh
+            intr.mpert = intr.mhigh - intr.mlow + 1
+            intr.mband = intr.mpert - 1
+            intr.numpert_total = intr.mpert * intr.npert
+            metric = GeneralizedPerturbedEquilibrium.ForceFreeStates.make_metric(equil; mband=intr.mband, fft_flag=ctrl.fft_flag)
+            ffit = GeneralizedPerturbedEquilibrium.ForceFreeStates.make_matrix(equil, intr, metric)
+            odet = GeneralizedPerturbedEquilibrium.ForceFreeStates.eulerlagrange_integration(ctrl, equil, ffit, intr)
+            vac = GeneralizedPerturbedEquilibrium.ForceFreeStates.free_run!(odet, ctrl, equil, ffit, intr)
+            return real(vac.et[1]), intr
+        end
+
+        et_std, intr_std = run_solovev(false)
+        et_par, intr_par = run_solovev(true)
+
+        # Energy eigenvalue matches to 2%
+        @test isapprox(et_par, et_std; rtol=0.02)
+
+        # Δ' is populated for every singular surface (finite values)
+        # Note: the FM parallel path computes Δ' from ca_l/ca_r accumulated in (S,I)
+        # normalization (Riccati-style crossings). This differs from the sequential path's
+        # (U1,U2) normalization, so absolute Δ' values are not compared here.
+        @test all(s -> !isempty(s.delta_prime), intr_par.sing)
+        @test all(s -> all(isfinite, s.delta_prime), intr_par.sing)
+
+        # delta_prime_col is populated and has the correct shape (N × n_res_modes)
+        N = intr_par.numpert_total
+        @test all(s -> !isempty(s.delta_prime_col), intr_par.sing)
+        @test all(s -> size(s.delta_prime_col, 1) == N, intr_par.sing)
+        @test all(s -> size(s.delta_prime_col, 2) == length(s.delta_prime), intr_par.sing)
+
+        # Diagonal of delta_prime_col matches delta_prime (consistency check)
+        for s in intr_par.sing
+            ipert_res_vals = 1 .+ s.m .- intr_par.mlow .+ (s.n .- intr_par.nlow) .* intr_par.mpert
+            for (i, ipr) in enumerate(ipert_res_vals)
+                @test s.delta_prime_col[ipr, i] ≈ s.delta_prime[i]  rtol=1e-10
+            end
+        end
+    end
+
+    @testset "Parallel FM integration matches standard ODE — DIIID-like example (large N)" begin
+        # Run standard and parallel FM integrations on the DIIID-like example (N≈26 modes).
+        # Before bidirectional integration, the all-forward FM propagators were ill-conditioned
+        # for large N, producing ~10% energy error. Bidirectional integration (backward crossing
+        # chunks + forward intermediate chunks) restores accuracy to within 2%.
+        #
+        # This is the key regression test for the bidirectional parallel FM fix.
+        ex = joinpath(@__DIR__, "..", "examples", "DIIID-like_ideal_example")
+
+        function run_diiid(use_parallel)
+            inputs = TOML.parsefile(joinpath(ex, "gpec.toml"))
+            inputs["ForceFreeStates"]["verbose"] = false
+            inputs["ForceFreeStates"]["use_parallel"] = use_parallel
+            inputs["ForceFreeStates"]["write_outputs_to_HDF5"] = false
+            intr = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesInternal(; dir_path=ex)
+            ctrl = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesControl(;
+                (Symbol(k) => v for (k, v) in inputs["ForceFreeStates"])...)
+            eq_config = GeneralizedPerturbedEquilibrium.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], ex)
+            equil = GeneralizedPerturbedEquilibrium.Equilibrium.setup_equilibrium(eq_config)
+            intr.wall_settings = GeneralizedPerturbedEquilibrium.Vacuum.WallShapeSettings(;
+                (Symbol(k) => v for (k, v) in inputs["Wall"])...)
+            GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_lim!(intr, ctrl, equil)
+            intr.nlow = ctrl.nn_low; intr.nhigh = ctrl.nn_high; intr.npert = 1
+            GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_find!(intr, equil)
+            intr.mlow = min(intr.nlow * equil.params.qmin, 0) - 4 - ctrl.delta_mlow
+            intr.mhigh = trunc(Int, intr.nhigh * equil.params.qmax) + ctrl.delta_mhigh
+            intr.mpert = intr.mhigh - intr.mlow + 1
+            intr.mband = intr.mpert - 1
+            intr.numpert_total = intr.mpert * intr.npert
+            metric = GeneralizedPerturbedEquilibrium.ForceFreeStates.make_metric(equil; mband=intr.mband, fft_flag=ctrl.fft_flag)
+            ffit = GeneralizedPerturbedEquilibrium.ForceFreeStates.make_matrix(equil, intr, metric)
+            odet = GeneralizedPerturbedEquilibrium.ForceFreeStates.eulerlagrange_integration(ctrl, equil, ffit, intr)
+            vac = GeneralizedPerturbedEquilibrium.ForceFreeStates.free_run!(odet, ctrl, equil, ffit, intr)
+            return real(vac.et[1])
+        end
+
+        et_std = run_diiid(false)
+        et_par = run_diiid(true)
+
+        # Energy eigenvalue matches to 2% (bidirectional fix: was ~10% error without it)
+        @test isapprox(et_par, et_std; rtol=0.02)
+    end
+
+    @testset "ode_itime_cost is additive over sub-intervals" begin
+        # Verify cost(a, c) ≈ cost(a, b) + cost(b, c) for b ∈ (a, c) where no
+        # rational surface is inside [a, c]. The cost function uses abs(Δlog) for
+        # each reference point; this is additive only when |psi - ref| is monotone
+        # on [a, c], i.e., when no reference (rational surface, axis, edge) lies
+        # strictly inside the interval. We use the first integration chunk from
+        # chunk_el_integration_bounds, which is guaranteed to contain no rational
+        # surfaces in its interior.
+        ex = joinpath(@__DIR__, "test_data", "regression_solovev_ideal_example")
+        inputs = TOML.parsefile(joinpath(ex, "gpec.toml"))
+        inputs["ForceFreeStates"]["verbose"] = false
+        intr = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesInternal(; dir_path=ex)
+        ctrl = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesControl(;
+            (Symbol(k) => v for (k, v) in inputs["ForceFreeStates"])...)
+        eq_config = GeneralizedPerturbedEquilibrium.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], ex)
+        equil = GeneralizedPerturbedEquilibrium.Equilibrium.setup_equilibrium(eq_config)
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_lim!(intr, ctrl, equil)
+        intr.nlow = ctrl.nn_low; intr.nhigh = ctrl.nn_high; intr.npert = 1
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_find!(intr, equil)
+        intr.mpert = 8; intr.numpert_total = 8
+
+        # Use the first chunk from chunk_el_integration_bounds: guaranteed rational-free interior
+        odet_tmp = GeneralizedPerturbedEquilibrium.ForceFreeStates.OdeState(8, 10, 5, intr.msing)
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.initialize_el_at_axis!(odet_tmp, ctrl, equil.profiles, intr)
+        chunks_tmp = GeneralizedPerturbedEquilibrium.ForceFreeStates.chunk_el_integration_bounds(odet_tmp, ctrl, intr)
+        chunk1 = chunks_tmp[1]
+        a = chunk1.psi_start
+        c = chunk1.psi_end
+        b = (a + c) / 2.0
+
+        cost_ac = GeneralizedPerturbedEquilibrium.ForceFreeStates.ode_itime_cost(a, c, intr)
+        cost_ab = GeneralizedPerturbedEquilibrium.ForceFreeStates.ode_itime_cost(a, b, intr)
+        cost_bc = GeneralizedPerturbedEquilibrium.ForceFreeStates.ode_itime_cost(b, c, intr)
+
+        @test isapprox(cost_ac, cost_ab + cost_bc; rtol=1e-10)
+    end
+
+    @testset "delta_prime_matrix — STRIDE BVP Solovev regression" begin
+        # Verify that the parallel FM path computes a well-formed inter-surface Δ' matrix
+        # via the STRIDE global BVP [Glasser 2018 Phys. Plasmas 25, 032501].
+        # Shape: (2·msing × 2·msing), where index 2j-1 = left side and 2j = right side
+        # of surface j. Each entry is the U₂[ipert_res] response amplitude for one
+        # driving configuration.
+        ex = joinpath(@__DIR__, "test_data", "regression_solovev_ideal_example")
+        inputs = TOML.parsefile(joinpath(ex, "gpec.toml"))
+        inputs["ForceFreeStates"]["verbose"] = false
+        inputs["ForceFreeStates"]["use_parallel"] = true
+        intr = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesInternal(; dir_path=ex)
+        ctrl = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesControl(;
+            (Symbol(k) => v for (k, v) in inputs["ForceFreeStates"])...)
+        eq_config = GeneralizedPerturbedEquilibrium.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], ex)
+        equil = GeneralizedPerturbedEquilibrium.Equilibrium.setup_equilibrium(eq_config)
+        intr.wall_settings = GeneralizedPerturbedEquilibrium.Vacuum.WallShapeSettings(;
+            (Symbol(k) => v for (k, v) in inputs["Wall"])...)
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_lim!(intr, ctrl, equil)
+        intr.nlow = ctrl.nn_low; intr.nhigh = ctrl.nn_high; intr.npert = 1
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_find!(intr, equil)
+        intr.mlow = min(intr.nlow * equil.params.qmin, 0) - 4 - ctrl.delta_mlow
+        intr.mhigh = trunc(Int, intr.nhigh * equil.params.qmax) + ctrl.delta_mhigh
+        intr.mpert = intr.mhigh - intr.mlow + 1
+        intr.mband = intr.mpert - 1
+        intr.numpert_total = intr.mpert * intr.npert
+        metric = GeneralizedPerturbedEquilibrium.ForceFreeStates.make_metric(equil; mband=intr.mband, fft_flag=ctrl.fft_flag)
+        ffit = GeneralizedPerturbedEquilibrium.ForceFreeStates.make_matrix(equil, intr, metric)
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.eulerlagrange_integration(ctrl, equil, ffit, intr)
+
+        msing = intr.msing
+        dpm = intr.delta_prime_matrix
+
+        # Matrix is populated with correct shape (2·msing × 2·msing)
+        @test !isempty(dpm)
+        @test size(dpm) == (2 * msing, 2 * msing)
+
+        # All elements are finite
+        @test all(isfinite, dpm)
+
+        # Diagonal (self-response) elements are non-zero for each surface side
+        for j in 1:msing
+            @test abs(dpm[2j-1, 2j-1]) > 1e-10
+            @test abs(dpm[2j,   2j  ]) > 1e-10
+        end
+    end
+
+    @testset "delta_prime_matrix — STRIDE BVP DIIID-like regression (large N)" begin
+        # Verify that the parallel FM path computes a well-formed inter-surface Δ' matrix
+        # for the DIIID-like case (N≈26 modes, multiple rational surfaces). This complements
+        # the Solovev test above by exercising the BVP assembly with more surfaces and larger
+        # mode space, where ill-conditioned (non-bidirectional) FM propagators would fail.
+        ex = joinpath(@__DIR__, "..", "examples", "DIIID-like_ideal_example")
+        inputs = TOML.parsefile(joinpath(ex, "gpec.toml"))
+        inputs["ForceFreeStates"]["verbose"] = false
+        inputs["ForceFreeStates"]["use_parallel"] = true
+        inputs["ForceFreeStates"]["write_outputs_to_HDF5"] = false
+        intr = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesInternal(; dir_path=ex)
+        ctrl = GeneralizedPerturbedEquilibrium.ForceFreeStates.ForceFreeStatesControl(;
+            (Symbol(k) => v for (k, v) in inputs["ForceFreeStates"])...)
+        eq_config = GeneralizedPerturbedEquilibrium.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], ex)
+        equil = GeneralizedPerturbedEquilibrium.Equilibrium.setup_equilibrium(eq_config)
+        intr.wall_settings = GeneralizedPerturbedEquilibrium.Vacuum.WallShapeSettings(;
+            (Symbol(k) => v for (k, v) in inputs["Wall"])...)
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_lim!(intr, ctrl, equil)
+        intr.nlow = ctrl.nn_low; intr.nhigh = ctrl.nn_high; intr.npert = 1
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.sing_find!(intr, equil)
+        intr.mlow = min(intr.nlow * equil.params.qmin, 0) - 4 - ctrl.delta_mlow
+        intr.mhigh = trunc(Int, intr.nhigh * equil.params.qmax) + ctrl.delta_mhigh
+        intr.mpert = intr.mhigh - intr.mlow + 1
+        intr.mband = intr.mpert - 1
+        intr.numpert_total = intr.mpert * intr.npert
+        metric = GeneralizedPerturbedEquilibrium.ForceFreeStates.make_metric(equil; mband=intr.mband, fft_flag=ctrl.fft_flag)
+        ffit = GeneralizedPerturbedEquilibrium.ForceFreeStates.make_matrix(equil, intr, metric)
+        GeneralizedPerturbedEquilibrium.ForceFreeStates.eulerlagrange_integration(ctrl, equil, ffit, intr)
+
+        msing = intr.msing
+        dpm = intr.delta_prime_matrix
+
+        # Matrix is populated with correct shape (2·msing × 2·msing)
+        @test !isempty(dpm)
+        @test size(dpm) == (2 * msing, 2 * msing)
+
+        # All elements are finite
+        @test all(isfinite, dpm)
+
+        # Diagonal (self-response) elements are non-zero for each surface side
+        for j in 1:msing
+            @test abs(dpm[2j-1, 2j-1]) > 1e-10
+            @test abs(dpm[2j,   2j  ]) > 1e-10
+        end
+    end
+
+end
diff --git a/test/runtests_riccati.jl b/test/runtests_riccati.jl
new file mode 100644
index 00000000..5681b691
--- /dev/null
+++ b/test/runtests_riccati.jl
@@ -0,0 +1,251 @@
+using LinearAlgebra, Random, TOML
+
+const FFS = GeneralizedPerturbedEquilibrium.ForceFreeStates
+
+# Configure a fresh ForceFreeStatesInternal from an already-built equilibrium.
+# Cheap (sing_lim! + sing_find! + field assignment). Separate from equil/ffit
+# setup because intr is mutated by each integration (sing[s].delta_prime etc.).
+function make_solovev_intr(inputs, ctrl, equil, ex)
+    intr = FFS.ForceFreeStatesInternal(; dir_path=ex)
+    intr.wall_settings = GeneralizedPerturbedEquilibrium.Vacuum.WallShapeSettings(;
+        (Symbol(k) => v for (k, v) in inputs["Wall"])...)
+    FFS.sing_lim!(intr, ctrl, equil)
+    intr.nlow = ctrl.nn_low; intr.nhigh = ctrl.nn_high; intr.npert = 1
+    FFS.sing_find!(intr, equil)
+    intr.mlow  = min(intr.nlow * equil.params.qmin, 0) - 4 - ctrl.delta_mlow
+    intr.mhigh = trunc(Int, intr.nhigh * equil.params.qmax) + ctrl.delta_mhigh
+    intr.mpert = intr.mhigh - intr.mlow + 1
+    intr.mband = intr.mpert - 1
+    intr.numpert_total = intr.mpert * intr.npert
+    return intr
+end
+
+@testset "Riccati Integration Tests" begin
+
+    # ── Pure matrix unit tests — no equilibrium needed ────────────────────────
+
+    @testset "renormalize_riccati_inplace!" begin
+        N = 4
+        # Build a random (U₁, U₂) pair and verify renorm gives S = U₁·U₂⁻¹ with U₂_new = I
+        rng = [1.0+0.5im  0.2im    0.1      0.3im;
+               0.0        1.2+0.1im 0.0im   0.2;
+               0.1+0.1im  0.0      0.9+0.3im 0.1im;
+               0.0im      0.2      0.0      1.1+0.2im]
+        U1 = rng .+ 0.5*I(N)
+        U2 = 0.5*rng .+ I(N)  # near-identity to ensure invertibility
+
+        u = zeros(ComplexF64, N, N, 2)
+        u[:, :, 1] .= U1
+        u[:, :, 2] .= U2
+
+        S_expected = U1 / U2  # = U₁ · U₂⁻¹
+
+        FFS.renormalize_riccati_inplace!(u, N)
+
+        @test u[:, :, 2] ≈ I(N)
+        @test u[:, :, 1] ≈ S_expected  rtol=1e-12
+    end
+
+    @testset "renormalize_riccati_inplace! idempotent" begin
+        N = 3
+        # If U₂ = I already, renorm should leave u unchanged
+        S = [1.0+0.5im  0.2im    0.1;
+             0.0im      1.2+0.1im 0.0;
+             0.1+0.1im  0.0      0.9+0.3im]
+        u = zeros(ComplexF64, N, N, 2)
+        u[:, :, 1] .= S
+        u[:, :, 2] .= I(N)
+
+        FFS.renormalize_riccati_inplace!(u, N)
+
+        @test u[:, :, 2] ≈ I(N)
+        @test u[:, :, 1] ≈ S  rtol=1e-12
+    end
+
+    @testset "renormalize_riccati! (OdeState)" begin
+        N = 3
+        rng = [1.0+0.5im  0.2im    0.1;
+               0.0im      1.2+0.1im 0.0;
+               0.1+0.1im  0.0      0.9+0.3im]
+        U1 = rng .+ 0.5*I(N)
+        U2 = 0.2*rng .+ I(N)
+
+        odet = FFS.OdeState(N, 10, 5, 1)
+        odet.u[:, :, 1] .= U1
+        odet.u[:, :, 2] .= U2
+
+        S_expected = U1 / U2
+        intr = FFS.ForceFreeStatesInternal(; mpert=N, numpert_total=N)
+
+        FFS.renormalize_riccati!(odet, intr)
+
+        @test odet.u[:, :, 2] ≈ I(N)
+        @test odet.u[:, :, 1] ≈ S_expected  rtol=1e-12
+    end
+
+    # ── Shared Solovev setup ──────────────────────────────────────────────────
+    #
+    # equil (Grad-Shafranov solve) and ffit (metric matrices) are expensive and
+    # immutable after construction — built ONCE and shared across all tests below.
+    # intr is cheap to (re)initialize but is mutated by each integration run
+    # (sing[s].delta_prime etc.), so a fresh copy is made for each integration.
+    #
+    # Integration runs:
+    #   intr_ric / odet_ric — Riccati path (shared by most tests)
+    #   intr_std / odet_std — Standard path (energy comparison only)
+
+    ex = joinpath(@__DIR__, "test_data", "regression_solovev_ideal_example")
+    inputs = TOML.parsefile(joinpath(ex, "gpec.toml"))
+    inputs["ForceFreeStates"]["verbose"] = false
+
+    ctrl  = FFS.ForceFreeStatesControl(;
+                (Symbol(k) => v for (k, v) in inputs["ForceFreeStates"])...)
+    equil = GeneralizedPerturbedEquilibrium.Equilibrium.setup_equilibrium(
+                GeneralizedPerturbedEquilibrium.Equilibrium.EquilibriumConfig(inputs["Equilibrium"], ex))
+
+    intr_tmp = make_solovev_intr(inputs, ctrl, equil, ex)
+    metric   = FFS.make_metric(equil; mband=intr_tmp.mband, fft_flag=ctrl.fft_flag)
+    ffit     = FFS.make_matrix(equil, intr_tmp, metric)
+    N        = intr_tmp.numpert_total
+
+    # Riccati integration
+    intr_ric = make_solovev_intr(inputs, ctrl, equil, ex)
+    odet_ric = FFS.riccati_eulerlagrange_integration(ctrl, equil, ffit, intr_ric)
+
+    # Save inline Δ' values before any test that calls compute_delta_prime_from_ca!
+    # (which overwrites intr_ric.sing[s].delta_prime)
+    delta_prime_inline = [copy(intr_ric.sing[s].delta_prime) for s in 1:intr_ric.msing]
+
+    vac_ric = FFS.free_run!(odet_ric, ctrl, equil, ffit, intr_ric)
+    et_ric  = real(vac_ric.et[1])
+
+    # Standard integration (needed only for energy comparison)
+    intr_std = make_solovev_intr(inputs, ctrl, equil, ex)
+    odet_std = FFS.eulerlagrange_integration(ctrl, equil, ffit, intr_std)
+    vac_std  = FFS.free_run!(odet_std, ctrl, equil, ffit, intr_std)
+    et_std   = real(vac_std.et[1])
+
+    # ─────────────────────────────────────────────────────────────────────────
+
+    @testset "Riccati integration matches standard ODE — Solovev example" begin
+        # Energy eigenvalue matches to 1%
+        @test isapprox(et_ric, et_std; rtol=0.01)
+
+        # Riccati uses no more than 2x as many steps as standard
+        @test odet_ric.step <= 2 * odet_std.step
+    end
+
+    @testset "Δ' computed by Riccati path — Solovev regression" begin
+        # Verify that the Riccati path populates delta_prime with physically correct values.
+        #
+        # The Riccati path computes Δ' in the bounded (U₁, U₂) normalization: before the
+        # crossing, the callback guarantees max(|U₁|, |U₂|) ≤ ucrit, and the asymptotic is
+        # introduced directly in column ipert_res (no GR permutation). This gives:
+        #   ca_r[ipert_res, ipert_res, 2] = 1  (exactly, by construction)
+        #   Δ' = (1 - ca_l[ipert_res, ipert_res, 2]) / (4π²·psio)
+        #
+        # The standard path uses Gaussian Reduction which inflates the resonant column's
+        # asymptotic coefficients, so it does NOT populate intr.sing[s].delta_prime.
+        # Use SingularCoupling.jl (which reads ca_l/ca_r directly) for standard-path Δ'.
+
+        # Riccati path should populate delta_prime for every singular surface
+        @test all(s -> !isempty(s.delta_prime), intr_ric.sing)
+
+        # All Riccati Δ' values should be finite
+        @test all(s -> all(isfinite, s.delta_prime), intr_ric.sing)
+
+        # Regression: Solovev Δ' values (in the bounded Riccati normalization).
+        # Positive Δ' (surface 1) and negative Δ' (surface 2) are both physically plausible
+        # for this configuration.
+        @test isapprox(real(intr_ric.sing[1].delta_prime[1]),  57.3; rtol=0.05)
+        @test isapprox(real(intr_ric.sing[2].delta_prime[1]), -4.03; rtol=0.05)
+
+        # delta_prime_col is populated, has correct shape (N × n_res_modes), and
+        # its diagonal elements match delta_prime exactly.
+        @test all(s -> !isempty(s.delta_prime_col), intr_ric.sing)
+        @test all(s -> size(s.delta_prime_col, 1) == N, intr_ric.sing)
+        @test all(s -> size(s.delta_prime_col, 2) == length(s.delta_prime), intr_ric.sing)
+        for s in intr_ric.sing
+            ipert_res_vals = 1 .+ s.m .- intr_ric.mlow .+ (s.n .- intr_ric.nlow) .* intr_ric.mpert
+            for (i, ipr) in enumerate(ipert_res_vals)
+                @test s.delta_prime_col[ipr, i] ≈ s.delta_prime[i]  rtol=1e-10
+            end
+        end
+    end
+
+    @testset "Riccati end state has U₂ ≈ I" begin
+        # After riccati_eulerlagrange_integration, odet.u[:,:,2] should be identity
+        # (canonical Riccati convention after final renorm)
+        @test odet_ric.u[:, :, 2] ≈ I(N)  rtol=1e-10
+    end
+
+    @testset "riccati_der! formula — Glasser 2018 Eq. 19" begin
+        # Verify riccati_der! correctly evaluates dS/dψ = w†·F̄⁻¹·w − S·Ḡ·S, w = Q − K̄·S.
+        #
+        # Test states are Hermitian (physical constraint: the EL system preserves S†=S from
+        # the axis). Non-Hermitian states would give ~5% disagreement — not a bug, but a
+        # consequence of the derivation assuming the physical symmetry.
+        #
+        # See benchmarks/benchmark_riccati_der.jl for the extended version with output.
+
+        # Use an initialized OdeState just for spline_hint and chunk bounds
+        odet_tmp = FFS.OdeState(N, ctrl.numsteps_init, ctrl.numunorms_init, intr_ric.msing)
+        FFS.initialize_el_at_axis!(odet_tmp, ctrl, equil.profiles, intr_ric)
+        chunks = FFS.chunk_el_integration_bounds(odet_tmp, ctrl, intr_ric)
+
+        # 30% into each chunk: away from singularities at psi_end
+        test_psis = [c.psi_start + 0.3 * (c.psi_end - c.psi_start) for c in chunks]
+
+        rng = Random.MersenneTwister(42)
+        for psi in test_psis
+            # Hermitian S: physical Riccati matrix is Hermitian (preserved by EL symmetry)
+            A = randn(rng, ComplexF64, N, N)
+            S = (A + A') / 2
+
+            # Manual RHS: w†·F̄⁻¹·w − S·Ḡ·S
+            L    = zeros(ComplexF64, N, N)
+            Kmat = zeros(ComplexF64, N, N)
+            Gmat = zeros(ComplexF64, N, N)
+            ffit.fmats_lower(vec(L),    psi; hint=ffit._hint)
+            ffit.kmats(vec(Kmat), psi; hint=ffit._hint)
+            ffit.gmats(vec(Gmat), psi; hint=ffit._hint)
+            q       = equil.profiles.q_spline(psi)
+            singfac = vec(1.0 ./ ((intr_ric.mlow:intr_ric.mhigh) .-
+                                   q .* (intr_ric.nlow:intr_ric.nhigh)'))
+            w = -Kmat * S
+            for i in 1:N; w[i, i] += singfac[i]; end
+            v = copy(w)
+            ldiv!(LowerTriangular(L), v)
+            ldiv!(UpperTriangular(L'), v)
+            dS_manual = adjoint(w) * v - S * Gmat * S
+
+            # riccati_der! RHS
+            u_ric  = zeros(ComplexF64, N, N, 2)
+            du_ric = zeros(ComplexF64, N, N, 2)
+            u_ric[:, :, 1] .= S
+            u_ric[:, :, 2] .= Matrix{ComplexF64}(I, N, N)
+            dummy  = FFS.IntegrationChunk(psi, psi, false, 0, 1)
+            params = (ctrl, equil, ffit, intr_ric, odet_tmp, dummy)
+            FFS.riccati_der!(du_ric, u_ric, params, psi)
+
+            rel_err = norm(du_ric[:, :, 1] - dS_manual) / max(norm(dS_manual), 1e-10)
+            @test rel_err < 1e-10
+        end
+    end
+
+    @testset "compute_delta_prime_from_ca! matches inline Δ'" begin
+        # Verify the standalone Δ' formula matches the inline Riccati crossing computation.
+        # Both apply the identical diagonal formula to the same ca_l/ca_r arrays, so the
+        # result must be bit-for-bit identical (not just approximately equal).
+        #
+        # Note: this call overwrites intr_ric.sing[s].delta_prime; delta_prime_inline was
+        # saved before free_run! above so it holds the original inline values.
+        #
+        # See benchmarks/benchmark_delta_prime_methods.jl for the extended version.
+        FFS.compute_delta_prime_from_ca!(odet_ric, intr_ric, equil)
+        for s in 1:intr_ric.msing
+            @test intr_ric.sing[s].delta_prime == delta_prime_inline[s]
+        end
+    end
+
+end