WaterLily-jl · marinlauber · Apr 23, 2024 · Jun 9, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/Project.toml b/Project.toml
@@ -20,12 +20,14 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
 WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 
 [extensions]
 WaterLilyAMDGPUExt = "AMDGPU"
 WaterLilyCUDAExt = "CUDA"
 WaterLilyReadVTKExt = "ReadVTK"
 WaterLilyWriteVTKExt = "WriteVTK"
+WaterLilyMPIExt = "MPI"
 
 [compat]
 AMDGPU = "^0.4.13,0.6"
@@ -53,6 +55,7 @@ ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 
 [targets]
 test = ["Test", "BenchmarkTools", "CUDA", "AMDGPU", "GPUArrays", "WriteVTK", "ReadVTK"]
diff --git a/examples/Project.toml b/examples/Project.toml
@@ -1,13 +1,16 @@
 [deps]
-Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 GR = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71"
 GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 Meshing = "e6723b4c-ebff-59f1-b4b7-d97aa5274f73"
+Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

diff --git a/examples/TwoD_CircleMPI.jl b/examples/TwoD_CircleMPI.jl
@@ -0,0 +1,57 @@
+#mpiexecjl --project=. -n 2 julia TwoD_CircleMPI.jl
+
+using WaterLily
+using WriteVTK
+using MPI
+using StaticArrays
+
+# make a writer with some attributes, now we need to apply the BCs when writting
+velocity(a::Simulation) = a.flow.u |> Array;
+pressure(a::Simulation) = a.flow.p |> Array;
+_body(a::Simulation) = (measure_sdf!(a.flow.σ, a.body);
+                        a.flow.σ |> Array;)
+vorticity(a::Simulation) = (@inside a.flow.σ[I] = 
+                            WaterLily.curl(3,I,a.flow.u)*a.L/a.U;
+                            WaterLily.perBC!(a.flow.σ,());
+                            a.flow.σ |> Array;)
+_vbody(a::Simulation) = a.flow.V |> Array;
+mu0(a::Simulation) = a.flow.μ₀ |> Array;
+ranks(a::Simulation) = (a.flow.σ.=0; 
+                        @inside a.flow.σ[I] = me()+1;
+                        WaterLily.perBC!(a.flow.σ,());
+                        a.flow.σ |> Array;)
+
+custom_attrib = Dict(
+    "u" => velocity,
+    "p" => pressure,
+    "d" => _body,
+    "ω" => vorticity,
+    "v" => _vbody,
+    "μ₀" => mu0,
+    "rank" => ranks
+)# this maps what to write to the name in the file
+
+"""Flow around a circle"""
+function circle(dims,center,radius;Re=250,U=1,psolver=MultiLevelPoisson,mem=Array)
+    body = AutoBody((x,t)->√sum(abs2, x .- center) - radius)
+    Simulation(dims, (U,0), radius; ν=U*radius/Re, body, mem=mem, psolver=psolver)
+end
+
+# last one standing...
+WaterLily.grid_loc() = mpi_grid().global_loc
+
+# local grid size
+L = 2^6
+
+# init the MPI grid and the simulation
+r = init_mpi((L,L))
+sim = circle((L,L),SA[L/2,L/2+2],L/8;mem=MPIArray) #use MPIArray to use extension
+
+wr = vtkWriter("WaterLily-circle-2";attrib=custom_attrib,dir="vtk_data",
+               extents=get_extents(sim.flow.p))
+for _ in 1:50
+    sim_step!(sim,sim_time(sim)+1.0,verbose=true)
+    write!(wr,sim)
+end
+close(wr)
+finalize_mpi()
diff --git a/examples/TwoD_CircleMPIArray.jl b/examples/TwoD_CircleMPIArray.jl
@@ -0,0 +1,14 @@
+using WaterLily
+# using MPI
+
+# circle simulation
+function circle(n,m;Re=250,U=1)
+    radius, center = m/8, m/2
+    body = AutoBody((x,t)->√sum(abs2, x .- center) - radius)
+    Simulation((n,m), (U,0), radius; ν=U*radius/Re, body, mem=MPIArray)
+end
+
+# test on sim
+include("TwoD_plots.jl")
+sim = circle(3*2^6,2^7)
+sim_gif!(sim,duration=10,clims=(-5,5),plotbody=true)
diff --git a/ext/WaterLilyMPIExt.jl b/ext/WaterLilyMPIExt.jl
@@ -0,0 +1,243 @@
+module WaterLilyMPIExt
+
+if isdefined(Base, :get_extension)
+    using MPI
+else
+    using ..MPI
+end
+
+using StaticArrays
+using WaterLily
+import WaterLily: init_mpi,me,mpi_grid,finalize_mpi,get_extents
+import WaterLily: BC!,perBC!,exitBC!,L₂,L∞,loc,grid_loc,_dot,CFL,residual!,sim_step!
+
+const NDIMS_MPI = 3           # Internally, we set the number of dimensions always to 3 for calls to MPI. This ensures a fixed size for MPI coords, neigbors, etc and in general a simple, easy to read code.
+const NNEIGHBORS_PER_DIM = 2
+
+"""
+    halos(dims,d)
+
+Return the CartesianIndices of the halos in dimension `±d` of an array of size `dims`.
+"""
+function halos(dims::NTuple{N},j) where N
+    CartesianIndices(ntuple( i-> i==abs(j) ? j<0 ? (1:2) : (dims[i]-1:dims[i]) : (1:dims[i]), N))
+end
+"""
+    buff(dims,d)
+
+Return the CartesianIndices of the buffer in dimension `±d` of an array of size `dims`.
+"""
+function buff(dims::NTuple{N},j) where N
+    CartesianIndices(ntuple( i-> i==abs(j) ? j<0 ? (3:4) : (dims[i]-3:dims[i]-2) : (1:dims[i]), N))
+end
+
+"""
+    mpi_swap!(send1,recv1,send2,recv2,neighbor,comm)
+
+This function swaps the data between two MPI processes. The data is sent from `send1` to `neighbor[1]` and received in `recv1`.
+The data is sent from `send2` to `neighbor[2]` and received in `recv2`. The function is non-blocking and returns when all data 
+has been sent and received. 
+"""
+function mpi_swap!(send1,recv1,send2,recv2,neighbor,comm)
+    reqs=MPI.Request[]
+    # Send to / receive from neighbor 1 in dimension d
+    push!(reqs,MPI.Isend(send1,  neighbor[1], 0, comm))
+    push!(reqs,MPI.Irecv!(recv1, neighbor[1], 1, comm))
+    # Send to / receive from neighbor 2 in dimension d
+    push!(reqs,MPI.Irecv!(recv2, neighbor[2], 0, comm))
+    push!(reqs,MPI.Isend(send2,  neighbor[2], 1, comm))
+    # wair for all transfer to be done
+    MPI.Waitall!(reqs)
+end
+
+"""
+    perBC!(a)
+
+This function sets the boundary conditions of the array `a` using the MPI grid.
+"""
+perBC!(a::MPIArray,::Tuple{})            = _perBC!(a, size(a), true)
+perBC!(a::MPIArray, perdir, N = size(a)) = _perBC!(a, N, true)
+_perBC!(a, N, mpi::Bool) = for d ∈ eachindex(N)
+    # get data to transfer @TODO use @views
+    send1 = a[buff(N,-d)]; send2 =a[buff(N,+d)]
+    recv1 = zero(send1);   recv2 = zero(send2)
+    # swap 
+    mpi_swap!(send1,recv1,send2,recv2,neighbors(d),mpi_grid().comm)
+
+    # this sets the BCs
+    !mpi_wall(d,1) && (a[halos(N,-d)] .= recv1) # halo swap
+    !mpi_wall(d,2) && (a[halos(N,+d)] .= recv2) # halo swap
+end
+
+"""
+    BC!(a)
+
+This function sets the boundary conditions of the array `a` using the MPI grid.
+"""
+# function BC!(a::MPIArray,A,saveexit=false,perdir=())
+#     N,n = WaterLily.size_u(a)
+#     for i ∈ 1:n, d ∈ 1:n
+#         # get data to transfer @TODO use @views
+#         send1 = a[buff(N,-d),i]; send2 = a[buff(N,+d),i]
+#         recv1 = zero(send1);     recv2 = zero(send2)
+#         # swap 
+#         mpi_swap!(send1,recv1,send2,recv2,neighbors(d),mpi_grid().comm)
+
+#         # this sets the BCs on the domain boundary and transfers the data
+#         if mpi_wall(d,1) # left wall
+#             if i==d # set flux
+#                 a[halos(N,-d),i] .= A[i]
+#                 a[WaterLily.slice(N,3,d),i] .= A[i]
+#             else # zero gradient
+#                 a[halos(N,-d),i] .= reverse(send1; dims=d)
+#             end
+#         else # neighbor on the left
+#             a[halos(N,-d),i] .= recv1
+#         end
+#         if mpi_wall(d,2) # right wall
+#             if i==d && (!saveexit || i>1) # convection exit
+#                 a[halos(N,+d),i] .= A[i]
+#             else # zero gradient
+#                 a[halos(N,+d),i] .= reverse(send2; dims=d)
+#             end
+#         else # neighbor on the right
+#             a[halos(N,+d),i] .= recv2
+#         end
+#     end
+# end
+using EllipsisNotation
+function BC!(a::MPIArray,A,saveexit=false,perdir=())
+    N,n = WaterLily.size_u(a)
+    for d ∈ 1:n # transfer full halos in each direction
+        # get data to transfer @TODO use @views
+        send1 = a[buff(N,-d),:]; send2 = a[buff(N,+d),:]
+        recv1 = zero(send1);     recv2 = zero(send2)
+        # swap 
+        mpi_swap!(send1,recv1,send2,recv2,neighbors(d),mpi_grid().comm)
+
+        # mpi boundary swap
+        !mpi_wall(d,1) && (a[halos(N,-d),:] .= recv1) # halo swap
+        !mpi_wall(d,2) && (a[halos(N,+d),:] .= recv2) # halo swap
+
+        for i ∈ 1:n # this sets the BCs on the physical boundary
+            if mpi_wall(d,1) # left wall
+                if i==d # set flux
+                    a[halos(N,-d),i] .= A[i]
+                    a[WaterLily.slice(N,3,d),i] .= A[i]
+                else # zero gradient
+                    a[halos(N,-d),i] .= reverse(send1[..,i]; dims=d)
+                end
+            end
+            if mpi_wall(d,2) # right wall
+                if i==d && (!saveexit || i>1) # convection exit
+                    a[halos(N,+d),i] .= A[i]
+                else # zero gradient
+                    a[halos(N,+d),i] .= reverse(send2[..,i]; dims=d)
+                end
+            end
+        end
+    end
+end
+
+function exitBC!(u::MPIArray,u⁰,U,Δt)
+    N,_ = WaterLily.size_u(u)
+    exitR = WaterLily.slice(N.-2,N[1]-2,1,3) # exit slice excluding ghosts
+    # ∮udA = 0
+    # if mpi_wall(1,2) #right wall
+    @WaterLily.loop u[I,1] = u⁰[I,1]-U[1]*Δt*(u⁰[I,1]-u⁰[I-δ(1,I),1]) over I ∈ exitR
+    ∮u = sum(u[exitR,1])/length(exitR)-U[1]   # mass flux imbalance
+    # end
+    # ∮u = MPI.Allreduce(∮udA,+,mpi_grid().comm)           # domain imbalance
+    # mpi_wall(1,2) && (@WaterLily.loop u[I,1] -= ∮u over I ∈ exitR) # correct flux only on right wall
+    @WaterLily.loop u[I,1] -= ∮u over I ∈ exitR # correct flux only on right wall
+end
+
+struct MPIGrid #{I,C<:MPI.Comm,N<:AbstractVector,M<:AbstractArray,G<:AbstractVector}
+    me::Int                    # rank
+    comm::MPI.Comm             # communicator
+    coords::AbstractVector     # coordinates
+    neighbors::AbstractArray   # neighbors
+    global_loc::AbstractVector # the location of the lower left corner in global index space
+end
+const MPI_GRID_NULL = MPIGrid(-1,MPI.COMM_NULL,[-1,-1,-1],[-1 -1 -1; -1 -1 -1],[0,0,0])
+
+let
+    global MPIGrid, set_mpi_grid, mpi_grid, mpi_initialized, check_mpi
+
+    # allows to access the global mpi grid
+    _mpi_grid::MPIGrid          = MPI_GRID_NULL
+    mpi_grid()::MPIGrid         = (check_mpi(); _mpi_grid::MPIGrid)
+    set_mpi_grid(grid::MPIGrid) = (_mpi_grid = grid;)
+    mpi_initialized()           = (_mpi_grid.comm != MPI.COMM_NULL)
+    check_mpi()                 = !mpi_initialized() && error("MPI not initialized")
+end
+
+function init_mpi(Dims::NTuple{D};dims=[0, 0, 0],periods=[0, 0, 0],comm::MPI.Comm=MPI.COMM_WORLD,
+                  disp::Integer=1,reorder::Bool=true) where D
+    # MPI
+    MPI.Init()
+    nprocs = MPI.Comm_size(comm)
+    # create cartesian communicator
+    MPI.Dims_create!(nprocs, dims)
+    comm_cart = MPI.Cart_create(comm, dims, periods, reorder)
+    me     = MPI.Comm_rank(comm_cart)
+    coords = MPI.Cart_coords(comm_cart)
+    # make the cart comm
+    neighbors = fill(MPI.PROC_NULL, NNEIGHBORS_PER_DIM, NDIMS_MPI);
+    for i = 1:NDIMS_MPI
+        neighbors[:,i] .= MPI.Cart_shift(comm_cart, i-1, disp);
+    end
+    # global index coordinate in grid space
+    global_loc = SVector([coords[i]*Dims[i] for i in 1:D]...)
+    set_mpi_grid(MPIGrid(me,comm_cart,coords,neighbors,global_loc))
+    return me; # this is the most usefull MPI vriable to have in the local space
+end
+finalize_mpi() = MPI.Finalize()
+
+# global coordinate in grid space
+# grid_loc(grid::MPIGrid=mpi_grid()) = grid.global_loc
+me()= mpi_grid().me
+neighbors(dim) = mpi_grid().neighbors[:,dim]
+mpi_wall(dim,i) = mpi_grid().neighbors[i,dim]==MPI.PROC_NULL
+
+L₂(a::MPIArray{T}) where T = MPI.Allreduce(sum(T,abs2,@inbounds(a[I]) for I ∈ inside(a)),+,mpi_grid().comm)
+function L₂(p::Poisson{T,S}) where {T,S<:MPIArray{T}} # should work on the GPU
+    MPI.Allreduce(sum(T,@inbounds(p.r[I]*p.r[I]) for I ∈ inside(p.r)),+,mpi_grid().comm)
+end
+L∞(a::MPIArray) = MPI.Allreduce(maximum(abs.(a)),Base.max,mpi_grid().comm)
+L∞(p::Poisson{T,S}) where {T,S<:MPIArray{T}} = MPI.Allreduce(maximum(abs.(p.r)),Base.max,mpi_grid().comm)
+function _dot(a::MPIArray{T},b::MPIArray{T}) where T
+    MPI.Allreduce(sum(T,@inbounds(a[I]*b[I]) for I ∈ inside(a)),+,mpi_grid().comm)
+end
+
+function CFL(a::Flow{D,T,S};Δt_max=10) where {D,T,S<:MPIArray{T}}
+    @inside a.σ[I] = WaterLily.flux_out(I,a.u)
+    MPI.Allreduce(min(Δt_max,inv(maximum(a.σ)+5a.ν)),Base.min,mpi_grid().comm)
+end
+# this actually add a global comminutation every time residual is called
+function residual!(p::Poisson{T,S}) where {T,S<:MPIArray{T}}
+    WaterLily.perBC!(p.x,p.perdir)
+    @inside p.r[I] = ifelse(p.iD[I]==0,0,p.z[I]-WaterLily.mult(I,p.L,p.D,p.x))
+    # s = sum(p.r)/length(inside(p.r))
+    s = MPI.Allreduce(sum(p.r)/length(inside(p.r)),+,mpi_grid().comm)
+    abs(s) <= 2eps(eltype(s)) && return
+    @inside p.r[I] = p.r[I]-s
+end
+
+function sim_step!(sim::Simulation{D,T,S},t_end;remeasure=true,
+                   max_steps=typemax(Int),verbose=false) where {D,T,S<:MPIArray{T}}
+    steps₀ = length(sim.flow.Δt)
+    while sim_time(sim) < t_end && length(sim.flow.Δt) - steps₀ < max_steps
+        sim_step!(sim; remeasure)
+        (verbose && me()==0) && println("tU/L=",round(sim_time(sim),digits=4),
+                                        ", Δt=",round(sim.flow.Δt[end],digits=3))
+    end
+end
+
+# hepler function for vtk writer
+function get_extents(a::MPIArray)
+    xs = Tuple(ifelse(x==0,1,x+1):ifelse(x==0,n+4,n+x+4) for (n,x) in zip(size(inside(a)),grid_loc()))
+    MPI.Allgather(xs, mpi_grid().comm)
+end
+
+end # module