Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
AFiD/decomp_2d.F90
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1222 lines (1020 sloc)
42.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!======================================================================= | |
! This is part of the 2DECOMP&FFT library | |
! | |
! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) | |
! decomposition. It also implements a highly scalable distributed | |
! three-dimensional Fast Fourier Transform (FFT). | |
! | |
! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG) | |
! | |
!======================================================================= | |
! This is the main 2D pencil decomposition module | |
module decomp_2d | |
implicit none | |
include "mpif.h" | |
#ifdef GLOBAL_ARRAYS | |
#include "mafdecls.fh" | |
#include "global.fh" | |
#endif | |
private ! Make everything private unless declared public | |
integer, parameter, public :: mytype = KIND(0.0D0) | |
integer, parameter, public :: real_type = MPI_DOUBLE_PRECISION | |
integer, parameter, public :: complex_type = MPI_DOUBLE_COMPLEX | |
#ifdef GLOBAL_ARRAYS | |
integer, parameter, public :: ga_real_type = MT_F_DBL | |
integer, parameter, public :: ga_complex_type = MT_F_DCPL | |
#endif | |
integer, save, public :: mytype_bytes | |
! some key global variables | |
integer, save, public :: nx_global, ny_global, nz_global ! global size | |
integer, save, public :: nrank ! local MPI rank | |
integer, save, public :: nproc ! total number of processors | |
! parameters for 2D Cartesian topology | |
integer, save, dimension(2) :: dims, coord | |
logical, save, dimension(2) :: periodic | |
integer, save, public :: DECOMP_2D_COMM_CART_X, & | |
DECOMP_2D_COMM_CART_Y, DECOMP_2D_COMM_CART_Z | |
integer, save :: DECOMP_2D_COMM_ROW, DECOMP_2D_COMM_COL | |
! define neighboring blocks (to be used in halo-cell support) | |
! first dimension 1=X-pencil, 2=Y-pencil, 3=Z-pencil | |
! second dimension 1=east, 2=west, 3=north, 4=south, 5=top, 6=bottom | |
integer, save, dimension(1,6) :: neighbour | |
! flags for periodic condition in three dimensions | |
logical, save :: periodic_x, periodic_y, periodic_z | |
#ifdef SHM | |
! derived type to store shared-memory info | |
TYPE, public :: SMP_INFO | |
integer MPI_COMM ! SMP associated with this communicator | |
integer NODE_ME ! rank in this communicator | |
integer NCPU ! size of this communicator | |
integer SMP_COMM ! communicator for SMP-node masters | |
integer CORE_COMM ! communicator for cores on SMP-node | |
integer SMP_ME ! SMP-node id starting from 1 ... NSMP | |
integer NSMP ! number of SMP-nodes in this communicator | |
integer CORE_ME ! core id starting from 1 ... NCORE | |
integer NCORE ! number of cores on this SMP-node | |
integer MAXCORE ! maximum no. cores on any SMP-node | |
integer N_SND ! size of SMP shared memory buffer | |
integer N_RCV ! size of SMP shared memory buffer | |
integer(8) SND_P ! SNDBUF address (cray pointer), for real | |
integer(8) RCV_P ! RCVBUF address (cray pointer), for real | |
integer(8) SND_P_c ! for complex | |
integer(8) RCV_P_c ! for complex | |
END TYPE SMP_INFO | |
#endif | |
! derived type to store decomposition info for a given global data size | |
TYPE, public :: DECOMP_INFO | |
! staring/ending index and size of data held by current processor | |
integer, dimension(3) :: xst, xen, xsz ! x-pencil | |
integer, dimension(3) :: yst, yen, ysz ! y-pencil | |
integer, dimension(3) :: zst, zen, zsz ! z-pencil | |
! in addition to local information, processors also need to know | |
! some global information for global communications to work | |
! how each dimension is distributed along pencils | |
integer, allocatable, dimension(:) :: & | |
x1dist, y1dist, y2dist, z2dist, x2dist, z1dist, & | |
x1st, y1st, y2st, z2st, x2st, z1st, & | |
x1en, y1en, y2en, z2en, x2en, z1en | |
! send/receive buffer counts and displacements for MPI_ALLTOALLV | |
integer, allocatable, dimension(:) :: & | |
x1cnts, y1cnts, y2cnts, z2cnts, x2cnts, z1cnts | |
integer, allocatable, dimension(:) :: & | |
x1disp, y1disp, y2disp, z2disp, x2disp, z1disp | |
! buffer counts for MPI_ALLTOALL: either for evenly distributed data | |
! or for padded-alltoall | |
integer :: x1count, y1count, y2count, z2count, x2count, z1count | |
! buffer counts, displacements and types for MPI_Alltoallw to transform | |
! directly between x- and z-pencils | |
! This is only for the complex datatype | |
integer,dimension(:),allocatable::zcnts_xz,ztypes_xz | |
integer,dimension(:),allocatable::xcnts_xz,xtypes_xz | |
#ifdef MPI3 | |
! use MPI_ADDRESS_KIND for MPI_Neighbor_alltoallw call | |
integer(kind=MPI_ADDRESS_KIND),dimension(:),allocatable::zdispls_xz,xdispls_xz | |
integer :: xtozNeighborComm,ztoxNeighborComm | |
integer,dimension(:),allocatable::xranks,zranks | |
#else | |
! use default integer for MPI_Alltoallw call | |
integer,dimension(:),allocatable::zdispls_xz,xdispls_xz | |
#endif | |
!#endif | |
! evenly distributed data | |
logical :: even | |
#ifdef SHM | |
! For shared-memory implementation | |
! one instance of this derived type for each communicator | |
! shared moemory info, such as which MPI rank belongs to which node | |
TYPE(SMP_INFO) :: ROW_INFO, COL_INFO | |
! shared send/recv buffers for ALLTOALLV | |
integer, allocatable, dimension(:) :: x1cnts_s, y1cnts_s, & | |
y2cnts_s, z2cnts_s | |
integer, allocatable, dimension(:) :: x1disp_s, y1disp_s, & | |
y2disp_s, z2disp_s | |
! A copy of original buffer displacement (will be overwriten) | |
integer, allocatable, dimension(:) :: x1disp_o, y1disp_o, & | |
y2disp_o, z2disp_o | |
#endif | |
END TYPE DECOMP_INFO | |
! main (default) decomposition information for global size nx*ny*nz | |
TYPE(DECOMP_INFO), save :: decomp_main | |
TYPE(DECOMP_INFO), save :: decomp_ph,decomp_sp | |
! staring/ending index and size of data held by current processor | |
! duplicate 'decomp_main', needed by apps to define data structure | |
integer, save, dimension(3), public :: xstart, xend, xsize ! x-pencil | |
integer, save, dimension(3), public :: ystart, yend, ysize ! y-pencil | |
integer, save, dimension(3), public :: zstart, zend, zsize ! z-pencil | |
! These are the buffers used by MPI_ALLTOALL(V) calls | |
integer, save :: decomp_buf_size = 0 | |
real(mytype), allocatable, dimension(:) :: work1_r, work2_r | |
complex(mytype), allocatable, dimension(:) :: work1_c, work2_c | |
! public user routines | |
public :: decomp_2d_init, decomp_2d_finalize, & | |
transpose_x_to_y, transpose_y_to_z, & | |
transpose_z_to_y, transpose_y_to_x, & | |
transpose_z_to_x, transpose_x_to_z, & | |
decomp_info_init, decomp_info_finalize, partition, & | |
#ifdef GLOBAL_ARRAYS | |
get_global_array, & | |
#endif | |
alloc_x, alloc_y, alloc_z, & | |
update_halo, decomp_2d_abort, & | |
get_decomp_info | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! These are routines to perform global data transpositions | |
! | |
! Four combinations are available, enough to cover all situations | |
! - transpose_x_to_y (X-pencil --> Y-pencil) | |
! - transpose_y_to_z (Y-pencil --> Z-pencil) | |
! - transpose_z_to_y (Z-pencil --> Y-pencil) | |
! - transpose_y_to_x (Y-pencil --> X-pencil) | |
! | |
! Generic interface provided here to support multiple data types | |
! - real and complex types supported through generic interface | |
! - single/double precision supported through pre-processing | |
! * see 'mytype' variable at the beginning | |
! - an optional argument can be supplied to transpose data whose | |
! global size is not the default nx*ny*nz | |
! * as the case in fft r2c/c2r interface | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
interface transpose_x_to_y | |
module procedure transpose_x_to_y_real | |
module procedure transpose_x_to_y_complex | |
end interface transpose_x_to_y | |
interface transpose_y_to_z | |
module procedure transpose_y_to_z_real | |
module procedure transpose_y_to_z_complex | |
end interface transpose_y_to_z | |
interface transpose_z_to_y | |
module procedure transpose_z_to_y_real | |
module procedure transpose_z_to_y_complex | |
end interface transpose_z_to_y | |
interface transpose_y_to_x | |
module procedure transpose_y_to_x_real | |
module procedure transpose_y_to_x_complex | |
end interface transpose_y_to_x | |
interface transpose_z_to_x | |
! not available | |
! module procedure transpose_z_to_x_real | |
module procedure transpose_z_to_x_complex | |
end interface transpose_z_to_x | |
interface transpose_x_to_z | |
! not available | |
! module procedure transpose_x_to_z_real | |
module procedure transpose_x_to_z_complex | |
end interface transpose_x_to_z | |
interface update_halo | |
module procedure update_halo_real | |
end interface update_halo | |
interface alloc_x | |
module procedure alloc_x_real | |
module procedure alloc_x_complex | |
end interface alloc_x | |
interface alloc_y | |
module procedure alloc_y_real | |
module procedure alloc_y_complex | |
end interface alloc_y | |
interface alloc_z | |
module procedure alloc_z_real | |
module procedure alloc_z_complex | |
end interface alloc_z | |
contains | |
#ifdef SHM_DEBUG | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! For debugging, print the shared-memory structure | |
subroutine print_smp_info(s) | |
TYPE(SMP_INFO) :: s | |
write(10,*) 'size of current communicator:', s%NCPU | |
write(10,*) 'rank in current communicator:', s%NODE_ME | |
write(10,*) 'number of SMP-nodes in this communicator:', s%NSMP | |
write(10,*) 'SMP-node id (1 ~ NSMP):', s%SMP_ME | |
write(10,*) 'NCORE - number of cores on this SMP-node', s%NCORE | |
write(10,*) 'core id (1 ~ NCORE):', s%CORE_ME | |
write(10,*) 'maximum no. cores on any SMP-node:', s%MAXCORE | |
write(10,*) 'size of SMP shared memory SND buffer:', s%N_SND | |
write(10,*) 'size of SMP shared memory RCV buffer:', s%N_RCV | |
end subroutine print_smp_info | |
#endif | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Routine to be called by applications to initialise this library | |
! INPUT: | |
! nx, ny, nz - global data dimension | |
! p_row, p_col - 2D processor grid | |
! OUTPUT: | |
! all internal data structures initialised properly | |
! library ready to use | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
subroutine decomp_2d_init(nx,ny,nz,p_row,p_col,periodic_bc) | |
implicit none | |
integer, intent(IN) :: nx,ny,nz,p_row,p_col | |
logical, dimension(3), intent(IN), optional :: periodic_bc | |
integer :: errorcode, ierror, row, col | |
#ifdef SHM_DEBUG | |
character(len=80) fname | |
#endif | |
nx_global = nx | |
ny_global = ny | |
nz_global = nz | |
if (present(periodic_bc)) then | |
periodic_x = periodic_bc(1) | |
periodic_y = periodic_bc(2) | |
periodic_z = periodic_bc(3) | |
else | |
periodic_x = .false. | |
periodic_y = .false. | |
periodic_z = .false. | |
end if | |
call MPI_INIT(ierror) | |
call MPI_COMM_RANK(MPI_COMM_WORLD,nrank,ierror) | |
call MPI_COMM_SIZE(MPI_COMM_WORLD,nproc,ierror) | |
if (p_row==0 .and. p_col==0) then | |
! determine the best 2D processor grid | |
call best_2d_grid(nproc, row, col) | |
else | |
if (nproc /= p_row*p_col) then | |
errorcode = 1 | |
call decomp_2d_abort(errorcode, & | |
'Invalid 2D processor grid - nproc /= p_row*p_col') | |
else | |
row = p_row | |
col = p_col | |
if (nrank==0) then | |
write(*,*) 'the used processor grid is ', row,' by ', col | |
end if | |
end if | |
end if | |
! Create 2D Catersian topology | |
! Note that in order to support periodic B.C. in the halo-cell code, | |
! need to create multiple topology objects: DECOMP_2D_COMM_CART_?, | |
! corresponding to three pencil orientations. They contain almost | |
! identical topological information but allow different combinations | |
! of periodic conditions. | |
dims(1) = row | |
dims(2) = col | |
periodic(1) = periodic_y | |
periodic(2) = periodic_z | |
call MPI_CART_CREATE(MPI_COMM_WORLD,2,dims,periodic, & | |
.false., & ! do not reorder rank | |
DECOMP_2D_COMM_CART_X, ierror) | |
periodic(1) = periodic_x | |
periodic(2) = periodic_z | |
call MPI_CART_CREATE(MPI_COMM_WORLD,2,dims,periodic, & | |
.false., DECOMP_2D_COMM_CART_Y, ierror) | |
periodic(1) = periodic_x | |
periodic(2) = periodic_y | |
call MPI_CART_CREATE(MPI_COMM_WORLD,2,dims,periodic, & | |
.false., DECOMP_2D_COMM_CART_Z, ierror) | |
call MPI_CART_COORDS(DECOMP_2D_COMM_CART_X,nrank,2,coord,ierror) | |
! derive communicators defining sub-groups for ALLTOALL(V) | |
call MPI_CART_SUB(DECOMP_2D_COMM_CART_X,(/.true.,.false./), & | |
DECOMP_2D_COMM_COL,ierror) | |
call MPI_CART_SUB(DECOMP_2D_COMM_CART_X,(/.false.,.true./), & | |
DECOMP_2D_COMM_ROW,ierror) | |
! gather information for halo-cell support code | |
call init_neighbour | |
! actually generate all 2D decomposition information | |
call decomp_info_init(nx,ny,nz,decomp_main) | |
! make a copy of the decomposition information associated with the | |
! default global size in these global variables so applications can | |
! use them to create data structures | |
xstart = decomp_main%xst | |
ystart = decomp_main%yst | |
zstart = decomp_main%zst | |
xend = decomp_main%xen | |
yend = decomp_main%yen | |
zend = decomp_main%zen | |
xsize = decomp_main%xsz | |
ysize = decomp_main%ysz | |
zsize = decomp_main%zsz | |
decomp_ph=decomp_main | |
#ifdef SHM_DEBUG | |
! print out shared-memory information | |
write(fname,99) nrank | |
99 format('log',I2.2) | |
open(10,file=fname) | |
write(10,*)'I am mpi rank ', nrank, 'Total ranks ', nproc | |
write(10,*)' ' | |
write(10,*)'Global data size:' | |
write(10,*)'nx*ny*nz', nx,ny,nz | |
write(10,*)' ' | |
write(10,*)'2D processor grid:' | |
write(10,*)'p_row*p_col:', dims(1), dims(2) | |
write(10,*)' ' | |
write(10,*)'Portion of global data held locally:' | |
write(10,*)'xsize:',xsize | |
write(10,*)'ysize:',ysize | |
write(10,*)'zsize:',zsize | |
write(10,*)' ' | |
write(10,*)'How pensils are to be divided and sent in alltoallv:' | |
write(10,*)'x1dist:',decomp_main%x1dist | |
write(10,*)'y1dist:',decomp_main%y1dist | |
write(10,*)'y2dist:',decomp_main%y2dist | |
write(10,*)'z2dist:',decomp_main%z2dist | |
write(10,*)' ' | |
write(10,*)'######Shared buffer set up after this point######' | |
write(10,*)' ' | |
write(10,*) 'col communicator detais:' | |
call print_smp_info(decomp_main%COL_INFO) | |
write(10,*)' ' | |
write(10,*) 'row communicator detais:' | |
call print_smp_info(decomp_main%ROW_INFO) | |
write(10,*)' ' | |
write(10,*)'Buffer count and dispalcement of per-core buffers' | |
write(10,*)'x1cnts:',decomp_main%x1cnts | |
write(10,*)'y1cnts:',decomp_main%y1cnts | |
write(10,*)'y2cnts:',decomp_main%y2cnts | |
write(10,*)'z2cnts:',decomp_main%z2cnts | |
write(10,*)'x1disp:',decomp_main%x1disp | |
write(10,*)'y1disp:',decomp_main%y1disp | |
write(10,*)'y2disp:',decomp_main%y2disp | |
write(10,*)'z2disp:',decomp_main%z2disp | |
write(10,*)' ' | |
write(10,*)'Buffer count and dispalcement of shared buffers' | |
write(10,*)'x1cnts:',decomp_main%x1cnts_s | |
write(10,*)'y1cnts:',decomp_main%y1cnts_s | |
write(10,*)'y2cnts:',decomp_main%y2cnts_s | |
write(10,*)'z2cnts:',decomp_main%z2cnts_s | |
write(10,*)'x1disp:',decomp_main%x1disp_s | |
write(10,*)'y1disp:',decomp_main%y1disp_s | |
write(10,*)'y2disp:',decomp_main%y2disp_s | |
write(10,*)'z2disp:',decomp_main%z2disp_s | |
write(10,*)' ' | |
close(10) | |
#endif | |
! determine the number of bytes per float number | |
! do not use 'mytype' which is compiler dependent | |
! also possible to use inquire(iolength=...) | |
call MPI_TYPE_SIZE(real_type,mytype_bytes,ierror) | |
#ifdef EVEN | |
if (nrank==0) write(*,*) 'Padded ALLTOALL optimisation on' | |
#endif | |
return | |
end subroutine decomp_2d_init | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Routine to be called by applications to clean things up | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
subroutine decomp_2d_finalize | |
implicit none | |
integer ierr | |
call decomp_info_finalize(decomp_main) | |
decomp_buf_size = 0 | |
deallocate(work1_r, work2_r, work1_c, work2_c) | |
call MPI_Finalize(ierr) | |
return | |
end subroutine decomp_2d_finalize | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Return the default decomposition object | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
subroutine get_decomp_info(decomp) | |
implicit none | |
TYPE(DECOMP_INFO), intent(OUT) :: decomp | |
decomp = decomp_main | |
return | |
end subroutine get_decomp_info | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Advanced Interface allowing applications to define globle domain of | |
! any size, distribute it, and then transpose data among pencils. | |
! - generate 2D decomposition details as defined in DECOMP_INFO | |
! - the default global data size is nx*ny*nz | |
! - a different global size nx/2+1,ny,nz is used in FFT r2c/c2r | |
! - multiple global sizes can co-exist in one application, each | |
! using its own DECOMP_INFO object | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
subroutine decomp_info_init(nx,ny,nz,decomp) | |
implicit none | |
integer, intent(IN) :: nx,ny,nz | |
TYPE(DECOMP_INFO), intent(INOUT) :: decomp | |
integer :: buf_size, status, errorcode | |
! verify the global size can actually be distributed as pencils | |
if (nx<dims(1) .or. ny<dims(1) .or. ny<dims(2) .or. nz<dims(2)) then | |
errorcode = 6 | |
call decomp_2d_abort(errorcode, & | |
'Invalid 2D processor grid. ' // & | |
'Make sure that min(nx,ny) >= p_row and ' // & | |
'min(ny,nz) >= p_col') | |
end if | |
if (mod(nx,dims(1))==0 .and. mod(ny,dims(1))==0 .and. & | |
mod(ny,dims(2))==0 .and. mod(nz,dims(2))==0) then | |
decomp%even = .true. | |
else | |
decomp%even = .false. | |
end if | |
! distribute mesh points | |
allocate(decomp%x1dist(0:dims(1)-1),decomp%y1dist(0:dims(1)-1), & | |
decomp%y2dist(0:dims(2)-1),decomp%z2dist(0:dims(2)-1), & | |
decomp%x2dist(0:dims(2)-1),decomp%z1dist(0:dims(1)-1)) | |
allocate(decomp%x1st(0:dims(1)-1),decomp%x1en(0:dims(1)-1), & | |
decomp%y1st(0:dims(1)-1),decomp%y1en(0:dims(1)-1), & | |
decomp%z1st(0:dims(1)-1),decomp%z1en(0:dims(1)-1)) | |
allocate(decomp%x2st(0:dims(2)-1),decomp%x2en(0:dims(2)-1), & | |
decomp%y2st(0:dims(2)-1),decomp%y2en(0:dims(2)-1), & | |
decomp%z2st(0:dims(2)-1),decomp%z2en(0:dims(2)-1)) | |
call get_dist(nx,ny,nz,decomp) | |
! generate partition information - starting/ending index etc. | |
call partition(nx, ny, nz, (/ 1,2,3 /), & | |
decomp%xst, decomp%xen, decomp%xsz) | |
call partition(nx, ny, nz, (/ 2,1,3 /), & | |
decomp%yst, decomp%yen, decomp%ysz) | |
call partition(nx, ny, nz, (/ 2,3,1 /), & | |
decomp%zst, decomp%zen, decomp%zsz) | |
! prepare send/receive buffer displacement and count for ALLTOALL(V) | |
allocate(decomp%x1cnts(0:dims(1)-1),decomp%y1cnts(0:dims(1)-1), & | |
decomp%y2cnts(0:dims(2)-1),decomp%z2cnts(0:dims(2)-1), & | |
decomp%z1cnts(0:dims(1)-1),decomp%x2cnts(0:dims(2)-1)) | |
allocate(decomp%x1disp(0:dims(1)-1),decomp%y1disp(0:dims(1)-1), & | |
decomp%y2disp(0:dims(2)-1),decomp%z2disp(0:dims(2)-1), & | |
decomp%x2disp(0:dims(2)-1),decomp%z1disp(0:dims(1)-1)) | |
! allocate arrays for MPI_ALLTOALLW calls | |
allocate(decomp%xcnts_xz(nproc),decomp%zcnts_xz(nproc)) | |
allocate(decomp%xtypes_xz(nproc),decomp%ztypes_xz(nproc)) | |
allocate(decomp%xdispls_xz(nproc)) | |
allocate(decomp%zdispls_xz(nproc)) | |
call prepare_buffer(decomp) | |
! allocate memory for the MPI_ALLTOALL(V) buffers | |
! define the buffers globally for performance reason | |
buf_size = max(decomp%xsz(1)*decomp%xsz(2)*decomp%xsz(3), & | |
max(decomp%ysz(1)*decomp%ysz(2)*decomp%ysz(3), & | |
decomp%zsz(1)*decomp%zsz(2)*decomp%zsz(3)) ) | |
! check if additional memory is required | |
! *** TODO: consider how to share the real/complex buffers | |
if (buf_size > decomp_buf_size) then | |
decomp_buf_size = buf_size | |
if (allocated(work1_r)) deallocate(work1_r) | |
if (allocated(work2_r)) deallocate(work2_r) | |
if (allocated(work1_c)) deallocate(work1_c) | |
if (allocated(work2_c)) deallocate(work2_c) | |
allocate(work1_r(buf_size), STAT=status) | |
allocate(work2_r(buf_size), STAT=status) | |
allocate(work1_c(buf_size), STAT=status) | |
allocate(work2_c(buf_size), STAT=status) | |
if (status /= 0) then | |
errorcode = 2 | |
call decomp_2d_abort(errorcode, & | |
'Out of memory when allocating 2DECOMP workspace') | |
end if | |
end if | |
return | |
end subroutine decomp_info_init | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Release memory associated with a DECOMP_INFO object | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
subroutine decomp_info_finalize(decomp) | |
implicit none | |
integer :: i | |
integer :: ierror | |
TYPE(DECOMP_INFO), intent(INOUT) :: decomp | |
deallocate(decomp%x1dist,decomp%y1dist,decomp%y2dist,decomp%z2dist) | |
deallocate(decomp%z1dist,decomp%x2dist) | |
deallocate(decomp%x1st,decomp%y1st,decomp%y2st,decomp%z2st) | |
deallocate(decomp%z1st,decomp%x2st) | |
deallocate(decomp%x1en,decomp%y1en,decomp%y2en,decomp%z2en) | |
deallocate(decomp%z1en,decomp%x2en) | |
deallocate(decomp%x1cnts,decomp%y1cnts,decomp%y2cnts,decomp%z2cnts) | |
deallocate(decomp%z1cnts,decomp%x2cnts) | |
deallocate(decomp%x1disp,decomp%y1disp,decomp%y2disp,decomp%z2disp) | |
deallocate(decomp%z1disp,decomp%x2disp) | |
do i=1,nproc | |
if (decomp%ztypes_xz(i).ne.MPI_INTEGER) then | |
call MPI_Type_free(decomp%ztypes_xz(i),ierror) | |
endif | |
if (decomp%xtypes_xz(i).ne.MPI_INTEGER) then | |
call MPI_Type_free(decomp%xtypes_xz(i),ierror) | |
endif | |
enddo | |
deallocate(decomp%xcnts_xz,decomp%zcnts_xz) | |
deallocate(decomp%xtypes_xz,decomp%ztypes_xz) | |
deallocate(decomp%xdispls_xz) | |
deallocate(decomp%zdispls_xz) | |
#ifdef MPI3 | |
deallocate(decomp%xranks,decomp%zranks) | |
#endif | |
return | |
end subroutine decomp_info_finalize | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Find sub-domain information held by current processor | |
! INPUT: | |
! nx, ny, nz - global data dimension | |
! pdim(3) - number of processor grid in each dimension, | |
! valid values: 1 - distibute locally; | |
! 2 - distribute across p_row; | |
! 3 - distribute across p_col | |
! OUTPUT: | |
! lstart(3) - starting index | |
! lend(3) - ending index | |
! lsize(3) - size of the sub-block (redundant) | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
subroutine partition(nx, ny, nz, pdim, lstart, lend, lsize) | |
implicit none | |
integer, intent(IN) :: nx, ny, nz | |
integer, dimension(3), intent(IN) :: pdim | |
integer, dimension(3), intent(OUT) :: lstart, lend, lsize | |
integer, allocatable, dimension(:) :: st,en,sz | |
integer :: i, gsize | |
do i = 1, 3 | |
if (i==1) then | |
gsize = nx | |
else if (i==2) then | |
gsize = ny | |
else if (i==3) then | |
gsize = nz | |
end if | |
if (pdim(i) == 1) then ! all local | |
lstart(i) = 1 | |
lend(i) = gsize | |
lsize(i) = gsize | |
elseif (pdim(i) == 2) then ! distribute across dims(1) | |
allocate(st(0:dims(1)-1)) | |
allocate(en(0:dims(1)-1)) | |
allocate(sz(0:dims(1)-1)) | |
call distribute(gsize,dims(1),st,en,sz) | |
lstart(i) = st(coord(1)) | |
lend(i) = en(coord(1)) | |
lsize(i) = sz(coord(1)) | |
deallocate(st,en,sz) | |
elseif (pdim(i) == 3) then ! distribute across dims(2) | |
allocate(st(0:dims(2)-1)) | |
allocate(en(0:dims(2)-1)) | |
allocate(sz(0:dims(2)-1)) | |
call distribute(gsize,dims(2),st,en,sz) | |
lstart(i) = st(coord(2)) | |
lend(i) = en(coord(2)) | |
lsize(i) = sz(coord(2)) | |
deallocate(st,en,sz) | |
end if | |
end do | |
return | |
end subroutine partition | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! - distibutes grid points in one dimension | |
! - handles uneven distribution properly | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
subroutine distribute(data1,proc,st,en,sz) | |
implicit none | |
! data1 -- data size in any dimension to be partitioned | |
! proc -- number of processors in that dimension | |
! st -- array of starting index | |
! en -- array of ending index | |
! sz -- array of local size (redundent) | |
integer data1,proc,st(0:proc-1),en(0:proc-1),sz(0:proc-1) | |
integer i,size1,nl,nu | |
size1=data1/proc | |
nu = data1 - size1 * proc | |
nl = proc - nu | |
st(0) = 1 | |
sz(0) = size1 | |
en(0) = size1 | |
do i=1,nl-1 | |
st(i) = st(i-1) + size1 | |
sz(i) = size1 | |
en(i) = en(i-1) + size1 | |
end do | |
size1 = size1 + 1 | |
do i=nl,proc-1 | |
st(i) = en(i-1) + 1 | |
sz(i) = size1 | |
en(i) = en(i-1) + size1 | |
end do | |
en(proc-1)= data1 | |
sz(proc-1)= data1-st(proc-1)+1 | |
return | |
end subroutine distribute | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Define how each dimension is distributed across processors | |
! e.g. 17 meshes across 4 processor would be distibuted as (4,4,4,5) | |
! such global information is required locally at MPI_ALLTOALLV time | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
subroutine get_dist(nx,ny,nz,decomp) | |
integer, intent(IN) :: nx, ny, nz | |
TYPE(DECOMP_INFO), intent(INOUT) :: decomp | |
call distribute(nx,dims(1),decomp%x1st,decomp%x1en,decomp%x1dist) | |
call distribute(ny,dims(1),decomp%y1st,decomp%y1en,decomp%y1dist) | |
call distribute(ny,dims(2),decomp%y2st,decomp%y2en,decomp%y2dist) | |
call distribute(nz,dims(2),decomp%z2st,decomp%z2en,decomp%z2dist) | |
return | |
end subroutine get_dist | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Prepare the send / receive buffers for MPI_ALLTOALLV communications | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
subroutine prepare_buffer(decomp) | |
implicit none | |
TYPE(DECOMP_INFO), intent(INOUT) :: decomp | |
integer :: i, k | |
integer :: rank_x, rank_z | |
integer :: subsize_y, offset_y | |
integer :: ierror | |
#ifdef MPI3 | |
integer,dimension(nproc) :: xranks,zranks | |
integer,dimension(nproc) :: xweights,zweights | |
integer :: index_src, index_dest | |
#endif | |
! MPI_ALLTOALLV buffer information | |
do i=0, dims(1)-1 | |
decomp%x1cnts(i) = decomp%x1dist(i)*decomp%xsz(2)*decomp%xsz(3) | |
decomp%y1cnts(i) = decomp%ysz(1)*decomp%y1dist(i)*decomp%ysz(3) | |
decomp%z1cnts(i) = decomp%zsz(1)*decomp%zsz(2)*decomp%z1dist(i) | |
if (i==0) then | |
decomp%x1disp(i) = 0 ! displacement is 0-based index | |
decomp%y1disp(i) = 0 | |
decomp%z1disp(i) = 0 | |
else | |
decomp%x1disp(i) = decomp%x1disp(i-1) + decomp%x1cnts(i-1) | |
decomp%y1disp(i) = decomp%y1disp(i-1) + decomp%y1cnts(i-1) | |
decomp%z1disp(i) = decomp%z1disp(i-1) + decomp%z1cnts(i-1) | |
end if | |
end do | |
do i=0, dims(2)-1 | |
decomp%x2cnts(i) = decomp%x2dist(i)*decomp%xsz(2)*decomp%xsz(3) | |
decomp%y2cnts(i) = decomp%ysz(1)*decomp%y2dist(i)*decomp%ysz(3) | |
decomp%z2cnts(i) = decomp%zsz(1)*decomp%zsz(2)*decomp%z2dist(i) | |
if (i==0) then | |
decomp%x2disp(i) = 0 ! displacement is 0-based index | |
decomp%y2disp(i) = 0 ! displacement is 0-based index | |
decomp%z2disp(i) = 0 | |
else | |
decomp%x2disp(i) = decomp%x2disp(i-1) + decomp%x2cnts(i-1) | |
decomp%y2disp(i) = decomp%y2disp(i-1) + decomp%y2cnts(i-1) | |
decomp%z2disp(i) = decomp%z2disp(i-1) + decomp%z2cnts(i-1) | |
end if | |
end do | |
! MPI_ALLTOALL buffer information | |
! For evenly distributed data, following is an easier implementation. | |
! But it should be covered by the more general formulation below. | |
!decomp%x1count = decomp%xsz(1)*decomp%xsz(2)*decomp%xsz(3)/dims(1) | |
!decomp%y1count = decomp%ysz(1)*decomp%ysz(2)*decomp%ysz(3)/dims(1) | |
!decomp%y2count = decomp%ysz(1)*decomp%ysz(2)*decomp%ysz(3)/dims(2) | |
!decomp%z2count = decomp%zsz(1)*decomp%zsz(2)*decomp%zsz(3)/dims(2) | |
! For unevenly distributed data, pad smaller messages. Note the | |
! last blocks along pencils always get assigned more mesh points | |
! for X <=> Y transposes | |
decomp%x1count = decomp%x1dist(dims(1)-1) * & | |
decomp%y1dist(dims(1)-1) * decomp%xsz(3) | |
decomp%y1count = decomp%x1count | |
! for Y <=> Z transposes | |
decomp%y2count = decomp%y2dist(dims(2)-1) * & | |
decomp%z2dist(dims(2)-1) * decomp%zsz(1) | |
decomp%z2count = decomp%y2count | |
! Information for MPI_Alltoallw for complex X <=> Z transposes | |
decomp%xdispls_xz(:)=0 | |
decomp%zdispls_xz(:)=0 | |
decomp%xcnts_xz(:)=0 | |
decomp%zcnts_xz(:)=0 | |
decomp%xtypes_xz(:)=MPI_INTEGER | |
decomp%ztypes_xz(:)=MPI_INTEGER | |
#ifdef MPI3 | |
index_src=0 | |
index_dest=0 | |
#endif | |
do k=0,dims(1)-1 | |
do i=0,dims(2)-1 | |
! Actually, rank_x and rank_z are the same.. | |
call MPI_Cart_rank(DECOMP_2D_COMM_CART_X,(/k,i/),rank_x,ierror) | |
! this call fails, since DECOMP_2D_COMM_CART_Z is not yet created | |
! call MPI_Cart_rank(DECOMP_2D_COMM_CART_Z,(/k,i/),rank_z,ierror) | |
rank_z=rank_x | |
!JD no checks on x- or z-dimension, since we transform from z- into x-pencils, so these always overlap. | |
if (decomp%zst(2).le.decomp%y1en(k) .and. & | |
decomp%zen(2).ge.decomp%y1st(k)) then | |
decomp%zcnts_xz(rank_z+1)=1 | |
subsize_y=min(decomp%zen(2),decomp%y1en(k))-max(decomp%zst(2),decomp%y1st(k))+1 | |
offset_y =max(decomp%zst(2),decomp%y1st(k))-decomp%zst(2) | |
#ifdef MPI3 | |
index_src=index_src+1 | |
zranks(index_src)=rank_z | |
zweights(index_src)=decomp%zsz(1)*subsize_y*decomp%z2dist(i) | |
#endif | |
call MPI_Type_create_subarray(3,decomp%zsz, & | |
(/decomp%zsz(1),subsize_y,decomp%z2dist(i)/), & | |
(/0,offset_y,decomp%z2st(i)-decomp%zst(3)/), & | |
MPI_ORDER_FORTRAN,complex_type,decomp%ztypes_xz(rank_z+1),ierror) | |
call MPI_Type_commit(decomp%ztypes_xz(rank_z+1),ierror) | |
!JD send to process with x-pencil defined by (k,i) | |
!JD x-bounds are taken from the z-pencils | |
! send: decomp%zst(1):decomp%zen(1) | |
!JD y-bounds are the overlapping region of both pencils. | |
! max(decomp%zst(2),decomp%y1st(k)):min(decomp%zen(2),decomp%y1en(k)) | |
!JD z-bounds are taken from the x-pencils. | |
! decomp%z2st(i):decomp%z2en(i) | |
endif | |
!JD no checks on x- or z-dimension, since we transform from z- into x-pencils, so these always overlap. | |
if (decomp%xst(2).le.decomp%y2en(i) .and. & | |
decomp%xen(2).ge.decomp%y2st(i)) then | |
decomp%xcnts_xz(rank_x+1)=1 | |
subsize_y=min(decomp%xen(2),decomp%y2en(i))-max(decomp%xst(2),decomp%y2st(i))+1 | |
offset_y =max(decomp%xst(2),decomp%y2st(i))-decomp%xst(2) | |
#ifdef MPI3 | |
index_dest=index_dest+1 | |
xranks(index_dest)=rank_x | |
xweights(index_dest)=decomp%x1dist(k)*subsize_y*decomp%xsz(3) | |
#endif | |
call MPI_Type_create_subarray(3,decomp%xsz, & | |
(/decomp%x1dist(k),subsize_y,decomp%xsz(3)/), & | |
(/decomp%x1st(k)-decomp%xst(1),offset_y,0/), & | |
MPI_ORDER_FORTRAN,complex_type,decomp%xtypes_xz(rank_x+1),ierror) | |
call MPI_Type_commit(decomp%xtypes_xz(rank_x+1),ierror) | |
!JD recv from process with z-pencil defined by (k,i) | |
!JD x-bounds are taken from the z-pencils | |
! send: decomp%x1st(k):decomp%x1en(k) | |
!JD y-bounds are the overlapping region of both pencils. | |
! max(decomp%xst(2),decomp%y2st(i)):min(decomp%xen(2),decomp%y2en(i)) | |
!JD z-bounds are taken from the x-pencils. | |
! decomp%xst(3):decomp%xen(3) | |
endif | |
enddo | |
enddo | |
#ifdef MPI3 | |
allocate(decomp%xranks(index_dest)) | |
allocate(decomp%zranks(index_src)) | |
decomp%xranks=xranks(1:index_dest)+1 | |
decomp%zranks=zranks(1:index_src)+1 | |
call MPI_Dist_graph_create_adjacent(DECOMP_2D_COMM_CART_X, & | |
index_src,zranks(1:index_src),zweights(1:index_src), & | |
index_dest,xranks(1:index_dest),xweights(1:index_dest), & | |
MPI_INFO_NULL,.true.,decomp%xtozNeighborComm,ierror) | |
call MPI_Dist_graph_create_adjacent(DECOMP_2D_COMM_CART_X, & | |
index_dest,xranks(1:index_dest),xweights(1:index_dest), & | |
index_src,zranks(1:index_src),zweights(1:index_src), & | |
MPI_INFO_NULL,.true.,decomp%ztoxNeighborComm,ierror) | |
#endif | |
return | |
end subroutine prepare_buffer | |
#ifdef GLOBAL_ARRAYS | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Create global arrays that mapped to pencil decompisitions | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
subroutine get_global_array(ga, ipencil, data_type, opt_decomp) | |
implicit none | |
integer, intent(OUT) :: ga | |
integer, intent(IN) :: ipencil ! 1=X-pencil; 2=Y-pencil; 3=Z-pencil | |
integer, intent(IN) :: data_type | |
TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp | |
TYPE(DECOMP_INFO) :: decomp | |
integer, dimension(3) :: nblock | |
integer, allocatable, dimension(:) :: map | |
integer :: offset, i, errorcode | |
logical :: success | |
if (present(opt_decomp)) then | |
decomp = opt_decomp | |
else | |
decomp = decomp_main | |
end if | |
ga = ga_create_handle() | |
call ga_set_data(ga, 3, & | |
(/decomp%xsz(1),decomp%ysz(2),decomp%zsz(3)/), data_type) | |
allocate(map(1+dims(1)+dims(2))) | |
! generate the GA irreg distribution parameters using | |
! 2DECOMP's decomposition information | |
if (ipencil==1) then ! X-pencil | |
nblock(1) = 1 | |
nblock(2) = dims(1) | |
nblock(3) = dims(2) | |
map(1) = 1 | |
offset = nblock(1)+1 | |
do i=0, dims(1)-1 | |
if (i==0) then | |
map(offset+i) = 1 | |
else | |
map(offset+i) = map(offset+i-1) + decomp%y1dist(i-1) | |
end if | |
end do | |
offset = nblock(1) + nblock(2) + 1 | |
do i=0, dims(2)-1 | |
if (i==0) then | |
map(offset+i) = 1 | |
else | |
map(offset+i) = map(offset+i-1) + decomp%z2dist(i-1) | |
end if | |
end do | |
else if (ipencil==2) then ! Y-pencil | |
nblock(1) = dims(1) | |
nblock(2) = 1 | |
nblock(3) = dims(2) | |
offset = 1 | |
do i=0, dims(1)-1 | |
if (i==0) then | |
map(offset+i) = 1 | |
else | |
map(offset+i) = map(offset+i-1) + decomp%x1dist(i-1) | |
end if | |
end do | |
map(nblock(1)+1) = 1 | |
offset = nblock(1) + nblock(2) + 1 | |
do i=0, dims(2)-1 | |
if (i==0) then | |
map(offset+i) = 1 | |
else | |
map(offset+i) = map(offset+i-1) + decomp%z2dist(i-1) | |
end if | |
end do | |
else if (ipencil==3) then ! Z-pencil | |
nblock(1) = dims(1) | |
nblock(2) = dims(2) | |
nblock(3) = 1 | |
offset = 1 | |
do i=0, dims(1)-1 | |
if (i==0) then | |
map(offset+i) = 1 | |
else | |
map(offset+i) = map(offset+i-1) + decomp%x1dist(i-1) | |
end if | |
end do | |
offset = nblock(1)+1 | |
do i=0, dims(2)-1 | |
if (i==0) then | |
map(offset+i) = 1 | |
else | |
map(offset+i) = map(offset+i-1) + decomp%y2dist(i-1) | |
end if | |
end do | |
map(nblock(1)+nblock(2)+1) = 1 | |
end if | |
call ga_set_irreg_distr(ga, map, nblock) | |
success = ga_allocate(ga) | |
if (.not.success) then | |
errorcode = 7 | |
call decomp_2d_abort(errorcode, & | |
'Failed to create global arrays') | |
end if | |
deallocate(map) | |
return | |
end subroutine get_global_array | |
#endif | |
#ifdef OCC | |
! For non-blocking communication code, progress the comminication stack | |
subroutine transpose_test(handle) | |
implicit none | |
integer :: handle, ierror | |
call NBC_TEST(handle,ierror) | |
return | |
end subroutine transpose_test | |
#endif | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Transposition routines | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
#include "transpose_x_to_y.F90" | |
#include "transpose_y_to_z.F90" | |
#include "transpose_z_to_y.F90" | |
#include "transpose_y_to_x.F90" | |
#include "transpose_x_to_z.F90" | |
#include "transpose_z_to_x.F90" | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Auto-tuning algorithm to select the best 2D processor grid | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
subroutine best_2d_grid(iproc, best_p_row, best_p_col) | |
implicit none | |
integer, intent(IN) :: iproc | |
integer, intent(OUT) :: best_p_row, best_p_col | |
integer, allocatable, dimension(:) :: factors | |
double precision :: t1, t2, best_time | |
integer :: nfact, i, row, col, ierror, errorcode | |
real(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3 | |
TYPE(DECOMP_INFO) :: decomp | |
if (nrank==0) write(*,*) 'In auto-tuning mode......' | |
best_time = huge(t1) | |
best_p_row = -1 | |
best_p_col = -1 | |
i = int(sqrt(real(iproc))) + 10 ! enough space to save all factors | |
allocate(factors(i)) | |
call findfactor(iproc, factors, nfact) | |
if (nrank==0) write(*,*) 'factors: ', (factors(i), i=1,nfact) | |
! Make initial communication to un-bias results | |
row = factors(1) | |
col = iproc / row | |
if (min(nx_global,ny_global)>=row .and. & | |
min(ny_global,nz_global)>=col) then | |
! 2D Catersian topology | |
dims(1) = row | |
dims(2) = col | |
periodic(1) = .false. | |
periodic(2) = .false. | |
call MPI_CART_CREATE(MPI_COMM_WORLD,2,dims,periodic, & | |
.false.,DECOMP_2D_COMM_CART_X, ierror) | |
call MPI_CART_COORDS(DECOMP_2D_COMM_CART_X,nrank,2,coord,ierror) | |
! communicators defining sub-groups for ALLTOALL(V) | |
call MPI_CART_SUB(DECOMP_2D_COMM_CART_X,(/.true.,.false./), & | |
DECOMP_2D_COMM_COL,ierror) | |
call MPI_CART_SUB(DECOMP_2D_COMM_CART_X,(/.false.,.true./), & | |
DECOMP_2D_COMM_ROW,ierror) | |
! generate 2D decomposition information for this row*col | |
call decomp_info_init(nx_global,ny_global,nz_global,decomp) | |
! arrays for X,Y and Z-pencils | |
allocate(u1(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3))) | |
allocate(u2(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3))) | |
! timing the transposition routines | |
t1 = MPI_WTIME() | |
call transpose_x_to_y(u1,u2,decomp) | |
call transpose_y_to_x(u2,u1,decomp) | |
t2 = MPI_WTIME() - t1 | |
deallocate(u1,u2) | |
call decomp_info_finalize(decomp) | |
call MPI_Comm_free(DECOMP_2D_COMM_COL,ierror) | |
call MPI_Comm_free(DECOMP_2D_COMM_ROW,ierror) | |
call MPI_Comm_free(DECOMP_2D_COMM_CART_X, ierror) | |
end if | |
do i=1, nfact | |
row = factors(i) | |
col = iproc / row | |
! enforce the limitation of 2D decomposition | |
if (min(nx_global,ny_global)>=row .and. & | |
min(ny_global,nz_global)>=col) then | |
! 2D Catersian topology | |
dims(1) = row | |
dims(2) = col | |
periodic(1) = .false. | |
periodic(2) = .false. | |
call MPI_CART_CREATE(MPI_COMM_WORLD,2,dims,periodic, & | |
.false.,DECOMP_2D_COMM_CART_X, ierror) | |
call MPI_CART_COORDS(DECOMP_2D_COMM_CART_X,nrank,2,coord,ierror) | |
! communicators defining sub-groups for ALLTOALL(V) | |
call MPI_CART_SUB(DECOMP_2D_COMM_CART_X,(/.true.,.false./), & | |
DECOMP_2D_COMM_COL,ierror) | |
call MPI_CART_SUB(DECOMP_2D_COMM_CART_X,(/.false.,.true./), & | |
DECOMP_2D_COMM_ROW,ierror) | |
! generate 2D decomposition information for this row*col | |
call decomp_info_init(nx_global,ny_global,nz_global,decomp) | |
! arrays for X,Y and Z-pencils | |
allocate(u1(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3))) | |
allocate(u2(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3))) | |
allocate(u3(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3))) | |
! timing the transposition routines | |
t1 = MPI_WTIME() | |
call transpose_x_to_y(u1,u2,decomp) | |
call transpose_y_to_z(u2,u3,decomp) | |
call transpose_z_to_y(u3,u2,decomp) | |
call transpose_y_to_x(u2,u1,decomp) | |
t2 = MPI_WTIME() - t1 | |
deallocate(u1,u2,u3) | |
call decomp_info_finalize(decomp) | |
call MPI_Comm_free(DECOMP_2D_COMM_COL,ierror) | |
call MPI_Comm_free(DECOMP_2D_COMM_ROW,ierror) | |
call MPI_Comm_free(DECOMP_2D_COMM_CART_X, ierror) | |
call MPI_ALLREDUCE(t2,t1,1,MPI_DOUBLE_PRECISION,MPI_SUM, & | |
MPI_COMM_WORLD,ierror) | |
t1 = t1 / dble(nproc) | |
if (nrank==0) then | |
write(*,*) 'processor grid', row, ' by ', col, ' time=', t1 | |
end if | |
if (best_time > t1) then | |
best_time = t1 | |
best_p_row = row | |
best_p_col = col | |
end if | |
end if | |
end do ! loop through processer grid | |
deallocate(factors) | |
if (best_p_row/=-1) then | |
if (nrank==0) then | |
write(*,*) 'the best processor grid is probably ', & | |
best_p_row, ' by ', best_p_col | |
end if | |
else | |
errorcode = 9 | |
call decomp_2d_abort(errorcode, & | |
'The processor-grid auto-tuning code failed. ' // & | |
'The number of processes requested is probably too large.') | |
end if | |
return | |
end subroutine best_2d_grid | |
#include "factor.F90" | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Halo cell support | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
#include "halo.F90" | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Error handling | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
subroutine decomp_2d_abort(errorcode, msg) | |
implicit none | |
integer, intent(IN) :: errorcode | |
character(len=*), intent(IN) :: msg | |
integer :: ierror | |
if (nrank==0) then | |
write(*,*) '2DECOMP&FFT ERROR - errorcode: ', errorcode | |
write(*,*) 'ERROR MESSAGE: ' // msg | |
end if | |
call MPI_ABORT(MPI_COMM_WORLD,errorcode,ierror) | |
return | |
end subroutine decomp_2d_abort | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
! Utility routines to help allocate 3D arrays | |
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
#include "alloc.F90" | |
end module decomp_2d | |