From f1f0c1641511bef78af9053801186b1326ba4d4b Mon Sep 17 00:00:00 2001 From: Youjia Li Date: Thu, 8 Aug 2024 21:49:17 -0500 Subject: [PATCH 01/20] add github workflow that checks sdist installation --- .../workflows/pnetcdf_c_official_sdist.yml | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 .github/workflows/pnetcdf_c_official_sdist.yml diff --git a/.github/workflows/pnetcdf_c_official_sdist.yml b/.github/workflows/pnetcdf_c_official_sdist.yml new file mode 100644 index 0000000..4b6a72d --- /dev/null +++ b/.github/workflows/pnetcdf_c_official_sdist.yml @@ -0,0 +1,104 @@ +name: Test PnetCDF-C latest official release with source distribution +on: + push: + branches: + - main + paths-ignore: + - '**/*.md' + - '**/*.txt' + pull_request: + branches: + - main + paths-ignore: + - '**/*.md' + - '**/*.txt' + +jobs: + build-linux: + name: Python (${{ matrix.python-version }}) + runs-on: ubuntu-latest + timeout-minutes: 60 + env: + MPICH_VERSION: 4.2.0 + MPICH_DIR: ${{ github.workspace }}/mpich-install + PNETCDF_VERSION: 1.13.0 + PNETCDF_DIR: ${{ github.workspace }}/PnetCDF-install + strategy: + matrix: + python-version: ["3.10"] + steps: + + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Ubuntu Dependencies + run: | + sudo apt-get update + sudo apt-get install m4 + + - name: Build MPICH + run: | + echo "Install MPICH ${MPICH_VERSION} in $MPICH_DIR" + rm -rf MPICH ; mkdir MPICH ; cd MPICH + wget -q https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz + gzip -dc mpich-${MPICH_VERSION}.tar.gz | tar -xf - + cd mpich-${MPICH_VERSION} + ./configure --prefix=$MPICH_DIR \ + --silent \ + --enable-romio \ + --with-file-system=ufs \ + --with-device=ch3:sock \ + --enable-shared\ + --disable-fortran \ + CC=gcc + make -s LIBTOOLFLAGS=--silent V=1 -j 4 install > qout 2>&1 + make -s -j 4 distclean >> qout 2>&1 + + - name: Build PnetCDF-C official release + run: | + echo "Download and build PnetCDF-C release version ${PNETCDF_VERSION}" + wget -q https://parallel-netcdf.github.io/Release/pnetcdf-${PNETCDF_VERSION}.tar.gz + tar -xzf pnetcdf-${PNETCDF_VERSION}.tar.gz + pushd pnetcdf-${PNETCDF_VERSION} + ./configure --prefix=$PNETCDF_DIR \ + --silent \ + --enable-shared \ + --enable-debug \ + --disable-fortran \ + --disable-cxx \ + --with-mpi=$MPICH_DIR + make -s LIBTOOLFLAGS=--silent V=1 -j 4 install > qout 2>&1 + make -s -j 4 distclean >> qout 2>&1 + popd + + - name: Install python dependencies via pip + run: | + python -m pip install --upgrade pip + pip install numpy cython cftime pytest twine wheel check-manifest + export MPICC=$MPICH_DIR/bin/mpicc + pip install mpi4py + pip install torch + + - name: Install PnetCDF-Python from source distribution + run: | + export CC=$MPICH_DIR/bin/mpicc + python setup.py sdist + pip install dist/pnetcdf-*.tar.gz + + - name: Test PnetCDF-Python + run: | + export PATH=${MPICH_DIR}/bin:${PATH} + make ptests + +# - name: Tarball +# run: | +# export PATH=${NETCDF_DIR}/bin:${PATH} +# python setup.py --version +# check-manifest --version +# check-manifest --verbose +# pip wheel . -w dist --no-deps +# twine check dist/* From e9a88275d14c471759716c00c63597bbfb73de83 Mon Sep 17 00:00:00 2001 From: Youjia Li Date: Thu, 8 Aug 2024 22:49:32 -0500 Subject: [PATCH 02/20] add sdist insllation check to existing yml files --- .github/workflows/pnetcdf_c_master.yml | 12 ++ .github/workflows/pnetcdf_c_official.yml | 13 +++ .../workflows/pnetcdf_c_official_sdist.yml | 104 ------------------ 3 files changed, 25 insertions(+), 104 deletions(-) delete mode 100644 .github/workflows/pnetcdf_c_official_sdist.yml diff --git a/.github/workflows/pnetcdf_c_master.yml b/.github/workflows/pnetcdf_c_master.yml index 922c6f2..8e6f8c5 100644 --- a/.github/workflows/pnetcdf_c_master.yml +++ b/.github/workflows/pnetcdf_c_master.yml @@ -94,7 +94,19 @@ jobs: run: | export PATH=${MPICH_DIR}/bin:${PATH} make ptests + + - name: Re-install PnetCDF-Python from source distribution + run: | + pip uninstall pnetcdf + make install-clean + export CC=$MPICH_DIR/bin/mpicc + python setup.py sdist + pip install dist/pnetcdf-*.tar.gz + - name: Test PnetCDF-Python + run: | + export PATH=${MPICH_DIR}/bin:${PATH} + make ptests # - name: Tarball # run: | # export PATH=${NETCDF_DIR}/bin:${PATH} diff --git a/.github/workflows/pnetcdf_c_official.yml b/.github/workflows/pnetcdf_c_official.yml index 7682050..2228f32 100644 --- a/.github/workflows/pnetcdf_c_official.yml +++ b/.github/workflows/pnetcdf_c_official.yml @@ -93,6 +93,19 @@ jobs: export PATH=${MPICH_DIR}/bin:${PATH} make ptests + - name: Re-install PnetCDF-Python from source distribution + run: | + pip uninstall pnetcdf + make install-clean + export CC=$MPICH_DIR/bin/mpicc + python setup.py sdist + pip install dist/pnetcdf-*.tar.gz + + - name: Test PnetCDF-Python + run: | + export PATH=${MPICH_DIR}/bin:${PATH} + make ptests + # - name: Tarball # run: | # export PATH=${NETCDF_DIR}/bin:${PATH} diff --git a/.github/workflows/pnetcdf_c_official_sdist.yml b/.github/workflows/pnetcdf_c_official_sdist.yml deleted file mode 100644 index 4b6a72d..0000000 --- a/.github/workflows/pnetcdf_c_official_sdist.yml +++ /dev/null @@ -1,104 +0,0 @@ -name: Test PnetCDF-C latest official release with source distribution -on: - push: - branches: - - main - paths-ignore: - - '**/*.md' - - '**/*.txt' - pull_request: - branches: - - main - paths-ignore: - - '**/*.md' - - '**/*.txt' - -jobs: - build-linux: - name: Python (${{ matrix.python-version }}) - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - MPICH_VERSION: 4.2.0 - MPICH_DIR: ${{ github.workspace }}/mpich-install - PNETCDF_VERSION: 1.13.0 - PNETCDF_DIR: ${{ github.workspace }}/PnetCDF-install - strategy: - matrix: - python-version: ["3.10"] - steps: - - - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install Ubuntu Dependencies - run: | - sudo apt-get update - sudo apt-get install m4 - - - name: Build MPICH - run: | - echo "Install MPICH ${MPICH_VERSION} in $MPICH_DIR" - rm -rf MPICH ; mkdir MPICH ; cd MPICH - wget -q https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz - gzip -dc mpich-${MPICH_VERSION}.tar.gz | tar -xf - - cd mpich-${MPICH_VERSION} - ./configure --prefix=$MPICH_DIR \ - --silent \ - --enable-romio \ - --with-file-system=ufs \ - --with-device=ch3:sock \ - --enable-shared\ - --disable-fortran \ - CC=gcc - make -s LIBTOOLFLAGS=--silent V=1 -j 4 install > qout 2>&1 - make -s -j 4 distclean >> qout 2>&1 - - - name: Build PnetCDF-C official release - run: | - echo "Download and build PnetCDF-C release version ${PNETCDF_VERSION}" - wget -q https://parallel-netcdf.github.io/Release/pnetcdf-${PNETCDF_VERSION}.tar.gz - tar -xzf pnetcdf-${PNETCDF_VERSION}.tar.gz - pushd pnetcdf-${PNETCDF_VERSION} - ./configure --prefix=$PNETCDF_DIR \ - --silent \ - --enable-shared \ - --enable-debug \ - --disable-fortran \ - --disable-cxx \ - --with-mpi=$MPICH_DIR - make -s LIBTOOLFLAGS=--silent V=1 -j 4 install > qout 2>&1 - make -s -j 4 distclean >> qout 2>&1 - popd - - - name: Install python dependencies via pip - run: | - python -m pip install --upgrade pip - pip install numpy cython cftime pytest twine wheel check-manifest - export MPICC=$MPICH_DIR/bin/mpicc - pip install mpi4py - pip install torch - - - name: Install PnetCDF-Python from source distribution - run: | - export CC=$MPICH_DIR/bin/mpicc - python setup.py sdist - pip install dist/pnetcdf-*.tar.gz - - - name: Test PnetCDF-Python - run: | - export PATH=${MPICH_DIR}/bin:${PATH} - make ptests - -# - name: Tarball -# run: | -# export PATH=${NETCDF_DIR}/bin:${PATH} -# python setup.py --version -# check-manifest --version -# check-manifest --verbose -# pip wheel . -w dist --no-deps -# twine check dist/* From 0016fea46a1738b0c9c5b38fb6f665e5b0db993b Mon Sep 17 00:00:00 2001 From: Youjia Li Date: Thu, 8 Aug 2024 23:04:40 -0500 Subject: [PATCH 03/20] add -y to uninstall --- .github/workflows/pnetcdf_c_master.yml | 2 +- .github/workflows/pnetcdf_c_official.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pnetcdf_c_master.yml b/.github/workflows/pnetcdf_c_master.yml index 8e6f8c5..cbb54b2 100644 --- a/.github/workflows/pnetcdf_c_master.yml +++ b/.github/workflows/pnetcdf_c_master.yml @@ -97,7 +97,7 @@ jobs: - name: Re-install PnetCDF-Python from source distribution run: | - pip uninstall pnetcdf + pip uninstall -y pnetcdf make install-clean export CC=$MPICH_DIR/bin/mpicc python setup.py sdist diff --git a/.github/workflows/pnetcdf_c_official.yml b/.github/workflows/pnetcdf_c_official.yml index 2228f32..36cbaa0 100644 --- a/.github/workflows/pnetcdf_c_official.yml +++ b/.github/workflows/pnetcdf_c_official.yml @@ -95,7 +95,7 @@ jobs: - name: Re-install PnetCDF-Python from source distribution run: | - pip uninstall pnetcdf + pip uninstall -y pnetcdf make install-clean export CC=$MPICH_DIR/bin/mpicc python setup.py sdist From c899f22c4a9b0e40648daa973e13a4c1460d6af8 Mon Sep 17 00:00:00 2001 From: wkliao Date: Fri, 9 Aug 2024 03:04:32 -0500 Subject: [PATCH 04/20] remove trailing whitespaces --- .github/workflows/pnetcdf_c_master.yml | 12 ++++++------ .github/workflows/pnetcdf_c_official.yml | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pnetcdf_c_master.yml b/.github/workflows/pnetcdf_c_master.yml index cbb54b2..a2e6b6e 100644 --- a/.github/workflows/pnetcdf_c_master.yml +++ b/.github/workflows/pnetcdf_c_master.yml @@ -94,7 +94,7 @@ jobs: run: | export PATH=${MPICH_DIR}/bin:${PATH} make ptests - + - name: Re-install PnetCDF-Python from source distribution run: | pip uninstall -y pnetcdf @@ -109,9 +109,9 @@ jobs: make ptests # - name: Tarball # run: | -# export PATH=${NETCDF_DIR}/bin:${PATH} -# python setup.py --version +# export PATH=${NETCDF_DIR}/bin:${PATH} +# python setup.py --version # check-manifest --version -# check-manifest --verbose -# pip wheel . -w dist --no-deps -# twine check dist/* +# check-manifest --verbose +# pip wheel . -w dist --no-deps +# twine check dist/* diff --git a/.github/workflows/pnetcdf_c_official.yml b/.github/workflows/pnetcdf_c_official.yml index 36cbaa0..8ff19b9 100644 --- a/.github/workflows/pnetcdf_c_official.yml +++ b/.github/workflows/pnetcdf_c_official.yml @@ -108,9 +108,9 @@ jobs: # - name: Tarball # run: | -# export PATH=${NETCDF_DIR}/bin:${PATH} -# python setup.py --version +# export PATH=${NETCDF_DIR}/bin:${PATH} +# python setup.py --version # check-manifest --version -# check-manifest --verbose -# pip wheel . -w dist --no-deps -# twine check dist/* +# check-manifest --verbose +# pip wheel . -w dist --no-deps +# twine check dist/* From 3041b0560d1c9184f4bde1e44920814bd17ff22e Mon Sep 17 00:00:00 2001 From: wkliao Date: Fri, 9 Aug 2024 03:06:22 -0500 Subject: [PATCH 05/20] enable verbose mode to show install warnings --- .github/workflows/pnetcdf_c_master.yml | 4 ++-- .github/workflows/pnetcdf_c_official.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pnetcdf_c_master.yml b/.github/workflows/pnetcdf_c_master.yml index a2e6b6e..ab940cc 100644 --- a/.github/workflows/pnetcdf_c_master.yml +++ b/.github/workflows/pnetcdf_c_master.yml @@ -88,7 +88,7 @@ jobs: - name: Install PnetCDF-Python run: | export CC=$MPICH_DIR/bin/mpicc - pip install --no-build-isolation -e . + pip install --verbose --no-build-isolation -e . - name: Test PnetCDF-Python run: | @@ -101,7 +101,7 @@ jobs: make install-clean export CC=$MPICH_DIR/bin/mpicc python setup.py sdist - pip install dist/pnetcdf-*.tar.gz + pip install --verbose dist/pnetcdf-*.tar.gz - name: Test PnetCDF-Python run: | diff --git a/.github/workflows/pnetcdf_c_official.yml b/.github/workflows/pnetcdf_c_official.yml index 8ff19b9..3946da3 100644 --- a/.github/workflows/pnetcdf_c_official.yml +++ b/.github/workflows/pnetcdf_c_official.yml @@ -86,7 +86,7 @@ jobs: - name: Install PnetCDF-Python run: | export CC=$MPICH_DIR/bin/mpicc - pip install --no-build-isolation -e . + pip install --verbose --no-build-isolation -e . - name: Test PnetCDF-Python run: | @@ -99,7 +99,7 @@ jobs: make install-clean export CC=$MPICH_DIR/bin/mpicc python setup.py sdist - pip install dist/pnetcdf-*.tar.gz + pip install --verbose dist/pnetcdf-*.tar.gz - name: Test PnetCDF-Python run: | From c23dfcd620257776bfea6a0ad38ed528092c9a7c Mon Sep 17 00:00:00 2001 From: wkliao Date: Fri, 9 Aug 2024 16:39:47 -0500 Subject: [PATCH 06/20] simplify examples --- examples/collective_write.py | 77 +++++++++---------------------- examples/create_open.py | 13 ++++-- examples/fill_mode.py | 17 +++++-- examples/flexible_api.py | 47 +++++++++++-------- examples/get_info.py | 15 ++++-- examples/get_vara.py | 25 ++++++---- examples/ghost_cell.py | 29 +++++++----- examples/global_attribute.py | 21 +++++---- examples/hints.py | 26 +++++++---- examples/nonblocking_write.py | 57 +++++------------------ examples/nonblocking_write_def.py | 53 +++++---------------- examples/put_vara.py | 23 +++++---- examples/put_varn_int.py | 23 ++++++--- examples/transpose.py | 52 ++++++++++++--------- examples/transpose2D.py | 20 +++++--- 15 files changed, 243 insertions(+), 255 deletions(-) diff --git a/examples/collective_write.py b/examples/collective_write.py index b0bd236..2a2cb5e 100644 --- a/examples/collective_write.py +++ b/examples/collective_write.py @@ -31,16 +31,12 @@ 32 400 x 400 x 200 6.67 45.72 """ -import sys, os, argparse, inspect +import sys, os, argparse import numpy as np from mpi4py import MPI import pnetcdf -NDIMS = 3 -NUM_VARS = 10 - -def parse_help(comm): - rank = comm.Get_rank() +def parse_help(): help_flag = "-h" in sys.argv or "--help" in sys.argv if help_flag and rank == 0: help_text = ( @@ -54,21 +50,22 @@ def parse_help(comm): print(help_text) return help_flag -def print_info(info_used): - print("MPI hint: cb_nodes =", info_used.Get("cb_nodes")) - print("MPI hint: cb_buffer_size =", info_used.Get("cb_buffer_size")) - print("MPI hint: striping_factor =", info_used.Get("striping_factor")) - print("MPI hint: striping_unit =", info_used.Get("striping_unit")) +def pnetcdf_io(filename, file_format, length): + # number of dimensions + NDIMS = 3 + # number of variables + NUM_VARS = 10 -def pnetcdf_io(comm, filename, file_format, length): - rank = comm.Get_rank() - nprocs = comm.Get_size() + if verbose and rank == 0: + print("Number of variables = ", NUM_VARS) + print("Number of dimensions = ", NDIMS) starts = np.zeros(NDIMS, dtype=np.int32) counts = np.zeros(NDIMS, dtype=np.int32) gsizes = np.zeros(NDIMS, dtype=np.int32) buf = [] + # calculate local subarray access pattern psizes = MPI.Compute_dims(nprocs, NDIMS) starts[0] = rank % psizes[0] starts[1] = (rank // psizes[1]) % psizes[1] @@ -87,20 +84,12 @@ def pnetcdf_io(comm, filename, file_format, length): for j in range(bufsize): buf[i][j] = rank * i + 123 + j - comm.Barrier() - write_timing = MPI.Wtime() - # Create the file using file clobber mode - try: - f = pnetcdf.File(filename = filename, \ - mode = 'w', \ - format = file_format, \ - comm = comm, \ - info = None) - except OSError as e: - print("Error at {}:{} ncmpi_create() file {} ({})".format(__file__,inspect.currentframe().f_back.f_lineno, filename, e)) - comm.Abort() - exit(1) + f = pnetcdf.File(filename = filename, + mode = 'w', + format = file_format, + comm = comm, + info = None) # Define dimensions dims = [] @@ -127,34 +116,14 @@ def pnetcdf_io(comm, filename, file_format, length): # Close the file f.close() - write_timing = MPI.Wtime() - write_timing - - # calculate write amount across all processes in total - write_size = bufsize * NUM_VARS * np.dtype(np.int32).itemsize - sum_write_size = comm.reduce(write_size, MPI.SUM, root=0) - max_write_timing = comm.reduce(write_timing, MPI.MAX, root=0) - - if rank == 0 and verbose: - subarray_size = (bufsize * np.dtype(np.int32).itemsize) / 1048576.0 - print_info(info_used) - print("Local array size {} x {} x {} integers, size = {:.2f} MB".format(length, length, length, subarray_size)) - sum_write_size /= 1048576.0 - print("Global array size {} x {} x {} integers, write size = {:.2f} GB".format(gsizes[0], gsizes[1], gsizes[2], sum_write_size/1024.0)) - - write_bw = sum_write_size / max_write_timing - print(" procs Global array size exec(sec) write(MB/s)") - print("------- ------------------ --------- -----------") - print(" {:4d} {:4d} x {:4d} x {:4d} {:8.2f} {:10.2f}\n".format(nprocs, gsizes[0], gsizes[1], gsizes[2], max_write_timing, write_bw)) - if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() - if parse_help(comm): + if parse_help(): MPI.Finalize() sys.exit(1) @@ -168,25 +137,25 @@ def pnetcdf_io(comm, filename, file_format, length): parser.add_argument("-l", help="Size of each dimension of the local array\n") args = parser.parse_args() - file_format = None - length = 10 - - if args.q: verbose = False + verbose = False if args.q else True + file_format = None if args.k: - kind_dict = {'1':None, '2':"NETCDF3_64BIT_OFFSET", '5':"NETCDF3_64BIT_DATA"} + kind_dict = {'1':None, '2':"NC_64BIT_OFFSET", '5':"NC_64BIT_DATA"} file_format = kind_dict[args.k] + length = 10 if args.l and int(args.l) > 0: length = int(args.l) filename = args.dir + if verbose and rank == 0: print("{}: example of collective writes".format(os.path.basename(__file__))) # Run I/O try: - pnetcdf_io(comm, filename, file_format, length) + pnetcdf_io(filename, file_format, length) except BaseException as err: print("Error: type:", type(err), str(err)) raise diff --git a/examples/create_open.py b/examples/create_open.py index fd6ebe9..e64e2e8 100644 --- a/examples/create_open.py +++ b/examples/create_open.py @@ -34,11 +34,12 @@ def parse_help(): return help_flag def pnetcdf_io(filename): - if verbose and rank == 0: - print("{}: example of file create and open".format(os.path.basename(__file__))) # create a new file using file clobber mode, i.e. flag "-w" - f = pnetcdf.File(filename=filename, mode = 'w', comm=comm, info=None) + f = pnetcdf.File(filename = filename, + mode = 'w', + comm = comm, + info = None) # close the file f.close() @@ -51,7 +52,6 @@ def pnetcdf_io(filename): if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -68,10 +68,13 @@ def pnetcdf_io(filename): parser.add_argument("-q", help="Quiet mode (reports when fail)", action="store_true") args = parser.parse_args() - if args.q: verbose = False + verbose = False if args.q else True filename = args.dir + if verbose and rank == 0: + print("{}: example of file create and open".format(os.path.basename(__file__))) + try: pnetcdf_io(filename) except BaseException as err: diff --git a/examples/fill_mode.py b/examples/fill_mode.py index 95541fb..647a481 100644 --- a/examples/fill_mode.py +++ b/examples/fill_mode.py @@ -63,10 +63,14 @@ def pnetcdf_io(filename): NX = 4 if verbose and rank == 0: - print("{}: example of setting fill mode".format(os.path.basename(__file__))) + print("Y dimension size = ", NY) + print("X dimension size = ", NX) # create a new file using clobber "w" mode - f = pnetcdf.File(filename=filename, mode = 'w', comm=comm, info=None) + f = pnetcdf.File(filename = filename, + mode = 'w', + comm = comm, + info = None) # the global array is NY * (NX * nprocs) global_ny = NY @@ -131,8 +135,11 @@ def pnetcdf_io(filename): # write to the 2nd record rec_var.put_var_all(buf, start = starts, count = counts) + + # close file f.close() + if __name__ == "__main__": verbose = True @@ -152,10 +159,14 @@ def pnetcdf_io(filename): parser.add_argument("-q", help="Quiet mode (reports when fail)", action="store_true") args = parser.parse_args() - if args.q: verbose = False + + verbose = False if args.q else True filename = args.dir + if verbose and rank == 0: + print("{}: example of setting fill mode".format(os.path.basename(__file__))) + try: pnetcdf_io(filename) except BaseException as err: diff --git a/examples/flexible_api.py b/examples/flexible_api.py index 854f97d..379d056 100644 --- a/examples/flexible_api.py +++ b/examples/flexible_api.py @@ -89,14 +89,21 @@ def pnetcdf_io(filename, file_format): NY = 5 NX = 5 NZ = 5 - ghost_len = 3 if verbose and rank == 0: - print("{}: example of using flexible APIs".format(os.path.basename(__file__))) + print("Z dimension size = ", NZ) + print("Y dimension size = ", NY) + print("X dimension size = ", NX) + # number of cells at both end of each dimension + ghost_len = 3 # Create the file - f = pnetcdf.File(filename=filename, mode = 'w', format = file_format, comm=comm, info=None) + f = pnetcdf.File(filename = filename, + mode = 'w', + format = file_format, + comm = comm, + info = None) # Define dimensions dim_z = f.def_dim("Z", NZ*nprocs) @@ -146,9 +153,9 @@ def pnetcdf_io(filename, file_format): buf_zy.fill(-1) # read using flexible API - var_zy.get_var_all(buf_zy, start = starts, \ - count = counts, \ - bufcount = 1, \ + var_zy.get_var_all(buf_zy, start = starts, + count = counts, + bufcount = 1, buftype = subarray) # check contents of the get buffer @@ -171,7 +178,10 @@ def pnetcdf_io(filename, file_format): array_of_sizes = np.array([NY + 2 * ghost_len, NX + 2 * ghost_len]) array_of_subsizes = np.array([NY, NX]) array_of_starts = np.array([ghost_len, ghost_len]) - subarray = MPI.DOUBLE.Create_subarray(array_of_sizes, array_of_subsizes, array_of_starts, order=MPI.ORDER_C) + subarray = MPI.DOUBLE.Create_subarray(array_of_sizes, + array_of_subsizes, + array_of_starts, + order=MPI.ORDER_C) subarray.Commit() # initialize write user buffer @@ -181,9 +191,9 @@ def pnetcdf_io(filename, file_format): counts = np.array([NY, NX]) # calling a blocking flexible write API - req_id = var_yx.iput_var(buf_yx, start = starts, \ - count = counts, \ - bufcount = 1, \ + req_id = var_yx.iput_var(buf_yx, start = starts, + count = counts, + bufcount = 1, buftype = subarray) # commit posted pending nonblocking requests @@ -198,9 +208,9 @@ def pnetcdf_io(filename, file_format): buf_yx.fill(-1) # calling a blocking flexible read API - req_id = var_yx.iget_var(buf_yx, start = starts, \ - count = counts, \ - bufcount = 1, \ + req_id = var_yx.iget_var(buf_yx, start = starts, + count = counts, + bufcount = 1, buftype=subarray) # commit posted pending nonblocking requests @@ -227,7 +237,6 @@ def pnetcdf_io(filename, file_format): if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -245,16 +254,18 @@ def pnetcdf_io(filename, file_format): parser.add_argument("-k", help="File format: 1 for CDF-1, 2 for CDF-2, 5 for CDF-5") args = parser.parse_args() - file_format = None - - if args.q: verbose = False + verbose = False if args.q else True + file_format = None if args.k: - kind_dict = {'1':None, '2':"NETCDF3_64BIT_OFFSET", '5':"NETCDF3_64BIT_DATA"} + kind_dict = {'1':None, '2':"NC_64BIT_OFFSET", '5':"NC_64BIT_DATA"} file_format = kind_dict[args.k] filename = args.dir + if verbose and rank == 0: + print("{}: example of using flexible APIs".format(os.path.basename(__file__))) + try: pnetcdf_io(filename, file_format) except BaseException as err: diff --git a/examples/get_info.py b/examples/get_info.py index 75b4447..8b67349 100644 --- a/examples/get_info.py +++ b/examples/get_info.py @@ -50,6 +50,7 @@ def parse_help(): print(help_text) return help_flag + def print_info(info_used): nkeys = info_used.Get_nkeys() print("MPI File Info: nkeys =", nkeys) @@ -60,11 +61,13 @@ def print_info(info_used): def pnetcdf_io(filename): - if verbose and rank == 0: - print("{}: example of getting MPI-IO hints".format(os.path.basename(__file__))) # create a new file using clobber "w" mode - f = pnetcdf.File(filename=filename, mode = 'w', file_format = "NETCDF3_64BIT_DATA", comm=comm, info=None) + f = pnetcdf.File(filename=filename, + mode = 'w', + file_format = "NC_64BIT_DATA", + comm=comm, + info=None) # exit the define mode f.enddef() @@ -82,7 +85,6 @@ def pnetcdf_io(filename): if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -99,10 +101,13 @@ def pnetcdf_io(filename): parser.add_argument("-q", help="Quiet mode (reports when fail)", action="store_true") args = parser.parse_args() - if args.q: verbose = False + verbose = False if args.q else True filename = args.dir + if verbose and rank == 0: + print("{}: example of getting MPI-IO hints".format(os.path.basename(__file__))) + try: pnetcdf_io(filename) except BaseException as err: diff --git a/examples/get_vara.py b/examples/get_vara.py index bbb8f3b..84d5094 100644 --- a/examples/get_vara.py +++ b/examples/get_vara.py @@ -68,11 +68,11 @@ def parse_help(): def pnetcdf_io(filename, file_format): - if verbose and rank == 0: - print("{}: reading file ".format(os.path.basename(__file__)), filename) - # Open an existing file for reading - f = pnetcdf.File(filename=filename, mode = 'r', comm=comm, info=None) + f = pnetcdf.File(filename = filename, + mode = 'r', + comm = comm, + info = None) # Get global attribute named "history" str_att = f.get_att("history") @@ -83,6 +83,10 @@ def pnetcdf_io(filename, file_format): global_ny = len(f.dimensions['Y']) global_nx = len(f.dimensions['X']) + if verbose and rank == 0: + print("Y dimension size = ", global_ny) + print("X dimension size = ", global_nx) + # get the variable of a 2D variable of integer type v = f.variables['var'] @@ -101,14 +105,14 @@ def pnetcdf_io(filename, file_format): counts = [local_ny, local_nx] # Read a subarray in collective mode - buff = np.empty(tuple(counts), v.dtype) - v.get_var_all(buff, start = starts, count = counts) + r_buf = np.empty(tuple(counts), v.dtype) + v.get_var_all(r_buf, start = starts, count = counts) # close the file f.close() + if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -129,14 +133,17 @@ def pnetcdf_io(filename, file_format): file_format = None length = 10 - if args.q: verbose = False + verbose = False if args.q else True if args.k: - kind_dict = {'1':None, '2':"NETCDF3_64BIT_OFFSET", '5':"NETCDF3_64BIT_DATA"} + kind_dict = {'1':None, '2':"NC_64BIT_OFFSET", '5':"NC_64BIT_DATA"} file_format = kind_dict[args.k] filename = args.dir + if verbose and rank == 0: + print("{}: reading file ".format(os.path.basename(__file__)), filename) + try: pnetcdf_io(filename, file_format) except BaseException as err: diff --git a/examples/ghost_cell.py b/examples/ghost_cell.py index 14de88d..d9f664b 100644 --- a/examples/ghost_cell.py +++ b/examples/ghost_cell.py @@ -83,9 +83,6 @@ def parse_help(): def pnetcdf_io(filename, file_format, length): - if verbose and rank == 0: - print("{}: example of using buffers with ghost cells".format(os.path.basename(__file__))) - counts = [length, length + 1] psizes = MPI.Compute_dims(nprocs, 2) @@ -108,7 +105,8 @@ def pnetcdf_io(filename, file_format, length): # set subarray access pattern counts = np.array([length, length + 1], dtype=np.int64) - starts = np.array([local_rank[0] * counts[0], local_rank[1] * counts[1]], dtype=np.int64) + starts = np.array([local_rank[0] * counts[0], local_rank[1] * counts[1]], + dtype=np.int64) if verbose: print("starts= {} {} counts= {} {}".format(starts[0], starts[1], counts[0], counts[1])) @@ -118,13 +116,19 @@ def pnetcdf_io(filename, file_format, length): buf = np.empty(bufsize, dtype=np.int32) for i in range(counts[0] + 2 * nghosts): for j in range(counts[1] + 2 * nghosts): - if nghosts <= i < counts[0] + nghosts and nghosts <= j < counts[1] + nghosts: + if nghosts <= i < counts[0] + nghosts and \ + nghosts <= j < counts[1] + nghosts: buf[i * (counts[1] + 2 * nghosts) + j] = rank else: - buf[i * (counts[1] + 2 * nghosts) + j] = -8 # all ghost cells have value -8 + # set values of all ghost cells to -8 + buf[i * (counts[1] + 2 * nghosts) + j] = -8 # Create the file - f = pnetcdf.File(filename=filename, mode = 'w', format = file_format, comm=comm, info=None) + f = pnetcdf.File(filename = filename, + mode = 'w', + format = file_format, + comm = comm, + info = None) # Define dimensions dim_y = f.def_dim("Y", gsizes[0]) @@ -150,7 +154,6 @@ def pnetcdf_io(filename, file_format, length): if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -169,19 +172,21 @@ def pnetcdf_io(filename, file_format, length): parser.add_argument("-l", help="Size of each dimension of the local array\n") args = parser.parse_args() - if args.q: verbose = False + verbose = False if args.q else True file_format = None if args.k: - kind_dict = {'1':None, '2':"NETCDF3_64BIT_OFFSET", '5':"NETCDF3_64BIT_DATA"} + kind_dict = {'1':None, '2':"NC_64BIT_OFFSET", '5':"NC_64BIT_DATA"} file_format = kind_dict[args.k] length = 4 - if args.l: length = int(args.l) + if args.l and int(args.l) > 0: + length = int(args.l) filename = args.dir - length = 4 if length <= 0 else length + if verbose and rank == 0: + print("{}: example of using buffers with ghost cells".format(os.path.basename(__file__))) try: pnetcdf_io(filename, file_format, length) diff --git a/examples/global_attribute.py b/examples/global_attribute.py index 35b1cde..0c75a57 100644 --- a/examples/global_attribute.py +++ b/examples/global_attribute.py @@ -48,13 +48,14 @@ def parse_help(): def pnetcdf_io(filename, file_format): digit = np.int16([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - if verbose and rank == 0: - print("{}: example of put/get global attributes".format(os.path.basename(__file__))) - # Run pnetcdf i/o # Create the file - f = pnetcdf.File(filename=filename, mode = 'w', format = file_format, comm=comm, info=None) + f = pnetcdf.File(filename = filename, + mode = 'w', + format = file_format, + comm = comm, + info = None) if rank == 0: ltime = time.localtime() @@ -107,7 +108,6 @@ def pnetcdf_io(filename, file_format): if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -125,15 +125,18 @@ def pnetcdf_io(filename, file_format): parser.add_argument("-k", help="File format: 1 for CDF-1, 2 for CDF-2, 5 for CDF-5") args = parser.parse_args() - if args.q: verbose = False - - filename = args.dir + verbose = False if args.q else True file_format = None if args.k: - kind_dict = {'1':None, '2':"NETCDF3_64BIT_OFFSET", '5':"NETCDF3_64BIT_DATA"} + kind_dict = {'1':None, '2':"NC_64BIT_OFFSET", '5':"NC_64BIT_DATA"} file_format = kind_dict[args.k] + filename = args.dir + + if verbose and rank == 0: + print("{}: example of put/get global attributes".format(os.path.basename(__file__))) + try: pnetcdf_io(filename, file_format) except BaseException as err: diff --git a/examples/hints.py b/examples/hints.py index 4df0162..d917fcf 100644 --- a/examples/hints.py +++ b/examples/hints.py @@ -94,16 +94,22 @@ def pnetcdf_io(filename): NZ = 5 if verbose and rank == 0: - print("{}: example of set/get PnetCDF hints".format(os.path.basename(__file__))) + print("Z dimension size = ", NZ) + print("Y dimension size = ", NY) + print("X dimension size = ", NX) # create MPI info object and set a few hints - info1 = MPI.Info.Create() - info1.Set("nc_header_align_size", "1024") - info1.Set("nc_var_align_size", "512") - info1.Set("nc_header_read_chunk_size", "256") + info = MPI.Info.Create() + info.Set("nc_header_align_size", "1024") + info.Set("nc_var_align_size", "512") + info.Set("nc_header_read_chunk_size", "256") # create a new file for writing - f = pnetcdf.File(filename=filename, mode = 'w', file_format = "NETCDF3_64BIT_DATA", comm=comm, info=info1) + f = pnetcdf.File(filename = filename, + mode = 'w', + format = "NC_64BIT_DATA", + comm = comm, + info = info) # define dimensions dim_z = f.def_dim('Z', NZ*nprocs) @@ -144,14 +150,13 @@ def pnetcdf_io(filename): if verbose and rank == 0: print_hints(f, var_zy, var_yx) - info1.Free() + info.Free() # close the file f.close() if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -168,10 +173,13 @@ def pnetcdf_io(filename): parser.add_argument("-q", help="Quiet mode (reports when fail)", action="store_true") args = parser.parse_args() - if args.q: verbose = False + verbose = False if args.q else True filename = args.dir + if verbose and rank == 0: + print("{}: example of set/get PnetCDF hints".format(os.path.basename(__file__))) + try: pnetcdf_io(filename) except BaseException as err: diff --git a/examples/nonblocking_write.py b/examples/nonblocking_write.py index 5395883..55bfe9a 100644 --- a/examples/nonblocking_write.py +++ b/examples/nonblocking_write.py @@ -49,18 +49,14 @@ def parse_help(): print(help_text) return help_flag -def print_info(info_used): - print("MPI hint: cb_nodes =", info_used.Get("cb_nodes")) - print("MPI hint: cb_buffer_size =", info_used.Get("cb_buffer_size")) - print("MPI hint: striping_factor =", info_used.Get("striping_factor")) - print("MPI hint: striping_unit =", info_used.Get("striping_unit")) def pnetcdf_io(filename, length): NDIMS = 3 NUM_VARS = 10 if verbose and rank == 0: - print("{}: example of calling nonblocking write APIs".format(os.path.basename(__file__))) + print("Number of variables = ", NUM_VARS) + print("Number of dimensions = ", NDIMS) # set up subarray access pattern starts = np.zeros(NDIMS, dtype=np.int32) @@ -86,16 +82,12 @@ def pnetcdf_io(filename, length): for j in range(bufsize): buf[i][j] = rank * i + 123 + j - comm.Barrier() - write_timing = MPI.Wtime() - # Create the file - try: - f = pnetcdf.File(filename=filename, mode = 'w', format = "NETCDF3_64BIT_DATA", comm=comm, info=None) - except OSError as e: - print("Error at {}:{} ncmpi_create() file {} ({})".format(__file__,inspect.currentframe().f_back.f_lineno, filename, e)) - comm.Abort() - sys.exit(1) + f = pnetcdf.File(filename = filename, + mode = 'w', + format = "NC_64BIT_DATA", + comm = comm, + info = None) # Define dimensions dims = [] @@ -112,9 +104,6 @@ def pnetcdf_io(filename, length): # Exit the define mode f.enddef() - # Get all the hints used - info_used = f.inq_info() - # Write one variable at a time, using iput APIs reqs = [] for i in range(NUM_VARS): @@ -153,38 +142,11 @@ def pnetcdf_io(filename, length): # detach the temporary buffer f.detach_buff() - # obtain write amount yet to now - put_size = f.inq_put_size() - put_size = comm.allreduce(put_size, op=MPI.SUM) - # Close the file f.close() - write_timing = MPI.Wtime() - write_timing - - write_size = bufsize * NUM_VARS * np.dtype(np.int32).itemsize - sum_write_size = comm.reduce(write_size, MPI.SUM, root=0) - max_write_timing = comm.reduce(write_timing, MPI.MAX, root=0) - - if rank == 0 and verbose: - print() - print("Total amount writes to variables only (exclude header) = {} bytes".format(sum_write_size)) - print("Total amount writes reported by pnetcdf (include header) = {} bytes".format(put_size)) - print() - subarray_size = (bufsize * np.dtype(np.int32).itemsize) / 1048576.0 - print_info(info_used) - print("Local array size {} x {} x {} integers, size = {:.2f} MB".format(length, length, length, subarray_size)) - sum_write_size /= 1048576.0 - print("Global array size {} x {} x {} integers, write size = {:.2f} GB".format(gsizes[0], gsizes[1], gsizes[2], sum_write_size/1024.0)) - - write_bw = sum_write_size / max_write_timing - print(" procs Global array size exec(sec) write(MB/s)") - print("------- ------------------ --------- -----------") - print(" {:4d} {:4d} x {:4d} x {:4d} {:8.2f} {:10.2f}\n".format(nprocs, gsizes[0], gsizes[1], gsizes[2], max_write_timing, write_bw)) - if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -202,13 +164,16 @@ def pnetcdf_io(filename, length): parser.add_argument("-l", help="Size of each dimension of the local array\n") args = parser.parse_args() - if args.q: verbose = False + verbose = False if args.q else True length = 10 if args.l and int(args.l) > 0: length = int(args.l) filename = args.dir + if verbose and rank == 0: + print("{}: example of calling nonblocking write APIs".format(os.path.basename(__file__))) + try: pnetcdf_io(filename, length) except BaseException as err: diff --git a/examples/nonblocking_write_def.py b/examples/nonblocking_write_def.py index 23c83d4..8ebb860 100644 --- a/examples/nonblocking_write_def.py +++ b/examples/nonblocking_write_def.py @@ -50,19 +50,14 @@ def parse_help(): print(help_text) return help_flag -def print_info(info_used): - print("MPI hint: cb_nodes =", info_used.Get("cb_nodes")) - print("MPI hint: cb_buffer_size =", info_used.Get("cb_buffer_size")) - print("MPI hint: striping_factor =", info_used.Get("striping_factor")) - print("MPI hint: striping_unit =", info_used.Get("striping_unit")) def pnetcdf_io(file_name, length): NDIMS = 3 NUM_VARS = 10 if verbose and rank == 0: - print("{}: example of nonblocking APIs in define mode".format(os.path.basename(__file__))) - + print("Number of variables = ", NUM_VARS) + print("Number of dimensions = ", NDIMS) # set subarray access pattern starts = np.zeros(NDIMS, dtype=np.int32) @@ -88,11 +83,12 @@ def pnetcdf_io(file_name, length): for j in range(bufsize): buf[i][j] = rank * i + 123 + j - comm.Barrier() - write_timing = MPI.Wtime() - # Create the file - f = pnetcdf.File(filename=filename, mode = 'w', format = "NETCDF3_64BIT_DATA", comm=comm, info=None) + f = pnetcdf.File(filename = filename, + mode = 'w', + format = "NC_64BIT_DATA", + comm = comm, + info = None) # Define dimensions dims = [] @@ -140,41 +136,11 @@ def pnetcdf_io(file_name, length): # detach the temporary buffer f.detach_buff() - # Get all the hints used - info_used = f.inq_info() - - # check write amount - put_size = f.inq_put_size() - put_size = comm.allreduce(put_size, op=MPI.SUM) - # close the file f.close() - write_timing = MPI.Wtime() - write_timing - - write_size = bufsize * NUM_VARS * np.dtype(np.int32).itemsize - sum_write_size = comm.reduce(write_size, MPI.SUM, root=0) - max_write_timing = comm.reduce(write_timing, MPI.MAX, root=0) - - if rank == 0 and verbose: - print() - print("Total amount writes to variables only (exclude header) = {} bytes".format(sum_write_size)) - print("Total amount writes reported by pnetcdf (include header) = {} bytes".format(put_size)) - print() - subarray_size = (bufsize * np.dtype(np.int32).itemsize) / 1048576.0 - print_info(info_used) - print("Local array size {} x {} x {} integers, size = {:.2f} MB".format(length, length, length, subarray_size)) - sum_write_size /= 1048576.0 - print("Global array size {} x {} x {} integers, write size = {:.2f} GB".format(gsizes[0], gsizes[1], gsizes[2], sum_write_size/1024.0)) - - write_bw = sum_write_size / max_write_timing - print(" procs Global array size exec(sec) write(MB/s)") - print("------- ------------------ --------- -----------") - print(" {:4d} {:4d} x {:4d} x {:4d} {:8.2f} {:10.2f}\n".format(nprocs, gsizes[0], gsizes[1], gsizes[2], max_write_timing, write_bw)) - if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -192,7 +158,7 @@ def pnetcdf_io(file_name, length): parser.add_argument("-l", help="Size of each dimension of the local array\n") args = parser.parse_args() - if args.q: verbose = False + verbose = False if args.q else True length = 10 if args.l and int(args.l) > 0: @@ -200,6 +166,9 @@ def pnetcdf_io(file_name, length): filename = args.dir + if verbose and rank == 0: + print("{}: example of nonblocking APIs in define mode".format(os.path.basename(__file__))) + try: pnetcdf_io(filename, length) except BaseException as err: diff --git a/examples/put_vara.py b/examples/put_vara.py index 1d4e431..61b7add 100644 --- a/examples/put_vara.py +++ b/examples/put_vara.py @@ -71,9 +71,6 @@ def parse_help(): def pnetcdf_io(filename, file_format): - if verbose and rank == 0: - print("{}: example of writing subarrays".format(os.path.basename(__file__))) - NY = 10 NX = 4 global_ny = NY @@ -81,8 +78,16 @@ def pnetcdf_io(filename, file_format): starts = [0, NX * rank] counts = [NY, NX] + if verbose and rank == 0: + print("Y dimension size = ", NY) + print("X dimension size = ", NX) + # Create the file - f = pnetcdf.File(filename=filename, mode = 'w', format = file_format, comm=comm, info=None) + f = pnetcdf.File(filename = filename, + mode = 'w', + format = file_format, + comm = comm, + info = None) # Add a global attribute: a time stamp at rank 0 if rank == 0: @@ -113,7 +118,7 @@ def pnetcdf_io(filename, file_format): short_att = np.int16(1000) var.put_att("short_att_name", short_att) - # Exit the define mode + # Exit the define mode f.enddef() # initialize write buffer @@ -127,7 +132,6 @@ def pnetcdf_io(filename, file_format): if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -145,15 +149,18 @@ def pnetcdf_io(filename, file_format): parser.add_argument("-k", help="File format: 1 for CDF-1, 2 for CDF-2, 5 for CDF-5") args = parser.parse_args() - if args.q: verbose = False + verbose = False if args.q else True file_format = None if args.k: - kind_dict = {'1':None, '2':"NETCDF3_64BIT_OFFSET", '5':"NETCDF3_64BIT_DATA"} + kind_dict = {'1':None, '2':"NC_64BIT_OFFSET", '5':"NC_64BIT_DATA"} file_format = kind_dict[args.k] filename = args.dir + if verbose and rank == 0: + print("{}: example of writing subarrays".format(os.path.basename(__file__))) + try: pnetcdf_io(filename, file_format) except BaseException as err: diff --git a/examples/put_varn_int.py b/examples/put_varn_int.py index 39026a4..7193e98 100644 --- a/examples/put_varn_int.py +++ b/examples/put_varn_int.py @@ -55,16 +55,23 @@ def parse_help(): print(help_text) return help_flag + def pnetcdf_io(file_name, file_format): NY = 4 NX = 10 NDIMS = 2 if verbose and rank == 0: - print("{}: example of writing multiple variables in a call".format(os.path.basename(__file__))) + print("Number of dimensions = ", NDIMS) + print("Y dimension size = ", NY) + print("X dimension size = ", NX) # create a new file - f = pnetcdf.File(filename=filename, mode = 'w', format=file_format, comm=comm, info=None) + f = pnetcdf.File(filename = filename, + mode = 'w', + format = file_format, + comm = comm, + info = None) # define dimensions dimx = f.def_dim('x',NX) @@ -143,17 +150,16 @@ def pnetcdf_io(file_name, file_format): # allocate I/O buffer and initialize its contents w_len = np.sum(np.prod(counts, axis=1)) - buffer = np.full(w_len, rank, dtype=np.int32) + w_buf = np.full(w_len, rank, dtype=np.int32) # set the buffer pointers to different offsets to the I/O buffe - v.put_var_all(buffer, start = starts, count = counts, num = num_reqs) + v.put_var_all(w_buf, start = starts, count = counts, num = num_reqs) # close the file f.close() if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -171,15 +177,18 @@ def pnetcdf_io(file_name, file_format): parser.add_argument("-k", help="File format: 1 for CDF-1, 2 for CDF-2, 5 for CDF-5") args = parser.parse_args() - if args.q: verbose = False + verbose = False if args.q else True file_format = None if args.k: - kind_dict = {'1':None, '2':"NETCDF3_64BIT_OFFSET", '5':"NETCDF3_64BIT_DATA"} + kind_dict = {'1':None, '2':"NC_64BIT_OFFSET", '5':"NC_64BIT_DATA"} file_format = kind_dict[args.k] filename = args.dir + if verbose and rank == 0: + print("{}: example of writing multiple variables in a call".format(os.path.basename(__file__))) + try: pnetcdf_io(filename, file_format) except BaseException as err: diff --git a/examples/transpose.py b/examples/transpose.py index 5473c27..516b03b 100644 --- a/examples/transpose.py +++ b/examples/transpose.py @@ -43,17 +43,17 @@ def parse_help(): print(help_text) return help_flag -def pnetcdf_io(filename, file_format, length): - if verbose and rank == 0: - print("{}: example of put/get 3D transposed arrays".format(os.path.basename(__file__))) +def pnetcdf_io(filename, file_format, length): NDIMS = 3 + if verbose and rank == 0: + print("Number of dimensions = ", NDIMS) - gsizes = np.zeros(NDIMS, dtype=np.int64) - starts = np.zeros(NDIMS, dtype=np.int64) - counts = np.zeros(NDIMS, dtype=np.int64) - imap = np.zeros(NDIMS, dtype=np.int64) + gsizes = np.zeros(NDIMS, dtype=np.int64) + starts = np.zeros(NDIMS, dtype=np.int64) + counts = np.zeros(NDIMS, dtype=np.int64) + imap = np.zeros(NDIMS, dtype=np.int64) startsT = np.zeros(NDIMS, dtype=np.int64) countsT = np.zeros(NDIMS, dtype=np.int64) @@ -73,10 +73,10 @@ def pnetcdf_io(filename, file_format, length): # set up subarray access pattern bufsize = 1 for i in range(NDIMS): - gsizes[i] = (length + i) * psizes[i] # global array size + gsizes[i] = (length + i) * psizes[i] # global array size starts[i] *= (length + i) # start indices - counts[i] = (length + i) # array elements - bufsize *= (length + i) + counts[i] = (length + i) # array elements + bufsize *= (length + i) # allocate buffer and initialize with contiguous numbers buf = np.empty(bufsize, dtype=int) @@ -84,11 +84,17 @@ def pnetcdf_io(filename, file_format, length): for k in range(counts[0]): for j in range(counts[1]): for i in range(counts[2]): - buf[index] = (starts[0]+k)*gsizes[1]*gsizes[2] + (starts[1]+j)*gsizes[2] + (starts[2]+i) + buf[index] = (starts[0]+k)*gsizes[1]*gsizes[2] + \ + (starts[1]+j)*gsizes[2] + \ + (starts[2]+i) index += 1 # Create the file - f = pnetcdf.File(filename=filename, mode = 'w', format = file_format, comm=comm, info=None) + f = pnetcdf.File(filename = filename, + mode = 'w', + format = file_format, + comm = comm, + info = None) # Define dimensions dim_z = f.def_dim("Z", gsizes[0]) @@ -99,29 +105,31 @@ def pnetcdf_io(filename, file_format, length): var_zyx = f.def_var("ZYX_var", pnetcdf.NC_INT, (dim_z, dim_y, dim_x)) # Define variable with transposed file layout: ZXY - var_zxy = f.def_var("ZXY_var", pnetcdf.NC_INT, (dim_z, dim_x, dim_y)) + var_zxy = f.def_var("ZXY_var", pnetcdf.NC_INT, (dim_z, dim_x, dim_y)) # Define variable with transposed file layout: YZX - var_yzx = f.def_var("YZX_var", pnetcdf.NC_INT, (dim_y, dim_z, dim_x)) + var_yzx = f.def_var("YZX_var", pnetcdf.NC_INT, (dim_y, dim_z, dim_x)) # Define variable with transposed file layout: YXZ - var_yxz = f.def_var("YXZ_var", pnetcdf.NC_INT, (dim_y, dim_x, dim_z)) + var_yxz = f.def_var("YXZ_var", pnetcdf.NC_INT, (dim_y, dim_x, dim_z)) # Define variable with transposed file layout: XZY - var_xzy = f.def_var("XZY_var", pnetcdf.NC_INT, (dim_x, dim_z, dim_y)) + var_xzy = f.def_var("XZY_var", pnetcdf.NC_INT, (dim_x, dim_z, dim_y)) # Define variable with transposed file layout: XYZ - var_xyz = f.def_var("XYZ_var", pnetcdf.NC_INT, (dim_x, dim_y, dim_z)) + var_xyz = f.def_var("XYZ_var", pnetcdf.NC_INT, (dim_x, dim_y, dim_z)) # Exit the define mode f.enddef() # Write the whole variable in file: ZYX var_zyx.put_var_all(buf, start=starts, count=counts) + # ZYX -> ZXY: - imap[1] = 1; imap[2] = counts[2]; imap[0] = counts[1]*counts[2] + imap[1] = 1; imap[2] = counts[2]; imap[0] = counts[1]*counts[2] startsT[0] = starts[0]; startsT[1] = starts[2]; startsT[2] = starts[1] countsT[0] = counts[0]; countsT[1] = counts[2]; countsT[2] = counts[1] var_zxy.put_var_all(buf, start = startsT, count = countsT, imap = imap) + # ZYX -> ZXY: imap[1] = 1; imap[2] = counts[2]; imap[0] = counts[1]*counts[2] startsT[0] = starts[0]; startsT[1] = starts[2]; startsT[2] = starts[1] @@ -157,7 +165,6 @@ def pnetcdf_io(filename, file_format, length): if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -177,11 +184,11 @@ def pnetcdf_io(filename, file_format, length): args = parser.parse_args() - if args.q: verbose = False + verbose = False if args.q else True file_format = None if args.k: - kind_dict = {'1':None, '2':"NETCDF3_64BIT_OFFSET", '5':"NETCDF3_64BIT_DATA"} + kind_dict = {'1':None, '2':"NC_64BIT_OFFSET", '5':"NC_64BIT_DATA"} file_format = kind_dict[args.k] length = 10 @@ -189,6 +196,9 @@ def pnetcdf_io(filename, file_format, length): filename = args.dir + if verbose and rank == 0: + print("{}: example of put/get 3D transposed arrays".format(os.path.basename(__file__))) + try: pnetcdf_io(filename, file_format, length) except BaseException as err: diff --git a/examples/transpose2D.py b/examples/transpose2D.py index e5b1db1..63ebcc4 100644 --- a/examples/transpose2D.py +++ b/examples/transpose2D.py @@ -66,11 +66,11 @@ def parse_help(): return help_flag def pnetcdf_io(filename, file_format, length): - if verbose and rank == 0: - print("{}: example of put/get 2D transposed arrays".format(os.path.basename(__file__))) - NDIMS = 2 + if verbose and rank == 0: + print("Number of dimensions = ", NDIMS) + gsizes = np.zeros(NDIMS, dtype=np.int64) starts = np.zeros(NDIMS, dtype=np.int64) counts = np.zeros(NDIMS, dtype=np.int64) @@ -114,7 +114,11 @@ def pnetcdf_io(filename, file_format, length): # Create the file - f = pnetcdf.File(filename=filename, mode = 'w', format = file_format, comm=comm, info=None) + f = pnetcdf.File(filename = filename, + mode = 'w', + format = file_format, + comm = comm, + info = None) # Define dimensions dim_y = f.def_dim("Y", gsizes[0]) @@ -144,7 +148,6 @@ def pnetcdf_io(filename, file_format, length): if __name__ == "__main__": - verbose = True comm = MPI.COMM_WORLD rank = comm.Get_rank() nprocs = comm.Get_size() @@ -163,11 +166,11 @@ def pnetcdf_io(filename, file_format, length): parser.add_argument("-l", help="size of each dimension of the local array") args = parser.parse_args() - if args.q: verbose = False + verbose = False if args.q else True file_format = None if args.k: - kind_dict = {'1':None, '2':"NETCDF3_64BIT_OFFSET", '5':"NETCDF3_64BIT_DATA"} + kind_dict = {'1':None, '2':"NC_64BIT_OFFSET", '5':"NC_64BIT_DATA"} file_format = kind_dict[args.k] length = 2 @@ -175,6 +178,9 @@ def pnetcdf_io(filename, file_format, length): filename = args.dir + if verbose and rank == 0: + print("{}: example of put/get 2D transposed arrays".format(os.path.basename(__file__))) + try: pnetcdf_io(filename, file_format, length) except BaseException as err: From 72b3a088540518b0be5d90474aa75b6f7015866e Mon Sep 17 00:00:00 2001 From: KWang1998 Date: Fri, 9 Aug 2024 20:46:14 -0500 Subject: [PATCH 07/20] add README with directory structure and run instructions --- examples/MNIST/README.md | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 examples/MNIST/README.md diff --git a/examples/MNIST/README.md b/examples/MNIST/README.md new file mode 100644 index 0000000..c7f8c11 --- /dev/null +++ b/examples/MNIST/README.md @@ -0,0 +1,46 @@ +# PnetCDF-python MNIST example + +This directory contains the description and run instructions for the MNIST example Python programs that utilize PnetCDF for file I/O and parallel training with MNIST data. + +## Directory Structure + +- **MNIST_data**: This folder contains a mini MNIST test dataset stored in a NetCDF file (`mnist_images_mini.nc`). The file includes: + - 60 training samples + - 12 testing samples + +- **MNIST_codes**: This folder contains the example MNIST training code. The example code is based on the [PyTorch MNIST example](https://github.com/pytorch/examples/tree/main/mnist) and uses `DistributedDataParallel` for parallel training. + +## Running the MNIST Example Program + +To run the MNIST example program, use the `mpiexec` command. The example below runs the program on 4 MPI processes. + +### Command: + +```sh +mpiexec -n 4 python main.py +``` + +### Expected Output: + +When using 4 MPI processes, the output is expected to be similar to the following: + +```sh +nprocs = 4 rank = 0 device = cpu mpi_size = 4 mpi_rank = 0 +nprocs = 4 rank = 2 device = cpu mpi_size = 4 mpi_rank = 2 +nprocs = 4 rank = 1 device = cpu mpi_size = 4 mpi_rank = 1 +nprocs = 4 rank = 3 device = cpu mpi_size = 4 mpi_rank = 3 + +Train Epoch: 1 Average Loss: 2.288340 +Test set: Average loss: 2.7425, Accuracy: 0/12 (0%) + +Train Epoch: 2 Average Loss: 2.490800 +Test set: Average loss: 1.9361, Accuracy: 6/12 (50%) + +Train Epoch: 3 Average Loss: 2.216520 +Test set: Average loss: 1.8703, Accuracy: 7/12 (58%) +``` + +### Notes: +- The test set accuracy may vary slightly depending on how the data is distributed across the MPI processes. +- The accuracy and loss reported after each epoch are averaged across all MPI processes. + From c725489ae068ad737721e8dbcd4c7d6c280c1362 Mon Sep 17 00:00:00 2001 From: wkliao Date: Sat, 10 Aug 2024 18:05:58 -0500 Subject: [PATCH 08/20] download mnist pytorch file and apply patch * add patch file, mnist.patch * add Makefile to enable run 'make check' * put PnetCDF-IO part into a separate file, pnetcdf_io.py --- examples/MNIST/Makefile | 25 ++++ examples/MNIST/comm_file.py | 212 +++++++++++++++++++++++++++++++++ examples/MNIST/mnist.patch | 134 +++++++++++++++++++++ examples/MNIST/mnist_images.nc | Bin 0 -> 57032 bytes examples/MNIST/pnetcdf_io.py | 42 +++++++ 5 files changed, 413 insertions(+) create mode 100644 examples/MNIST/Makefile create mode 100644 examples/MNIST/comm_file.py create mode 100644 examples/MNIST/mnist.patch create mode 100644 examples/MNIST/mnist_images.nc create mode 100644 examples/MNIST/pnetcdf_io.py diff --git a/examples/MNIST/Makefile b/examples/MNIST/Makefile new file mode 100644 index 0000000..ef82d59 --- /dev/null +++ b/examples/MNIST/Makefile @@ -0,0 +1,25 @@ +# +# Copyright (C) 2024, Northwestern University and Argonne National Laboratory +# See COPYRIGHT notice in top-level directory. +# + +check_PROGRAMS = mnist_main.py + +MNIST_URL = https://raw.githubusercontent.com/pytorch/examples/main/mnist/main.py + +mnist_main.py: + curl -Ls $(MNIST_URL) -o $@ + patch -st $@ < mnist.patch + +all: + +ptests check: mnist_main.py mnist_images.nc + @echo "======================================================================" + @echo " examples/MNIST: Parallel testing on 4 MPI processes" + @echo "======================================================================" + @mpiexec -n 4 python mnist_main.py --batch-size 4 --test-batch-size 2 --epochs 3 --input-file mnist_images.nc + @echo "" + +clean: + rm -rf mnist_main.py + diff --git a/examples/MNIST/comm_file.py b/examples/MNIST/comm_file.py new file mode 100644 index 0000000..9e757c6 --- /dev/null +++ b/examples/MNIST/comm_file.py @@ -0,0 +1,212 @@ +import os +import torch +import torch.distributed as dist +from mpi4py import MPI + +class distributed(): + def get_size(self): + if dist.is_available() and dist.is_initialized(): + size = dist.get_world_size() + else: + size = 1 + return size + + def get_rank(self): + if dist.is_available() and dist.is_initialized(): + rank = dist.get_rank() + else: + rank = 0 + return rank + + def get_local_rank(self): + if not (dist.is_available() and dist.is_initialized()): + return 0 + # Number of GPUs per node + if torch.cuda.is_available(): + local_rank = dist.get_rank() % torch.cuda.device_count() + else: + # raise NotImplementedError() + # running on cpu device should not call this function + local_rank = -1 + return local_rank + + def __init__(self, method): + # MASTER_PORT - required; has to be a free port on machine with rank 0 + # MASTER_ADDR - required (except for rank 0); address of rank 0 node + # WORLD_SIZE - required; can be set either here, or in a call to init function + # RANK - required; can be set either here, or in a call to init function + self.mpi_comm = MPI.COMM_WORLD + + if method == "nccl-slurm": + # MASTER_ADDR can be set in the slurm batch script using command + # scontrol show hostnames $SLURM_JOB_NODELIST + if "MASTER_ADDR" not in os.environ: + # Try SLURM_LAUNCH_NODE_IPADDR but it is the IP address of the node + # from which the task launch was initiated (where the srun command + # ran from). It may not be the node of rank 0. + if "SLURM_LAUNCH_NODE_IPADDR" in os.environ: + os.environ["MASTER_ADDR"] = os.environ["SLURM_LAUNCH_NODE_IPADDR"] + else: + raise Exception("Error: nccl-slurm - SLURM_LAUNCH_NODE_IPADDR is not set") + + # Use the default pytorch port + if "MASTER_PORT" not in os.environ: + if "SLURM_SRUN_COMM_PORT" in os.environ: + os.environ["MASTER_PORT"] = os.environ["SLURM_SRUN_COMM_PORT"] + else: + os.environ["MASTER_PORT"] = "29500" + + # obtain WORLD_SIZE + if "WORLD_SIZE" not in os.environ: + if "SLURM_NTASKS" in os.environ: + world_size = os.environ["SLURM_NTASKS"] + else: + if "SLURM_JOB_NUM_NODES" in os.environ: + num_nodes = os.environ["SLURM_JOB_NUM_NODES"] + else: + raise Exception("Error: nccl-slurm - SLURM_JOB_NUM_NODES is not set") + if "SLURM_NTASKS_PER_NODE" in os.environ: + ntasks_per_node = os.environ["SLURM_NTASKS_PER_NODE"] + elif "SLURM_TASKS_PER_NODE" in os.environ: + ntasks_per_node = os.environ["SLURM_TASKS_PER_NODE"] + else: + raise Exception("Error: nccl-slurm - SLURM_(N)TASKS_PER_NODE is not set") + world_size = ntasks_per_node * num_nodes + os.environ["WORLD_SIZE"] = str(world_size) + + # obtain RANK + if "RANK" not in os.environ: + if "SLURM_PROCID" in os.environ: + os.environ["RANK"] = os.environ["SLURM_PROCID"] + else: + raise Exception("Error: nccl-slurm - SLURM_PROCID is not set") + + # Initialize DDP module + dist.init_process_group(backend = "nccl", init_method='env://') + + elif method == "nccl-openmpi": + if "MASTER_ADDR" not in os.environ: + if "PMIX_SERVER_URI2" in os.environ: + os.environ["MASTER_ADDR"] = os.environ("PMIX_SERVER_URI2").split("//")[1] + else: + raise Exception("Error: nccl-openmpi - PMIX_SERVER_URI2 is not set") + + # Use the default pytorch port + if "MASTER_PORT" not in os.environ: + os.environ["MASTER_PORT"] = "29500" + + if "WORLD_SIZE" not in os.environ: + if "OMPI_COMM_WORLD_SIZE" not in os.environ: + raise Exception("Error: nccl-openmpi - OMPI_COMM_WORLD_SIZE is not set") + os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"] + + if "RANK" not in os.environ: + if "OMPI_COMM_WORLD_RANK" not in os.environ: + raise Exception("Error: nccl-openmpi - OMPI_COMM_WORLD_RANK is not set") + os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"] + + # Initialize DDP module + dist.init_process_group(backend = "nccl", init_method='env://') + + elif method == "nccl-mpich": + if "MASTER_ADDR" not in os.environ: + os.environ['MASTER_ADDR'] = "localhost" + + # Use the default pytorch port + if "MASTER_PORT" not in os.environ: + os.environ["MASTER_PORT"] = "29500" + + if "WORLD_SIZE" not in os.environ: + if "PMI_SIZE" in os.environ: + world_size = os.environ["PMI_SIZE"] + elif MPI.Is_initialized(): + world_size = MPI.COMM_WORLD.Get_size() + else: + world_size = 1 + os.environ["WORLD_SIZE"] = str(world_size) + + if "RANK" not in os.environ: + if "PMI_RANK" in os.environ: + rank = os.environ["PMI_RANK"] + elif MPI.Is_initialized(): + rank = MPI.COMM_WORLD.Get_rank() + else: + rank = 0 + os.environ["RANK"] = str(rank) + + # Initialize DDP module + dist.init_process_group(backend = "nccl", init_method='env://') + + elif method == "gloo": + if "MASTER_ADDR" not in os.environ: + # check if OpenMPI is used + if "PMIX_SERVER_URI2" in os.environ: + addr = os.environ["PMIX_SERVER_URI2"] + addr = addr.split("//")[1].split(":")[0] + os.environ["MASTER_ADDR"] = addr + else: + os.environ['MASTER_ADDR'] = "localhost" + + # Use the default pytorch port + if "MASTER_PORT" not in os.environ: + os.environ["MASTER_PORT"] = "29500" + + # obtain WORLD_SIZE + if "WORLD_SIZE" not in os.environ: + # check if OpenMPI is used + if "OMPI_COMM_WORLD_SIZE" in os.environ: + world_size = os.environ["OMPI_COMM_WORLD_SIZE"] + elif "PMI_SIZE" in os.environ: + world_size = os.environ["PMI_SIZE"] + elif MPI.Is_initialized(): + world_size = MPI.COMM_WORLD.Get_size() + else: + world_size = 1 + os.environ["WORLD_SIZE"] = str(world_size) + + # obtain RANK + if "RANK" not in os.environ: + # check if OpenMPI is used + if "OMPI_COMM_WORLD_RANK" in os.environ: + rank = os.environ["OMPI_COMM_WORLD_RANK"] + elif "PMI_RANK" in os.environ: + rank = os.environ["PMI_RANK"] + elif MPI.Is_initialized(): + rank = MPI.COMM_WORLD.Get_rank() + else: + rank = 0 + os.environ["RANK"] = str(rank) + + # Initialize DDP module + dist.init_process_group(backend = "gloo", init_method='env://') + + else: + raise NotImplementedError() + + def finalize(self): + dist.destroy_process_group() + +#----< init_parallel() >------------------------------------------------------- +def init_parallel(): + # check if cuda device is available + ngpu_per_node = torch.cuda.device_count() + if not torch.cuda.is_available(): + backend = "gloo" + else: + backend = "nccl-mpich" + + # initialize parallel/distributed environment + comm = distributed(backend) + rank = comm.get_rank() + world_size = comm.get_size() + local_rank = comm.get_local_rank() + + # select training device: cpu or cuda + if not torch.cuda.is_available(): + device = torch.device("cpu") + else: + device = torch.device("cuda:"+str(local_rank)) + + return comm, device + + diff --git a/examples/MNIST/mnist.patch b/examples/MNIST/mnist.patch new file mode 100644 index 0000000..1001ca2 --- /dev/null +++ b/examples/MNIST/mnist.patch @@ -0,0 +1,134 @@ +--- mnist_main_original.py 2024-08-10 17:30:08.552324326 -0500 ++++ pnetcdf_mnist.py 2024-08-10 18:02:49.008705003 -0500 +@@ -1,3 +1,8 @@ ++# ++# Copyright (C) 2024, Northwestern University and Argonne National Laboratory ++# See COPYRIGHT notice in top-level directory. ++# ++ + import argparse + import torch + import torch.nn as nn +@@ -5,7 +10,11 @@ + import torch.optim as optim + from torchvision import datasets, transforms + from torch.optim.lr_scheduler import StepLR ++from torch.nn.parallel import DistributedDataParallel as DDP ++from torch.utils.data.distributed import DistributedSampler + ++import comm_file, pnetcdf_io ++from mpi4py import MPI + + class Net(nn.Module): + def __init__(self): +@@ -42,14 +51,13 @@ + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() +- if batch_idx % args.log_interval == 0: ++ if rank == 0 and batch_idx % args.log_interval == 0: + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + if args.dry_run: + break + +- + def test(model, device, test_loader): + model.eval() + test_loss = 0 +@@ -62,9 +70,14 @@ + pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability + correct += pred.eq(target.view_as(pred)).sum().item() + ++ # aggregate loss among all ranks ++ test_loss = comm.mpi_comm.allreduce(test_loss, op=MPI.SUM) ++ correct = comm.mpi_comm.allreduce(correct, op=MPI.SUM) ++ + test_loss /= len(test_loader.dataset) + +- print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( ++ if rank == 0: ++ print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_loader.dataset), + 100. * correct / len(test_loader.dataset))) + +@@ -94,6 +107,8 @@ + help='how many batches to wait before logging training status') + parser.add_argument('--save-model', action='store_true', default=False, + help='For Saving the current Model') ++ parser.add_argument('--input-file', type=str, required=True, ++ help='NetCDF file storing train and test samples') + args = parser.parse_args() + use_cuda = not args.no_cuda and torch.cuda.is_available() + use_mps = not args.no_mps and torch.backends.mps.is_available() +@@ -107,7 +122,7 @@ + else: + device = torch.device("cpu") + +- train_kwargs = {'batch_size': args.batch_size} ++ train_kwargs = {'batch_size': args.batch_size//nprocs} + test_kwargs = {'batch_size': args.test_batch_size} + if use_cuda: + cuda_kwargs = {'num_workers': 1, +@@ -120,25 +135,53 @@ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) +- dataset1 = datasets.MNIST('../data', train=True, download=True, +- transform=transform) +- dataset2 = datasets.MNIST('../data', train=False, +- transform=transform) +- train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) +- test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) ++ ++ # Open files storing training and testing samples ++ infile = args.input_file ++ train_file = pnetcdf_io.dataset(infile, 'train_images', 'train_labels', transform, comm.mpi_comm) ++ test_file = pnetcdf_io.dataset(infile, 'test_images', 'test_labels', transform, comm.mpi_comm) ++ ++ # create distributed samplers ++ train_sampler = DistributedSampler(train_file, num_replicas=nprocs, rank=rank, shuffle=True) ++ test_sampler = DistributedSampler(test_file, num_replicas=nprocs, rank=rank, shuffle=False) ++ ++ # add distributed samplers to DataLoaders ++ train_loader = torch.utils.data.DataLoader(train_file, sampler=train_sampler, **train_kwargs) ++ test_loader = torch.utils.data.DataLoader(test_file, sampler=test_sampler, **test_kwargs, drop_last=False) + + model = Net().to(device) ++ ++ # use DDP ++ model = DDP(model, device_ids=[device] if use_cuda else None) ++ + optimizer = optim.Adadelta(model.parameters(), lr=args.lr) + + scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) + for epoch in range(1, args.epochs + 1): ++ # train sampler set epoch ++ train_sampler.set_epoch(epoch) ++ test_sampler.set_epoch(epoch) ++ + train(args, model, device, train_loader, optimizer, epoch) + test(model, device, test_loader) + scheduler.step() + + if args.save_model: +- torch.save(model.state_dict(), "mnist_cnn.pt") ++ if rank == 0: ++ torch.save(model.state_dict(), "mnist_cnn.pt") + ++ # close files ++ train_file.close() ++ test_file.close() + + if __name__ == '__main__': ++ ## initialize parallel environment ++ comm, device = comm_file.init_parallel() ++ ++ rank = comm.get_rank() ++ nprocs = comm.get_size() ++ + main() ++ ++ comm.finalize() ++ diff --git a/examples/MNIST/mnist_images.nc b/examples/MNIST/mnist_images.nc new file mode 100644 index 0000000000000000000000000000000000000000..4cf0aa40dad3bf85817df9bff4a4009104b7b253 GIT binary patch literal 57032 zcmeHw2YeRA*7rafArL|cRUnbxtAd~u1t}uEC<4+1LQ4DI_7pAbGeTE!AlkFJ)|DB=*VUB^S=ECe6HElZ0M0(##}a#EC2gkwvdl{9_DNz zEnnJEv*|LWA6z!exoqvjE=09y$rjRVXf`z?TE1N6cW=K2yog6Pe}DhUUvDq5-kCYY ze14_NeY|`VVIgV@NjcKggw=kk$l=G)t5c^pDw-hVe_O$h@5xLyi&eVA4?_q!=Yy&) zw`0gR*K+caK(31Bk#PR@(c}7rlNLI|6J?X5e)?Ry(mdEPiI6Xg^HLXI{dYo^ydnYp zdSUX11ViXKdEQhA)|j@JkPD(Sskrzdlw9^6yIQf4cUhU9YG;FrzH^Ns$2-~#`fpaQ z!;uc&yi)xw(M%_8rdAu6B76?PlQ;#7=oY0;|88K zH1AvYF~O(1U^3o@tpR%k<30z-1|F*mk5c$Z@}OpW=SB`=+1Qn@&A7q{kyL9|E36~& z7rUtO30`Mb^xBD&S{5p)VMokZ1B{#KJi!+1I0@GwK#gFvI z$7?qE-Q47K#gSS}ujx3drN*#hSNx5Ot+ftsH6sXhBT=eg{)#C#1YZ_LAXBpjn(z&o}O91;5&DV3XHa zu-wZoSJDM{tk^5mbb0StvAuMXu2v)6iXErMrT>aru%fZFl&-gmnw%tr*3cV9&9;g* z)8qMe+nGgs=qy@Ay8=_Q_UJ`Rl2a3QC1c|Z&=T_G!i5WjW+xqA*rTLJe72#6(3iik z&`-_XKzhfFS?Dl2$HJSLFxtS(TId`x=f{?~W+sgGF|!srj4reAW+u!j$NJ=;XSVS| z(i4f(s7kG8*eA_F8e2s=H^m{vnm=}23CyF%G!t(o#;B=-NT1i7uwsYhdZwgbl3WSYUwoO6 z5g#nB&&h%PAP$oQm!ZMl$YbseQgHa&9{KiQGMdNwV7%8Cp`I8=fy-nGGYI|`AOy}q zdkcWjWRkf9eB?as(#cc~@{7YeKveD&#$mj(Tm<9NMZP?iOt7OZr-uAunYV|dV{j}F zAyhw++?-pb*de^oj;WHyPBK)6wIrB7HzR5eSWbGM<)ByHN#yC>!bUS}b4 z)U6o*_D=2EnqjxEaq?#c(d#Gijkl3+t2N2k=xYqdDIfZ`-js$XtDa!@-Me+yJ{G5+ zy8#JJwC5gOU^&=1x|}vTrlrM>`FkxKSCq@S8UKgV-#D*GT3i(Lgy5Ev;_=xo$Iv1< z=66tzmAFgrL|JAgLB0IBBr_-{4DRZ`q?eP;oG?glQx5r*R*~PsExbsgI5Y97G#c_b^w32x%rmz!a8iwCA z0YJiaeKcd{j2Q=591e3YRjL?^?Hi5p7E9%7*yx%ny=OjTHo2^*iaoAo0(6!iT@0FZ z`0!!4vGLP^t|2Hro$B{MtenFyx=bUgN3s%8%H=&xrXK?*yjBguLX7R0D^h_Rux>WY zqm3)u{h&dz+HABJD~*PK&u18?zOCnh5zs*onQK`$0 zcK&6|Gy=w*%}$!cNep&G4p?WJE^Gf#dzw+fu)arDJbjXQp(M=^es<*r3My{F$u#cns&Hgx~E4mZ}_OjSl~qeW6tI zji2FeRjLRZxB`pGEj}L_KMu$8t(yh~-gWT92pWWL;h%JxC&pv>bHc}c;rJ?JZ?7La zmiLF0(;SpSS$Icu&i-)<_Oyiv{4|Z;2RKB8p$qgfU}c0@m-EbOWg&)&T>@;g5bH6A z05)8JRf@=FCWYyB%v2yM8WKRmG89mpUhD4NdnHNo0-4AaOmEAoP%la&WH<0!O8Jqs zLcgff^2L+v=dhMuZ&D7`?GA-Dk8dFw9HaE!?qo{IHT!VS{K4vpKNb`i@)|Q{9i`9e zh_IKk|5*tHR+^fSuu5d3`I2W8-0La@Fk@((7-f(N?zxx}6b1rXBEt{9un z9Kji6pP+FVmkym`t{e~$$#GWM*1F#R>o4AfzI zz?a9rYKHFsl6U~Zk^|>}Y-7gCzE`$vaaE8X8V{NK^%13+ubcIB+A|btY<-YY7c=6W zm6SdoB)qljIUplh@7eq)rZyiS2&X3r`i>2MO;AxhUm470@r_WrPU{t{MWP3D8$uJ+ z<0J#enj|LD{SaWwO#(9+0+_XpC_dTcj=JrqG`o>GGPIZl6u#&jP`hM?&_?LOL08;+ z3bYZ<6y`G;VF1KF7aq>85lv}k$QkwOdM0Hdd#-Kz$m_%9qyPMK`SK6N*THJ(zksW7 zFG`k+?-G;=C;?pb7{M0{2>^sD?8Ib@b>u>i2Md4%uDk+QxJO8PJ))JEj(5FcmFV2@ zj63CZ#!9p&c(%mL@S9Jta_4<9f4)@-hT0E03Ct^v903_}a>>4rNo09cYa7NX5#)r$ zy~#fmKbVHt!l<>in~(?#27{E7QBhbJwb-lL^`ym?6u-n6p<`@e6ci4Znb~9;s`Zcz zM^HGtV>cUCE!B#_E29%my~4M0V5z~UNl@1D*3l)FaNQJj9FNO}nx{4gpsVuZaO;y` z$?i+$|FQ&fZA_8egy4rfpr1(1h12m;th0IH5*Mo!&2qIN8~|)7C*}LSdZbd6N~eAQ zGUoitHl_C<Bm+ZeTfl=dD z6LL)uugQ>24mQZ09bpBFABv7n#^kgz0B&vOnnx5h}iwgI}npKewNr8lYb+|!^B#sGEDbt@wj8971-MBnecTO z%g(IoF1$pNeKZH(l|iVPZ7Gc@y}u9HYweZn>_1f()tc*o9puUG-Kmtjrls=FC;*eLEr}r@UtLuDs_&Flo{Q817om zb&mJZ%w69$d#fb6LGSSHlJlP8s=Logi{rEqqUYIij6Az&W>X^zF6std`OCCE*YF9E zr$0UO`Sa(STiKLe3jVzk@Hz7^NEEX~&6=fq)9d+ptorhO`(PG#=62_PTfbx&j4EaP z{D2VJlUEB)CoAU^Mz!Keysd9Mk4AzgRc;eav)5hquacjAO6(+$NAMP5PIU!&&;jxy zbf-cfFO)1-mj-dVET43jaDvQjGu+v`xcDx1GpG@vA5vlhC1-vhCq;U_2pyNd7qLxt zK-{L!$oqMMZWDXug3wHQ5P5u_KKc8HL+qMV@NaN3Wjx+cZB z7zM|s9u$-@7wd1rmI=$QB*|*Sl)DQHim^7C3te@y4KJVmo1~#ec@CYl)$yACyiBPJ z?GvuMr#UBo4GLIjw zSv*BG-jIyJzO{Sbyu=M8Xdod;(}eR*YJ``+|BEHyKRAX6UXEiaG&lkNHO8w1sRh82 z*wEG+@O8muI)?XvM)}0lns&wg$_#V)#D{__e}Ht$Q*~bWmFjji6o*g3dhad*1fgr` zFm{ttMTns{x)A)67(%Ez!NO*R#qaJUQmg|x5G1i#c~_h`mk1Sz!lNWwFe|@S1~pGl zm5l%fJxT8*QS)Am!z2*;j2!inVD%nMmDYh0mH*G;3gPB)o&3ekZNhLoiTSCnFaw)g z%tjcO(-3H*agwQ!yKoqO`kEFs-*JphBT)U}`S?|9plSnS!rUeS|A|Yn#4U?WQ~=gH zUtrZS1g2-o3qn7FpcVk0Lx=v!W3VRuxLlirfaJD>C?|BWuRZ|9A5B-u z9IgIC0PU01YLwUG%2-l<$}oyXzq1Am8PbJ1pFS)k-isGzZNupjEr;a2RK6wHXelWnk940A^Gp zl@c7cO*b}RA8UG*ZjxAY19%XMM8)!LBjmQFTi@)l!*MYDi&o(Wk2zAuSs;X(JXVSs ze)j9ewM5V>Q`seOJlt477nz9OR~=VAe$ym)N&cH95V$Ctt{2DcmwdMGWFqRYc#g}Q z>4)l&R|JZKlz#75u>#0HAq<|$z&irW{rXjZ&^SWp+4VjoO+fgOBgJ$BYO(f4z_fL- zcrsHHn2vNJEIdC(z87?Zn$ha0<$ZcfbTd`Yh=|Ebc79>9WRM>p-q0@kKpsP2p>lom zTs$!kv6_-;mbhl-Q4BvOrK7K9f2z|Qk>`A*wCW#BM9!sZAmW0sW>mm9QNNx`_eEgG z(XVBvk^d3NvN{xcO!u*{8;!*OZUoiddwNU@5peL`fNj;e`daxO`7{jC9+n4K;wz$)K> zb(+h{y9Ws{(JeJfVbw^ERrNCSXUgg2DOJgD=Cu+n~8$@tTkblW8j~25zS)>I z(@xW)@=e(PD?ME_bTbij4hb{iQ}YfX`^_U0nmvSAxg5xE@FuEq6f}uN6hUB>ZRgIN zJKRu1Yr(sqWC^qm!PBS18Y~D#Zy;YN;~(FpqTNDD}B|2UH442+JSFT)nvs}5tCCV*TS`KqwIzU_qM_gt7 zV_V$}LM}{=@g%X52|~4h$W-O9ztj)51yiFHY;wDHR&T*<;6`R%AD#&!0T0H|;FS01Z-)F|V8Spr2_8&-EDxgvLiitZdY1L_~z*+OTLAA566} zc+{vTiUmR20x|Lz+n7wq2~&zDGWWn#LOw7FOr#$eJ*K)VCaFRe*#q0q4&bGXck6$g zz}7J$2=!;VJSc+3+)slcfzpE3qhT1^ZS`1tkybZAQs&rhPbDg=20|JOM+-cj0Y=D{ z)vH%G&BHa19rXJiVw`x#nriP8Fqx`^JC9B!S}}xP>)#)Se#!WX4T?xcQh1G*{J$yz zxANu7>sQFsfAsCLXa8Y%4q>oU4VwQ@q{lRnpPdBn&Shwrc8q{W*SWO_>PMZSnoK)h zOF!NwFgF^LPVd^>`c~ywtacgnp1uE|=#NRIQROvD)??e}^iFH5NaNb1(EAdtHV%4Q zl2%)IH`h{CYvy2~`z0FJIhVnksDNmCpN%JrLKB*8sjJL!YNl;7vsSvaua$qv{q+*? zDb#zRD{vS7qwg=eDjhi7v{uJX;<$J3=lT61EQEdiEXm)z+|9+8`k4^B)1!c|f-|vA z=+)+J7YFH?95a*gxk`o(?cN<6WRj{Rs{+*!X7diJG=k81NBd!!xO!3^{JVI?J z0dYP3$A^T3d?0?*?6zJ?U|HJo-Oe3j#|qo(1?v6sc#xo=pus&0yZfe!yoMNZ10IK> zF;DvxwrR$Ya|jh=BRHD`SVCjPt#^DRz|_p#6=X(j{4W;suxX)+k3r4RS8#gn@4`@4 zQdkYU4tjMKOKY$IL3LrI2tsoKRtvI`4NsW}16OHRlP`o=PsgeJ9%#J?t9XwbhlH|7 ztUZ_YQprSAVJ0R^>{%I0$dg^L^49&uAznk;3i)TQ@!=)0U=*=OJ7@>tOdxgRFuZT;yRv@djzT1U}woEEU{ z)3TzYV`=Irn-+j0e}VVeL&c4w-PB9Tf3O6q4}#a&e-pXbUBi@#3GxUoG8cNplb^cx zW)99I4ngP-GR6yeew{)b!)@3}G9vf;ZT=K)y8R1z_JDOdx8}8}p;t~SwILa3hrpZg zEie!)3R)EwyYdYZUPz>dQ0ZkPOEv)dF~MIcUXfk3X13*H66OfS_mDGY-b|k-cLZ}s zEQVW8EO=Q&)vXhqJk)H(L{~~Ab41fP5T(FH5?;y_)kK!QmGrIXk%TDjKW%7Xiuv~@ zUw6VSqjztu=dxQ_74{7{wtNV>nXF)B2#>&$ZDkE0{0Ol14C}L4fVF0K?lgpk5$ET; z&DLyj8VH99^iBg51vmFp0k#CNmI#d!R+`);j_V-n z0!!`Mav0=}VLm@O9DO*5<=im`-gs3rWJMWP7ENpE_&JrDS9>?RWQ*J)e_3=WssIYsgUE&<07;h01)gQfl?vS3^=ZJl+DVUK%tum zKqw@7lhPSAM|aWPoW~{g7DC6JA>AZ;o1Di-MsE^nHaPAaSqg_NiXA}>0!=w+_ii){zAgtXh&9w_GbHD?!s0^mhi$Zg z<>i~8*V#DRK&u*hD+z?;R=rE`dxFDC9Ay|Z0x*u$kHb*$V;Z)FhhWSiI9bs!W1XX>NqI=1jBs5 zKXX_d4j*tqt`uh{W>Ab{VZQD`fz2y5RH5q^-ee%biVFf5*M(qS*?!1*(9o%lt%EFT z%qu&^`t+=HYg>^_-=ai#bnZ`d(T4?DPOY5Un@t^hmRwXr6ZS9fyV4~t%L zP-7RM)I1w*F_9D2I|set^u8Zz(;0%r9Q3AZH+x8Naxyd^hrci|pZA9EzMyOhi<7P` zeAoQ^1;nf89!25pYcBZSqO(U8DO9NRL&_BkM0~G@I{AD-fN6Ge!VWv_HZ{HBIoQkz z^M8}SwwlEk;g~zvDH}O`74}ftB{@eJOV1Jx!o#+Hu*}mstESs13J|eXW-43(pd*5{ z3=I8B4xyikvaZ!{S+H#ZVbK`n+gtTuzV1Hn`Ai_GkdcD$9UwyZ0hJZxG9(ITfAo7E z3y33I0XswiW368fx0OV8=%7i>;MYbPrgQ@GjDS<9)34T=DTkY5vpONT(0f8%HV;+yx?+p_BbXSe%MZuor}Y$dSMDY@ri%w|8i z;cHV+1-_>Vk=)rR-w6`zDbcG!gqyA?v2()Uo>?g|iGCvFyBx5}60BepNwhs*uDHbr z*&$Iw{~>I(EWH{;`nC{)DO1U+LU>CmN9?rPDhU$(fXov}UpUd|G&X8t3sq~@EQ3aq z^ELn#TAM*|!mFcUUs682wO@JTzWU_^zD`E(rNNhL7GEwhKl>4{1z&nTR~R*&9~LjT zw+UYEjZky=D4I$V`|wkk6$p3i^nl_>=2BngC>a;XtBu9ZFXJ|X+Z<< z0LR8Xt|kJNY|O=>B23A~Tzt`1CV@#bFg|C18PRpN^ykL=UFV6xXcIfP0aLWr;)3rj^&Jt(MkG@ChKz8Sx3v24d4)IMsVMcOX^h7;PJB%Rhr z*Y9Z-Y{yad6_t9NS0V;$+3ImB(`%tEx03q;lL&25a(PCNY#}2z7~EE}#$c?}8nW3c(6|v>W{?GvLaQ6MRQj}Q`v1tI&psF~ ze>7u;y+7*R5;om_5DPnGZQx6$Zvi8#|*lEB*oZhT#R0NNC%z(WxAf$G{_a1 zi}YKkxtEiE;gU_rd(i19|JA6b0;>T}X(@#2ryUX0ydTlqa1Q7bD)0xVs2>w5nl;&O zbP8?4gtZ0xtfPrpkQ}P8fex_im5RSIYSe6+CTvz(u#G;pR^@Kc2ES^&HUpebuKx# z6cC?vt;aNz`#iWFaVUKf_0JQ_HOuVr4w4ouJtyQP5VgbaRhMp~A6wE>kZc+hpWZ zn7rk`yv9znUsRH^r!RRBHBE0lOL#J<8T;B9bF$gba3*14fHDPKa@_N0HyPIU3% z!{N(v&zcyPtkh&!UpSKQcW6;w{{;)T<5ch>F)&3f35Bl9KFJ3gIAPbvWOUu0%9UxN zd>!wswK}Mqipij^JJoCSv96&I`Bx}c27aFHu5X19D)9`i%=5^o{%`XL@J+Vi5OLKa zi2U~y<$GiPO1gQM!7$>)Y*^Hs*FAL-FhF5x`G& zgyj;+LFRKkPEswQfN<;AH4O{J#-68H3qJQZBs$e3Qt7a8AKz5r*S1aV?dRwB+4D(y zbQ`hjnN6e%I^=|jW>{RBpX8vIz+>+7T}_SK65H%cQ~Ik?7nRvhFY%?8FWMRTVe{Z0 zQ|N`FdX6=sI{O4TY{T$JZB1f2R-XFo;jSKjg384d=^skg7_1cBGuu!KDnIrfTh$EV zNmWkB`HVf~pchFxRSXhR91;_&JlA=CSYkzO|JJggCq zpW==DE4UTdY}Km|i>TvWb^?F5VmrX#6c@(Z1Z=vu82ggaxHfv$R#G}fXAr!Xyxib^ zu%wp*R^y&Y&uerB*{!7w#>{$b`=f0}0%mMj2E6QTuD3w%>Gj$H0Rit^PbTT{O-*X3 z6DxZbpsbUIm=T~^xv&73&CRN$ZeT;`&kfu;wU?}{1ab|9Fl^{lGHfW-P0foU zIA&HAvxOZDQ9Uf?W!;-+8k}QV!Jn%EzE(i6$;Yk&?$u>2o$^);d3<>pPjrER#nv^a zhysrsabfeZV1bAlx5{07uz*F)U|_La7lX4#9FT14;+UgqNa z6X!j9rKEs`Phn#7dt7QCW4vAPqO5{Kxi)r1;}1}=Y#P>%m#RFwKC<)}wp@}PU>n62 z{~6LMYvDW0qENI9N~21@!Zr%(9v+39`|>lb-khrECVH$XeN&NgnD-2Nw&-wLQo*e2 z6FfK28x4!4L~)ZAO3e2uUEOsq-49Hyu3QJh4f^2_9xPoNk9+yL46F;xlSd z^5%dqkuqN(RYlXoq!}p`FY0orYTJfNEx!0b?MaDF*Y~IM&)VI)IoeB$_XB7AfRPpzwiol4^MxPLWR) z*Q4cl_VAs=7kSBO6eslc;IbE&t~ z9z0FW27j!$Dl~|fy-<9qY15*d&Y@nTQ%{<1UUrIkdc*G6u8?K&d;od{_8LlB_(do~ z|FkyO09_v(h`g2ol-Al>RNs?FWkB*SQ)_R9VCCmsXi-Xn!%lWs6JwvrA=`Zs^c~1i zh{J`J;E&gV$MVU09006$&jgm12m-V%K#$wlwLsznyQV4YsqARp0bnqGr5ymtbl3r; z!teV;*rX?9B%Ff?gS}vPhfq@KqtQOL<%Psn#ASDZi>{Wz9nVo!R@zmT@cCvJOHpi`U@G z2-9}(z-Iw6+1FK@?IKxkDAtsp?eJKJs~#5c@J`TH1A~{CcN$zzLM|HI73-IFXFD$$ z80dXuaA=lU&uTOG0uGBPt1Fhaq_+-E;ktg?t|8`~H2k*!7%|05QEe4RHk%g`47|nGU0bHEN*hQ8 zt5FYQ#n1xLx;=HRETkFFB?c^s+dFXJUdZPcc1&5XcP&)BX13DOr9JY7%GV^Wp@-}i zU6M2MwXd*-PNr6w;6@Ux6`?&O5IT%|*ht9KjO>*`C{(&)R;u#f z3QVGxyYqRQQB#q4!KT>`hZidL;ppVI^QA@>8w!oLPw$g!HvaUNp{Y2fZ&lr@HXH9|`8Z z2^wq(1|IQE!RU(;8@z{E616r-=EME0YdtojwG^xe9^`*f7-zMDy@ z<>Hu(HiA90HuFlvWsPZebIx(sqzR*A~EB z=Xe|TS~j^Hln;5HC-(cct>bcWP#B*@-nQwXK&4EeWf5RX>@k_-xWpxXKa<4DT9#l! z7THr%Bs((5s(fOscmj!+-ATvg?>Mx}A_pWE3&E^@h&VCmnv<;(HQ3*qutF0Cz^cL| zb$|!*2nZ;@WXY26i~?eV4J>S2xLzg51l`6qmxLXjXJ&?t8Do0b#!i?pfvK#RFrl)p z@siL=paLDLk+$OpxRXt$31%m2l}u+TxT#GwVB$75?oP=#a-tj8|+tfmBDb-QY!6kqX+WWI2t+RSO`7m z#-*aMd9O=WC_@B~<2#H;PE+{D9AW#q(}9w+Wn1D65o3jqNwHzDubhK~4HdZ>gzYy6 zKX>Zd%)%bS5EAae505p0J%$~Q@7gvFb~pxG!Jt?$cV`Y+{Fc+y5juk>9l%iCBzT+?`4YjNv9VlzNyBjZ1G4Zu+wau}$Wh;Tp zz$NidW9!^^opFm&wo@M$>k=*A$YbtDG_22Mm`$>Tcd^z7ABnksV)e&2HFx3=k8R9+ z-DSq#NvwKZQm$wzgpdQ4#{u3M{mg|0#bv*vMN^IiqSR4bU}j<6f*Pd=UaQ3o<6nvO zKCAUnir6=m4Z|n8LYTVt2-Z{9ELGl=4hw^DT&x3QZ%wElYj=Tw(9c-Ul@4jd6pX8D zG=Bttkr(Ma=;<_1%*JWFNay1o3&yEk=q_tv>N@`fqAeXFYjg|2Zo>9!s}@Q}O4mvH zYhGt4Z7nIf*P%ZxfzT{UxysXsd6Z_=GB^v>kkTg>frVzMGnmh(oU^FTT8(fOw10`p*zXtH5!fq2TVKZ%_EjgX%$shOBE^t`p9GGRZFcTJpy>`IgCSOg|! zC0%FYtYmDORP9q5XN_y(E|MQneu?LEku2|VR&st9pgnF#madVe9fNV!3uioO~Zm`pD2gPS+Dm|e3yJ&f}7yG@;^ z>}I=>4xq3Hz6GGUVut>9WsV_%(AkDeqq>ocmAbBXJfV-PXd?aS|CHr(U%4A3RF~;Q z^CTzzCJ(e^k(LgLiZ@@m0J7n|UH*`v4@tEBEL_%euP2lBgm|J>I;p_Rbbb`wB5TG` z;ajwZ9+4+X=kSE0-_z`cqZ#mxY^RZb%VWWsHEa4!?W#^$Th5@(GPbVoI!ppv#0ZsL zkcL?Ws{{^f;u2rm1i^#_%xDGcpJJYFocL~lUgfK}&v^nx7=%jOSCe`ELT2POSuv6M zW|H^ z?v5pQgSDvqvDf!Sc+pG#ua`i%H|Cr=(Yw6BK=sfBHZ4c9uPo&n-5SeadzhH@GaV=C z!>CU!s>?v4SRAj$6>^s1)j4 zZEDckB+b*UUI4O0oY*M_Ln;G!?`pAxNm%*0)lrHhJnU}ai42!Hl-hF@vO&BV5aMemsRHC>LpW}@}>|%(_!P#reY}nb4mqm7>k_m0i_qA zJVl`$oV%>Qe2Kzl5|R+(jOU8Vh&aOjg){^Fuc8{cRT+DD?AS*3O!@9ge+<`%()7eG zJh~Lb*ADI6xf?D$*W6Z3q|r@Ff@6?77(2HXv}nFT_2JuD{5v{LOg)3U^hb-{pQaaP zmRR?(D|6>rLp^R6D>j*K)*-}U5zpR|Vb3{39(@N0ITqXxEipK)8>ZkI2nn))Axv6evD~r;TY&`!TVLVgmt$_h82p< z9Ite&PNtc&QsN%VOYt-KaDMdu$>)~SL;J`sb~nFYw$yTxW`$76OXO&$1`Vbk!z7}k smI>`x{04g|6#jn@^yT~p%}bg;Z(b!W@_*UUD|aqf{p9ZL;f*~14?gE?L;wH) literal 0 HcmV?d00001 diff --git a/examples/MNIST/pnetcdf_io.py b/examples/MNIST/pnetcdf_io.py new file mode 100644 index 0000000..84adc86 --- /dev/null +++ b/examples/MNIST/pnetcdf_io.py @@ -0,0 +1,42 @@ +# +# Copyright (C) 2024, Northwestern University and Argonne National Laboratory +# See COPYRIGHT notice in top-level directory. +# + +# This is the I/O module for reading input samples using PnetCDF-Python + +from mpi4py import MPI +from pnetcdf import File + +class dataset(): + def __init__(self, path, samples, labels, transform=None, comm=None): + self.path = path + self.samples = samples + self.labels = labels + self.transform = transform + self.comm = comm + + # Open the NetCDF file + self.f = File(self.path, mode='r', comm=self.comm) + self.f.begin_indep() # To use independent I/O mode + + # Get dimensions of the variables + self.data_shape = self.f.variables[self.samples].shape + self.label_shape = self.f.variables[self.labels].shape + + def __len__(self): + return self.data_shape[0] + + def __getitem__(self, idx): + # Read the data and label at the given index + image = self.f.variables[self.samples][idx, ...] + label = self.f.variables[self.labels][idx] + + if self.transform: + image = self.transform(image) + + return image, label + + def close(self): + self.f.close() + From 40282d2798479632e2470b2fc390ce1f00fede95 Mon Sep 17 00:00:00 2001 From: wkliao Date: Sat, 10 Aug 2024 18:12:01 -0500 Subject: [PATCH 09/20] github action: MNIST requires torchvision --- .github/workflows/pnetcdf_c_master.yml | 2 +- .github/workflows/pnetcdf_c_official.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pnetcdf_c_master.yml b/.github/workflows/pnetcdf_c_master.yml index ab940cc..f5393ee 100644 --- a/.github/workflows/pnetcdf_c_master.yml +++ b/.github/workflows/pnetcdf_c_master.yml @@ -83,7 +83,7 @@ jobs: pip install numpy cython cftime pytest twine wheel check-manifest export MPICC=$MPICH_DIR/bin/mpicc pip install mpi4py - pip install torch + pip install torch, torchvision - name: Install PnetCDF-Python run: | diff --git a/.github/workflows/pnetcdf_c_official.yml b/.github/workflows/pnetcdf_c_official.yml index 3946da3..4b349e1 100644 --- a/.github/workflows/pnetcdf_c_official.yml +++ b/.github/workflows/pnetcdf_c_official.yml @@ -81,7 +81,7 @@ jobs: pip install numpy cython cftime pytest twine wheel check-manifest export MPICC=$MPICH_DIR/bin/mpicc pip install mpi4py - pip install torch + pip install torch, torchvision - name: Install PnetCDF-Python run: | From e45466c495b9cfab36d03c1dcaceb9f60e05c2ba Mon Sep 17 00:00:00 2001 From: wkliao Date: Sat, 10 Aug 2024 18:12:38 -0500 Subject: [PATCH 10/20] add examples/MNIST into testing --- examples/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/Makefile b/examples/Makefile index 59f4ebc..2db3150 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -29,9 +29,11 @@ all: check: ptest4 cd Pytorch_DDP && make check + cd MNIST && make check ptests: ptest3 ptest4 ptest8 cd Pytorch_DDP && make ptests + cd MNIST && make ptests ptest3: @mkdir -p ${OUTPUT_DIR} From 5da0d239b2d39b35813544b5e1bcf8e888f2a123 Mon Sep 17 00:00:00 2001 From: wkliao Date: Sat, 10 Aug 2024 18:26:27 -0500 Subject: [PATCH 11/20] github action: fix minor issue --- .github/workflows/pnetcdf_c_master.yml | 2 +- .github/workflows/pnetcdf_c_official.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pnetcdf_c_master.yml b/.github/workflows/pnetcdf_c_master.yml index f5393ee..b8e7503 100644 --- a/.github/workflows/pnetcdf_c_master.yml +++ b/.github/workflows/pnetcdf_c_master.yml @@ -83,7 +83,7 @@ jobs: pip install numpy cython cftime pytest twine wheel check-manifest export MPICC=$MPICH_DIR/bin/mpicc pip install mpi4py - pip install torch, torchvision + pip install torch torchvision - name: Install PnetCDF-Python run: | diff --git a/.github/workflows/pnetcdf_c_official.yml b/.github/workflows/pnetcdf_c_official.yml index 4b349e1..9b501a5 100644 --- a/.github/workflows/pnetcdf_c_official.yml +++ b/.github/workflows/pnetcdf_c_official.yml @@ -81,7 +81,7 @@ jobs: pip install numpy cython cftime pytest twine wheel check-manifest export MPICC=$MPICH_DIR/bin/mpicc pip install mpi4py - pip install torch, torchvision + pip install torch torchvision - name: Install PnetCDF-Python run: | From ee880fe937fbe0fae99708c453bfd27c59a33188 Mon Sep 17 00:00:00 2001 From: KWang1998 Date: Sun, 11 Aug 2024 12:43:21 -0500 Subject: [PATCH 12/20] add code for generating MNIST netcdf file --- examples/MNIST/create_mnist_netcdf.py | 108 ++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 examples/MNIST/create_mnist_netcdf.py diff --git a/examples/MNIST/create_mnist_netcdf.py b/examples/MNIST/create_mnist_netcdf.py new file mode 100644 index 0000000..07b2af9 --- /dev/null +++ b/examples/MNIST/create_mnist_netcdf.py @@ -0,0 +1,108 @@ +import os +import numpy as np +import numpy as np +import pnetcdf +from mpi4py import MPI +from array import array +import struct + +class MnistDataloader(object): + def __init__(self, training_images_filepath,training_labels_filepath, + test_images_filepath, test_labels_filepath): + self.training_images_filepath = training_images_filepath + self.training_labels_filepath = training_labels_filepath + self.test_images_filepath = test_images_filepath + self.test_labels_filepath = test_labels_filepath + + def read_images_labels(self, images_filepath, labels_filepath): + labels = [] + with open(labels_filepath, 'rb') as file: + magic, size = struct.unpack(">II", file.read(8)) + if magic != 2049: + raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic)) + labels = array("B", file.read()) + + with open(images_filepath, 'rb') as file: + magic, size, rows, cols = struct.unpack(">IIII", file.read(16)) + if magic != 2051: + raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic)) + image_data = array("B", file.read()) + images = [] + for i in range(size): + images.append([0] * rows * cols) + for i in range(size): + img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols]) + img = img.reshape(28, 28) + images[i][:] = img + + return images, labels + + def load_data(self): + x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath) + x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath) + return (x_train, y_train),(x_test, y_test) + +# +# Set file paths based on added MNIST Datasets +# +input_path = '.' +training_images_filepath = os.path.join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte') +training_labels_filepath = os.path.join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte') +test_images_filepath = os.path.join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte') +test_labels_filepath = os.path.join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte') + +# +# Load MINST dataset +# +mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath) +(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data() + +# use partial dataset +x_train_small = x_train[:60] +y_train_small = y_train[:60] +x_test_small = x_test[:12] +y_test_small = y_test[:12] + +def to_nc(train_samples, test_samples, train_labels, test_labels, comm, out_file_path='mnist_images.nc'): + if os.path.exists(out_file_path): + os.remove(out_file_path) + train_labels = list(train_labels) + test_labels = list(test_labels) + with pnetcdf.File(out_file_path, comm= comm, mode = "w", format = "64BIT_DATA") as fnc: + + dim_y = fnc.def_dim("Y", 28) + dim_x = fnc.def_dim("X", 28) + dim_num_train = fnc.def_dim("train_idx", len(train_samples)) + dim_num_test = fnc.def_dim("test_idx", len(test_samples)) + + # define nc variable for all imgs + v_train = fnc.def_var("train_images", pnetcdf.NC_UBYTE, (dim_num_train, dim_x, dim_y)) + # put labels into attributes + v_label_train = fnc.def_var("train_labels", pnetcdf.NC_UBYTE, (dim_num_train, )) + + # define nc variable for all imgs + v_test = fnc.def_var("test_images", pnetcdf.NC_UBYTE, (dim_num_test, dim_x, dim_y)) + # put labels into attributes + v_label_test = fnc.def_var("test_labels", pnetcdf.NC_UBYTE, (dim_num_test, )) + + # put values into each nc variable + fnc.enddef() + v_label_train[:] = np.array(train_labels, dtype = np.uint8) + for idx, img in enumerate(train_samples): + v_train[idx, :, :] = img + + v_label_test[:] = np.array(test_labels, dtype = np.uint8) + for idx, img in enumerate(test_samples): + v_test[idx, :, :] = img + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() +size = comm.Get_size() + +# create mini MNIST file +to_nc(x_train_small, x_test_small, y_train_small, y_test_small, comm, "mnist_images_mini.nc") + +# create MNIST file +# to_nc(x_train, x_test, y_train, y_test, comm, "mnist_images.nc") + + From 68cd9ae19f0abd88c0b65d53802a6f54424f9f93 Mon Sep 17 00:00:00 2001 From: wkliao Date: Sun, 11 Aug 2024 17:05:38 -0500 Subject: [PATCH 13/20] revise netcdf file generation utility program --- examples/MNIST/create_mnist_netcdf.py | 176 ++++++++++++++++---------- 1 file changed, 111 insertions(+), 65 deletions(-) diff --git a/examples/MNIST/create_mnist_netcdf.py b/examples/MNIST/create_mnist_netcdf.py index 07b2af9..f788d7a 100644 --- a/examples/MNIST/create_mnist_netcdf.py +++ b/examples/MNIST/create_mnist_netcdf.py @@ -1,108 +1,154 @@ -import os -import numpy as np +# +# Copyright (C) 2024, Northwestern University and Argonne National Laboratory +# See COPYRIGHT notice in top-level directory. +# + +import os, argparse, struct import numpy as np -import pnetcdf -from mpi4py import MPI from array import array -import struct + +from mpi4py import MPI +import pnetcdf class MnistDataloader(object): def __init__(self, training_images_filepath,training_labels_filepath, test_images_filepath, test_labels_filepath): + self.training_images_filepath = training_images_filepath self.training_labels_filepath = training_labels_filepath self.test_images_filepath = test_images_filepath self.test_labels_filepath = test_labels_filepath - - def read_images_labels(self, images_filepath, labels_filepath): + + def read_images_labels(self, images_filepath, labels_filepath): labels = [] with open(labels_filepath, 'rb') as file: magic, size = struct.unpack(">II", file.read(8)) if magic != 2049: raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic)) - labels = array("B", file.read()) - + labels = array("B", file.read()) + with open(images_filepath, 'rb') as file: magic, size, rows, cols = struct.unpack(">IIII", file.read(16)) if magic != 2051: raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic)) - image_data = array("B", file.read()) + image_data = array("B", file.read()) images = [] for i in range(size): images.append([0] * rows * cols) for i in range(size): img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols]) img = img.reshape(28, 28) - images[i][:] = img - + images[i][:] = img + return images, labels - + def load_data(self): x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath) x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath) - return (x_train, y_train),(x_test, y_test) - -# -# Set file paths based on added MNIST Datasets -# -input_path = '.' -training_images_filepath = os.path.join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte') -training_labels_filepath = os.path.join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte') -test_images_filepath = os.path.join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte') -test_labels_filepath = os.path.join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte') - -# -# Load MINST dataset -# -mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath) -(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data() + return (x_train, y_train),(x_test, y_test) -# use partial dataset -x_train_small = x_train[:60] -y_train_small = y_train[:60] -x_test_small = x_test[:12] -y_test_small = y_test[:12] -def to_nc(train_samples, test_samples, train_labels, test_labels, comm, out_file_path='mnist_images.nc'): +def to_nc(train_samples, train_labels, test_samples, test_labels, out_file_path='mnist_images.nc'): if os.path.exists(out_file_path): os.remove(out_file_path) + train_labels = list(train_labels) test_labels = list(test_labels) - with pnetcdf.File(out_file_path, comm= comm, mode = "w", format = "64BIT_DATA") as fnc: - - dim_y = fnc.def_dim("Y", 28) - dim_x = fnc.def_dim("X", 28) - dim_num_train = fnc.def_dim("train_idx", len(train_samples)) - dim_num_test = fnc.def_dim("test_idx", len(test_samples)) - - # define nc variable for all imgs - v_train = fnc.def_var("train_images", pnetcdf.NC_UBYTE, (dim_num_train, dim_x, dim_y)) - # put labels into attributes - v_label_train = fnc.def_var("train_labels", pnetcdf.NC_UBYTE, (dim_num_train, )) - - # define nc variable for all imgs - v_test = fnc.def_var("test_images", pnetcdf.NC_UBYTE, (dim_num_test, dim_x, dim_y)) - # put labels into attributes - v_label_test = fnc.def_var("test_labels", pnetcdf.NC_UBYTE, (dim_num_test, )) - - # put values into each nc variable + + with pnetcdf.File(out_file_path, mode = "w", format = "NC_64BIT_DATA") as fnc: + + # Each image is of dimension 28 x 28 + dim_y = fnc.def_dim("height", 28) + dim_x = fnc.def_dim("width", 28) + + # define number of traing and testing samples + dim_train = fnc.def_dim("train_num", len(train_samples)) + dim_test = fnc.def_dim("test_num", len(test_samples)) + + # define nc variables to store training image samples and labels + train_data = fnc.def_var("train_samples", pnetcdf.NC_UBYTE, (dim_train, dim_y, dim_x)) + train_data.long_name = "training data samples" + train_label = fnc.def_var("train_labels", pnetcdf.NC_UBYTE, (dim_train)) + train_label.long_name = "labels of training samples" + + # define nc variables to store testing image samples and labels + test_data = fnc.def_var("test_samples", pnetcdf.NC_UBYTE, (dim_test, dim_y, dim_x)) + test_data.long_name = "testing data samples" + test_label = fnc.def_var("test_labels", pnetcdf.NC_UBYTE, (dim_test)) + test_label.long_name = "labels of testing samples" + + # exit define mode and enter data mode fnc.enddef() - v_label_train[:] = np.array(train_labels, dtype = np.uint8) + + # write training data samples for idx, img in enumerate(train_samples): - v_train[idx, :, :] = img - - v_label_test[:] = np.array(test_labels, dtype = np.uint8) + train_data[idx, :, :] = img + + # write labels of training data samples + train_label[:] = np.array(train_labels, dtype = np.uint8) + + # write testing data samples for idx, img in enumerate(test_samples): - v_test[idx, :, :] = img + test_data[idx, :, :] = img + + # write labels of testing data samples + test_label[:] = np.array(test_labels, dtype = np.uint8) + + +if __name__ == '__main__': + + # parse command-line arguments + args = None + parser = argparse.ArgumentParser(description='Store MNIST Datasets to a NetCDF file') + parser.add_argument("--verbose", help="Verbose mode", action="store_true") + parser.add_argument('--train-size', type=int, default=60, metavar='N', + help='Number of training samples extracted from the input file (default: 60)') + parser.add_argument('--test-size', type=int, default=12, metavar='N', + help='Number of testing samples extracted from the input file (default: 12)') + parser.add_argument("--train-data-file", nargs=1, type=str, help="(Optional) input file name of training data",\ + default = "train-images-idx3-ubyte") + parser.add_argument("--train-label-file", nargs=1, type=str, help="(Optional) input file name of training labels",\ + default = "train-labels-idx1-ubyte") + parser.add_argument("--test-data-file", nargs=1, type=str, help="(Optional) input file name of testing data",\ + default = "t10k-images-idx3-ubyte") + parser.add_argument("--test-label-file", nargs=1, type=str, help="(Optional) input file name of testing labels",\ + default = "t10k-labels-idx1-ubyte") + args = parser.parse_args() + + verbose = True if args.verbose else False + + if verbose: + print("Input file of training samples: ", args.train_data_file) + print("Input file of training labels: ", args.train_label_file) + print("Input file of testing samples: ", args.test_data_file) + print("Input file of testing labels: ", args.test_label_file) + + # + # Load MINST dataset + # + mnist_dataloader = MnistDataloader(args.train_data_file, + args.train_label_file, + args.test_data_file, + args.test_label_file) + + (train_data, train_label), (test_data, test_label) = mnist_dataloader.load_data() + + n_train = len(train_data) + if args.train_size > 0 and args.train_size < n_train: + n_train = int(args.train_size) -comm = MPI.COMM_WORLD -rank = comm.Get_rank() -size = comm.Get_size() + n_test = len(test_data) + if args.test_size > 0 and args.test_size < n_test: + n_test = int(args.test_size) -# create mini MNIST file -to_nc(x_train_small, x_test_small, y_train_small, y_test_small, comm, "mnist_images_mini.nc") + if verbose: + print("Number of training samples: ", n_train) + print("Number of testing samples: ", n_test) -# create MNIST file -# to_nc(x_train, x_test, y_train, y_test, comm, "mnist_images.nc") + # + # create mini MNIST file in NetCDF format + # + to_nc(train_data[0:n_train], train_label[0:n_train], + test_data[0:n_test], test_label[0:n_test], "mnist_images.nc") From baea23e21e0698883f7ea702158ee10208750df0 Mon Sep 17 00:00:00 2001 From: wkliao Date: Sun, 11 Aug 2024 17:06:42 -0500 Subject: [PATCH 14/20] revise README.md and Makefile --- examples/MNIST/Makefile | 37 +++++++++- examples/MNIST/README.md | 140 +++++++++++++++++++++++++++---------- examples/MNIST/mnist.patch | 30 ++++---- examples/Makefile | 2 + 4 files changed, 153 insertions(+), 56 deletions(-) diff --git a/examples/MNIST/Makefile b/examples/MNIST/Makefile index ef82d59..a80e506 100644 --- a/examples/MNIST/Makefile +++ b/examples/MNIST/Makefile @@ -8,8 +8,36 @@ check_PROGRAMS = mnist_main.py MNIST_URL = https://raw.githubusercontent.com/pytorch/examples/main/mnist/main.py mnist_main.py: - curl -Ls $(MNIST_URL) -o $@ - patch -st $@ < mnist.patch + @curl -Ls $(MNIST_URL) -o $@ + @patch -st $@ < mnist.patch + +MNIST_DATA_URL = https://yann.lecun.com/exdb/mnist + +MNIST_DATASETS = train-images-idx3-ubyte \ + train-labels-idx1-ubyte \ + t10k-images-idx3-ubyte \ + t10k-labels-idx1-ubyte + +MNIST_DATASETS_GZ = $(MNIST_DATASETS:=.gz) + +train-images-idx3-ubyte: + @curl -LOs $(MNIST_DATA_URL)/$@.gz + @gunzip $@.gz + +train-labels-idx1-ubyte: + @curl -LOs $(MNIST_DATA_URL)/$@.gz + @gunzip $@.gz + +t10k-images-idx3-ubyte: + @curl -LOs $(MNIST_DATA_URL)/$@.gz + @gunzip $@.gz + +t10k-labels-idx1-ubyte: + @curl -LOs $(MNIST_DATA_URL)/$@.gz + @gunzip $@.gz + +mnist_images.nc: $(MNIST_DATASETS) + @python create_mnist_netcdf.py all: @@ -21,5 +49,8 @@ ptests check: mnist_main.py mnist_images.nc @echo "" clean: - rm -rf mnist_main.py + rm -f mnist_main.py + rm -f $(MNIST_DATASETS) + rm -f $(MNIST_DATASETS_GZ) + rm -f mnist_images.nc diff --git a/examples/MNIST/README.md b/examples/MNIST/README.md index c7f8c11..5311e59 100644 --- a/examples/MNIST/README.md +++ b/examples/MNIST/README.md @@ -1,45 +1,109 @@ -# PnetCDF-python MNIST example +# MNIST example using PnetCDF-Python to Read Input Data -This directory contains the description and run instructions for the MNIST example Python programs that utilize PnetCDF for file I/O and parallel training with MNIST data. - -## Directory Structure - -- **MNIST_data**: This folder contains a mini MNIST test dataset stored in a NetCDF file (`mnist_images_mini.nc`). The file includes: - - 60 training samples - - 12 testing samples - -- **MNIST_codes**: This folder contains the example MNIST training code. The example code is based on the [PyTorch MNIST example](https://github.com/pytorch/examples/tree/main/mnist) and uses `DistributedDataParallel` for parallel training. +This directory contains files for running the Pytorch example program, +[MNIST](https://github.com/pytorch/examples/tree/main/mnist), +using Pytorch module `DistributedDataParallel` for parallel training and +`PnetCDF-Python` for reading data from a NetCDF files. +--- ## Running the MNIST Example Program -To run the MNIST example program, use the `mpiexec` command. The example below runs the program on 4 MPI processes. - -### Command: - -```sh -mpiexec -n 4 python main.py -``` - -### Expected Output: - -When using 4 MPI processes, the output is expected to be similar to the following: - -```sh -nprocs = 4 rank = 0 device = cpu mpi_size = 4 mpi_rank = 0 -nprocs = 4 rank = 2 device = cpu mpi_size = 4 mpi_rank = 2 -nprocs = 4 rank = 1 device = cpu mpi_size = 4 mpi_rank = 1 -nprocs = 4 rank = 3 device = cpu mpi_size = 4 mpi_rank = 3 - -Train Epoch: 1 Average Loss: 2.288340 -Test set: Average loss: 2.7425, Accuracy: 0/12 (0%) - -Train Epoch: 2 Average Loss: 2.490800 -Test set: Average loss: 1.9361, Accuracy: 6/12 (50%) - -Train Epoch: 3 Average Loss: 2.216520 -Test set: Average loss: 1.8703, Accuracy: 7/12 (58%) -``` - +* Firstly, run commands below to generate the python program file and NetCDF file. + ```sh + make mnist_main.py` + make mnist_images.nc` + ``` +* Run command below to train the model using 4 MPI processes. + ```sh + mpiexec -n 4 python mnist_main.py --batch-size 4 --test-batch-size 2 --epochs 3 --input-file mnist_images.nc + ``` + +## Testing +* Command `make check` will do the following. + + Downloads the python source codes + [main.py](https://github.com/pytorch/examples/blob/main/mnist/main.py) + from [Pytorch Examples](https://github.com/pytorch/examples) as file + `mnist_main.py`. + + Applies patch file [mnist.patch](./mnist.patch) to `mnist_main.py`. + + Downloads the MNIST data sets from []() + + Run utility program [create_mnist_netcdf.py](./create_mnist_netcdf.py) + to extract a subset of images into a NetCDF file. + + Run the training program `mnist_main.py`. + +* Testing output shown on screen. + ``` + ===================================================================== + examples/MNIST: Parallel testing on 4 MPI processes + ====================================================================== + Train Epoch: 1 [0/60 (0%)] Loss: 2.514259 + Train Epoch: 1 [10/60 (67%)] Loss: 1.953820 + + Test set: Average loss: 2.2113, Accuracy: 4/12 (33%) + + Train Epoch: 2 [0/60 (0%)] Loss: 2.359334 + Train Epoch: 2 [10/60 (67%)] Loss: 2.092178 + + Test set: Average loss: 1.4825, Accuracy: 6/12 (50%) + + Train Epoch: 3 [0/60 (0%)] Loss: 2.067438 + Train Epoch: 3 [10/60 (67%)] Loss: 0.010670 + + Test set: Average loss: 1.2531, Accuracy: 7/12 (58%) + ``` + +## mnist_main.py command-line options + ``` + -h, --help show this help message and exit + --batch-size N input batch size for training (default: 64) + --test-batch-size N input batch size for testing (default: 1000) + --epochs N number of epochs to train (default: 14) + --lr LR learning rate (default: 1.0) + --gamma M Learning rate step gamma (default: 0.7) + --no-cuda disables CUDA training + --no-mps disables macOS GPU training + --dry-run quickly check a single pass + --seed S random seed (default: 1) + --log-interval N how many batches to wait before logging training status + --save-model For Saving the current Model + --input-file INPUT_FILE + NetCDF file storing train and test samples + ``` + +## create_mnist_netcdf.py command-line options + ``` + -h, --help show this help message and exit + --verbose Verbose mode + --train-size N Number of training samples extracted from the input file (default: 60) + --test-size N Number of testing samples extracted from the input file (default: 12) + --train-data-file TRAIN_DATA_FILE + (Optional) input file name of training data + --train-label-file TRAIN_LABEL_FILE + (Optional) input file name of training labels + --test-data-file TEST_DATA_FILE + (Optional) input file name of testing data + --test-label-file TEST_LABEL_FILE + (Optional) input file name of testing labels + ``` + +--- +## Files in this directory +* [mnist.patch](./mnist.patch) -- + a patch file to be applied on + [main.py](https://github.com/pytorch/examples/blob/main/mnist/main.py) + once downloaded from [Pytorch Examples](https://github.com/pytorch/examples) + before running the model training. + +* [comm_file.py](./comm_file.py) -- + implements the parallel environment for training the model in parallel. + +* [pnetcdf_io.py](./pnetcdf_io.py) -- + implements the file I/O using PnetCDF-Python. + +* [create_mnist_netcdf.py](./create_mnist_netcdf.py) -- + a utility python program that reads the MINST files, extract a subset of the + samples, and stores them into a newly created file in NetCDF format. + +--- ### Notes: - The test set accuracy may vary slightly depending on how the data is distributed across the MPI processes. - The accuracy and loss reported after each epoch are averaged across all MPI processes. diff --git a/examples/MNIST/mnist.patch b/examples/MNIST/mnist.patch index 1001ca2..757de77 100644 --- a/examples/MNIST/mnist.patch +++ b/examples/MNIST/mnist.patch @@ -1,5 +1,5 @@ --- mnist_main_original.py 2024-08-10 17:30:08.552324326 -0500 -+++ pnetcdf_mnist.py 2024-08-10 18:02:49.008705003 -0500 ++++ pnetcdf_mnist.py 2024-08-11 16:10:31.895471785 -0500 @@ -1,3 +1,8 @@ +# +# Copyright (C) 2024, Northwestern University and Argonne National Laboratory @@ -15,10 +15,10 @@ from torch.optim.lr_scheduler import StepLR +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data.distributed import DistributedSampler - + +import comm_file, pnetcdf_io +from mpi4py import MPI - + class Net(nn.Module): def __init__(self): @@ -42,14 +51,13 @@ @@ -32,7 +32,7 @@ 100. * batch_idx / len(train_loader), loss.item())) if args.dry_run: break - + - def test(model, device, test_loader): model.eval() @@ -40,19 +40,19 @@ @@ -62,9 +70,14 @@ pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() - + + # aggregate loss among all ranks + test_loss = comm.mpi_comm.allreduce(test_loss, op=MPI.SUM) + correct = comm.mpi_comm.allreduce(correct, op=MPI.SUM) + test_loss /= len(test_loader.dataset) - + - print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + if rank == 0: + print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) - + @@ -94,6 +107,8 @@ help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, @@ -65,7 +65,7 @@ @@ -107,7 +122,7 @@ else: device = torch.device("cpu") - + - train_kwargs = {'batch_size': args.batch_size} + train_kwargs = {'batch_size': args.batch_size//nprocs} test_kwargs = {'batch_size': args.test_batch_size} @@ -84,8 +84,8 @@ + + # Open files storing training and testing samples + infile = args.input_file -+ train_file = pnetcdf_io.dataset(infile, 'train_images', 'train_labels', transform, comm.mpi_comm) -+ test_file = pnetcdf_io.dataset(infile, 'test_images', 'test_labels', transform, comm.mpi_comm) ++ train_file = pnetcdf_io.dataset(infile, 'train_samples', 'train_labels', transform, comm.mpi_comm) ++ test_file = pnetcdf_io.dataset(infile, 'test_samples', 'test_labels', transform, comm.mpi_comm) + + # create distributed samplers + train_sampler = DistributedSampler(train_file, num_replicas=nprocs, rank=rank, shuffle=True) @@ -94,14 +94,14 @@ + # add distributed samplers to DataLoaders + train_loader = torch.utils.data.DataLoader(train_file, sampler=train_sampler, **train_kwargs) + test_loader = torch.utils.data.DataLoader(test_file, sampler=test_sampler, **test_kwargs, drop_last=False) - + model = Net().to(device) + + # use DDP + model = DDP(model, device_ids=[device] if use_cuda else None) + optimizer = optim.Adadelta(model.parameters(), lr=args.lr) - + scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): + # train sampler set epoch @@ -111,16 +111,16 @@ train(args, model, device, train_loader, optimizer, epoch) test(model, device, test_loader) scheduler.step() - + if args.save_model: - torch.save(model.state_dict(), "mnist_cnn.pt") + if rank == 0: + torch.save(model.state_dict(), "mnist_cnn.pt") - + + # close files + train_file.close() + test_file.close() - + if __name__ == '__main__': + ## initialize parallel environment + comm, device = comm_file.init_parallel() diff --git a/examples/Makefile b/examples/Makefile index 2db3150..d91cff9 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -61,4 +61,6 @@ ptest8: clean: rm -rf ${OUTPUT_DIR} + cd Pytorch_DDP && make clean + cd MNIST && make clean From 81b4dfae7673981d2382e925efd5d3fa8f91693f Mon Sep 17 00:00:00 2001 From: Kewei Wang Date: Sun, 11 Aug 2024 19:04:24 -0500 Subject: [PATCH 15/20] set device and remove shuffle for training with CUDA --- examples/MNIST/mnist.patch | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/examples/MNIST/mnist.patch b/examples/MNIST/mnist.patch index 757de77..c3d2d22 100644 --- a/examples/MNIST/mnist.patch +++ b/examples/MNIST/mnist.patch @@ -62,7 +62,15 @@ args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() use_mps = not args.no_mps and torch.backends.mps.is_available() -@@ -107,7 +122,7 @@ +@@ -101,18 +116,18 @@ + torch.manual_seed(args.seed) + + if use_cuda: +- device = torch.device("cuda") ++ torch.cuda.set_device(rank) # Set the GPU device by rank ++ device = torch.device(f"cuda:{rank}") + elif use_mps: + device = torch.device("mps") else: device = torch.device("cpu") @@ -71,6 +79,12 @@ test_kwargs = {'batch_size': args.test_batch_size} if use_cuda: cuda_kwargs = {'num_workers': 1, +- 'pin_memory': True, +- 'shuffle': True} ++ 'pin_memory': True} + train_kwargs.update(cuda_kwargs) + test_kwargs.update(cuda_kwargs) + @@ -120,25 +135,53 @@ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) From f4682e80899b55ad264f217f636d92733291b372 Mon Sep 17 00:00:00 2001 From: wkliao Date: Sun, 11 Aug 2024 19:03:08 -0500 Subject: [PATCH 16/20] add the MNIST dataset URL as a global attribute --- examples/MNIST/create_mnist_netcdf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/MNIST/create_mnist_netcdf.py b/examples/MNIST/create_mnist_netcdf.py index f788d7a..0fc8504 100644 --- a/examples/MNIST/create_mnist_netcdf.py +++ b/examples/MNIST/create_mnist_netcdf.py @@ -57,6 +57,9 @@ def to_nc(train_samples, train_labels, test_samples, test_labels, out_file_path= with pnetcdf.File(out_file_path, mode = "w", format = "NC_64BIT_DATA") as fnc: + # add MNIST dataset URL as a global attribute + fnc.url = "https://yann.lecun.com/exdb/mnist/" + # Each image is of dimension 28 x 28 dim_y = fnc.def_dim("height", 28) dim_x = fnc.def_dim("width", 28) From 679ee1ae783962ea46b7f41df4d04f2411d3f45c Mon Sep 17 00:00:00 2001 From: wkliao Date: Sun, 11 Aug 2024 19:56:59 -0500 Subject: [PATCH 17/20] add commane-line option --out-file --- examples/MNIST/create_mnist_netcdf.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/examples/MNIST/create_mnist_netcdf.py b/examples/MNIST/create_mnist_netcdf.py index 0fc8504..ba79e45 100644 --- a/examples/MNIST/create_mnist_netcdf.py +++ b/examples/MNIST/create_mnist_netcdf.py @@ -48,14 +48,9 @@ def load_data(self): return (x_train, y_train),(x_test, y_test) -def to_nc(train_samples, train_labels, test_samples, test_labels, out_file_path='mnist_images.nc'): - if os.path.exists(out_file_path): - os.remove(out_file_path) +def to_nc(train_samples, train_labels, test_samples, test_labels, out_file): - train_labels = list(train_labels) - test_labels = list(test_labels) - - with pnetcdf.File(out_file_path, mode = "w", format = "NC_64BIT_DATA") as fnc: + with pnetcdf.File(out_file, mode = "w", format = "NC_64BIT_DATA") as fnc: # add MNIST dataset URL as a global attribute fnc.url = "https://yann.lecun.com/exdb/mnist/" @@ -116,6 +111,8 @@ def to_nc(train_samples, train_labels, test_samples, test_labels, out_file_path= default = "t10k-images-idx3-ubyte") parser.add_argument("--test-label-file", nargs=1, type=str, help="(Optional) input file name of testing labels",\ default = "t10k-labels-idx1-ubyte") + parser.add_argument("--out-file", nargs=1, type=str, help="(Optional) output NetCDF file name",\ + default = "mnist_images.nc") args = parser.parse_args() verbose = True if args.verbose else False @@ -152,6 +149,6 @@ def to_nc(train_samples, train_labels, test_samples, test_labels, out_file_path= # create mini MNIST file in NetCDF format # to_nc(train_data[0:n_train], train_label[0:n_train], - test_data[0:n_test], test_label[0:n_test], "mnist_images.nc") + test_data[0:n_test], test_label[0:n_test], args.out_file) From 42073d7d4cdb855aa23d9a4646ce3f74c308781b Mon Sep 17 00:00:00 2001 From: wkliao Date: Sun, 11 Aug 2024 17:04:56 -0500 Subject: [PATCH 18/20] Use official MNIST dataset URL * re-generating mnist_images.nc is optional, i.e. by command 'make mnist_images.nc' --- examples/MNIST/Makefile | 17 +++---- examples/MNIST/README.md | 85 ++++++++++++++++++++++----------- examples/MNIST/mnist_images.nc | Bin 57032 -> 57544 bytes 3 files changed, 65 insertions(+), 37 deletions(-) diff --git a/examples/MNIST/Makefile b/examples/MNIST/Makefile index a80e506..83bbac0 100644 --- a/examples/MNIST/Makefile +++ b/examples/MNIST/Makefile @@ -7,6 +7,8 @@ check_PROGRAMS = mnist_main.py MNIST_URL = https://raw.githubusercontent.com/pytorch/examples/main/mnist/main.py +all: + mnist_main.py: @curl -Ls $(MNIST_URL) -o $@ @patch -st $@ < mnist.patch @@ -21,27 +23,25 @@ MNIST_DATASETS = train-images-idx3-ubyte \ MNIST_DATASETS_GZ = $(MNIST_DATASETS:=.gz) train-images-idx3-ubyte: - @curl -LOs $(MNIST_DATA_URL)/$@.gz + @curl -LOsf $(MNIST_DATA_URL)/$@.gz @gunzip $@.gz train-labels-idx1-ubyte: - @curl -LOs $(MNIST_DATA_URL)/$@.gz + @curl -LOsf $(MNIST_DATA_URL)/$@.gz @gunzip $@.gz t10k-images-idx3-ubyte: - @curl -LOs $(MNIST_DATA_URL)/$@.gz + @curl -LOsf $(MNIST_DATA_URL)/$@.gz @gunzip $@.gz t10k-labels-idx1-ubyte: - @curl -LOs $(MNIST_DATA_URL)/$@.gz + @curl -LOsf $(MNIST_DATA_URL)/$@.gz @gunzip $@.gz mnist_images.nc: $(MNIST_DATASETS) @python create_mnist_netcdf.py -all: - -ptests check: mnist_main.py mnist_images.nc +ptests check: mnist_main.py @echo "======================================================================" @echo " examples/MNIST: Parallel testing on 4 MPI processes" @echo "======================================================================" @@ -52,5 +52,6 @@ clean: rm -f mnist_main.py rm -f $(MNIST_DATASETS) rm -f $(MNIST_DATASETS_GZ) - rm -f mnist_images.nc + +.PHONY: all check ptests clean diff --git a/examples/MNIST/README.md b/examples/MNIST/README.md index 5311e59..ba670ac 100644 --- a/examples/MNIST/README.md +++ b/examples/MNIST/README.md @@ -5,19 +5,35 @@ This directory contains files for running the Pytorch example program, using Pytorch module `DistributedDataParallel` for parallel training and `PnetCDF-Python` for reading data from a NetCDF files. ---- ## Running the MNIST Example Program -* Firstly, run commands below to generate the python program file and NetCDF file. +* Firstly, run command below to generate the python program file. ```sh - make mnist_main.py` - make mnist_images.nc` + make mnist_main.py ``` * Run command below to train the model using 4 MPI processes. ```sh mpiexec -n 4 python mnist_main.py --batch-size 4 --test-batch-size 2 --epochs 3 --input-file mnist_images.nc ``` +* `mnist_main.py` command-line options + ``` + -h, --help show this help message and exit + --batch-size N input batch size for training (default: 64) + --test-batch-size N input batch size for testing (default: 1000) + --epochs N number of epochs to train (default: 14) + --lr LR learning rate (default: 1.0) + --gamma M Learning rate step gamma (default: 0.7) + --no-cuda disables CUDA training + --no-mps disables macOS GPU training + --dry-run quickly check a single pass + --seed S random seed (default: 1) + --log-interval N how many batches to wait before logging training status + --save-model For Saving the current Model + --input-file INPUT_FILE + NetCDF file storing train and test samples + ``` + ## Testing * Command `make check` will do the following. + Downloads the python source codes @@ -25,10 +41,7 @@ using Pytorch module `DistributedDataParallel` for parallel training and from [Pytorch Examples](https://github.com/pytorch/examples) as file `mnist_main.py`. + Applies patch file [mnist.patch](./mnist.patch) to `mnist_main.py`. - + Downloads the MNIST data sets from []() - + Run utility program [create_mnist_netcdf.py](./create_mnist_netcdf.py) - to extract a subset of images into a NetCDF file. - + Run the training program `mnist_main.py`. + + Run the training program `mnist_main.py` in parallel using 4 MPI processes. * Testing output shown on screen. ``` @@ -51,25 +64,15 @@ using Pytorch module `DistributedDataParallel` for parallel training and Test set: Average loss: 1.2531, Accuracy: 7/12 (58%) ``` -## mnist_main.py command-line options - ``` - -h, --help show this help message and exit - --batch-size N input batch size for training (default: 64) - --test-batch-size N input batch size for testing (default: 1000) - --epochs N number of epochs to train (default: 14) - --lr LR learning rate (default: 1.0) - --gamma M Learning rate step gamma (default: 0.7) - --no-cuda disables CUDA training - --no-mps disables macOS GPU training - --dry-run quickly check a single pass - --seed S random seed (default: 1) - --log-interval N how many batches to wait before logging training status - --save-model For Saving the current Model - --input-file INPUT_FILE - NetCDF file storing train and test samples - ``` - -## create_mnist_netcdf.py command-line options +## Generate the Input NetCDF File From MNIST Datasets +* Utility program [create_mnist_netcdf.py](./create_mnist_netcdf.py) + can be used to extract a subset of images into a NetCDF file. +* Command `make mnist_images.nc` will first download the MNIST data files from + https://yann.lecun.com/exdb/mnist and extract 60 images as training samples + and 12 images as testing samples into a new file named `mnist_images.nc`. +* `create_mnist_netcdf.py` can also run individually to extract a different + number of images using command-line options shown below. +* `create_mnist_netcdf.py` command-line options: ``` -h, --help show this help message and exit --verbose Verbose mode @@ -83,9 +86,34 @@ using Pytorch module `DistributedDataParallel` for parallel training and (Optional) input file name of testing data --test-label-file TEST_LABEL_FILE (Optional) input file name of testing labels + --out-file OUT_FILE (Optional) output NetCDF file name + ``` +* The NetCDF file metadata can be obtained by running command "ncmpidump -h" or + "ncdump -h". + ```sh + % ncmpidump -h mnist_images.nc + netcdf mnist_images { + // file format: CDF-5 (big variables) + dimensions: + height = 28 ; + width = 28 ; + train_num = 60 ; + test_num = 12 ; + variables: + ubyte train_samples(train_num, height, width) ; + train_samples:long_name = "training data samples" ; + ubyte train_labels(train_num) ; + train_labels:long_name = "labels of training samples" ; + ubyte test_samples(test_num, height, width) ; + test_samples:long_name = "testing data samples" ; + ubyte test_labels(test_num) ; + test_labels:long_name = "labels of testing samples" ; + + // global attributes: + :url = "https://yann.lecun.com/exdb/mnist/" ; + } ``` ---- ## Files in this directory * [mnist.patch](./mnist.patch) -- a patch file to be applied on @@ -103,7 +131,6 @@ using Pytorch module `DistributedDataParallel` for parallel training and a utility python program that reads the MINST files, extract a subset of the samples, and stores them into a newly created file in NetCDF format. ---- ### Notes: - The test set accuracy may vary slightly depending on how the data is distributed across the MPI processes. - The accuracy and loss reported after each epoch are averaged across all MPI processes. diff --git a/examples/MNIST/mnist_images.nc b/examples/MNIST/mnist_images.nc index 4cf0aa40dad3bf85817df9bff4a4009104b7b253..85bfc49f17cf08a3cee2c52f13b6dd7dd8181e05 100644 GIT binary patch delta 635 zcmX@Hm-)m&<_YpvY#FJU=@}&qFdzeEu$E`0lw`m}85m&VoFzqxnR)SfrMVLWCF+3` zR0$7|24N6jgis93rA0X)CKHselu=SrP;8~IUzwPfr>7mrU17A*>I@Y3^ow6cSG%9 z;3-KhE{R803G^I94Tw&JG9eTx9u)x@jK!Z&hp0duuzPYjqedjLzLZ4rCDeEnr^9>+ XHSfWm&53D^jFUfb>2IFFWN-%n7B_9J delta 279 zcmX?ckom-3<_Yp*jFAj*AOmGFMohGo%*srufXgu0K$#pRsl_Gn2q_+f5-^J!%4dPn zJS9bmnR)SnG>XY8P@Ov`r!#4=L2Q|Pkx7*4-X5R>?rfgHVsHll-fbw> From 98192266cdca54223f36d17c14350c97fa9b172e Mon Sep 17 00:00:00 2001 From: wkliao Date: Sun, 11 Aug 2024 20:00:59 -0500 Subject: [PATCH 19/20] indicate makefile phony targets --- Makefile | 4 ++++ examples/Makefile | 2 ++ examples/Pytorch_DDP/Makefile | 2 ++ test/Makefile | 2 ++ 4 files changed, 10 insertions(+) diff --git a/Makefile b/Makefile index d0a077d..cc02b8e 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,8 @@ # See COPYRIGHT notice in top-level directory. # +all: + check: cd test && make check cd examples && make check @@ -32,3 +34,5 @@ build-clean: clean install-clean: build-clean rm -rf dist +.PHONY: all check ptests clean build-clean install-clean + diff --git a/examples/Makefile b/examples/Makefile index d91cff9..45b0d32 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -64,3 +64,5 @@ clean: cd Pytorch_DDP && make clean cd MNIST && make clean +.PHONY: all check ptests ptest3 ptest4 ptest8 clean + diff --git a/examples/Pytorch_DDP/Makefile b/examples/Pytorch_DDP/Makefile index c48a853..3a5082b 100644 --- a/examples/Pytorch_DDP/Makefile +++ b/examples/Pytorch_DDP/Makefile @@ -43,3 +43,5 @@ ptest8: clean: rm -rf ${OUTPUT_DIR} +.PHONY: all check ptests ptest3 ptest4 ptest8 clean + diff --git a/test/Makefile b/test/Makefile index 0a90a23..86bee08 100644 --- a/test/Makefile +++ b/test/Makefile @@ -83,3 +83,5 @@ ptest8: clean: rm -rf ${OUTPUT_DIR} +.PHONY: all check ptests ptest3 ptest4 ptest8 clean + From d24be5e7ac4e96ac7b75f984ddb956819a5fc391 Mon Sep 17 00:00:00 2001 From: wkliao Date: Mon, 12 Aug 2024 11:05:31 -0500 Subject: [PATCH 20/20] mention MNIST to examples/README.md --- examples/README.md | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/examples/README.md b/examples/README.md index 6d77fe8..54ecb2f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -19,13 +19,19 @@ be found at the beginning of each file. --- ### Overview of Test Programs +* [MNIST](./MNIST) + + This directory contains an example of + [MNIST](https://github.com/pytorch/examples/tree/main/mnist), + using Pytorch module `DistributedDataParallel` for parallel training and + `PnetCDF-Python` for reading data from a NetCDF files. + * [Pytorch_DDP](./Pytorch_DDP) - + A directory containing examples that make use of Pytorch Distributed Data + + A directory contains examples that make use of Pytorch Distributed Data Parallel module to run python programs in parallel. * [collective_write.py](./collective_write.py) - + writes multiple 3D subarrays to non-record variables of int type using - collective I/O mode. + + This example writes multiple 3D subarrays to non-record variables of int + type using collective I/O mode. * [put_vara.py](./put_vara.py) + This example shows how to use `Variable` method put_var() to write a 2D @@ -33,23 +39,23 @@ be found at the beginning of each file. partitioning across all processes. * [get_vara.py](./get_vara.py) - + This is the read counterpart of [put_vara.py](./put_vara.py), which shows - how to use to `Variable` method get_var() read a 2D 4-byte integer array in - parallel. + + This example is the read counterpart of [put_vara.py](./put_vara.py), which + shows how to use to `Variable` method get_var() read a 2D 4-byte integer + array in parallel. * [nonblocking_write.py](./nonblocking_write.py) - + Similar to `collective_write.py`, it uses nonblocking APIs instead. It - creates a netcdf file in CDF-5 format and writes a number of 3D integer - non-record variables. + + Similar to `collective_write.py`, this example uses nonblocking APIs + instead. It creates a netcdf file in CDF-5 format and writes a number of 3D + integer non-record variables. * [nonblocking_write_def.py](./nonblocking_write_def.py) - + This is the same as `nonblocking_write.py` expect all nonblocking write - requests (calls to `iput` and `bput`) are posted in define mode. It creates - a netcdf file in CDF-5 format and writes a number of 3D integer non-record - variables. + + This example is the same as `nonblocking_write.py` expect all nonblocking + write requests (calls to `iput` and `bput`) are posted in define mode. It + creates a netcdf file in CDF-5 format and writes a number of 3D integer + non-record variables. * [create_open.py](./create_open.py) - + This example shows how to use `File` class constructor to create a netCDF + + This example shows how to use `File` class constructor to create a NetCDF file and to open the file for read only. * [ghost_cell.py](./ghost_cell.py)