Skip to content

Running Legion on Frontier

Seshu Yamajala edited this page Oct 5, 2023 · 8 revisions

S3D has been tested on Frontier. Other applications still need to be tested. Here are the settings/variables used for S3D.

Environment

The following environment has been verified to work on Frontier:

module load cray-python
module swap $LMOD_FAMILY_PRGENV PrgEnv-amd
module load cray-pmi
module load amd/5.1.0
module unload darshan-runtime
export CC=cc
export CXX=CC
export HOST_CC=gcc
export HOST_CXX=g++

AMD/5.4.0 suffers from HSA_OUT_OF_RESOURCES errors. Be sure to unload darshan-runtime after loading amd/5.1.0.

GASNet

Use GASNet 2023.3.0 with the ofi-slingshot11 conduit. The configuration in the legion gasnet repo has been verified to work: https://github.com/StanfordLegion/gasnet/blob/master/configs/config.ofi-slingshot11.release

Legion

For S3D the following commit works for scaling on Frontier up to 2048 nodes:

commit 3b0886d24d30273f4afa036fdd11acafe4c812d1
Author: Mike <mebauer@cs.stanford.edu>
Date:   Thu Aug 25 01:43:52 2022 -0700

    legion: small fix for latent field space deletions

Later commits of Legion will work but suffer from out of memory issues when running.

The following commit for realm flow control must be cherry-picked on top of this commit:

git cherry-pick 292d5d7c

https://gitlab.com/StanfordLegion/legion/-/commit/292d5d7cf2723ac7b5451c84198c576e72215d38

Pass the following arguments when building Legion with CMake: -DLegion_NETWORKS=gasnetex -DLegion_MAX_NUM_NODES=65536

The following patches can also be useful for scaling.

Allocate IDs for 8192 nodes/64k ranks:

diff --git a/runtime/realm/id.h b/runtime/realm/id.h
index 42b6fcd5f..219054c5f 100644
--- a/runtime/realm/id.h
+++ b/runtime/realm/id.h
@@ -46,11 +46,11 @@ namespace Realm {
       // COMPQUEUE:   tag:8 = 0x19, owner_node:16,   (unused):28, cq_idx: 12
       // SUBGRAPH:    tag:8 = 0x18, creator_node:16, (unused):16, subgraph_idx: 24
 
-      static const int NODE_FIELD_WIDTH = 16;
+      static const int NODE_FIELD_WIDTH = 17;
       static const unsigned MAX_NODE_ID = (1U << NODE_FIELD_WIDTH) - 2; // reserve all 1's for special cases
       static const int EVENT_GENERATION_WIDTH = REALM_EVENT_GENERATION_BITS; // fom realm_c.h
       static const int MEMORY_INDEX_WIDTH = 8;
-      static const int INSTANCE_INDEX_WIDTH = 22;
+      static const int INSTANCE_INDEX_WIDTH = 20;
 
 #define ACCESSOR(structname, name, field) \
       bitpack<IDType>::bitsliceref<structname::field> name ## _ ## field() { return id.slice<structname::field>(); } \
@@ -77,7 +77,7 @@ namespace Realm {
        typedef bitfield<4, 60> type_tag;
        typedef bitfield<NODE_FIELD_WIDTH,
                         60-NODE_FIELD_WIDTH> creator_node;
-       typedef bitfield<24,
+       typedef bitfield<40 - NODE_FIELD_WIDTH,
                         EVENT_GENERATION_WIDTH> barrier_idx;
        typedef bitfield<EVENT_GENERATION_WIDTH, 0> generation;  // MUST MATCH FMT_Event::generation size
 
@@ -160,7 +160,7 @@ namespace Realm {
                         56-NODE_FIELD_WIDTH> owner_node;
        typedef bitfield<NODE_FIELD_WIDTH,
                         56-2*NODE_FIELD_WIDTH> creator_node;
-       typedef bitfield<24, 0> pgroup_idx;
+       typedef bitfield<56-2*NODE_FIELD_WIDTH, 0> pgroup_idx;
 
        static const IDType TAG_VALUE = 0x1c;
       };
@@ -176,7 +176,7 @@ namespace Realm {
                         60-NODE_FIELD_WIDTH> owner_node;
        typedef bitfield<NODE_FIELD_WIDTH,
                         60-2*NODE_FIELD_WIDTH> creator_node;
-       typedef bitfield<28, 0> sparsity_idx;
+       typedef bitfield<60-2*NODE_FIELD_WIDTH, 0> sparsity_idx;
 
        static const IDType TAG_VALUE = 0x3;
       };
@@ -206,7 +206,7 @@ namespace Realm {
                         56-NODE_FIELD_WIDTH> owner_node;
        typedef bitfield<NODE_FIELD_WIDTH,
                         56-2*NODE_FIELD_WIDTH> creator_node;
-       typedef bitfield<24, 0> subgraph_idx;
+       typedef bitfield<56-2*NODE_FIELD_WIDTH, 0> subgraph_idx;
 
        static const IDType TAG_VALUE = 0x18;
       };

Reduce runtime memory usage:

diff --git a/runtime/realm/runtime_impl.h b/runtime/realm/runtime_impl.h
index d739ad8f4..068ecd240 100644
--- a/runtime/realm/runtime_impl.h
+++ b/runtime/realm/runtime_impl.h
@@ -117,7 +117,7 @@ namespace Realm {
     // use a wide tree for local events - max depth will be 2
     typedef DynamicTableAllocator<GenEventImpl, 11, 16> LocalEventTableAllocator;
     // use a narrow tree for remote events - depth is 3, leaves have 128 events
-    typedef DynamicTableAllocator<GenEventImpl, 10, 7> RemoteEventTableAllocator;
+    typedef DynamicTableAllocator<GenEventImpl, 11, 5> RemoteEventTableAllocator;
     typedef DynamicTableAllocator<BarrierImpl, 10, 4> BarrierTableAllocator;
     typedef DynamicTableAllocator<ReservationImpl, 10, 8> ReservationTableAllocator;
     typedef DynamicTableAllocator<ProcessorGroupImpl, 10, 4> ProcessorGroupTableAllocator;

Turn off complex number support for Regent:

diff --git a/language/src/regent/std_base.t b/language/src/regent/std_base.t
index d9ad6f82a..27084ee4e 100644
--- a/language/src/regent/std_base.t
+++ b/language/src/regent/std_base.t
@@ -451,15 +451,15 @@ do
     end
   end
   -- Prefill the table of reduction op IDs for complex types.
-  do
-    base.update_reduction_op("+", base.complex32, c.LEGION_REDOP_SUM_COMPLEX64)
-    base.update_reduction_op("-", base.complex32, c.LEGION_REDOP_SUM_COMPLEX64)
-    base.update_reduction_op("*", base.complex32, c.LEGION_REDOP_PROD_COMPLEX64)
-    base.update_reduction_op("/", base.complex32, c.LEGION_REDOP_PROD_COMPLEX64)
-
-    base.update_reduction_op("+", base.complex64, c.LEGION_REDOP_SUM_COMPLEX128)
-    base.update_reduction_op("-", base.complex64, c.LEGION_REDOP_SUM_COMPLEX128)
-  end
+  -- do
+  --   base.update_reduction_op("+", base.complex32, c.LEGION_REDOP_SUM_COMPLEX64)
+  --   base.update_reduction_op("-", base.complex32, c.LEGION_REDOP_SUM_COMPLEX64)
+  --   base.update_reduction_op("*", base.complex32, c.LEGION_REDOP_PROD_COMPLEX64)
+  --   base.update_reduction_op("/", base.complex32, c.LEGION_REDOP_PROD_COMPLEX64)
+
+  --   base.update_reduction_op("+", base.complex64, c.LEGION_REDOP_SUM_COMPLEX128)
+  --   base.update_reduction_op("-", base.complex64, c.LEGION_REDOP_SUM_COMPLEX128)
+  -- end
 end
 
 function base.is_reduction_op(privilege)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 145ec601b..329a6a797 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -866,9 +866,9 @@ if(Legion_BUILD_ALL OR Legion_BUILD_BINDINGS)
 else()
   option(Legion_REDOP_COMPLEX "Use reduction operators for complex types" OFF)
 endif()
-if((Legion_BUILD_ALL OR Legion_BUILD_BINDINGS) AND NOT Legion_REDOP_COMPLEX)
-  message(FATAL_ERROR "Bindings require Legion_REDOP_COMPLEX to be set")
-endif()
+# if((Legion_BUILD_ALL OR Legion_BUILD_BINDINGS) AND NOT Legion_REDOP_COMPLEX)
+#   message(FATAL_ERROR "Bindings require Legion_REDOP_COMPLEX to be set")
+# endif()
 if(Legion_REDOP_COMPLEX)
   # define variable for legion_defines.h
   set(LEGION_REDOP_COMPLEX ON)

For CMake build with: -DLegion_REDOP_COMPLEX=OFF

Submission script

Be sure to set the following in your job submission script:

module load cray-python
module swap $LMOD_FAMILY_PRGENV PrgEnv-amd
module load amd/5.1
module load cray-pmi

export FI_MR_CACHE_MONITOR=memhooks
export FI_CXI_RX_MATCH_MODE=software
export GASNET_OFI_DEVICE_0=cxi2
export GASNET_OFI_DEVICE_1=cxi1
export GASNET_OFI_DEVICE_2=cxi3
export GASNET_OFI_DEVICE_3=cxi0
export GASNET_OFI_DEVICE_TYPE=Node
export GASNET_OFI_RECEIVE_BUFF_SIZE=single

Add -gex:amlimit 32 to the runtime arguments for your Legion application. This controls the number of outstanding active messages before flow control kicks in. It can be increased further but has not been tested yet.

If you do not want to run with flow control then set the following variables:

export FI_MR_CACHE_MONITOR=memhooks
export FI_CXI_RX_MATCH_MODE=software
export FI_CXI_DEFAULT_CQ_SIZE=13107200
export FI_CXI_REQ_BUF_MIN_POSTED=10
export FI_CXI_REQ_BUF_SIZE=25165824

This will significantly increase the amount of memory used by libfabric.

In addition when running on 1 node pass the following to srun: --network=single_node_vni (see https://github.com/StanfordLegion/gasnet/issues/22)

Slingshot 2.1

Slingshot 2.1 fixes many of the issues above and Realm flow control is no longer needed. If you see issues with hanging or freezes at startup you can try setting either of the following variables:

export GASNET_OFI_RECEIVE_BUFF_SIZE=2M   # default is 1MB

or

export GASNET_OFI_NUM_RECEIVE_BUFFS=16   # default is 8