Skip to content

Commit

Permalink
fix multi machine test (#5984)
Browse files Browse the repository at this point in the history
Signed-off-by: daquexian <daquexian566@gmail.com>

Co-authored-by: oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
  • Loading branch information
daquexian and oneflow-ci-bot committed Aug 21, 2021
1 parent fc60e22 commit 611933a
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 20 deletions.
2 changes: 1 addition & 1 deletion python/oneflow/framework/unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def has_node_list():
def node_size():
node_num_from_env = os.getenv("ONEFLOW_TEST_NODE_NUM", None)
if node_num_from_env:
return node_num_from_env
return int(node_num_from_env)
elif has_node_list():
node_list_from_env = node_list()
return len(node_list_from_env)
Expand Down
24 changes: 5 additions & 19 deletions python/oneflow/test/modules/test_allreduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,30 +34,16 @@ def test_all_reduce(test_case):
x = flow.Tensor(arr_rank2)
else:
raise ValueError
x = x.to(f"cuda:{flow.distributed.get_local_rank()}")
nccl_allreduce_op = (
flow.builtin_op("eager_nccl_all_reduce")
.Input("in")
.Output("out")
.Attr("parallel_conf", f'device_tag: "gpu", device_name: "0:0-1"')
.Build()
)
y = nccl_allreduce_op(x)[0]
x = x.to("cuda")
y = flow.F.all_reduce(x)
test_case.assertTrue(np.allclose(y.numpy(), arr_rank1 + arr_rank2))

@flow.unittest.skip_unless_2n2d()
def test_all_reduce_2nodes(test_case):
np_arr = np.array([1, 2])
x = flow.Tensor(np_arr * flow.distributed.get_rank())
x = x.to(f"cuda:{flow.distributed.get_local_rank()}")
nccl_allreduce_op = (
flow.builtin_op("eager_nccl_all_reduce")
.Input("in")
.Output("out")
.Attr("parallel_conf", f'device_tag: "gpu", device_name: "0-1:0-1"')
.Build()
)
y = nccl_allreduce_op(x)[0]
x = flow.Tensor(np_arr * (flow.distributed.get_rank() + 1))
x = x.to("cuda")
y = flow.F.all_reduce(x)
test_case.assertTrue(np.allclose(y.numpy(), np_arr * 10))


Expand Down

0 comments on commit 611933a

Please sign in to comment.