Skip to content

Commit

Permalink
Merge branch 'master' into dev_warp_softmax
Browse files Browse the repository at this point in the history
  • Loading branch information
guo-ran committed Jun 1, 2021
2 parents 5f34b13 + 5133512 commit 2b08c61
Show file tree
Hide file tree
Showing 22 changed files with 437 additions and 125 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,11 @@ jobs:
docker run $extra_docker_args \
oneflow-test:$USER \
bash -c "bash ci/test/try_install.sh && bash ci/test/build_docs.sh"
- name: Query system status
if: ${{ failure() }}
run: |
nvidia-smi
docker ps
- name: Remove container
if: always()
run: |
Expand Down
4 changes: 2 additions & 2 deletions cmake/third_party/glog.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ else()
if(BUILD_SHARED_LIBS)
# Must use a shared lib with cpack version
set(GLOG_VER 0.3.4)
if(${CMAKE_SHARED_LIBRARY_SUFFIX} STREQUAL ".dylib")
if("${CMAKE_SHARED_LIBRARY_SUFFIX}" STREQUAL ".dylib")
set(GLOG_LIBRARY_NAMES libglog.${GLOG_VER}.dylib)
elseif(${CMAKE_SHARED_LIBRARY_SUFFIX} STREQUAL ".so")
elseif("${CMAKE_SHARED_LIBRARY_SUFFIX}" STREQUAL ".so")
set(GLOG_LIBRARY_NAMES libglog.so.${GLOG_VER})
else()
message(FATAL_ERROR "${CMAKE_SHARED_LIBRARY_SUFFIX} not support for glog")
Expand Down
4 changes: 2 additions & 2 deletions cmake/third_party/protobuf.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ else()
set(PROTOBUF_BUILD_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf)
if(BUILD_SHARED_LIBS)
set(PB_VER 3.9.2.0)
if(${CMAKE_SHARED_LIBRARY_SUFFIX} STREQUAL ".dylib")
if("${CMAKE_SHARED_LIBRARY_SUFFIX}" STREQUAL ".dylib")
set(PROTOBUF_LIBRARY_NAMES libprotobuf.${PB_VER}.dylib)
elseif(${CMAKE_SHARED_LIBRARY_SUFFIX} STREQUAL ".so")
elseif("${CMAKE_SHARED_LIBRARY_SUFFIX}" STREQUAL ".so")
set(PROTOBUF_LIBRARY_NAMES libprotobuf.so.${PB_VER})
else()
message(FATAL_ERROR "${CMAKE_SHARED_LIBRARY_SUFFIX} not support for protobuf")
Expand Down
16 changes: 12 additions & 4 deletions cmake/third_party/zlib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,20 @@ use_mirror(VARIABLE ZLIB_URL URL ${ZLIB_URL})
if(WIN32)
set(ZLIB_BUILD_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib)
set(ZLIB_LIBRARY_NAMES zlibstaticd.lib)
elseif(APPLE AND ("${CMAKE_GENERATOR}" STREQUAL "Xcode"))
set(ZLIB_BUILD_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib)
set(ZLIB_LIBRARY_NAMES libz.a)
else()
set(ZLIB_BUILD_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib)
set(ZLIB_LIBRARY_NAMES libz.a)
if(BUILD_SHARED_LIBS)
set(Z_VER 1.2.8)
if("${CMAKE_SHARED_LIBRARY_SUFFIX}" STREQUAL ".dylib")
set(ZLIB_LIBRARY_NAMES libz.${Z_VER}.dylib)
elseif("${CMAKE_SHARED_LIBRARY_SUFFIX}" STREQUAL ".so")
set(ZLIB_LIBRARY_NAMES libz.so.${Z_VER})
else()
message(FATAL_ERROR "${CMAKE_SHARED_LIBRARY_SUFFIX} not support for zlib")
endif()
else()
set(ZLIB_LIBRARY_NAMES libz.a)
endif()
endif()

foreach(LIBRARY_NAME ${ZLIB_LIBRARY_NAMES})
Expand Down
27 changes: 18 additions & 9 deletions oneflow/core/autograd/gradient_funcs/normalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ namespace oneflow {
namespace one {

struct NormalizationGradInterpState : public OpExprInterpState {
int32_t axis;
float epsilon;
bool is_training;
};

Expand All @@ -42,16 +44,15 @@ class NormalizationGrad : public OpExprGradFunction<NormalizationGradInterpState
CHECK_NOTNULL_OR_RETURN(fw_op_expr);
const std::string& op_name = fw_op_expr->op_name();
op_trait_ = std::make_shared<user_op::UserOpConfTrait>(op_name, fw_op_expr->proto());
const float epsilon = JUST(op_trait_->GetAttr<float>("epsilon"));
axis_ = JUST(op_trait_->GetAttr<int32_t>("axis"));
// v1 = variance + eps
add_eps_op_ = JUST(op_expr_helper::ScalarAddOp(epsilon, GradientOpName(op_name + "_add_eps")));
add_eps_op_ =
JUST(op_expr_helper::ScalarAddOp(/*epsilon=*/0, GradientOpName(op_name + "_add_eps")));
// v2 = rsqrt(v1)
rsqrt_op_ = JUST(op_expr_helper::RsqrtOp(GradientOpName(op_name + "_rsqrt")));

// Normalization grad.
normalization_grad_op_ = JUST(op_expr_helper::NormalizationGradOp(
axis_, epsilon, GradientOpName(op_name + "_norm_grad")));
/*axis=*/-1, /*epsilon=*/0, GradientOpName(op_name + "_norm_grad")));

reshape_gamma_op_ =
JUST(op_expr_helper::ReshapeOp(Shape{-1}, GradientOpName(op_name + "_reshape_gamma")));
Expand All @@ -68,6 +69,8 @@ class NormalizationGrad : public OpExprGradFunction<NormalizationGradInterpState

Maybe<void> Capture(NormalizationGradInterpState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const override {
ctx->axis = JUST(op_trait_->GetAttr<int32_t>("axis", attrs));
ctx->epsilon = JUST(op_trait_->GetAttr<float>("epsilon", attrs));
ctx->is_training = JUST(op_trait_->GetAttr<bool>("training", attrs));
ctx->SaveTensorForBackward(inputs.at(0)); // x
ctx->SaveTensorForBackward(inputs.at(3)); // gamma
Expand All @@ -94,12 +97,19 @@ class NormalizationGrad : public OpExprGradFunction<NormalizationGradInterpState
} else {
const auto& moving_mean = ctx->SavedTensors().at(2); // moving_mean
const auto& moving_variance = ctx->SavedTensors().at(3); // moving_variance
const auto& add_eps = JUST(OpInterpUtil::Dispatch<Tensor>(*add_eps_op_, {moving_variance}));
MutableAttrMap epsilon_attr;
JUST(epsilon_attr.SetAttr<float>("epsilon", ctx->epsilon));
const auto& add_eps =
JUST(OpInterpUtil::Dispatch<Tensor>(*add_eps_op_, {moving_variance}, epsilon_attr));
mean = moving_mean;
inv_variance = JUST(OpInterpUtil::Dispatch<Tensor>(*rsqrt_op_, {add_eps}));
}

MutableAttrMap norm_grad_attr;
JUST(norm_grad_attr.SetAttr<int32_t>("axis", ctx->axis));
JUST(norm_grad_attr.SetAttr<float>("epsilon", ctx->epsilon));
const auto& results = JUST(OpInterpUtil::Dispatch<TensorTuple>(
*normalization_grad_op_, {x, y_grad, gamma, mean, inv_variance}));
*normalization_grad_op_, {x, y_grad, gamma, mean, inv_variance}, norm_grad_attr));
CHECK_EQ_OR_RETURN(results->size(), 3);
// The normalization op has 5 inputs which are x, moving_mean, moving_variance, gamma and beta.
in_grads->resize(5);
Expand All @@ -112,10 +122,10 @@ class NormalizationGrad : public OpExprGradFunction<NormalizationGradInterpState

DimVector dim_vec;
for (int i = 0; i < x->shape()->NumAxes(); ++i) {
if (i != axis_) {
if (i != ctx->axis) {
dim_vec.push_back(1);
} else {
dim_vec.push_back(x->shape()->At(axis_));
dim_vec.push_back(x->shape()->At(ctx->axis));
}
}
MutableAttrMap shape_attr;
Expand All @@ -142,7 +152,6 @@ class NormalizationGrad : public OpExprGradFunction<NormalizationGradInterpState

private:
std::shared_ptr<user_op::UserOpConfTrait> op_trait_;
int32_t axis_;
std::shared_ptr<OpExpr> add_eps_op_;
std::shared_ptr<OpExpr> rsqrt_op_;
std::shared_ptr<OpExpr> normalization_grad_op_;
Expand Down
91 changes: 91 additions & 0 deletions oneflow/core/autograd/gradient_funcs/where.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/op_expr_grad_function.h"
#include "oneflow/core/framework/op_builder.h"
#include "oneflow/core/framework/op_expr.h"
#include "oneflow/core/framework/op_expr_helper.h"
#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"

namespace oneflow {
namespace one {

struct WhereInterpState : public OpExprInterpState {
bool requires_grad_x;
bool requires_grad_y;
};

class Where : public OpExprGradFunction<WhereInterpState> {
public:
Maybe<void> Init(const OpExpr& op) override;
Maybe<void> Capture(WhereInterpState* ctx, const TensorTuple& inputs, const TensorTuple& outputs,
const AttrMap& attrs) const override;
Maybe<void> Apply(const WhereInterpState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const override;

private:
AttrMap base_attrs_;
std::shared_ptr<OpExpr> zero_like_op_;
std::shared_ptr<OpExpr> where_op_x_;
std::shared_ptr<OpExpr> where_op_y_;
};

Maybe<void> Where::Init(const OpExpr& op) {
const UserOpExpr* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
CHECK_NOTNULL_OR_RETURN(fw_op_expr);
base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
const std::string& op_name = fw_op_expr->op_name();
zero_like_op_ = JUST(op_expr_helper::ZeroLikeOp("zeros_like_" + GradientOpName(op_name)));
where_op_x_ = JUST(op_expr_helper::WhereOp("where_x_" + GradientOpName(op_name)));
where_op_y_ = JUST(op_expr_helper::WhereOp("where_y_" + GradientOpName(op_name)));
return Maybe<void>::Ok();
}

Maybe<void> Where::Capture(WhereInterpState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const {
ctx->requires_grad_x = inputs.at(1)->requires_grad();
ctx->requires_grad_y = inputs.at(2)->requires_grad();
if ((!ctx->requires_grad_x) && (!ctx->requires_grad_y)) { return Maybe<void>::Ok(); }

ComposedAttrMap composed_attrs(attrs, base_attrs_);
ctx->SaveTensorForBackward(inputs.at(0)); // condition
ctx->SaveTensorForBackward(inputs.at(1)); // x
return Maybe<void>::Ok();
}

Maybe<void> Where::Apply(const WhereInterpState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const {
if ((!ctx->requires_grad_x) && (!ctx->requires_grad_y)) { return Maybe<void>::Ok(); }
CHECK_EQ_OR_RETURN(out_grads.size(), 1);
MutableAttrMap attrs;
const std::shared_ptr<oneflow::one::Tensor>& condtion = ctx->SavedTensors().at(0);
const std::shared_ptr<oneflow::one::Tensor>& x = ctx->SavedTensors().at(1);

std::shared_ptr<oneflow::one::Tensor> zero_out =
JUST(OpInterpUtil::Dispatch<Tensor>(*zero_like_op_, {x}));
in_grads->resize(3);
if (ctx->requires_grad_x)
in_grads->at(1) =
JUST(OpInterpUtil::Dispatch<Tensor>(*where_op_x_, {condtion, out_grads.at(0), zero_out}));
if (ctx->requires_grad_y)
in_grads->at(2) =
JUST(OpInterpUtil::Dispatch<Tensor>(*where_op_y_, {condtion, zero_out, out_grads.at(0)}));
return Maybe<void>::Ok();
}

REGISTER_OP_EXPR_GRAD_FUNCTION("where", Where);

} // namespace one
} // namespace oneflow
18 changes: 11 additions & 7 deletions oneflow/core/control/ctrl_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,7 @@ namespace {

} // namespace

GrpcCtrlClient::~GrpcCtrlClient() {
{
std::unique_lock<std::mutex> lck(need_heartbeat_thread_stop_mtx_);
need_heartbeat_thread_stop_ = true;
}
heartbeat_thread_.join();
}
GrpcCtrlClient::~GrpcCtrlClient() { StopHeartbeat(); }

GrpcCtrlClient::GrpcCtrlClient(const ProcessCtx& process_ctx) : process_ctx_(process_ctx) {
rpc_client_.ReserveStubsOfSize(process_ctx.ctrl_addr_size());
Expand Down Expand Up @@ -118,4 +112,14 @@ int32_t GrpcCtrlClient::IncreaseCount(const std::string& k, int32_t v) {

void GrpcCtrlClient::EraseCount(const std::string& k) { rpc_client_.EraseCount(k); }

void GrpcCtrlClient::StopHeartbeat() {
bool already_stopped = false;
{
std::unique_lock<std::mutex> lck(need_heartbeat_thread_stop_mtx_);
already_stopped = need_heartbeat_thread_stop_;
need_heartbeat_thread_stop_ = true;
}
if (!already_stopped) { heartbeat_thread_.join(); }
}

} // namespace oneflow
5 changes: 3 additions & 2 deletions oneflow/core/eager/opkernel_instruction_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,9 +443,9 @@ Maybe<T*> GetSharedOpKernel(vm::Instruction* instruction, DeviceType device_type
struct LocalCallOpKernelUtil final {
static inline Maybe<void> Infer(vm::Instruction* instruction) {
auto* operand = JUST(GetLocalCallOpKernelPhyInstrOperand(instruction));
operand->mut_opkernel()->composed_attrs_for_scheduler_thread()->ResetPrior(operand->attrs());
operand->set_user_opkernel(
JUST(operand->mut_opkernel()->ChooseOpKernel(operand->inputs(), operand->outputs())));
operand->mut_opkernel()->ResetDynamicOpAttrs(operand->attrs());
JUST(CheckOutputBlobObjectsMemCase(operand, instruction->stream()));
JUST(InitOutputBlobs(operand));
JUST(InferTempStorageBlobDesc(operand));
Expand Down Expand Up @@ -522,7 +522,8 @@ struct LocalCallOpKernelUtil final {
const auto& InferTmpSizeFn = operand->opkernel().GetInferTmpSizeFn(operand->user_opkernel());
auto* temp_blob_desc = operand->mut_opkernel()->mut_temp_blob_object()->mut_blob_desc();
CHECK_OR_RETURN(temp_blob_desc->data_type() == DataType::kChar);
one::LocalUserOpInferContext* op_infer_ctx = operand->opkernel().op_infer_ctx_for_thread_a();
one::LocalUserOpInferContext* op_infer_ctx =
operand->opkernel().op_infer_ctx_for_scheduler_thread();
op_infer_ctx->Update(operand->inputs(), operand->outputs());
size_t temp_size = InferTmpSizeFn(op_infer_ctx);
temp_blob_desc->mut_shape() = Shape({static_cast<int64_t>(temp_size)});
Expand Down
10 changes: 10 additions & 0 deletions oneflow/core/framework/op_expr_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,16 @@ Maybe<one::UserOpExpr> SplitLikeOp(const int n, const int64_t axis, const std::s
.Build();
}

Maybe<one::UserOpExpr> WhereOp() { return WhereOp(UniqueOpName("where")); }
Maybe<one::UserOpExpr> WhereOp(const std::string& name) {
return one::OpBuilder("where", name)
.Input("condition")
.Input("x")
.Input("y")
.Output("out")
.Build();
}

Maybe<one::UserOpExpr> ExpandGradOp(const std::vector<int32_t>& out_shape,
const std::vector<int32_t>& stride) {
return ExpandGradOp(out_shape, stride, UniqueOpName("expand_grad"));
Expand Down
3 changes: 3 additions & 0 deletions oneflow/core/framework/op_expr_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,9 @@ Maybe<one::UserOpExpr> TransposeOp(const std::vector<int32_t>& perm, const std::
Maybe<one::UserOpExpr> SplitLikeOp(const int n, const int64_t axis);
Maybe<one::UserOpExpr> SplitLikeOp(const int n, const int64_t axis, const std::string& name);

Maybe<one::UserOpExpr> WhereOp();
Maybe<one::UserOpExpr> WhereOp(const std::string& name);

Maybe<one::UserOpExpr> ExpandGradOp(const std::vector<int32_t>& out_shape,
const std::vector<int32_t>& stride);
Maybe<one::UserOpExpr> ExpandGradOp(const std::vector<int32_t>& out_shape,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ Maybe<void> NaiveInterpret(const UserOpExpr& user_op_expr, const TensorTuple& in
output_eager_blob_objects->at(index)->set_is_shape_synced(false);
}

kernel->ResetDynamicOpAttrs(attrs);
kernel->composed_attrs_for_main_thread()->ResetPrior(attrs);
JUST(kernel->InferDataType(input_eager_blob_objects, output_eager_blob_objects,
kernel->op_infer_ctx_for_thread_b()));
kernel->op_infer_ctx_for_main_thread()));
JUST(kernel->InferTensorDesc(input_eager_blob_objects, output_eager_blob_objects,
kernel->op_infer_ctx_for_thread_b()));
kernel->op_infer_ctx_for_main_thread()));

const auto& instr_type_name = JUST(op_device->local_call_instruction_name());
JUST(PhysicalRun([&](InstructionsBuilder* builder) -> Maybe<void> {
Expand Down
5 changes: 3 additions & 2 deletions oneflow/core/rpc/include/grpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ namespace oneflow {
class GrpcCtrlClient final : public CtrlClient {
public:
OF_DISALLOW_COPY_AND_MOVE(GrpcCtrlClient);
GrpcCtrlClient(const ProcessCtx& process_ctx);
~GrpcCtrlClient();
explicit GrpcCtrlClient(const ProcessCtx& process_ctx);
~GrpcCtrlClient() override;

void Barrier(const std::string& barrier_name) override;
void Barrier(const std::string& barrier_name, int32_t barrier_num) override;
Expand All @@ -51,6 +51,7 @@ class GrpcCtrlClient final : public CtrlClient {
void Clear() override;
int32_t IncreaseCount(const std::string& k, int32_t v) override;
void EraseCount(const std::string& k) override;
void StopHeartbeat();

private:
const ProcessCtx& process_ctx() const { return process_ctx_; }
Expand Down
9 changes: 6 additions & 3 deletions oneflow/core/rpc/lib/grpc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@ limitations under the License.
*/
#ifdef RPC_BACKEND_GRPC

#include "oneflow/core/control/ctrl_client.h"
#include "oneflow/core/rpc/include/grpc.h"
#include "oneflow/core/control/ctrl_bootstrap.h"
#include "oneflow/core/control/ctrl_server.h"
#include "oneflow/core/job/env_desc.h"
#include "oneflow/core/rpc/include/grpc.h"

namespace oneflow {

Expand Down Expand Up @@ -57,11 +56,15 @@ Maybe<void> GrpcRpcManager::CreateClient() {
}

GrpcRpcManager::~GrpcRpcManager() {
auto* grpc_client = dynamic_cast<GrpcCtrlClient*>(Global<CtrlClient>::Get());
CHECK_NOTNULL(grpc_client);
grpc_client->StopHeartbeat();
OF_ENV_BARRIER();
Global<CtrlClient>::Delete();
CHECK_NOTNULL(Global<CtrlServer>::Get());
Global<CtrlServer>::Delete();
}

} // namespace oneflow

#endif // RPC_BACKEND_GPRC
#endif // RPC_BACKEND_GRPC

0 comments on commit 2b08c61

Please sign in to comment.