Skip to content

Commit

Permalink
Merge pull request #3712 from tensor-tang/merge
Browse files Browse the repository at this point in the history
add MKLDNN_DEVICE
  • Loading branch information
luotao1 committed Aug 30, 2017
2 parents b45d020 + c5183ca commit 322d9ad
Show file tree
Hide file tree
Showing 12 changed files with 609 additions and 134 deletions.
2 changes: 1 addition & 1 deletion cmake/external/mkldnn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "v0.9"
GIT_TAG "v0.10"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/mklml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ INCLUDE(ExternalProject)

SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.20170720")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")
Expand Down
2 changes: 1 addition & 1 deletion paddle/gserver/layers/Layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ namespace paddle {
Layer::Layer(const LayerConfig& config, bool useGpu)
: config_(config),
useGpu_(useGpu),
deviceId_(-1),
deviceId_(CPU_DEVICE),
needSequenceInfo_(true) {}

bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
Expand Down
29 changes: 28 additions & 1 deletion paddle/gserver/layers/Layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,12 @@ class Layer {
LayerConfig config_;
/// whether to use GPU
bool useGpu_;
/// Device Id. CPU is -1, and GPU is 0, 1, 2 ...
/// Paddle device ID, MKLDNN is -2, CPU is -1
enum PADDLE_DEVICE_ID {
MKLDNN_DEVICE = -2,
CPU_DEVICE = -1,
};
/// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
int deviceId_;
/// Input layers
std::vector<LayerPtr> inputLayers_;
Expand All @@ -77,6 +82,7 @@ class Layer {
Argument output_;
/// Several outputs stored on different devices, used in 'parallel_nn' case,
/// and record them by deviceId_.
/// Also used in 'use_mkldnn' case.
std::vector<Argument> outputOtherDevice_;
/// If there are several outputs, map them by each name.
std::map<std::string, Argument*> outputMap_;
Expand Down Expand Up @@ -172,6 +178,13 @@ class Layer {
return inputLayer.getOutput(deviceId_);
}

/**
* Get the argument of input layer with deviceId.
*/
const Argument& getInput(size_t inputIndex, int deviceId) const {
return inputLayers_[inputIndex]->getOutput(deviceId);
}

/**
* Get the forward-input value.
*/
Expand All @@ -186,6 +199,13 @@ class Layer {
return inputLayer.getOutput(deviceId_).value;
}

/**
* Get the forward-input value with deviceId.
*/
const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
return inputLayers_[inputIndex]->getOutput(deviceId).value;
}

/**
* Get the forward-input grad.
*/
Expand All @@ -200,6 +220,13 @@ class Layer {
return inputLayer.getOutput(deviceId_).grad;
}

/**
* Get the forward-input grad.
*/
const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
return inputLayers_[inputIndex]->getOutput(deviceId).grad;
}

/**
* Get the forward-input label.
*/
Expand Down
220 changes: 118 additions & 102 deletions paddle/gserver/layers/MKLDNNFcLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
return;
}

// TODO(TJ): dst format should get from wgtVal_
int dstFmt = PARAM_FORMAT_MKLDNN_OI;
int srcFmt = weight_->getParameterPtr()->getHeaderFormat();
if (srcFmt == dstFmt) {
return;
}

// The weight_ is transposed from initial paddle weight
MatrixPtr paddleWgt = Matrix::create(
weight_->getW()->getData(), iLayerSize_, oc_, false, false);

// TODO(TJ): remove this print when do not need differ weights
std::ostringstream ostr;
paddleWgt->print(ostr);
VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();

// The mkldnn weight is transposed from initial paddle matrix
MatrixPtr paddleWgtT;
paddleWgt->transpose(paddleWgtT, true);
weight_->getW()->copyFrom(*paddleWgtT);
weight_->getParameterPtr()->setHeaderFormat(dstFmt);
CHECK(wgtVal_) << "should have been initialized";
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims();
auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
hasInitedWgt_ = true;
}

void MKLDNNFcLayer::convertWeightsToPaddle() {
MatrixPtr dnnWgt = weight_->getW();
MatrixPtr paddleWgt;
dnnWgt->transpose(paddleWgt, true);

// copy paddle weight and override on weight_
MatrixPtr dnnWgtT = Matrix::create(
dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
dnnWgtT->copyFrom(*paddleWgt);
CHECK(wgtVal_) << "should have been initialized";
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims();
auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
}

void MKLDNNFcLayer::convertOutputToOtherDevice() {
copyOutputInfoToOtherDevice();
// find other cpu device and reorder output to cpu device
int cnt = 0;
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
// fc cpu output value do not need convert
// just share point
outputOtherDevice_[i].value = output_.value;
++cnt;
}
}

if (cnt > 1) {
LOG(WARNING) << "should not have more than one CPU devie";
}
}

void MKLDNNFcLayer::reshape() {
const Argument& input = getInput(0);
const Argument& input = getInput(0, getPrev(0)->getDeviceId());
int batchSize = input.getBatchSize();
if (bs_ == batchSize) {
return;
Expand All @@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() {
if (iw_ == 0) {
iw_ = 1;
}
hasSpatial_ = true;
if (ih_ == 1 && iw_ == 1) {
hasSpatial_ = false;
}
CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
ic_ = iLayerSize_ / (ih_ * iw_);
CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
Expand All @@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() {

void MKLDNNFcLayer::resetFwd() {
bool hasBias = biases_ && biases_->getW();
real* iData = getInputValue(0)->getData();
real* oData = getOutputValue()->getData();
real* wData = weight_->getW()->getData();
real* bData = hasBias ? biases_->getW()->getData() : NULL;

// TODO(TJ): below create should be covered in MkldnnMatrix
// create memory desc
memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
: createMD({bs_, ic_}, format::nc);
memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
: createMD({oc_, ic_}, format::oi);
memory::desc bMD = bData != NULL ? createMD({oc_}, format::x)
: createMD({}, format::format_undef);
memory::desc oMD = createMD({bs_, oc_}, format::nc);

// create memory primitive desc and memory self
inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData));
outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData));
const MatrixPtr& wgt = weight_->getW();
const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
const MatrixPtr& out = output_.value;

if (inputIsOnlyMKLDNN()) {
const MatrixPtr& in = getInputValue(0);
inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
CHECK(inVal_) << "Input should be MKLDNNMatrix";
} else {
CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
inVal_ = MKLDNNMatrix::create(
in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
}
inVal_->downSpatial();
wgtVal_ = MKLDNNMatrix::create(
wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
wgtVal_->downSpatial();
biasVal_ =
hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);

// change original output value to mkldnn output value
output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
if (!outputIsOnlyMKLDNN()) {
convertOutputToOtherDevice();
}

// create forward handle
prop_kind pk = prop_kind::forward;
fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD)
: fc_fwd::desc(pk, iMD, wMD, oMD);
fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
inVal_->getMemoryDesc(),
wgtVal_->getMemoryDesc(),
biasVal_->getMemoryDesc(),
outVal_->getMemoryDesc())
: fc_fwd::desc(pk,
inVal_->getMemoryDesc(),
wgtVal_->getMemoryDesc(),
outVal_->getMemoryDesc());
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);

if (bData != NULL) {
biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
if (hasBias) {
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
} else {
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
}
printValueFormatFlow();

pipelineFwd_.clear();
pipelineFwd_.push_back(*fwd_);
}
Expand All @@ -175,45 +186,46 @@ void MKLDNNFcLayer::resetBwd() {
return;
}
needResetBwd_ = false;

bool hasBias = biases_ && biases_->getWGrad();
real* iData = getInputValue(0)->getData();
real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
real* oDiff = getOutputGrad()->getData();
real* wDiff = weight_->getWGrad()->getData();
real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;

/// backward weight
// create memory desc for backward memory
memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
: createMD({bs_, ic_}, format::nc);
memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
: createMD({oc_, ic_}, format::oi);
memory::desc oMD = createMD({bs_, oc_}, format::nc);
memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x)
: createMD({}, format::format_undef);

if (inVal_) {
// update data
inVal_->set_data_handle(iData);
} else {
inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
}

// create memory primitive desc and memory self
wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff));
outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));

fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD);
CHECK(inVal_) << "Should have input value";
const MatrixPtr& wgt = weight_->getWGrad();
const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;

// TODO(TJ): merge outgrad
int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
// for MKLDNN device:
// can not directly cast outputgrad to mkldnnmatrix,
// since each layer can not write the inputgrad to mkldnn inputgrad.
// So just create from matrix with outputvalue format.
// for CPU device:
// fc do not need to convert from cpu device since output is always nc format
// only need create from cpu device
const MatrixPtr& out = getOutput(device).grad;
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
: nullptr;

// create memory primitive desc
fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL
? fc_bwdWgt::desc(iMD, wMD, bMD, oMD)
: fc_bwdWgt::desc(iMD, wMD, oMD);
fc_bwdWgt::desc bwdWgtDesc = hasBias
? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
biasGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc())
: fc_bwdWgt::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_bwdWgt::primitive_desc bwdWgtPD =
fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);

if (bDiff != NULL) {
biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
if (hasBias) {
bwdWgt_.reset(
new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
} else {
Expand All @@ -223,15 +235,26 @@ void MKLDNNFcLayer::resetBwd() {
pipelineBwd_.push_back(*bwdWgt_);

/// backward data
if (iDiff == NULL) {
device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
const MatrixPtr& in = getInputGrad(0, device);
if (in == nullptr) {
return;
}
fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD);
if (getInput(0, device).getAllCount() > 1) {
// TODO(TJ): use outputMaps_ ways when merge outgrad done
} else {
inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
}

fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_bwdData::primitive_desc bwdDataPD =
fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));

CHECK(wgtVal_) << "Should have weight memory";
bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
printGradFormatFlow();
pipelineBwd_.push_back(*bwdData_);
}

Expand All @@ -241,11 +264,7 @@ void MKLDNNFcLayer::forward(PassType passType) {

{
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());

// update input data
// since it might be changed if this is after data layer
real* iData = getInputValue(0)->getData();
inVal_->set_data_handle(iData);
syncInputValue();

// just submit forward pipeline
stream_->submit(pipelineFwd_);
Expand All @@ -267,10 +286,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
resetBwd();

// update diff
real* oDiff = getOutputGrad()->getData();
outGrad_->set_data_handle(oDiff);

syncOutputGrad();
// just sumbmit backward pipeline
stream_->submit(pipelineBwd_);
}
Expand Down
Loading

0 comments on commit 322d9ad

Please sign in to comment.