Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

使用crf层,在多线程GPU下,如果batch_size不为1会出错 #9261

Closed
jshower opened this issue Mar 20, 2018 · 6 comments
Closed

使用crf层,在多线程GPU下,如果batch_size不为1会出错 #9261

jshower opened this issue Mar 20, 2018 · 6 comments
Assignees
Labels
User 用于标记用户问题

Comments

@jshower
Copy link
Contributor

jshower commented Mar 20, 2018

发现的问题是使用包含crf层的代码时,多线程(ParallelDo)在单个gpu下,如果batch_size不为1(比如10),会出文末粘贴的问题。值的注意的是,这和#9234 的错误是一致的。
使用的代码是:

import sys
import math

import numpy as np
import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05
import paddle.fluid as fluid
import contextlib
import time
import unittest
from five_corss_val import filereader

sys.stdout.flush()

word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
pred_len = len(verb_dict)

mark_dict_len = 2
word_dim = 32
mark_dim = 5
hidden_dim = 512
depth = 8
mix_hidden_lr = 10

IS_SPARSE = True
PASS_NUM = 1
BATCH_SIZE = 1

embedding_name = 'emb'
default_std = 1 / math.sqrt(hidden_dim) / 3.0

def load_parameter(file_name, h, w):
    with open(file_name, 'rb') as f:
        f.read(16)  # skip header.
        return np.fromfile(f, dtype=np.float32).reshape(h, w)

def to_lodtensor(data, place):
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = np.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = fluid.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res

word = fluid.layers.data(
    name='word_data', shape=[1], dtype='int64', lod_level=1)
predicate = fluid.layers.data(
    name='verb_data', shape=[1], dtype='int64', lod_level=1)
ctx_n2 = fluid.layers.data(
    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
ctx_n1 = fluid.layers.data(
    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
ctx_0 = fluid.layers.data(
    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
ctx_p1 = fluid.layers.data(
    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
ctx_p2 = fluid.layers.data(
    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(
    name='mark_data', shape=[1], dtype='int64', lod_level=1)

target = fluid.layers.data(
    name='target', shape=[1], dtype='int64', lod_level=1)

places = fluid.layers.get_places(device_count=0)
pd = fluid.layers.ParallelDo(places)    
with pd.do():
    word_ = pd.read_input(word)
    predicate_ = pd.read_input(predicate)
    ctx_n2_ = pd.read_input(ctx_n2)
    ctx_n1_ = pd.read_input(ctx_n1)
    ctx_0_ = pd.read_input(ctx_0)
    ctx_p1_ = pd.read_input(ctx_p1)
    ctx_p2_ = pd.read_input(ctx_p2)
    mark_ = pd.read_input(mark)
    target_ = pd.read_input(target)

    predicate_embedding = fluid.layers.embedding(
        input=predicate_,
        size=[pred_len, word_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
        param_attr=fluid.ParamAttr(name='vemb', learning_rate=5))

    mark_embedding = fluid.layers.embedding(
        input=mark_,
        size=[mark_dict_len, mark_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
        param_attr=fluid.ParamAttr(name='mark_emb', learning_rate=5))

    word_input = [word_, ctx_n2_, ctx_n1_, ctx_0_, ctx_p1_, ctx_p2_]
    emb_layers = [
        fluid.layers.embedding(
            size=[word_dict_len, word_dim],
            input=x,
            param_attr=fluid.ParamAttr(
                name=embedding_name, trainable=False)) for x in word_input
    ]
    emb_layers.append(predicate_embedding)
    emb_layers.append(mark_embedding)

    hidden_0_layers = [
        fluid.layers.fc(input=emb, size=hidden_dim, act="tanh") for emb in emb_layers
    ]

    hidden_0 = fluid.layers.sums(input=hidden_0_layers)

    lstm_0 = fluid.layers.dynamic_lstm(
        input=hidden_0,
        size=hidden_dim,
        candidate_activation='relu',
        gate_activation='sigmoid',
        cell_activation='sigmoid')

    # stack L-LSTM and R-LSTM with direct edges
    input_tmp = [hidden_0, lstm_0]

    for i in range(1, depth):
        mix_hidden = fluid.layers.sums(input=[
            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act="tanh"),
            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act="tanh")
        ])

        lstm = fluid.layers.dynamic_lstm(
            input=mix_hidden,
            size=hidden_dim,
            candidate_activation='relu',
            gate_activation='sigmoid',
            cell_activation='sigmoid',
            is_reverse=((i % 2) == 1))

        input_tmp = [mix_hidden, lstm]

    feature_out = fluid.layers.sums(input=[
        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act="tanh"),
        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act="tanh")
    ])

    crf_cost = fluid.layers.linear_chain_crf(
        input=feature_out,
        label=target,
        param_attr=fluid.ParamAttr(
            name='crfw', learning_rate=mix_hidden_lr))
    avg_cost1 = fluid.layers.mean(x=crf_cost)

    pd.write_output(avg_cost1)
    pd.write_output(feature_out)

avg_cost_on_each_devs, feature_out_on_each_devs = pd()
feature_out_on_each_devs.stop_gradient = True
avg_cost = fluid.layers.mean(x=avg_cost_on_each_devs)

sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)

crf_decode = fluid.layers.crf_decoding(
    input=feature_out_on_each_devs, param_attr=fluid.ParamAttr(name='crfw'))

(precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks) = fluid.layers.chunk_eval(input=crf_decode,
    label=target,
    chunk_scheme="IOB",
     num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))

inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
    inference_program = fluid.io.get_inference_program([])

train_data = paddle.batch(
    paddle.reader.shuffle(
        filereader.file_reader("five_corss_val/" + str(sys.argv[1])), buf_size=8192),
    batch_size=BATCH_SIZE)

test_data = paddle.batch(
    paddle.reader.shuffle(
        filereader.file_reader("five_corss_val/" + str(sys.argv[2])), buf_size=8192),
    batch_size=BATCH_SIZE)

place = fluid.CUDAPlace(0)

feeder = fluid.DataFeeder(
feed_list=[
    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
], place=place)
exe = fluid.Executor(place)

exe.run(fluid.default_startup_program())

embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
embedding_param.set(
load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
start_time = time.time()
#batch_id = 0
for pass_id in xrange(PASS_NUM):
    #chunk_evaluator.reset(exe)
    time_start = time.time()
    sum_infer = 0
    sum_label = 0
    sum_correct = 0
    for data in train_data():
        start_time = time.time()
        cost, num_infer, num_label, num_correct = exe.run(
            fluid.default_main_program(),
            feed=feeder.feed(data),
            fetch_list=[avg_cost, num_infer_chunks, num_label_chunks, num_correct_chunks])
        sum_infer += num_infer
        sum_label += num_label
        sum_correct += num_correct
        print("cost:" + str(cost[0]), ", num_infer:" + str(num_infer[0]) + ", num_label:" + str(num_label[0]) + ", num_correct:" + str(num_correct))
    precision = 0
    recall = 0
    f1_score = 0
    if sum_infer != 0:
        precision = sum_correct * 1.0 / sum_infer
    if sum_label != 0:
        recall = sum_correct * 1.0 / sum_label
    if precision != 0 or recall != 0:
        f1_score = precision * recall * 2.0 / (precision + recall)
    print("pass_id:" + str(pass_id) + ", precision:" + str(precision) + ", recall:" + str(recall) + ", f1_score:" + str(f1_score))
    time_end = time.time()
    print("pass_id:" + str(pass_id) + ", cost_time:" + str(time_end - time_start))   
    save_dirname = sys.argv[2] + ".save_model_multi_thread." + str(pass_id) 
    fluid.io.save_inference_model(save_dirname, [
                'word_data', 'verb_data', 'ctx_n2_data',
                'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
                'ctx_p2_data', 'mark_data', 'target'], [num_infer_chunks, num_label_chunks, num_correct_chunks], exe)

这是对Paddle/python/paddle/fluid/tests/book/test_label_semantic_roles.py的一个多线程改写的尝试。通过将test_label_semantic_roles.py变成多线程来复现这个问题,也可以联系我提供环境。

Traceback (most recent call last):
  File "new_srl_base_tanh_MultiThread.py", line 289, in <module>
    fetch_list=[avg_cost, num_infer_chunks, num_label_chunks, num_correct_chunks])
  File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/executor.py", line 349, in run
    self.executor.run(program_cache.desc, scope, 0, True, True)
paddle.fluid.core.EnforceNotMet: var crfw@GRAD is both input and output, does not support transform at [/paddle_gpu/Paddle/paddle/fluid/framework/operator.cc:535]
PaddlePaddle Call Stacks:
0       0x7f71c45bd48cp paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 572
1       0x7f71c51b818fp paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 2783
2       0x7f71c4f96af2p paddle::operators::ParallelDoGradOp::AccumulateGrad(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&, std::vector<paddle::framework::Scope*, std::allocator<paddle::framework::Scope*> > const&, std::vector<boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_>, std::allocator<boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> > > const&) const + 2482
3       0x7f71c4f9b3bcp paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 2220
4       0x7f71c466e4a5p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool) + 1781
5       0x7f71c466fa5fp paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool) + 63
6       0x7f71c45d9fc3p void pybind11::cpp_function::initialize<pybind11::cpp_function::initialize<void, paddle::framework::Executor, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, pybind11::name, pybind11::is_method, pybind11::sibling>(void (paddle::framework::Executor::*)(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool)#1}, void, paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, pybind11::name, pybind11::is_method, pybind11::sibling>(pybind11::cpp_function::initialize<void, paddle::framework::Executor, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, pybind11::name, pybind11::is_method, pybind11::sibling>(void (paddle::framework::Executor::*)(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool)#1}&&, void (*)(paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(pybind11::detail::function_call&)#3}::_FUN(pybind11::detail::function_call) + 579
7       0x7f71c45d7d04p pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 1236
8             0x4c37edp PyEval_EvalFrameEx + 31165
9             0x4b9ab6p PyEval_EvalCodeEx + 774
10            0x4c16e7p PyEval_EvalFrameEx + 22711
11            0x4b9ab6p PyEval_EvalCodeEx + 774
12            0x4eb30fp
13            0x4e5422p PyRun_FileExFlags + 130
14            0x4e3cd6p PyRun_SimpleFileExFlags + 390
15            0x493ae2p Py_Main + 1554
16      0x7f721003b830p __libc_start_main + 240
17            0x4933e9p _start + 41

@luotao1 luotao1 added User 用于标记用户问题 labels Mar 21, 2018
@guru4elephant guru4elephant self-assigned this Mar 21, 2018
@wangpeiing
Copy link

wangpeiing commented Mar 22, 2018

@jacquesqiao 正在看您的问题

@jacquesqiao
Copy link
Member

processing

@jshower
Copy link
Contributor Author

jshower commented Mar 27, 2018

是否有结论,有结论辛苦告知一下。

@jacquesqiao
Copy link
Member

问题已确认,原因是crf层没有gpu kernel,所以做了自动内存转化,这个机制在加了正则之后出现了问题,正在思考解决办法

@jshower
Copy link
Contributor Author

jshower commented Apr 8, 2018

你好,关于上面这个问题。我有一个疑问是如果Place选择是GPU的单个卡,这时候ParallelDo是否还会起作用,是否存在一个GPU下的多线程,还说我上面在一个GPU卡下进行多线程本身是错的?在多个卡上起多线程应该怎么做?

@jshower
Copy link
Contributor Author

jshower commented Apr 9, 2018

已解决。

@jshower jshower closed this as completed Apr 9, 2018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
User 用于标记用户问题
Projects
None yet
Development

No branches or pull requests

5 participants