/
ctc.py
182 lines (146 loc) · 6.67 KB
/
ctc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from __future__ import absolute_import, print_function, division
import theano
from theano import (config, gof)
import theano.tensor as T
from .basic_ops import (gpu_contiguous, as_gpuarray_variable, infer_context_name, gpuarray_helper_inc_dir)
import theano.tensor.nnet.ctc
from .type import (GpuArrayType, gpu_context_type)
from .elemwise import GpuDimShuffle
from theano.gradient import grad_undefined
from theano.gof import local_optimizer
from theano.tensor.opt import register_canonicalize
from theano.tensor.nnet.ctc import ctc_available
import os
import sys
from . import pygpu
class GpuConnectionistTemporalClassification(gof.COp):
"""
GPU wrapper for Baidu CTC loss function.
Parameters
----------
compute_grad
If set to True, enables the computation of gradients of the CTC loss function.
"""
__props__ = ('compute_grad',)
_cop_num_inputs = 3
_cop_num_outputs = 2
func_file = "./c_code/ctc_wrapper.c"
func_name = "APPLY_SPECIFIC(ctc_cost_gpu)"
params_type = gpu_context_type
def __init__(self, compute_grad=True):
if not ctc_available():
raise RuntimeError('Baidu CTC is not available and '
'GpuConnectionistTemporalClassification Op '
'can not be constructed.')
self.compute_grad = compute_grad
# Return only the cost. Gradient will be returned by grad()
self.default_output = 0
gof.COp.__init__(self, self.func_file, self.func_name)
def c_lib_dirs(self):
lib_dirs = []
if ctc_available.path is not None:
lib_dirs += [ctc_available.path]
return lib_dirs
def c_compile_args(self):
if ctc_available.path is not None:
if sys.platform != 'darwin' and ' ' in ctc_available.path:
return ['-Wl,-rpath,"' + ctc_available.path + '"']
else:
return ['-Wl,-rpath,' + ctc_available.path]
return []
def c_libraries(self):
return ["warpctc", "gpuarray"]
def c_header_dirs(self):
dirs = [gpuarray_helper_inc_dir(), pygpu.get_include(),
config.cuda.include_path]
if config.ctc.root != '':
dirs.append(os.path.join(config.ctc.root, "include"))
return dirs
def c_headers(self):
return ['ctc.h', 'numpy_compat.h', 'gpuarray/ext_cuda.h',
'gpuarray_helper.h', 'gpuarray/types.h', 'gpuarray_api.h',
'gpuarray/array.h', 'gpuarray/util.h', 'gpuarray/extension.h']
def get_params(self, node):
return node.inputs[0].type.context
def make_node(self, activations, labels, input_lengths):
context_name = infer_context_name(activations)
t_activations = as_gpuarray_variable(activations,
context_name=context_name)
# Ensure activations array is C-contiguous
t_activations = gpu_contiguous(t_activations)
# Labels and input lengths are always on the CPU
t_labels = T.as_tensor_variable(labels)
t_input_lengths = T.as_tensor_variable(input_lengths)
if t_activations.type.dtype != 'float32':
raise TypeError('activations must use the float32 type.')
if t_activations.ndim != 3:
raise ValueError('activations must have 3 dimensions.')
if t_labels.type.dtype != 'int32':
raise TypeError('labels must use the int32 type.')
if t_labels.ndim != 2:
raise ValueError('labels must have 2 dimensions.')
if t_input_lengths.type.dtype != 'int32':
raise TypeError('input_lengths must use the int32 type.')
if t_input_lengths.ndim != 1:
raise ValueError('input_lengths must have 1 dimension.')
costs = GpuArrayType(dtype='float32',
broadcastable=(False,),
context_name=context_name)()
outputs = [costs]
if self.compute_grad:
gradients = GpuArrayType(dtype='float32',
broadcastable=(False, False, False,),
context_name=context_name)()
outputs += [gradients]
return theano.Apply(self, inputs=[t_activations, t_labels, t_input_lengths],
outputs=outputs)
def L_op(self, inputs, outputs, output_grads):
# Gradients computed by Op
assert self.compute_grad and len(outputs) == 2
gradients = outputs[1]
assert gradients is not None
# Gradients of original function, to compose chain rule
grad_op = output_grads[0]
grad_shuffle = GpuDimShuffle(input_broadcastable=(False, False, False,),
new_order=(1, 0, 2))(gradients)
grad_bdot = T.basic.batched_dot(grad_op, grad_shuffle)
grad_shuffle_reverse = GpuDimShuffle(input_broadcastable=(False, False, False,),
new_order=(1, 0, 2))(grad_bdot)
return [grad_shuffle_reverse,
grad_undefined(self, 1, inputs[1]),
grad_undefined(self, 2, inputs[2])]
def gpu_ctc(activations, labels, input_lengths):
"""
Compute CTC loss function on the GPU.
Parameters
----------
activations
Three-dimensional tensor, which has a shape of (t, m, p), where
t is the time index, m is the minibatch index, and p is the index
over the probabilities of each symbol in the alphabet. The memory
layout is assumed to be in C-order, which consists in the slowest
to the fastest changing dimension, from left to right. In this case,
p is the fastest changing dimension.
labels
A 2-D tensor of all the labels for the minibatch. In each row, there
is a sequence of target labels. Negative values are assumed to be padding,
and thus are ignored. Blank symbol is assumed to have index 0 in the
alphabet.
input_lengths
A 1-D tensor with the number of time steps for each sequence in
the minibatch.
Returns
-------
1-D array
Cost of each example in the minibatch.
"""
return GpuConnectionistTemporalClassification()(activations, labels, input_lengths)
# Disable gradient computation if not needed
@register_canonicalize("fast_compile")
@local_optimizer([GpuConnectionistTemporalClassification])
def local_gpu_ctc_no_grad(node):
if isinstance(node.op, GpuConnectionistTemporalClassification):
if len(node.outputs) > 1:
if len(node.outputs[1].clients) == 0: # gradient is not used
return [GpuConnectionistTemporalClassification(compute_grad=False)(*node.inputs), None]
return False