-
Notifications
You must be signed in to change notification settings - Fork 2.5k
/
opt_util.py
408 lines (334 loc) · 14.2 KB
/
opt_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
from __future__ import absolute_import, print_function, division
from functools import wraps
import numpy as np
from theano import tensor, scalar as scal, Constant
from theano.gof import local_optimizer
from theano.gof.opt import inherit_stack_trace
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, GpuReshape
from .elemwise import GpuDimShuffle, GpuElemwise
def grab_cpu_scalar(v, nd):
"""
Get a scalar variable value from the tree at `v`.
This function will dig through transfers and dimshuffles to get
the constant value. If no such constant is found, it returns None.
Parameters
----------
v
Theano variable to extract the constant value from.
nd : int
Expected number of dimensions for the variable (for
broadcasted constants).
"""
if v.owner is not None:
n = v.owner
if (isinstance(n.op, (GpuDimShuffle, DimShuffle)) and
n.op.new_order == ('x',) * nd):
return grab_cpu_scalar(n.inputs[0], n.inputs[0].ndim)
elif isinstance(n.op, (GpuFromHost, HostFromGpu)):
return grab_cpu_scalar(n.inputs[0], nd)
else:
return None
else:
if (isinstance(v, Constant) and
v.broadcastable == (True,) * nd):
return v.dimshuffle(())
def find_node(v, cls, ignore_clients=False):
"""
Find the node that has an op of of type `cls` in `v`.
This digs through possibly redundant transfers to for the node
that has the type `cls`. If `ignore_clients` is False (the
default) it will only dig through nodes that have a single client
to avoid duplicating computations.
Parameters
----------
v
The variable to dig through
cls : Op class
The type of the node we are looking for
ignore_clients : bool, optional
Whether to ignore multiple clients or not.
"""
if v.owner is not None and (ignore_clients or len(v.clients) == 1):
if isinstance(v.owner.op, cls):
return v.owner
elif (isinstance(v.owner.op, GpuFromHost) and
v.owner.inputs[0].owner is not None and
(ignore_clients or len(v.owner.inputs[0].clients) == 1) and
isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
return find_node(v.owner.inputs[0].owner.inputs[0], cls)
else:
return None
def is_equal(var, val):
"""
Returns True if `var` is always equal to `val`.
This will only return True if the variable will always be equal to
the value. If it might not be true in some cases then it returns False.
Parameters
----------
var
Variable to compare
val
Python value
"""
try:
v = get_scalar_constant_value(var)
return v == val
except NotScalarConstantError:
return False
def alpha_merge(cls, alpha_in, beta_in):
"""
Decorator to merge multiplication by a scalar on the output.
This will find a pattern of `scal * <yourop>(some, params, alpha,
beta)` and update it so that the scalar multiplication happens as
part of your op.
The op needs to accept an alpha and a beta scalar which act this way::
out = Op() * alpha + out_like * beta
Where out_like is a buffer that has the same size as the output
and gets added to the "real" output of the operation. An example
of an operation that respects this pattern is GEMM from blas.
The decorated function must have this signature::
maker(node, *inputs)
The `node` argument you receive is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
The `*inputs` parameters contains the new inputs for your op. You
MUST use those inputs instead of the ones on `node`. Note that
this function can be as simple as::
def maker(node, *inputs):
return node.op(*inputs)
Parameters
----------
cls : op class
The class of the op you want to merge
alpha_in : int
The input index for the alpha scalar for your op (in node.inputs).
beta_in : int
The input index for the beta scalar for your op (in node.inputs).
Returns
-------
local optimizer
an unregistered local optimizer that has the same name as the
decorated function.
Notes
-----
This was factored out since the code to deal with intervening
transfers and correctness in the presence of different values of
alpha and beta scaling factors is not trivial.
"""
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
def opt(node):
if (isinstance(node.op, GpuElemwise) and
node.op.scalar_op == scal.mul and
node.nin == 2):
targ = find_node(node.inputs[0], cls)
if targ is None:
targ = find_node(node.inputs[1], cls)
if targ is None:
return
lr = grab_cpu_scalar(node.inputs[0],
nd=targ.outputs[0].ndim)
else:
lr = grab_cpu_scalar(node.inputs[1],
nd=targ.outputs[0].ndim)
if lr is None or lr.dtype != targ.outputs[0].dtype:
return None
inputs = list(targ.inputs)
try:
c = get_scalar_constant_value(lr)
if c == 0:
inputs[alpha_in] = lr
inputs[beta_in] = lr
elif c == 1:
inputs[alpha_in] = targ.inputs[alpha_in]
inputs[beta_in] = targ.inputs[beta_in]
else:
inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
except NotScalarConstantError:
inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
with inherit_stack_trace(node.outputs):
return maker(targ, *inputs)
return opt
return wrapper
def output_merge(cls, alpha_in, beta_in, out_in):
"""
Decorator to merge addition by a value on the output.
This will find a pattern of `val * <yourop>(some, params, alpha,
beta, out_like)` and update it so that the addtition happens as
part of your op.
The op needs to accept an alpha and a beta scalar which act this way::
out = Op() * alpha + out_like * beta
Where out_like is a buffer that has the same size as the output
and gets added to the "real" output of the operation. An example
of an operation that respects this pattern is GEMM from blas.
The decorated function must have this signature::
maker(node, *inputs)
The `node` argument you receive is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
The `*inputs` parameters contains the new inputs for your op. You
MUST use those inputs instead of the ones on `node`. Note that
this function can be as simple as::
def maker(node, *inputs):
return node.op(*inputs)
Parameters
----------
cls : op class
The class of the op you want to merge
alpha_in : int
The input index for the alpha scalar for your op (in node.inputs).
beta_in : int
The input index for the beta scalar for your op (in node.inputs).
out_in : int
The input index for the out_like input for your op (in node.inputs).
Returns
-------
local optimizer
an unregistered local optimizer that has the same name as the
decorated function.
Notes
-----
This was factored out since the code to deal with intervening
transfers and correctness in the presence of different values of
alpha and beta scaling factors is not trivial.
This also correctly handles the case where the added value is
broadcasted (by not performing the replacement).
"""
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
def opt(node):
if (isinstance(node.op, GpuElemwise) and
node.op.scalar_op == scal.add and
node.nin == 2):
targ = find_node(node.inputs[0], cls)
W = node.inputs[1]
if targ is None:
targ = find_node(node.inputs[1], cls)
W = node.inputs[0]
if targ is None:
return None
if W.dtype != targ.outputs[0].dtype:
return None
if not is_equal(targ.inputs[beta_in], 0.0):
# other cases are too complex for now
return None
if W.broadcastable != targ.inputs[out_in].broadcastable:
# Would need to explicitly tile the output to fill
# the full shape here. Disable for now.
return None
inputs = list(targ.inputs)
inputs[out_in] = W
dtype = inputs[beta_in].dtype
one = scal.constant(np.asarray(1.0, dtype=dtype))
inputs[beta_in] = one
with inherit_stack_trace(node.outputs):
return maker(targ, *inputs)
return opt
return wrapper
def inplace_allocempty(op, idx):
"""
Wrapper to make an inplace optimization that deals with AllocEmpty
This will duplicate the alloc input if it has more than one client
to allow the op to work on it inplace.
The decorated function must have this signature::
maker(node, inputs)
The `node` argument you receive is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
You should also switch the op to work inplace. The `*inputs`
parameters contains the new inputs for your op. You MUST use
those inputs instead of the ones on `node`. Note that this
function can be as simple as::
def maker(node, inputs):
return [node.op.__class__(inplace=True)(*inputs)]
Parameters
----------
op : op class
The op class to look for to make inplace
idx : int
The index of the (possibly) AllocEmpty input (in node.inputs).
Returns
-------
local optimizer
an unregistered inplace local optimizer that has the same name
as the decorated function.
"""
def wrapper(maker):
@local_optimizer([op], inplace=True)
@wraps(maker)
def opt(node):
if type(node.op) != op or node.op.inplace:
return
inputs = list(node.inputs)
alloc = inputs[idx]
if (alloc.owner and
isinstance(alloc.owner.op, GpuAllocEmpty) and
len(alloc.clients) > 1):
alloc_op = GpuAllocEmpty(alloc.owner.op.dtype, alloc.owner.op.context_name)
inputs[idx] = alloc_op(*alloc.owner.inputs)
with inherit_stack_trace(node.outputs):
return maker(node, inputs)
return opt
return wrapper
def pad_dims(input, leftdims, rightdims):
"""Reshapes the input to a (leftdims + rightdims) tensor
This helper function is used to convert pooling inputs with arbitrary
non-pooling dimensions to the correct number of dimensions for the
GPU pooling ops.
This reduces or expands the number of dimensions of the input to
exactly `leftdims`, by adding extra dimensions on the left or by
combining some existing dimensions on the left of the input.
Use `unpad_dims` to reshape back to the original dimensions.
Examples
--------
Given input of shape (3, 5, 7), ``pad_dims(input, 2, 2)``
adds a singleton dimension and reshapes to (1, 3, 5, 7).
Given that output from pad_dims, ``unpad_dims(output, input, 2, 2)``
reshapes back to (3, 5, 7).
Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 2)``
does not reshape and returns output with shape (3, 5, 7, 9).
Given input of shape (3, 5, 7, 9, 11), ``pad_dims(input, 2, 2)``
combines the first two dimensions and reshapes to (15, 7, 9, 11).
Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 3)``
adds a singleton dimension and reshapes to (1, 3, 5, 7, 9).
"""
assert input.ndim >= rightdims
if input.ndim == (leftdims + rightdims):
return input
# extract image dimensions
img_shape = input.shape[-rightdims:]
non_pool_ndim = input.ndim - rightdims
if non_pool_ndim < leftdims:
# too few dimensions, pad on the left
dummy_dims = tensor.as_tensor([1] * (leftdims - non_pool_ndim))
new_shape = tensor.join(0, dummy_dims,
input.shape[:non_pool_ndim],
img_shape)
else:
# too many dimensions, combine the leading dimensions
batched_ndim = non_pool_ndim - leftdims + 1
batch_size = tensor.prod(input.shape[:batched_ndim])
# convert to a vector for tensor.join
batch_size = tensor.shape_padright(batch_size, 1)
new_shape = tensor.join(0, batch_size,
input.shape[batched_ndim:non_pool_ndim],
img_shape)
# store in the required shape
new_shape = tensor.cast(new_shape, 'int64')
input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
return input_ND
def unpad_dims(output, input, leftdims, rightdims):
"""Reshapes the output after pad_dims.
This reverts the padding by `pad_dims`.
"""
if output.ndim == input.ndim:
return output
# restore the output to the original shape
outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
return GpuReshape(input.ndim)(output, outshp)