theano/gpuarray/linalg.py

from __future__ import absolute_import, division, print_function

import warnings

import pkg_resources
import numpy as np
from numpy.linalg.linalg import LinAlgError

import theano
from theano import Op, config, tensor
from theano.scalar import bool as bool_t
from theano.gof import COp, ParamsType
from theano.gpuarray import GpuArrayType

from .basic_ops import (CGpuKernelBase, as_gpuarray_variable, gpu_contiguous, gpuarray_helper_inc_dir,
                        infer_context_name)
from .type import gpu_context_type

try:
    import pygpu
    from pygpu.basic import triu, tril
    pygpu_available = True
except ImportError:
    pygpu_available = False

cusolver_available = False
try:
    import skcuda
    from skcuda import cusolver
    cusolver_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
    pass

cublas_available = False
try:
    from skcuda import cublas
    cublas_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
    pass

if cusolver_available:
    # Add cusolver call as it is missing in skcuda
    # SPOTRS
    cusolver._libcusolver.cusolverDnSpotrs.restype = int
    cusolver._libcusolver.cusolverDnSpotrs.argtypes = [cusolver.ctypes.c_void_p,
                                                       cusolver.ctypes.c_int,
                                                       cusolver.ctypes.c_int,
                                                       cusolver.ctypes.c_int,
                                                       cusolver.ctypes.c_void_p,
                                                       cusolver.ctypes.c_int,
                                                       cusolver.ctypes.c_void_p,
                                                       cusolver.ctypes.c_int,
                                                       cusolver.ctypes.c_void_p]

    def cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda,
                         B, ldb, devInfo):
        """
        Solve real single precision linear system for hermitian matrices.
        References
        ----------
        `cusolverDn<t>potrs <http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-potrs>`_
        """

        status = cusolver._libcusolver.cusolverDnSpotrs(handle, uplo, n, nrhs,
                                                        int(A), lda, int(B),
                                                        ldb, int(devInfo))
        cusolver.cusolverCheckStatus(status)

    # DPOTRS
    # TODO: Are they still missing in skucda?
    cusolver._libcusolver.cusolverDnDpotrs.restype = int
    cusolver._libcusolver.cusolverDnDpotrs.argtypes = [cusolver.ctypes.c_void_p,
                                                       cusolver.ctypes.c_int,
                                                       cusolver.ctypes.c_int,
                                                       cusolver.ctypes.c_int,
                                                       cusolver.ctypes.c_void_p,
                                                       cusolver.ctypes.c_int,
                                                       cusolver.ctypes.c_void_p,
                                                       cusolver.ctypes.c_int,
                                                       cusolver.ctypes.c_void_p]

    def cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda,
                         B, ldb, devInfo):
        status = cusolver._libcusolver.cusolverDnDpotrs(handle, uplo, n, nrhs,
                                                        int(A), lda, int(B),
                                                        ldb, int(devInfo))
        cusolver.cusolverCheckStatus(status)


def attach_cusolver_handle_to_context(ctx):
    handle = getattr(ctx, 'cusolver_handle', None)
    if handle is None:
        with ctx:
            ctx.cusolver_handle = cusolver.cusolverDnCreate()


def attach_cublas_handle_to_context(ctx):
    handle = getattr(ctx, 'cublas_handle', None)
    if handle is None:
        with ctx:
            ctx.cublas_handle = cublas.cublasCreate()


# it is a subset of all cases available in slinalg's MATRIX_STRUCTURE
MATRIX_STRUCTURES_SOLVE = (
    'general',
    'symmetric',
    'lower_triangular',
    'upper_triangular')


class GpuCusolverSolve(Op):
    """
    CUSOLVER GPU solver OP.

    Parameters
    ----------
    trans
        Whether to take the transpose of the input matrix or not.

    """

    __props__ = ('A_structure', 'trans', 'inplace')

    def __init__(self, A_structure='general', trans='N', inplace=False):
        self.trans = trans
        self.inplace = inplace
        self.A_structure = A_structure
        if self.inplace:
            self.destroy_map = {0: [0]}
        assert A_structure in MATRIX_STRUCTURES_SOLVE
        super(GpuCusolverSolve, self).__init__()

    def make_node(self, inp1, inp2):
        if not cusolver_available:
            raise RuntimeError('CUSOLVER is not available and '
                               'GpuCusolverSolve Op can not be constructed.')
        if skcuda.__version__ <= '0.5.1':
            warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
        context_name = infer_context_name(inp1, inp2)

        inp1 = as_gpuarray_variable(inp1, context_name)
        inp2 = as_gpuarray_variable(inp2, context_name)

        inp1 = gpu_contiguous(inp1)
        inp2 = gpu_contiguous(inp2)

        assert inp1.ndim == 2
        assert inp2.ndim == 2
        assert inp1.dtype == inp2.dtype

        return theano.Apply(
            self, [inp1, inp2],
            [GpuArrayType(inp1.dtype,
                          broadcastable=inp1.broadcastable,
                          context_name=context_name)()])

    def prepare_node(self, node, storage_map, compute_map, impl):
        ctx = node.inputs[0].type.context
        attach_cusolver_handle_to_context(ctx)

    def check_dev_info(self, dev_info):
        val = np.asarray(dev_info)[0]
        if val > 0:
            raise LinAlgError('A is singular')

    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Size of the matrices to invert.
        z = outputs[0]

        # Matrix.
        A = inputs[0]

        # Solution vectors.
        b = inputs[1]

        assert(len(A.shape) == 2)
        assert(len(b.shape) == 2)

        if self.trans in ['T', 'C']:
            trans = 1
            l, n = A.shape
            k, m = b.shape
        elif self.trans == 'N':
            trans = 0
            n, l = A.shape
            k, m = b.shape
        else:
            raise ValueError('Invalid value for trans')
        if l != n:
            raise ValueError('A must be a square matrix')
        if n != k:
            raise ValueError('A and b must be aligned.')

        lda = max(1, n)
        ldb = max(1, k)

        # We copy A and b as cusolver operates inplace
        b = pygpu.array(b, copy=True, order='F')
        if not self.inplace:
            A = pygpu.array(A, copy=True)
        A_ptr = A.gpudata
        b_ptr = b.gpudata

        # cusolver expects a F ordered matrix, but A is not explicitly
        # converted between C and F order, instead we switch the
        # "transpose" flag.
        if A.flags['C_CONTIGUOUS']:
            trans = 1 - trans

        if A.dtype == 'float32':
            potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
            potrf = cusolver.cusolverDnSpotrf
            potrs = cusolverDnSpotrs
            getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
            getrf = cusolver.cusolverDnSgetrf
            getrs = cusolver.cusolverDnSgetrs
        elif A.dtype == 'float64':
            potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
            potrf = cusolver.cusolverDnDpotrf
            potrs = cusolverDnDpotrs
            getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
            getrf = cusolver.cusolverDnDgetrf
            getrs = cusolver.cusolverDnDgetrs
        else:
            raise ValueError("Unsupported dtype")

        if self.A_structure == 'symmetric':
            with context:
                workspace_size = potrf_bufferSize(
                    context.cusolver_handle, 0, n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
                                    context=context)

            dev_info = pygpu.zeros((1,), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                potrf(
                    context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr,
                    workspace_size, dev_info_ptr)
                self.check_dev_info(dev_info)

                potrs(
                    context.cusolver_handle, 0, n, m, A_ptr, lda,
                    b_ptr, ldb, dev_info_ptr)

        else:
            # general case for A
            with context:
                workspace_size = getrf_bufferSize(
                    context.cusolver_handle, n, n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
                                    context=context)

            pivots = pygpu.zeros(n, dtype='int32', context=context)

            dev_info = pygpu.zeros((1,), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            pivots_ptr = pivots.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                getrf(
                    context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
                    pivots_ptr, dev_info_ptr)
                self.check_dev_info(dev_info)

                getrs(
                    context.cusolver_handle, trans, n, m, A_ptr, lda,
                    pivots_ptr, b_ptr, ldb, dev_info_ptr)

        z[0] = b

    def L_op(self, inputs, outputs, output_gradients):
        # Modified from theano/tensor/slinalg.py
        A, b = inputs
        c = outputs[0]
        c_bar = output_gradients[0]
        # FIXME: triangular structure would use GpuCublasTriangularsolve?
        # no need to handle A_structure like slinalg.py?
        trans_solve_op = GpuCusolverSolve('general')
        b_bar = trans_solve_op(A.T, c_bar)
        A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
        return [A_bar, b_bar]


class GpuCublasTriangularSolve(Op):
    """
    CUBLAS GPU Triangular Solve Op.

    Parameters
    ----------
    lower
        Whether system is lower-triangular (True) or upper-triangular (False).
    trans
        Whether to take the transpose of the input matrix or not.
    """
    __props__ = ('trans', 'lower')

    def __init__(self, lower=True, trans='N'):
        self.trans = trans
        self.lower = lower
        super(GpuCublasTriangularSolve, self).__init__()

    def make_node(self, inp1, inp2):
        if not cublas_available:
            raise RuntimeError('CUBLAS is not available and '
                               'GpuCublasTriangularSolve Op '
                               'can not be constructed.')
        context_name = infer_context_name(inp1, inp2)

        inp1 = as_gpuarray_variable(inp1, context_name)
        inp2 = as_gpuarray_variable(inp2, context_name)

        inp1 = gpu_contiguous(inp1)
        inp2 = gpu_contiguous(inp2)

        assert inp1.ndim == 2
        assert inp2.ndim in [1, 2]
        assert inp1.dtype == inp2.dtype

        return theano.Apply(self, [inp1, inp2],
                            [GpuArrayType(inp1.dtype,
                                          broadcastable=inp2.broadcastable,
                                          context_name=context_name)()])

    def prepare_node(self, node, storage_map, compute_map, impl):
        ctx = node.inputs[0].type.context
        attach_cublas_handle_to_context(ctx)

    def perform(self, node, inputs, outputs):
        ctx = node.inputs[0].type.context

        # Solution set
        x = outputs[0]

        # Matrix.
        A = inputs[0]

        # right hand side
        b = inputs[1]

        assert(len(A.shape) == 2)
        assert(len(b.shape) in [1, 2])

        # implicitly deal with the difference between C order
        # and fortran order by flipping the trans and lower flags
        lower = not self.lower
        trans = self.trans
        if trans in ['T', 'C']:
            trans = 'N'
            l, n = A.shape
        elif trans == 'N':
            trans = 'T'
            n, l = A.shape
        else:
            raise ValueError('Invalid value for trans')

        if b.ndim == 2:
            k, m = b.shape
        else:
            k, = b.shape
            m = 1

        if l != n:
            raise ValueError('A must be a square matrix')
        if n != k:
            raise ValueError('A and b must be aligned.')

        lda = max(1, n)
        ldb = max(1, k)

        # solution overwrites right hand side on exit
        b = pygpu.array(b, copy=True, order='F')

        A_ptr = A.gpudata
        b_ptr = b.gpudata

        # unit scalar used for multiplication
        alpha = 1.0
        # indicates matrix A is on left of B
        side = 'l'
        # set whether upper or lower part of matrix A stored
        uplo = 'l' if lower else 'u'
        # indicates elements on diagonal of matrix A may not be unity
        diag = 'n'

        if A.dtype == 'float32':
            trsv = cublas.cublasStrsv
            trsm = cublas.cublasStrsm
        elif A.dtype == 'float64':
            trsv = cublas.cublasDtrsv
            trsm = cublas.cublasDtrsm
        else:
            raise ValueError("Unsupported dtype")

        with ctx:
            if b.ndim == 1:
                # matrix vector solve
                trsv(ctx.cublas_handle, uplo, trans, diag, n,
                     A_ptr, lda, b_ptr, 1)
            else:
                trsm(ctx.cublas_handle, side, uplo, trans, diag,
                     n, m, alpha, A_ptr, lda, b_ptr, ldb)

        x[0] = b

    def L_op(self, inputs, outputs, output_gradients):
        # Modified from theano/tensor/slinalg.py
        A, b = inputs
        c = outputs[0]
        c_bar = output_gradients[0]

        trans_solve_op = GpuCublasTriangularSolve(not self.lower)
        b_bar = trans_solve_op(A.T, c_bar)

        A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)

        if self.lower:
            A_bar = tensor.tril(A_bar)
        else:
            A_bar = tensor.triu(A_bar)
        return [A_bar, b_bar]


def gpu_solve(A, b, A_structure='general', trans='N'):
    if A_structure == 'lower':
        return GpuCublasTriangularSolve(True, trans)(A, b)
    elif A_structure == 'upper':
        return GpuCublasTriangularSolve(False, trans)(A, b)

    return GpuCusolverSolve(A_structure, trans)(A, b)


def gpu_solve_lower_triangular(A, b, trans='N'):
    return GpuCublasTriangularSolve(True, trans)(A, b)


def gpu_solve_upper_triangular(A, b, trans='N'):
    return GpuCublasTriangularSolve(False, trans)(A, b)


class GpuCholesky(Op):
    """
    CUSOLVER GPU Cholesky Op.

    Given a real positive definite matrix `A` returns either a lower
    triangular matrix `L` such that `A == dot(L, L.T)` if `lower == True`
    else returns an upper triangular matrix `U` such that `A == dot(U.T, U)`
    if `lower == False`.

    Parameters
    ----------
    lower
        Whether to return a lower rather than upper triangular decomposition.

    """

    __props__ = ('lower', 'inplace')

    def __init__(self, lower=True, inplace=False):
        self.lower = lower
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}
        super(GpuCholesky, self).__init__()

    def clone_inplace(self):
        return self.__class__(lower=self.lower, inplace=True)

    def make_node(self, inp):
        if not cusolver_available:
            raise RuntimeError('CUSOLVER is not available and '
                               'GpuCholesky Op can not be constructed.')
        if skcuda.__version__ <= '0.5.1':
            warnings.warn('The GpuCholesky op requires scikit-cuda > '
                          '0.5.1 to work with CUDA 8')
        if not pygpu_available:
            raise RuntimeError('Missing pygpu or triu/tril functions.'
                               'Install or update libgpuarray.')
        context_name = infer_context_name(inp)

        inp = as_gpuarray_variable(inp, context_name)

        inp = gpu_contiguous(inp)

        assert inp.ndim == 2

        return theano.Apply(self, [inp], [inp.type()])

    def prepare_node(self, node, storage_map, compute_map, impl):
        ctx = node.inputs[0].type.context
        attach_cusolver_handle_to_context(ctx)

    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Input matrix.
        A = inputs[0]

        l, n = A.shape
        if l != n:
            raise ValueError('A must be a square matrix')

        lda = max(1, n)

        # cusolver operates on F ordered matrices, but A is expected
        # to be symmetric so it does not matter.
        # We copy A if needed
        if self.inplace:
            L = A
        else:
            L = pygpu.array(A, copy=True)

        # The output matrix will contain only the upper or lower
        # triangular factorization of A. If L is C ordered (it
        # probably is as it is the default in Theano) we just switch
        # the fill mode parameter of cusolver
        l_parameter = 0 if self.lower else 1
        if L.flags['C_CONTIGUOUS']:
            l_parameter = 1 - l_parameter

        L_ptr = L.gpudata

        if A.dtype == 'float32':
            potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
            potrf = cusolver.cusolverDnSpotrf
        elif A.dtype == 'float64':
            potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
            potrf = cusolver.cusolverDnDpotrf
        else:
            raise ValueError("Unsupported dtype")

        with context:
            workspace_size = potrf_bufferSize(
                context.cusolver_handle, l_parameter, n, L_ptr, lda)

            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
                                    context=context)

            dev_info = pygpu.zeros((1,), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata

            potrf(context.cusolver_handle, l_parameter, n, L_ptr,
                  lda, workspace_ptr, workspace_size, dev_info_ptr)

            val_dev_info = np.asarray(dev_info)[0]
            if val_dev_info > 0:
                raise LinAlgError('Cholesky decomposition failed (is A SPD?)')

        # cusolver leaves the elements in the matrix outside the considered
        # upper or lower triangle unchanged, so we need to put zeros outside
        # the triangle
        if self.lower:
            tril(L)
        else:
            triu(L)

        outputs[0][0] = L

    def L_op(self, inputs, outputs, gradients):
        # Modified from theano/tensor/slinalg.py
        # No handling for on_error = 'nan'
        dz = gradients[0]
        chol_x = outputs[0]

        # this is for nan mode
        #
        # ok = ~tensor.any(tensor.isnan(chol_x))
        # chol_x = tensor.switch(ok, chol_x, 1)
        # dz = tensor.switch(ok, dz, 1)

        # deal with upper triangular by converting to lower triangular
        if not self.lower:
            chol_x = chol_x.T
            dz = dz.T

        def tril_and_halve_diagonal(mtx):
            """Extracts lower triangle of square matrix and halves diagonal."""
            return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)

        def conjugate_solve_triangular(outer, inner):
            """Computes L^{-T} P L^{-1} for lower-triangular L."""
            return gpu_solve_upper_triangular(
                outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T)

        s = conjugate_solve_triangular(
            chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)))

        if self.lower:
            grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))
        else:
            grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))

        return [grad]


def gpu_cholesky(A, lower=True):
    return GpuCholesky(lower)(A)


# TODO: add support for float64
class GpuMagmaBase(COp):
    """Base class for magma related operations. Add the necessary headers,
    libraries and optionally the location of headers and library.
    """
    def c_headers(self):
        return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
                'gpuarray_helper.h', 'magma.h']

    def c_header_dirs(self):
        dirs = [gpuarray_helper_inc_dir(), pygpu.get_include(),
                config.cuda.include_path]
        if config.magma.include_path:
            dirs.append(config.magma.include_path)
        return dirs

    def c_libraries(self):
        return ['magma']

    def c_lib_dirs(self):
        if config.magma.library_path:
            return [config.magma.library_path]
        return []

    def prepare_node(self, node, storage_map, compute_map, impl):
        from skcuda.magma import magma_init
        ctx = node.inputs[0].type.context
        if not getattr(ctx, 'is_magma_initialized', False):
            with ctx:
                magma_init()
                ctx.is_magma_initialized = True


class GpuMagmaSVD(GpuMagmaBase):
    """Computes the svd of a matrix :math:`A` using magma library.

    .. warning::

        Because of implementation constraints, this Op returns outputs
        in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd`
        to get them in expected order ``U, S, VT``.

    """
    __props__ = ('full_matrices', 'compute_uv')
    _cop_num_inputs = 1
    _cop_num_outputs = 3
    check_input = False
    params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)

    def __init__(self, full_matrices=True, compute_uv=True):
        self.full_matrices = full_matrices
        self.compute_uv = compute_uv
        COp.__init__(self, ['c_code/magma_svd.c'], 'APPLY_SPECIFIC(magma_svd)')

    def make_node(self, A):
        ctx_name = infer_context_name(A)
        A = as_gpuarray_variable(A, ctx_name)
        A = gpu_contiguous(A)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
        if A.dtype != 'float32':
            raise TypeError("only `float32` is supported for now")
        if self.compute_uv:
            return theano.Apply(self, [A],
                                # return S, U, VT
                                [GpuArrayType(A.dtype, broadcastable=[False],
                                              context_name=ctx_name)(),
                                 A.type(),
                                 A.type()])
        else:
            return theano.Apply(self, [A],
                                # return only S
                                [GpuArrayType(A.dtype, broadcastable=[False],
                                              context_name=ctx_name)()])

    def prepare_node(self, node, storage_map, compute_map, impl):
        super(GpuMagmaSVD, self).prepare_node(node, storage_map, compute_map, impl)
        # Check node to prevent eventual errors with old pickled nodes.
        if self.compute_uv:
            A, B, C = node.outputs
            # We expect order: S (vector), U (matrix), VT (matrix)
            assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, \
                "Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) " \
                "instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order."

    def get_params(self, node):
        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def infer_shape(self, node, shapes):
        x_shape, = shapes
        M, N = x_shape
        K = tensor.minimum(M, N)
        s_shape = (K, )
        if self.compute_uv:
            u_shape = (M, M) if self.full_matrices else (M, K)
            vt_shape = (N, N) if self.full_matrices else (K, N)
            return [s_shape, u_shape, vt_shape]
        else:
            return [s_shape]


def gpu_svd(a, full_matrices=1, compute_uv=1):
    """
    This function performs the SVD on GPU.

    Parameters
    ----------
    full_matrices : bool, optional
        If True (default), u and v have the shapes (M, M) and (N, N),
        respectively.
        Otherwise, the shapes are (M, K) and (K, N), respectively,
        where K = min(M, N).
    compute_uv : bool, optional
        Whether or not to compute u and v in addition to s.
        True by default.

    Returns
    -------
    U, V,  D : matrices

    """
    out = GpuMagmaSVD(full_matrices, compute_uv)(a)
    if compute_uv:
        S, U, VT = out
        out = [U, S, VT]
    return out


class GpuMagmaMatrixInverse(GpuMagmaBase):
    """Computes the inverse of a matrix :math:`A` using magma library.
    """
    __props__ = ('inplace', )
    check_input = False
    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)

    def __init__(self, inplace=False):
        COp.__init__(self, ['c_code/magma_inv.c'], 'APPLY_SPECIFIC(magma_inv)')
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}

    def clone_inplace(self):
        return self.__class__(inplace=True)

    def make_node(self, A):
        ctx_name = infer_context_name(A)
        A = as_gpuarray_variable(A, ctx_name)
        A = gpu_contiguous(A)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
        if A.dtype != 'float32':
            raise TypeError("only `float32` is supported for now")
        return theano.Apply(self, [A], [A.type()])

    def get_params(self, node):
        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def infer_shape(self, node, shapes):
        return shapes


def gpu_matrix_inverse(a):
    """
    This function performs the matrix inverse on GPU.

    Returns
    -------
    a_inv: matrix

    """
    return GpuMagmaMatrixInverse()(a)


class GpuMagmaCholesky(GpuMagmaBase, CGpuKernelBase):
    """Computes the cholesky decomposition of a matrix :math:`A` using magma
    library.

    """
    __props__ = ('lower', 'inplace')
    check_input = False
    params_type = ParamsType(lower=bool_t, inplace=bool_t, context=gpu_context_type)

    def __init__(self, lower=True, inplace=False):
        self.lower = lower
        COp.__init__(self, ['c_code/magma_cholesky.c'], 'APPLY_SPECIFIC(magma_cholesky)')
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}

    def clone_inplace(self):
        return self.__class__(lower=self.lower, inplace=True)

    def make_node(self, A):
        ctx_name = infer_context_name(A)
        A = as_gpuarray_variable(A, ctx_name)
        A = gpu_contiguous(A)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
        if A.dtype != 'float32':
            raise TypeError("only `float32` is supported for now")
        return theano.Apply(self, [A], [A.type()])

    def get_params(self, node):
        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def infer_shape(self, node, shapes):
        return [shapes[0]]


class GpuMagmaQR(GpuMagmaBase, CGpuKernelBase):
    """Computes the qr decomposition of a matrix :math:`A` using magma
    library.

    Parameters
    ----------

        complete : If False, returns only ``R``.

    .. warning::

        Because of implementation constraints, this Op returns outputs
        in order ``R, Q``. Use :func:`theano.gpuarray.linalg.gpu_qr`
        to get them in expected order ``Q, R``.
    """
    __props__ = ('complete', )
    _cop_num_inputs = 1
    _cop_num_outputs = 2
    check_input = False
    params_type = ParamsType(complete=bool_t, context=gpu_context_type)

    def __init__(self, complete=True):
        self.complete = complete
        COp.__init__(self, ['c_code/magma_qr.c'], 'APPLY_SPECIFIC(magma_qr)')

    def make_node(self, A):
        ctx_name = infer_context_name(A)
        A = as_gpuarray_variable(A, ctx_name)
        A = gpu_contiguous(A)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
        if A.dtype != 'float32':
            raise TypeError("only `float32` is supported for now")
        if self.complete:
            return theano.Apply(self, [A],
                                # return R, Q
                                [A.type(), A.type()])
        else:
            return theano.Apply(self, [A],
                                # return R
                                [A.type()])

    def get_params(self, node):
        return self.params_type.get_params(self, context=node.inputs[0].type.context)


def gpu_qr(a, complete=True):
    """
    This function performs the QR on GPU.

    Parameters
    ----------
    complete : bool, optional
        If `False`, returns only r.

    Returns
    -------
    Q, R : matrices

    """
    out = GpuMagmaQR(complete)(a)
    if complete:
        R, Q = out
        out = [Q, R]
    return out


class GpuMagmaEigh(GpuMagmaBase):
    """Computes the eigen decomposition of a symmetric matrix :math:`A` using magma
    library.

    Parameters
    ----------
    UPLO : Specifies whether the calculation is done with the lower triangular
           part of matrix (`L`, default) or the upper triangular part (`U`).
    compute_v : If `True`, computes eigenvalues and eigenvectors (`True`,
                default). If `False`, computes only eigenvalues of matrix.
    """
    __props__ = ('lower', 'compute_v')
    _cop_num_inputs = 1
    _cop_num_outputs = 2
    check_input = False
    params_type = ParamsType(lower=bool_t, compute_v=bool_t,
                             context=gpu_context_type)

    def __init__(self, UPLO='L', compute_v=True):
        assert UPLO in ['L', 'U']
        self.lower = UPLO == 'L'
        self.compute_v = compute_v
        COp.__init__(self, ['c_code/magma_eigh.c'], 'APPLY_SPECIFIC(magma_eigh)')

    def make_node(self, A):
        ctx_name = infer_context_name(A)
        A = as_gpuarray_variable(A, ctx_name)
        A = gpu_contiguous(A)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
        if A.dtype != 'float32':
            raise TypeError("only `float32` is supported for now")
        if self.compute_v:
            return theano.Apply(self, [A],
                                # return D, V
                                [GpuArrayType(A.dtype, broadcastable=[False],
                                              context_name=ctx_name)(),
                                 A.type()])
        else:
            return theano.Apply(self, [A],
                                # return D
                                [GpuArrayType(A.dtype, broadcastable=[False],
                                              context_name=ctx_name)()])

    def get_params(self, node):
        return self.params_type.get_params(self, context=node.inputs[0].type.context)