diff --git a/.gitignore b/.gitignore
index 62f99999dd7..5865e241be2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@
 *.log
 *.nav
 *.out
+*.pdf
 *.snm
 *.toc
 *.vrb
diff --git a/.mailmap b/.mailmap
index c59dab83ce8..bb3acb31ee2 100644
--- a/.mailmap
+++ b/.mailmap
@@ -7,11 +7,8 @@
 # #
 # # This file is up-to-date if the command git log --format="%aN <%aE>" | sort -u
 # # gives no duplicates.
-Jörg Bornschein <jb@capsec.org> Joerg Bornschein <bornschein@fias.uni-frankfurt.de>
-Eric Hunsberger <hunse@ctn> hunse <hunse@ctn>
-Jan Schlüter <github@jan-schlueter.de> f0k <github@jan-schlueter.de>
+
 Rami Al-Rfou' <rmyeid@gmail.com> Rami Al-Rfou <rmyeid@gmail.com>
-Arnaud Bergeron <abergeron@gmail.com> <abergeron@gmail.com>
 <abergeron@gmail.com> <anakha@kami.(none)>
 David Warde-Farley <wardefar@iro.umontreal.ca> David Warde-Farley <dwf@cs.toronto.edu>
 David Warde-Farley <wardefar@iro.umontreal.ca> David Warde Farley <dwf@cs.toronto.edu>
@@ -20,7 +17,7 @@ Douglas Eck <douglas.eck@gmail.com> eckdoug@waits.local <eckdoug@waits.local>
 Dumitru Erhan <dumitru.erhan@gmail.com> dumitru@deepnets.mtv.corp.google.com <dumitru@deepnets.mtv.corp.google.com>
 Dumitru Erhan <dumitru.erhan@gmail.com> erhandum@bikat.iro.umontreal.ca <erhandum@bikat.iro.umontreal.ca>
 Francois Savard <devnull@localhost> fsavard <devnull@localhost>
-Steven Pigeon <pigeon@iro.umontreal.ca> steven-pigeon <pigeon@iro.umontreal.ca>
+
 #     5	Firstname Lastname <firstname.lastname@example.net>
 #     4	Laboratoire d'Informatique des Systemes Adaptatifs <lisa@iro.umontreal.ca>
 #     6	Li Yao <yaoli@iro>
@@ -28,11 +25,6 @@ Steven Pigeon <pigeon@iro.umontreal.ca> steven-pigeon <pigeon@iro.umontreal.ca>
 #     2	onze <onzeonline@gmail.com>
 #    25	projects@lgcm <projects@lgcm>
 #     1	tutorial/debug_faq.txt <devnull@localhost>
-Bogdan Budescu <bbudescu@gmail.com> bbudescu <bbudescu@gmail.com>
-Sebastian Berg <sebastian@sipsolutions.net> seberg <sebastian@sipsolutions.net>
-Huy Nguyen <huy@huyng.com> huyng <huy@huyng.com>
-Wei Li <kuantkid@gmail.com> kuantkid <kuantkid@gmail.com>
-Ethan Buchman <ebuchman@uoguelph.ca> ebuchman <ebuchman@uoguelph.ca>
 Frederic Bastien <nouiz@nouiz.org> Frederic Bastien <bastienf@briaree1.rqchp.qc.ca>
 Frederic Bastien <nouiz@nouiz.org> Frederic Bastien <bastienf@iro.umontreal.ca>
 Frederic Bastien <nouiz@nouiz.org> Frédéric Bastien <nouiz@nouiz.org>
@@ -63,7 +55,6 @@ James Bergstra <james.bergstra@gmail.com> james@mackie <james@mackie>
 James Bergstra <james.bergstra@gmail.com> james@x40.unstable <james@x40.unstable>
 James Bergstra <james.bergstra@gmail.com> test_rng_mrg.py <devnull@localhost>
 John Salvatier <jsalvatier@gmail.com> jsalvatier <jsalvatier@gmail.com>
-John Salvatier <jsalvatier@gmail.com> john salvatier <jsalvatier@gmail.com>
 Joseph Turian <turian@iro.umontreal.ca> Joseph Turian <turian@gmail.com>
 Joseph Turian <turian@iro.umontreal.ca> turian@grenat.iro.umontreal.ca <turian@grenat.iro.umontreal.ca>
 Joseph Turian <turian@iro.umontreal.ca> turian@lgcm <turian@lgcm>
@@ -92,4 +83,3 @@ Sander Dieleman <sanderdieleman@gmail.com> benanne <sanderdieleman@gmail.com>
 Xavier Glorot <glorotxa@iro.umontreal.ca> glorotxa <glorotxa@iro.umontreal.ca>
 Xavier Glorot <glorotxa@iro.umontreal.ca> glorotxa@timide.iro.umontreal.ca <glorotxa@timide.iro.umontreal.ca>
 Yoshua Bengio <bengioy@iro.umontreal.ca> bengioy@bengio-mac.local <bengioy@bengio-mac.local>
-Sina Honari <honaris@iro.umontreal.ca> SinaHonari <sina2222@gmail.com>
diff --git a/.travis.yml b/.travis.yml
index b1f1aa0b29f..3698be65683 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,49 +3,42 @@
 
 language: python
 python:
-  - "2.6"
+  - "2.5"
 #  - "2.7"
 #  - "3.2"
-
 # command to install dependencies
 before_install:
-  - sudo apt-get install -q libatlas3gf-base libatlas-dev liblapack-dev gfortran
-# Install miniconda to avoid compiling scipy
-  - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
-  - chmod +x miniconda.sh
-  - ./miniconda.sh -b
-  - export PATH=/home/travis/miniconda/bin:$PATH
-  - conda update --yes conda
-
+  - sudo apt-get install -qq libatlas3gf-base libatlas-dev liblapack-dev gfortran
+#  - sudo apt-get install -qq libopenblas-dev
 install:
-# We support scipy 0.7.2, but it is not available on conda.
-# So we test with 0.11. Our internal buildbot have 0.7.2.
-  - conda create --yes -q -n py26 python=2.6 numpy=1.6 scipy=0.11 nose=1.1 pip
-  - source activate py26
-  - pip install pydot
-  - pip install . --no-deps --use-mirrors
-
+# If we don't install numpy before SciPy 0.10.1, the SciPy installations fails.
+  - "pip install -q numpy==1.5 --use-mirrors"
+# We support scipy 0.7.2, but it is not available on pypi anymore.
+# So we test with 0.8. Our internal buildbot have 0.7.2.
+# We install it later only for the PART that need it.
+#  - "pip install -q scipy==0.8 --use-mirrors"
+  - "pip install . --no-deps --use-mirrors"
 # command to run tests
 env:
-  - PART="theano/scan_module/"
-  - PART="theano/sandbox theano/sparse theano/scalar/ theano/tensor/nnet/"
-  - PART="theano/tensor/tests/test_basic.py theano/tensor/signal/ theano/compile/ theano/gof/ theano/misc/ theano/tests/ theano/compat"
+  - PART="theano/tensor/nnet/ theano/tensor/signal/ theano/compile/ theano/gof/ theano/misc/ theano/tests/ theano/compat theano/scan_module/"
+# This part is select such that all scipy code is there.
+# We install scipy only for this part to make the test time faster.
+  - PART="theano/sandbox theano/sparse theano/scalar/"
+  - PART="theano/tensor/tests/test_basic.py"
   - PART="-e test_basic.py theano/tensor/tests"
-
 script:
-  - export THEANO_FLAGS=blas.ldflags="-lblas -lgfortran",warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise
+  - "if [ `expr \"$PART\" : '.*sparse'` -gt \"0\" ]; then pip install scipy==0.8 --use-mirrors; fi"
+  - export THEANO_FLAGS=warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise
   - python --version
   - uname -a
   - free -m
   - df -h
   - ulimit -a
-  - echo "$PART"
-  - theano-nose -v $PART
-  - theano-cache list
+  - echo $PART
+  - theano-nose $PART
+
 
 #after_script:
 after_failure:
   - cat /home/travis/.pip/pip.log
 #after_success:
-
-cache: apt
diff --git a/MANIFEST.in b/MANIFEST.in
index 458a924bd22..cabc8eaeacb 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,10 +1,7 @@
 global-include *.txt
-global-include *.c
 global-include *.cu
 global-include *.cuh
-global-include *.h
 global-include *.sh
-global-include *.pkl
 recursive-include docs
 include bin/theano-cache
 include bin/theano-nose
diff --git a/NEWS.txt b/NEWS.txt
index edba00faf1c..977366d3873 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,391 +1,9 @@
+.. _NEWS:
+
 =============
 Release Notes
 =============
 
-Theano 0.6 (December 3th, 2013)
-===================================
-
-We recommend that everybody update to this version.
-
-
-Highlights (since 0.6rc5):
- * Last release with support for Python 2.4 and 2.5.
- * We will try to release more frequently.
- * Fix crash/installation problems.
- * Use less memory for conv3d2d.
-
-0.6rc4 skipped for a technical reason.
-
-Highlights (since 0.6rc3):
- * Python 3.3 compatibility with buildbot test for it.
- * Full advanced indexing support.
- * Better Windows 64 bit support.
- * New profiler.
- * Better error messages that help debugging.
- * Better support for newer NumPy versions (remove useless warning/crash).
- * Faster optimization/compilation for big graph.
- * Move in Theano the Conv3d2d implementation.
- * Better SymPy/Theano bridge: Make an Theano op from SymPy expression and use SymPy c code generator.
- * Bug fixes.
-
-Change from 0.6rc5:
- * Fix crash when specifing march in cxxflags Theano flag. (Frederic B., reported by FiReTiTi)
- * code cleanup (Jorg Bornschein)
- * Fix Canopy installation on windows when it was installed for all users: Raingo
- * Fix Theano tests due to a scipy change. (Frederic B.)
- * Work around bug introduced in scipy dev 0.14. (Frederic B.)
- * Fix Theano tests following bugfix in SciPy. (Frederic B., reported by Ziyuan Lin)
- * Add Theano flag cublas.lib (Misha Denil)
- * Make conv3d2d work more inplace (so less memory usage) (Frederic B., repoted by Jean-Philippe Ouellet)
-
-
-Committers since 0.5:
-
-Frederic Bastien
-Pascal Lamblin
-Ian Goodfellow
-Olivier Delalleau
-Razvan Pascanu
-abalkin
-Arnaud Bergeron
-Nicolas Bouchard +
-Jeremiah Lowin +
-Matthew Rocklin
-Eric Larsen +
-James Bergstra
-David Warde-Farley
-John Salvatier +
-Vivek Kulkarni +
-Yann N. Dauphin
-Ludwig Schmidt-Hackenberg +
-Gabe Schwartz +
-Rami Al-Rfou' +
-Guillaume Desjardins
-Caglar +
-Sigurd Spieckermann +
-Steven Pigeon +
-Bogdan Budescu +
-Jey Kottalam +
-Mehdi Mirza +
-Alexander Belopolsky +
-Ethan Buchman +
-Jason Yosinski
-Nicolas Pinto +
-Sina Honari +
-Ben McCann +
-Graham Taylor
-Hani Almousli
-Ilya Dyachenko +
-Jan Schlüter +
-Jorg Bornschein +
-Micky Latowicki +
-Yaroslav Halchenko +
-Eric Hunsberger +
-Amir Elaguizy +
-Hannes Schulz +
-Huy Nguyen +
-Ilan Schnell +
-Li Yao
-Misha Denil +
-Robert Kern +
-Sebastian Berg +
-Vincent Dumoulin +
-Wei Li +
-XterNalz +
-
-
-A total of 51 people contributed to this release.
-People with a "+" by their names contributed a patch for the first time.
-
-
-Theano 0.6rc5 (November 25th, 2013)
-===================================
-
-We recommend that everybody update to this version.
-
-We plan to release 0.6 in one week if there is no problem introduced
-with this release candidate.
-
-Theano 0.6rc4 was skipped due to a problem with pypi
-
-Highlights:
- * Python 3.3 compatibility with buildbot test for it.
- * Full advanced indexing support.
- * Better Windows 64 bit support.
- * New profiler.
- * Better error messages that help debugging.
- * Better support for newer NumPy versions (remove useless warning/crash).
- * Faster optimization/compilation for big graph.
- * Move in Theano the Conv3d2d implementation.
- * Better SymPy/Theano bridge: Make an Theano op from SymPy expression and use SymPy c code generator.
- * Bug fixes.
-
-Committers for this rc5 only:
-
-Frederic Bastien
-Pascal Lamblin
-Arnaud Bergeron
-abalkin
-Olivier Delalleau
-John Salvatier
-Razvan Pascanu
-Jeremiah Lowin
-Ludwig Schmidt-Hackenberg +
-Vivek Kulkarni
-Matthew Rocklin
-Gabe Schwartz
-James Bergstra
-Sigurd Spieckermann +
-Bogdan Budescu +
-Mehdi Mirza +
-Nicolas Bouchard
-Ethan Buchman +
-Guillaume Desjardins
-Ian Goodfellow
-Jason Yosinski
-Sina Honari +
-Ben McCann +
-David Warde-Farley
-Ilya Dyachenko +
-Jan Schluter +
-Micky Latowicki +
-Yaroslav Halchenko +
-Alexander Belopolsky
-Hannes Schulz +
-Huy Nguyen +
-Robert Kern +
-Sebastian Berg +
-Vincent Dumoulin +
-Wei Li +
-XterNalz +
-
-
-A total of 36 people contributed to this release.
-People with a "+" by their names contributed a patch for the first time.
-
-Installation:
- * Canopy support (direct link to MKL):
-   * On Linux and Mac OSX (Frederic B., Robert Kern)
-   * On Windows (Edward Shi, Frederic B.)
-
- * Anaconda instructions (Pascal L., Frederic B.)
- * Doc Ubuntu 13.04 (Frederic B.)
- * Better support of newer NumPy version(remove useless warning/crash) (Frederic B., Huy Nguyen)
-
-Bug fixes:
- * Scan: if a scan node was cloned (by theano.clone) with different inputs, and if both the initial and the cloned nodes are used in the function being compiled, the value of the outputs of one would be replaced with the outputs of the other one. (Pascal L.)
- * Sparse: Disable the optimization that introduce the CSMGradC op as it doesn't work correctly with unsorted indices. (Frederic B.)
- * Mac: Fix wrong result of GpuDownsampleFactorMaxGrad on Mac OSX. (Pascal L.)
- * Mac: Auto-Detect and work around a bug in BLAS on MacOS X (Pascal L.)
- * Mac: Work around bug in MacOS X. If 2 compiled modules had the same name, the OS or Python was not always the right one even when we used the right handle to it. (Pascal L.)
-   Use this hash in the Python module, and in %(nodename)s, so that different helper functions in the support code for different Ops will always have different names.
- * Sparse grad: Fix ConstructSparseFromList.infer_shape (Pascal L., reported by Rami Al-Rfou')
- * (introduced in the development version after 0.6rc3 release) (Frederic B.)
-   Reduction that upcasts the input on no axis (ex: call theano.sum() on a scalar when the original dtype isn't float64 or
-   [u]int64). It produced bad results as we did not upcasted the inputs in the code, we just copy them.
- * Fix some cases of theano.clone() when we get a replacement of x that is a function of x. (Razvan P., reported by Akio Takano)
- * Fix grad of Alloc when we unbroadcast the value and it isn't a scalar. (Frederic B., reported Ian G.)
-
-   * In some cases (I think most cases), there was an exception raised in the theano.tensor.grad() method.
-     But in theory, there could be bad shapes produced in the unbroadcasted dimensions.
-
-Interface Deprecation (a warning is printed):
- * The mode ProfileMode is now deprecated, use the Theano flag profile=True to replace it.
- * New theano.sparse_grad() interface to get the sparse grad of a_tensor[an_int_vector]. (Frederic B.)
-   This can speed up the sparse computations when a small fraction of a_tensor is taken.
-   Deprecate the old interface for this. (Frederic B.)
-
-Interface Changes:
- * Interface change subtensor and take are not in tensor.basic anymore. They were available from tensor.* and are still available from there. (Frederic B., Matthew Rocklin)
-   * This lowers the basic.py size to 191k, so under 200k for github search.
- * Add -m32 or -m64 in the module cache key and add the python bitwidth in the compiledir path. (Pascal L.)
- * mrg.normal now has the parameter size mandatory. It was crashing with the default value of None. (Olivier D.)
- * Remove the deprecated passing of multiple modes to theano function. (Frederic B.)
- * Change FunctionGraph Features interface of the {on_prune(),on_import()} call back to take a reason. (Frederic B.)
- * FunctionGraph now clone the input graph by default. (Frederic B.)
-   * Added a parameter to optionally not do this cloning.
-   * This was needed to speed up compilation
-
-New Interface (reuses existing functionality):
- * Add hostname as a var in compiledir_format (Frederic B.)
- * Add a new Theano flag: compute_test_value_opt. It takes the same values as compute_test_value. It enables compute_test_value during Theano optimization. Only useful to debug Theano optimization. Also small changes to some optimization to work correctly in that setup. (Frederic B.)
- * Add the value pdb to the Theano flag: compute_test_value and compute_test_value_opt. (Frederic B.)
- * Add the Theano flag: optimizer_verbose. Default False. When True, we print all the optimization being applied.(Frederic B.)
- * Add Op.c_init_code() to allow running the code when the c cmodule is imported (Pascal L.)
- * Allow theano.tensor.ones(3) to support scalar and not just list of scalar as numpy.ones (Jeremiah Lowin)
- * Make the memory profiler print the FLOPS used for the ops that know how to compute it. (Frederic B.)
-
-New Features:
- * Make tensor.{constant,as_tensor_variable} work with memmap. (Christian Hudon, Frederic Bastien)
- * compilation work on ARM processor (Raspberry Pi, Vincent Dumoulin)
- * Add numpy.random.choice wrapper to our random number generator (Sigurd Spieckermann)
- * Better SymPy/Theano bridge: Make an Theano op from SymPy expression and use SymPy c code generator (Matthew Rocklin)
- * Move in Theano the Conv3d2d implementation (James Bergstra, Frederic B., Pascal L.)
- * First version of the new GPU back-end available (Arnaud Bergeron, Frederic B.)
-
-   * Not all Ops have been converted to this new back-end.
-     To use, use Theano flag device=cudaN or device=openclN, where N is a integer.
- * Python 3.3 compatible (abalkin, Gabe Schwartz, Frederic B., Pascal L.)
- * A new profiler (Frederic B.)
-   The new profiler now can profile the memory with the Theano flag profile_memory=True.
-   The ProfileMode now can't profile memory anymore and prints a message about it.
-   Now we raise an error if we try to profile when the gpu is enabled if we didn't set
-   correctly the env variable to force the driver to sync the kernel launch.
-   Otherwise the profile information are useless.
-   The new profiler supports the enabling/disabling of the garbage collection.
- * Adds tensor.tri, tensor.triu, and tensor.tril functions that wrap Numpy equivalents (Jeremiah Lowin)
- * Adds tensor.nonzero, tensor.flatnonzero functions that wrap Numpy equivalents (Jeremiah Lowin)
- * Adds tensor.nonzero_values to get around lack of advanced indexing for nonzero elements (Jeremiah Lowin)
- * Make {inc,set}_subtensor work on output of take. (Pascal L.)
- * When device=cpu and force_device=True, force that we disable the gpu. (Frederic B.)
- * Better Windows 64 bit support for indexing/reshaping (Pascal L.)
- * Full advanced indexing support (John Salvatier, seberg)
- * Add theano.tensor.stacklist(). Recursivly stack lists of tensors to maintain similar structure (Matthew R.)
- * Add Theano flag value: on_opt_error=pdb (Olivier D.)
- * GpuSoftmax[WithBias] for bigger row. (Frederic B.)
- * Make Erfinv work on the GPU (Guillaume Desjardin, Pascal L.)
- * Add "theano-cache basecompiledir purge" (Pascal L.)
-   This purges all the compiledirs that are in the base compiledir.
- * A_tensor_variable.zeros_like() now supports the dtype parameter (Pascal L.)
- * More stable reduce operations by default (Pascal L.)
-   Add an accumulator dtype to CAReduceDtype (acc_dtype)
-   by default, acc_dtype is float64 for float32 inputs,
-   then cast to specified output dtype (float32 for float32 inputs)
- * Test default blas flag before using it (Pascal L.)
-   This makes it work correctly by default if no blas library is installed.
- * Add cuda.unuse() to help tests that need to enable/disable the GPU (Frederic B.)
- * Add theano.tensor.nnet.ultra_fast_sigmoid and the opt (disabled by default) local_ultra_fast_sigmoid. (Frederic B.)
- * Add theano.tensor.nnet.hard_sigmoid and the opt (disabled by default) local_hard_sigmoid. (Frederic B.)
- * Add class theano.compat.python2x.Counter() (Mehdi Mirza)
- * Allow a_cuda_ndarray += another_cuda_ndarray for 6d tensor. (Frederic B.)
- * Make the op ExtractDiag work on the GPU. (Frederic B.)
- * New op theano.tensor.chi2sf (Ethan Buchman)
- * Lift Flatten/Reshape toward input on unary elemwise. (Frederic B.)
-   This makes the "log(1-sigmoid) -> softplus" stability optimization being applied with a flatten/reshape in the middle.
- * Make MonitorMode use the default optimizers config and allow it to change used optimizers (Frederic B.)
- * Add support for ScalarOp.c_support_code in GpuElemwise. (Frederic B.)
- * Also make the Psi function run on GPU. (Frederic B.)
- * Make tensor.outer(x,y) work when ndim != 1 as numpy.outer.
- * Kron op: Speed up/generalize/GPU friendly. (Frederic B.)
-   (It is not an op anymore, but reuses current op)
- * Add gpu max for pattern (0, 1) and added all gpu max pattern for gpu min. (Frederic B.)
- * Add GpuEye (Frederic B.)
- * Make GpuCrossentropySoftmaxArgmax1HotWithBias and GpuCrossentropySoftmax1HotWithBiasDx work for bigger inputs (Frederic B., reported by Ryan Price)
- * Finish and move out of sandbox theano.sparse.basic.true_dot (Nicolas Bouchard, Frederic B.)
-   And document all sparse dot variants.
- * Implement the mode ignore_borders for GpuImages2Neibs (Frederic B.)
- * Make many reduction functions accept a numpy scalar as axis (Jeremiah Lowin)
- * Allow numpy.asarray(cuda_ndarray, dtype=...) (Frederic B.)
- * theano-cache cleanup now remove cached module old version of code. (Frederic B.)
-
-
-Speed-ups:
- * Optimizer speed up. (Frederic B.)
- * Fix warning on newer llvm version on Mac. (Pascal L., reported by Jeremiah Lowin and Chris Fonnesbeck)
- * Allow pickling of more Ops to allow reusing the compiled code (Pascal L., Frederic B.)
- * Optimize more cases of dot22 and scalar when we can't make a gemm (Pascal L., Frederic B.)
- * Speed up GpuJoin with c code (Ludwig Schmidt-Hackenberg, Frederic B.)
- * Faster GpuAdvancedIncSubtensor1 on Fermi GPU (and up) on matrix. (Vivek Kulkarni)
- * Faster GPUAdvancedIncSubtensor1 in some cases on all GPU (Vivek Kulkarni)
- * Implemented c_code for AdvancedSubtensor1 (abalkin)
- * Add the equivalent of -march=native to g++ command line. (Frederic B., Pascal L.)
- * Speed up compilation with Scan (Jan Schluter)
- * Merge more Scan nodes together (Pascal L., Yao Li).
- * Add MakeVector.c_code (Frederic B.)
- * Add Shape.c_code (Frederic B.)
- * Optimize Elemwise when all the inputs are fortran (Frederic B.)
-   We now generate a fortran output and use vectorisable code.
- * Add ScalarOp.c_code_contiguous interface and do a default version. (Frederic B.)
-   This could optimize elemwise by helping the compiler generate SIMD instruction.
- * Use ScalarOp.c_code_contiguous with amdlibm. (Frederic B.)
-   This speeds up exp, pow, sin, cos, log, log2, log10 and sigmoid when the input is contiguous in memory.
- * A fix that removes a local_setsubtensor_of_allocs optimization warning and enables it in that case. (Frederic B., reported by John Salvatier)
- * Make inv_as_solve optimization work (Matthew Rocklin)
-
-Crash/no return fixes:
- * Fix scan crash in the grad of grad of a scan with special structure (including scan in a scan) (Razvan P., Bitton Tenessi)
- * Fix various crashes when calling scan() with inputs specified in unusual ways. (Pascal L.)
- * Fix shape crash inserted by Scan optimization. The gradient of some recursive scan was making the PushOutSeqScan optimization insert crash during the execution of a Theano function. (Frederic B., reported by Hugo Larochelle)
- * Fix command not returning with recent mingw64 on Windows (Pascal L., reported by many people)
- * Fix infinite loop related to Scan on the GPU. (Pascal L.)
- * Fix infinite loop when the compiledir is full. (Frederic B.)
- * Fix a shape cycle crash in the optimizer (Pascal L., Frederic B., reported by Cho KyungHyun)
- * Fix MRG normal() now allow it to generate scalars. (Pascal L.)
- * Fix some GPU compilation issue on Mac (John Yani, Frederic B.)
- * Fix crash when building symbolic random variables with a mix of symbolic and numeric scalar in the "size" parameter. (Pascal L., Reported by Wu Zhen Zhou)
- * Make some Op.grad() implementions not return None (Pascal L.)
- * Crash fix in the grad of elemwise about an DisconnectedType (Pascal L, reported by Thomas Wiecki)
- * Fix local_gpu_multinomial optimization handling of broadcast information. (Frederic B., reported by Caglar)
- * Fix crash with change introduced in NumPy 1.7.1 (Pascal L., reported by Thomas Wiecki)
- * Compilation failure with complex (Pascal L., reported by autumncat)
- * Gpu reduction on all dimensions of a 4d tensor. (Frederic B., reported by Arjun Jain)
- * Fix crash for a combination of grad of dot and dimshuffle when only one of the inputs for a corresponding dimensions was knowing that it was broadcastable. (Frederic B., reported by Micky Latowicki)
- * AdvancedSubtensor1: allow broadcasted index vector. (Frederic B., reported by Jeremiah Lowin)
- * Fix compute_test_value for ifelse (Olivier D., reported by Bitton Tenessi)
- * Fix import error with some versions of NumPy (Olivier D.)
- * Fix Scan grad exception (Razvan P., reported by Nicolas BL)
- * Fix compute_test_value for a non_sequence when calling the gradient of Scan (Pascal L., reported by Bitton Tenessi).
- * Crash fix in Scan following interface change in 0.6rc2 (Razvan P.)
- * Crash fix on Scan (Razvan P.)
- * Crash fix on Scan (Pascal L., reported by Sina Honari and Sigurd)
- * Fix crash in Scan gradient related to compute_test_value (Frederic B., reported by Bitton Tenessi)
- * Fix a scan optimization warning/error depending of Theano flags (Frederic B.)
- * Fixed crash for unimplemented elemwise gradient (Olivier D., reported by Michael McNeil Forbes)
- * Fix crash in the elemwise python code for some big shape with power of 2. (Sina Honari, Pascal L.)
- * Fix compile and import errors on Windows including for the GPU. (Bogdan Budescu)
- * Fix GPU compilation on Windows (XterNalz)
- * Fix local_abs_merge optimization crash (Pascal L., reported by Jeremiah Lowin)
- * Fix import theano crash when g++ isn't there (Olivier D.)
- * Fix crash related to rebuild of Theano graph (Pascal L., reported by Divine Eguzouwa)
- * Fix crash during compilation (David Ward-Farley)
- * Crash fix in the grad of GPU op in corner case (Pascal L.)
- * Crash fix on MacOS X (Robert Kern)
- * theano.misc.gnumpy_utils.garray_to_cudandarray() set strides correctly for dimensions of 1. (Frederic B., reported by Justin Bayer)
- * Fix crash during optimization with consecutive sums and some combination of axis (Frederic B., reported by Caglar Gulcehre)
- * Fix crash with keepdims and negative axis (Frederic B., reported by David W.-F.)
- * Fix crash of theano.[sparse.]dot(x,y) when x or y is a vector. (Frederic B., reported by Zsolt Bitvai)
- * Fix opt crash/disabled with ifelse on the gpu (Frederic B, reported by Ryan Price)
- * Fix crash in optimization involving dot22, (Pascal L., reported by @micklat)
- * Prevent shape optimizations from introducing cycles in the graph (Frederic Bastien, Pascal Lamblin, reported by Kyunghyun Cho)
-
-Others:
- * Update/Fixes/Typo/pep8 documentation and/or tutorial (Olivier D., David W.-F., Frederic B., Yaroslav Halchenko, Micky Latowicki, Ben McCann, Jason Yosinski, reported by Arnaud Bergeron)
- * Doc how to make a sparse Op. (Frederic B.)
- * Doc compatibility guide (abalkin)
- * Fix problem in remove_constants_and_unused_inputs_scan. (useless warning and maybe slow down) (Pascal L.)
- * Fix rop dot.(Razvan P., reported by Jeremiah Lowin)
- * Raise better error related to pydot bug. (Frederic B., reported by Jason Yosinski and Ludwig Schmidt-Hackenberg)
- * Fix to Theano tutorial examples. (reported by Ilya Dyachenko)
- * Fix SharedVar.value property to make it raise an exception (Frederic B., reported by Drew Duncan)
- * Fix verification with compute_test_value in grad() (Frederic B.)
- * Theano flags are now evaluated lazily, only if requested (Frederic B.)
- * Fix test when g++ is not avail (Frederic B.)
- * Add manual instructions for OpenBLAS on Ubuntu by (Jianri Li )
- * Better/more error messages (Frederic B., Pascal L., Ian Goodfellow)
- * Fix Error reporting with GpuConv (Frederic B., reported by Heng Luo and Nicolas Pinto)
- * Now travis-ci tests with scipy the parts that need it (Frederic B.)
- * Export some functions that work on CudaNdarray for windows (Frederic B.)
- * If the user specifies a -arch=sm_* value in the Theano flags for the gpu, don't add one (Frederic B., Pascal L.)
- * If a C thunk returns an error, check if a python exception is set. Otherwise, set a default one (Pascal L.)
- * Crash fix introduced in the development version (Wei LI)
- * Added BLAS benchmark result (Frederic B., Ben McCann)
- * Fix code comment (Hannes Schulz)
- * More stable tests (Frederic B.)
- * Add utt.asset_allclose(a, b) to have better error message. (Frederic B.)
- * Better error message with compute_test_value (Frederic, reported by John Salvatier)
- * Stochastic order behavior fix (Frederic B.)
-
- * Simpler initial graph for subtensor infer shape (Olivier D.)
-   The optimization was doing the optimization, but this allows better reading of the graph before optimization.
- * Better detection of non-aligned ndarray (Frederic B.)
- * Update MRG multinomial gradient to the new interface (Mehdi Mirza)
- * Implement Image2Neibs.perform() to help debug (Frederic B.)
- * Remove some Theano flags from the compilation key (Frederic B.)
- * Make theano-nose work on executable '\*.py' files. (Alistair Muldal)
- * Make theano-nose work with older nose version (Frederic B.)
- * Add extra debug info in verify_grad() (Frederic B.)
-
-
 Theano 0.6rc3 (February 14th, 2013)
 ===================================
 
@@ -524,6 +142,9 @@ Others:
  * Documentation improvements. (Many people including David W-F, abalkin, Amir Elaguizy, Olivier D., Frederic B.)
  * The current GPU back-end have a new function CudaNdarray_prep_output(CudaNdarray ** arr, int nd, const int * dims) (Ian G)
 
+=============
+Release Notes
+=============
 
 Theano 0.6rc2 (November 21th, 2012)
 ===================================
@@ -636,6 +257,9 @@ Crash Fixes:
 Other:
  * Doc typo fixes, Doc updates, Better error messages: Olivier D., David W.F., Frederic B., James B., Matthew Rocklin, Ian G., abalkin.
 
+=============
+Release Notes
+=============
 
 Theano 0.6rc1 (October 1st, 2012)
 =================================
@@ -857,7 +481,6 @@ Speed up:
 Speed up GPU:
  * Convolution on the GPU now checks the generation of the card to make
    it faster in some cases (especially medium/big ouput image) (Frederic B.)
-
      * We had hardcoded 512 as the maximum number of threads per block. Newer cards
        support up to 1024 threads per block.
  * Faster GpuAdvancedSubtensor1, GpuSubtensor, GpuAlloc (Frederic B.)
diff --git a/NEWS_DEV.txt b/NEWS_DEV.txt
deleted file mode 100644
index 0709d7ea649..00000000000
--- a/NEWS_DEV.txt
+++ /dev/null
@@ -1,53 +0,0 @@
-.. _NEWS:
-
-===================
-DRAFT Release Notes
-===================
-
-git log -p rel-0.6rc3... |grep Merge|grep '#' |cut -f 8 -d ' ' | replace "#" "* https://github.com/Theano/Theano/pull/"
-
-git shortlog -sn rel-0.6rc3..
-
-Done up to PR 1608
-* https://github.com/Theano/Theano/pull/1608
-
-* https://github.com/Theano/Theano/pull/1591 # need info
-
-Interface change:
- - theano.tensor.signal.conv2d(2d,2d) output 2d answer. (Frederic B., reported by Alexander Izvorski)
-
-
-Theano Development version
-==========================
-
-NEWS.txt:
-
-We recommend that everybody update to this version.
-
-Highlights:
-
-
-Committers for this dev version only:
-
-
-A total of X people contributed to this release.
-People with a "+" by their names contributed a patch for the first time.
-
-Installation:
-
-Bug fixes:
-
-Interface Deprecation (a warning is printed):
-
-Interface Changes:
-
-New Interface (reuses existing functionality):
-
-Speed-ups:
-
-Crash/no return fixes:
-
-Others:
-
-Todo for the final release:
- * update the NEWS.txt file.
diff --git a/bin/theano-cache b/bin/theano-cache
index 142f53d9881..f5f74ff9842 100755
--- a/bin/theano-cache
+++ b/bin/theano-cache
@@ -8,6 +8,7 @@ from theano import config
 from theano.gof.cc import get_module_cache
 
 _logger = logging.getLogger('theano.bin.theano-cache')
+_logger.setLevel(logging.WARN)
 
 
 def print_help(exit_status):
@@ -18,7 +19,7 @@ def print_help(exit_status):
     print 'Type "theano-cache clear" to erase the cache'
     print 'Type "theano-cache list" to print the cache content'
     print 'Type "theano-cache unlock" to unlock the cache directory'
-    print 'Type "theano-cache cleanup" to delete keys in the old format/code version'
+    print 'Type "theano-cache cleanup" to delete keys in the old format'
     print 'Type "theano-cache purge" to force deletion of the cache directory'
     print ('Type "theano-cache basecompiledir" '
             'to print the parent of the cache directory')
@@ -59,8 +60,6 @@ elif len(sys.argv) == 2:
         theano.gof.compiledir.print_compiledir_content()
     elif sys.argv[1] == 'cleanup':
         theano.gof.compiledir.cleanup()
-        cache = get_module_cache(init_args=dict(do_refresh=False))
-        cache.clear_old()
     elif sys.argv[1] == 'unlock':
         theano.gof.compilelock.force_unlock()
         print 'Lock successfully removed!'
diff --git a/bin/theano-nose b/bin/theano-nose
index db4b5118093..37cf4ee75e1 100755
--- a/bin/theano-nose
+++ b/bin/theano-nose
@@ -18,6 +18,7 @@ disable that plugin.
 
 import logging
 _logger = logging.getLogger('theano.bin.theano-nose')
+_logger.setLevel(logging.WARN)
 
 import os
 import nose
diff --git a/doc/acknowledgement.txt b/doc/acknowledgement.txt
index 412e5fbeeb1..427b3962e98 100644
--- a/doc/acknowledgement.txt
+++ b/doc/acknowledgement.txt
@@ -19,8 +19,3 @@ Acknowledgements
   `theano/misc/cpucount.py` come from the project `pyprocessing
   <http://pyprocessing.berlios.de/>`_. It is available under the same license
   as Theano.
-* Our random number generator implementation on CPU and GPU uses the MRG31k3p algorithm that is described in:
-
-    P. L'Ecuyer and R. Touzin, `Fast Combined Multiple Recursive Generators with Multipliers of the form a = +/- 2^d +/- 2^e <http://www.informs-sim.org/wsc00papers/090.PDF>`_, Proceedings of the 2000 Winter Simulation Conference, Dec. 2000, 683--689.
-
-  We were authorized by Pierre L'Ecuyer to copy/modify his Java implementation in the `SSJ <http://www.iro.umontreal.ca/~simardr/ssj/>`_ software and to relicense it under BSD 3-Clauses in Theano.
diff --git a/doc/cifarSC2011/advanced_theano.txt b/doc/cifarSC2011/advanced_theano.txt
index 6a2fe1019ff..b37dcca132f 100644
--- a/doc/cifarSC2011/advanced_theano.txt
+++ b/doc/cifarSC2011/advanced_theano.txt
@@ -16,55 +16,46 @@ Conditions
 
 **IfElse Example: Comparison with Switch**
 
-.. testcode::
-
-   from theano import tensor as T
-   from theano.ifelse import ifelse
-   import theano, time, numpy
-
-   a,b = T.scalars('a','b')
-   x,y = T.matrices('x','y')
-
-   z_switch = T.switch(T.lt(a,b), T.mean(x), T.mean(y))
-   z_lazy = ifelse(T.lt(a,b), T.mean(x), T.mean(y))
+.. code-block:: python
 
-   f_switch = theano.function([a,b,x,y], z_switch,
-                              mode=theano.Mode(linker='vm'))
-   f_lazyifelse = theano.function([a,b,x,y], z_lazy,
-                                  mode=theano.Mode(linker='vm'))
+  from theano import tensor as T
+  from theano.ifelse import ifelse
+  import theano, time, numpy
 
-   val1 = 0.
-   val2 = 1.
-   big_mat1 = numpy.ones((10000,1000))
-   big_mat2 = numpy.ones((10000,1000))
+  a,b = T.scalars('a','b')
+  x,y = T.matrices('x','y')
+  
+  z_switch = T.switch(T.lt(a,b), T.mean(x), T.mean(y))
+  z_lazy = ifelse(T.lt(a,b), T.mean(x), T.mean(y))
 
-   n_times = 10
+  f_switch = theano.function([a,b,x,y], z_switch, 
+                      mode=theano.Mode(linker='vm'))
+  f_lazyifelse = theano.function([a,b,x,y], z_lazy,
+                      mode=theano.Mode(linker='vm'))
 
-   tic = time.clock()
-   for i in xrange(n_times):
-       f_switch(val1, val2, big_mat1, big_mat2)
-   print 'time spent evaluating both values %f sec'%(time.clock()-tic)
+  val1 = 0.
+  val2 = 1.
+  big_mat1 = numpy.ones((10000,1000))
+  big_mat2 = numpy.ones((10000,1000))
 
-   tic = time.clock()
-   for i in xrange(n_times):
-       f_lazyifelse(val1, val2, big_mat1, big_mat2)
-   print 'time spent evaluating one value %f sec'%(time.clock()-tic)
+  n_times = 10
 
-.. testoutput::
-   :hide:
-   :options: +ELLIPSIS
+  tic = time.clock()
+  for i in xrange(n_times):
+      f_switch(val1, val2, big_mat1, big_mat2)
+  print 'time spent evaluating both values %f sec'%(time.clock()-tic)
 
-   time spent evaluating both values ... sec
-   time spent evaluating one value ... sec
+  tic = time.clock()
+  for i in xrange(n_times):
+      f_lazyifelse(val1, val2, big_mat1, big_mat2)
+  print 'time spent evaluating one value %f sec'%(time.clock()-tic)
 
 IfElse Op spend less time (about an half) than Switch since it computes only
 one variable instead of both.
 
-.. code-block:: none
-
-  $ python ifelse_switch.py
-  time spent evaluating both values 0.6700 sec
-  time spent evaluating one value 0.3500 sec
+>>> python ifelse_switch.py
+time spent evaluating both values 0.6700 sec
+time spent evaluating one value 0.3500 sec
 
 Note that IfElse condition is a boolean while Switch condition is a tensor, so
 Switch is more general.
@@ -121,7 +112,7 @@ Loops
 
 **Scan Example: Calculating a Polynomial**
 
-.. testcode::
+.. code-block:: python
 
   import theano
   import theano.tensor as T
@@ -142,10 +133,7 @@ Loops
 
   test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
   print calculate_polynomial(test_coeff, 3)
-
-.. testoutput::
-
-  19.0
+  # 19.0
 
 
 
@@ -279,7 +267,7 @@ Printing/Drawing Theano graphs
 
 ``theano.printing.pprint(variable)``
 
->>> theano.printing.pprint(prediction) # doctest: +SKIP
+>>> theano.printing.pprint(prediction)
 gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),TensorConstant{0.5})
 
 
@@ -287,7 +275,7 @@ gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),TensorC
 
 ``theano.printing.debugprint({fct, variable, list of variables})``
 
->>> theano.printing.debugprint(prediction) # doctest: +SKIP
+>>> theano.printing.debugprint(prediction)
 Elemwise{gt,no_inplace} [@181772236] ''
  |Elemwise{true_div,no_inplace} [@181746668] ''
  | |InplaceDimShuffle{x} [@181746412] ''
@@ -305,7 +293,7 @@ Elemwise{gt,no_inplace} [@181772236] ''
  | | | | | |b [@181730156]
  |InplaceDimShuffle{x} [@181771788] ''
  | |TensorConstant{0.5} [@181771148]
->>> theano.printing.debugprint(predict) # doctest: +SKIP
+>>> theano.printing.debugprint(predict)
 Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] ''   2
  |dot [@183018796] ''   1
  | |x [@183000780]
@@ -316,19 +304,19 @@ Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] ''   2
 
 - Picture Printing of Graphs
 
->>> theano.printing.pydotprint_variables(prediction) # doctest: +SKIP
+>>> theano.printing.pydotprint_variables(prediction)
 
 .. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png
    :width: 800 px
 
 All pydotprint* requires graphviz and pydot
 
->>> theano.printing.pydotprint(predict) # doctest: +SKIP
+>>> theano.printing.pydotprint(predict)
 
 .. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_predic.png
    :width: 800 px
 
->>> theano.printing.pydotprint(train) # This is a small train example! # doctest: +SKIP
+>>> theano.printing.pydotprint(train) # This is a small train example!
 
 .. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_train.png
    :width: 1500 px
diff --git a/doc/cifarSC2011/boot_camp_overview.txt b/doc/cifarSC2011/boot_camp_overview.txt
index 861bd1164f4..322b8fcef7d 100644
--- a/doc/cifarSC2011/boot_camp_overview.txt
+++ b/doc/cifarSC2011/boot_camp_overview.txt
@@ -15,7 +15,7 @@ Day 1
 
 * Show of hands - what is your background?
 
-* Python & NumPy in a nutshell
+* Python & Numpy in a nutshell
 
 * Theano basics
 
diff --git a/doc/cifarSC2011/gpundarray.txt b/doc/cifarSC2011/gpundarray.txt
index 05a05abaeac..0babc8ea6c6 100644
--- a/doc/cifarSC2011/gpundarray.txt
+++ b/doc/cifarSC2011/gpundarray.txt
@@ -1,5 +1,5 @@
 
-.. _cifar2013_gpundarray:
+.. _gpundarray:
 
 **********
 GpuNdArray
diff --git a/doc/cifarSC2011/index.txt b/doc/cifarSC2011/index.txt
index 92debb1e54d..3c1c043c015 100644
--- a/doc/cifarSC2011/index.txt
+++ b/doc/cifarSC2011/index.txt
@@ -18,7 +18,7 @@ What does it do?
 
  * symbolic differentiation.
 
-It complements the Python numeric/scientific software stack (e.g. NumPy, SciPy,
+It complements the Python numeric/scientific software stack (e.g. numpy, scipy,
 scikits, matplotlib, PIL.)
 
 Design and feature set has been driven by machine learning research
diff --git a/doc/cifarSC2011/introduction.txt b/doc/cifarSC2011/introduction.txt
index 836568812fd..06e1ac06480 100644
--- a/doc/cifarSC2011/introduction.txt
+++ b/doc/cifarSC2011/introduction.txt
@@ -13,7 +13,7 @@ Background Questionaire
 
  * What did you do with it?
 
-* Who has used Python? NumPy? SciPy? matplotlib?
+* Who has used Python? numpy? scipy? matplotlib?
 
 * Who has used iPython?
 
@@ -72,14 +72,14 @@ Python in one slide
     # PYTHON SYNTAX EXAMPLE
     #######################
     a = 1                     # no type declaration required!
-    b = (1, 2, 3)             # tuple of three int literals
-    c = [1, 2, 3]             # list of three int literals
+    b = (1,2,3)               # tuple of three int literals
+    c = [1,2,3]               # list of three int literals
     d = {'a': 5, b: None}     # dictionary of two elements
                               # N.B. string literal, None
 
     print d['a']              # square brackets index
     # -> 5
-    print d[(1, 2, 3)]        # new tuple == b, retrieves None
+    print d[(1,2,3)]          # new tuple == b, retrieves None
     # -> None
     print d[6]
     # raises KeyError Exception
@@ -116,18 +116,18 @@ Python in one slide
     print Bar(99).hello()     # Creating an instance of Bar
     # -> 99
 
-NumPy in one slide
+Numpy in one slide
 ------------------
 
 * Python floats are full-fledged objects on the heap
 
  * Not suitable for high-performance computing!
 
-* NumPy provides a N-dimensional numeric array in Python
+* Numpy provides a N-dimensional numeric array in Python
 
  * Perfect for high-performance computing.
 
-* NumPy provides
+* Numpy provides
 
  * elementwise computations
 
@@ -135,7 +135,7 @@ NumPy in one slide
 
  * pseudorandom numbers from many distributions
 
-* SciPy provides lots more, including
+* Scipy provides lots more, including
 
  * more linear algebra
 
@@ -148,29 +148,29 @@ NumPy in one slide
 .. code-block:: python
 
     ##############################
-    # Properties of NumPy arrays
+    # Properties of Numpy arrays
     # that you really need to know
     ##############################
 
     import numpy as np          # import can rename
-    a = np.random.rand(3, 4, 5) # random generators
+    a = np.random.rand(3,4,5)   # random generators
     a32 = a.astype('float32')   # arrays are strongly typed
 
     a.ndim                      # int: 3
-    a.shape                     # tuple: (3, 4, 5)
+    a.shape                     # tuple: (3,4,5)
     a.size                      # int: 60
     a.dtype                     # np.dtype object: 'float64'
     a32.dtype                   # np.dtype object: 'float32'
 
 Arrays can be combined with numeric operators, standard mathematical
-functions. NumPy has great `documentation <http://docs.scipy.org/doc/numpy/reference/>`_.
+functions. Numpy has great `documentation <http://docs.scipy.org/doc/numpy/reference/>`_.
 
-Training an MNIST-ready classification neural network in pure NumPy might look like this:
+Training an MNIST-ready classification neural network in pure numpy might look like this:
 
 .. code-block:: python
 
     #########################
-    # NumPy for Training a
+    # Numpy for Training a
     # Neural Network on MNIST
     #########################
 
@@ -186,23 +186,23 @@ Training an MNIST-ready classification neural network in pure NumPy might look l
 
     batchsize = 100
     for i in xrange(1000):
-        x_i = x[i * batchsize: (i + 1) * batchsize]
-        y_i = y[i * batchsize: (i + 1) * batchsize]
+        x_i = x[i*batchsize:(i+1)*batchsize]
+        y_i = y[i*batchsize:(i+1)*batchsize]
 
         hidin = np.dot(x_i, w) + b
 
         hidout = np.tanh(hidin)
 
         outin = np.dot(hidout, v) + c
-        outout = (np.tanh(outin) + 1) / 2.0
+        outout = (np.tanh(outin)+1)/2.0
 
         g_outout = outout - y_i
-        err = 0.5 * np.sum(g_outout ** 2)
+        err = 0.5 * np.sum(g_outout**2)
 
         g_outin = g_outout * outout * (1.0 - outout)
 
         g_hidout = np.dot(g_outin, v.T)
-        g_hidin = g_hidout * (1 - hidout ** 2)
+        g_hidin = g_hidout * (1 - hidout**2)
 
         b -= lr * np.sum(g_hidin, axis=0)
         c -= lr * np.sum(g_outin, axis=0)
@@ -215,9 +215,9 @@ What's missing?
 
 * Non-lazy evaluation (required by Python) hurts performance
 
-* NumPy is bound to the CPU
+* Numpy is bound to the CPU
 
-* NumPy lacks symbolic or automatic differentiation
+* Numpy lacks symbolic or automatic differentiation
 
 Now let's have a look at the same algorithm in Theano, which runs 15 times faster if
 you have GPU (I'm skipping some dtype-details which we'll come back to).
@@ -229,42 +229,40 @@ you have GPU (I'm skipping some dtype-details which we'll come back to).
     # Neural Network on MNIST
     #########################
 
-    import numpy as np
-
-    import theano
-    import theano.tensor as tensor
+    import theano as T
+    import theano.tensor as TT
 
     x = np.load('data_x.npy')
     y = np.load('data_y.npy')
 
     # symbol declarations
-    sx = tensor.matrix()
-    sy = tensor.matrix()
-    w = theano.shared(np.random.normal(avg=0, std=.1,
-                                       size=(784, 500)))
-    b = theano.shared(np.zeros(500))
-    v = theano.shared(np.zeros((500, 10)))
-    c = theano.shared(np.zeros(10))
+    sx = TT.matrix()
+    sy = TT.matrix()
+    w = T.shared(np.random.normal(avg=0, std=.1,
+        size=(784, 500)))
+    b = T.shared(np.zeros(500))
+    v = T.shared(np.zeros((500, 10)))
+    c = T.shared(np.zeros(10))
 
     # symbolic expression-building
-    hid = tensor.tanh(tensor.dot(sx, w) + b)
-    out = tensor.tanh(tensor.dot(hid, v) + c)
-    err = 0.5 * tensor.sum(out - sy) ** 2
-    gw, gb, gv, gc = tensor.grad(err, [w, b, v, c])
+    hid = TT.tanh(TT.dot(sx, w) + b)
+    out = TT.tanh(TT.dot(hid, v) + c)
+    err = 0.5 * TT.sum(out - sy)**2
+    gw, gb, gv, gc = TT.grad(err, [w,b,v,c])
 
     # compile a fast training function
-    train = theano.function([sx, sy], err,
+    train = T.function([sx, sy], err,
         updates={
-            w: w - lr * gw,
-            b: b - lr * gb,
-            v: v - lr * gv,
-            c: c - lr * gc})
+            w:w - lr * gw,
+            b:b - lr * gb,
+            v:v - lr * gv,
+            c:c - lr * gc})
 
     # now do the computations
     batchsize = 100
     for i in xrange(1000):
-        x_i = x[i * batchsize: (i + 1) * batchsize]
-        y_i = y[i * batchsize: (i + 1) * batchsize]
+        x_i = x[i*batchsize:(i+1)*batchsize]
+        y_i = y[i*batchsize:(i+1)*batchsize]
         err_i = train(x_i, y_i)
 
     
@@ -288,7 +286,7 @@ Theano in one slide
 * Expression substitution optimizations automatically draw
   on many backend technologies for best performance.
 
- * FFTW, MKL, ATLAS, SciPy, Cython, CUDA
+ * FFTW, MKL, ATLAS, Scipy, Cython, CUDA
 
  * Slower fallbacks always available
 
diff --git a/doc/cifarSC2011/pyCUDA.txt b/doc/cifarSC2011/pyCUDA.txt
index 6713fe189c0..6aac7f5c3da 100644
--- a/doc/cifarSC2011/pyCUDA.txt
+++ b/doc/cifarSC2011/pyCUDA.txt
@@ -75,12 +75,12 @@ Exercise 6
 - Modify and execute it to work for a matrix of 20 x 10
 
 
-.. _cifar2011_pyCUDA_theano:
+.. _pyCUDA_theano:
 
 Theano + PyCUDA
 ---------------
 
-.. testcode::
+.. code-block:: python
 
     import numpy, theano
     import theano.misc.pycuda_init
@@ -118,20 +118,15 @@ Theano + PyCUDA
                 pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
                            block=(512,1,1), grid=grid)
             return thunk
-
-.. testoutput::
-   :hide:
-   :options: +SKIP
-
-   This contains GPU code so skip it
+    
 
 Test it!
 
->>> x = theano.tensor.fmatrix() # doctest: +SKIP
->>> f = theano.function([x], PyCUDADoubleOp()(x)) # doctest: +SKIP
->>> xv=numpy.ones((4,5), dtype="float32") # doctest: +SKIP
->>> assert numpy.allclose(f(xv), xv*2) # doctest: +SKIP
->>> print numpy.asarray(f(xv)) # doctest: +SKIP
+>>> x = theano.tensor.fmatrix()
+>>> f = theano.function([x], PyCUDADoubleOp()(x))
+>>> xv=numpy.ones((4,5), dtype="float32")
+>>> assert numpy.allclose(f(xv), xv*2)
+>>> print numpy.asarray(f(xv))
 
 Exercises 7
 -----------
diff --git a/doc/cifarSC2011/theano.txt b/doc/cifarSC2011/theano.txt
index ce726dc8afb..659526d6095 100644
--- a/doc/cifarSC2011/theano.txt
+++ b/doc/cifarSC2011/theano.txt
@@ -345,11 +345,20 @@ Differentiation details
 * We are working on the missing optimizations to be able to compute efficently the full Jacobian and Hessian and Jacobian x vector
 
 
-.. _cifar2011_benchmark:
 
 Benchmarks
 ----------
 
+Example:
+
+* Multi-layer perceptron
+* Convolutional Neural Networks
+* Misc Elemwise operations
+
+Competitors: NumPy + SciPy, MATLAB, EBLearn, Torch5, numexpr
+
+* EBLearn, Torch5: specialized libraries written by practitioners specifically for these tasks
+* numexpr: similar to Theano, 'virtual machine' for elemwise expressions
 
 **Multi-Layer Perceptron**:
 
diff --git a/doc/citation.txt b/doc/citation.txt
deleted file mode 100644
index 804d115d7a4..00000000000
--- a/doc/citation.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-
-.. _citation:
-
-
-Theano Citation Policy
-======================
-
-If you use Theano for academic research, you are highly encouraged (though not
-required) to cite the following two papers:
-
-* F. Bastien, P. Lamblin, R. Pascanu, J. Bergstra, I. Goodfellow,
-  A. Bergeron, N. Bouchard, D. Warde-Farley and Y. Bengio.
-  `"Theano: new features and speed improvements"
-  <http://arxiv.org/pdf/1211.5590.pdf>`_.
-  NIPS 2012 deep learning workshop. (`BibTex
-  <http://www.iro.umontreal.ca/~lisa/publications2/index.php/export/publication/551/bibtex>`_)
-
-* J. Bergstra, O. Breuleux, F. Bastien, P. Lamblin, R.
-  Pascanu, G. Desjardins, J. Turian, D. Warde-Farley and Y.
-  Bengio. `"Theano: A CPU and GPU Math Expression Compiler"
-  <http://www.iro.umontreal.ca/~lisa/pointeurs/theano_scipy2010.pdf>`_.
-  *Proceedings of the Python for Scientific Computing Conference (SciPy)
-  2010. June 30 - July 3, Austin, TX* (`BibTeX
-  <http://www.iro.umontreal.ca/~lisa/publications2/index.php/export/publication/461/bibtex>`_)
-
-Theano is primarily developed by academics, and so citations matter a lot to
-us. As an added benefit, you increase Theano's exposure and potential user
-(and developer) base, which is to the benefit of all users of Theano. Thanks
-in advance!
-
-Previously, we only asked users of Theano to cite the original 2010 paper. However,
-this policy did not give appropriate credit to the many members of our community
-who have contributed to Theano in the meantime.
-
-In the future, we intend to introduce new papers periodically (hopefully approximately
-once per year) with a comprehensive author list. As soon as one of these papers is
-prepared, we will only ask for users to cite the single most recent paper with the
-most comprehensive author list.
-
-
-
diff --git a/doc/conf.py b/doc/conf.py
index d0c72ac8075..2dc96752819 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -23,7 +23,7 @@
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo', 'sphinx.ext.doctest']
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo']
 
 todo_include_todos = True
 
@@ -53,7 +53,7 @@
 # The short X.Y version.
 version = '0.6'
 # The full version, including alpha/beta/rc tags.
-release = '0.6'
+release = '0.6rc3'
 
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
diff --git a/doc/crei2013/advanced_theano.txt b/doc/crei2013/advanced_theano.txt
deleted file mode 100644
index 2f91f5cf7d1..00000000000
--- a/doc/crei2013/advanced_theano.txt
+++ /dev/null
@@ -1,216 +0,0 @@
-
-.. _crei2013_advanced_theano:
-
-***************
-Advanced Theano
-***************
-
-
-Profiling
----------
-
-- To replace the default mode with this mode, use the Theano flags ``profile=True``
-
-- To enable the memory profiling use the flags ``profile_memory=True``
-
-Theano output:
-
-.. literalinclude:: logreg_profile.prof
-
-Compilation pipeline
---------------------
-
-.. image:: ../hpcs2011_tutorial/pics/pipeline.png
-   :width: 400 px
-
-
-Inplace optimization
---------------------
-
-- 2 type of inplace operations:
-
-  - An op that return a view on its inputs (e.g. reshape, inplace transpose)
-  - An op that write the output on the inputs memory space
-
-- This allows some memory optimization
-- The Op must tell Theano if they work inplace
-- Inplace Op add constraints to the order of execution
-
-
-Conditions
-----------
-**IfElse**
-
-- Build condition over symbolic variables.
-- IfElse Op takes a boolean condition and two variables to compute as input.
-- While Switch Op evaluates both 'output' variables, IfElse Op is lazy and only
-  evaluates one variable respect to the condition.
-
-**IfElse Example: Comparison with Switch**
-
-.. literalinclude:: ifelse_switch.py
-
-IfElse Op spend less time (about an half) than Switch since it computes only
-one variable instead of both.
-
->>> python ifelse_switch.py
-time spent evaluating both values 0.230000 sec
-time spent evaluating one value 0.120000 sec
-
-Note that IfElse condition is a boolean while Switch condition is a tensor, so
-Switch is more general.
-
-It is actually important to use  ``linker='vm'`` or ``linker='cvm'``,
-otherwise IfElse will compute both variables and take the same computation
-time as the Switch Op. The linker is not currently set by default to 'cvm' but
-it will be in a near future.
-
-Loops
------
-
-**Scan**
-
-- General form of **recurrence**, which can be used for looping.
-- **Reduction** and **map** (loop over the leading dimensions) are special cases of Scan
-- You 'scan' a function along some input sequence, producing an output at each time-step
-- The function can see the **previous K time-steps** of your function
-- ``sum()`` could be computed by scanning the z + x(i) function over a list, given an initial state of ``z=0``.
-- Often a for-loop can be expressed as a ``scan()`` operation, and ``scan`` is the closest that Theano comes to looping.
-- The advantage of using ``scan`` over for loops
-  
-  - The number of iterations to be part of the symbolic graph
-  - Minimizes GPU transfers if GPU is involved
-  - Compute gradients through sequential steps
-  - Slightly faster then using a for loop in Python with a compiled Theano function
-  - Can lower the overall memory usage by detecting the actual amount of memory needed
-
-**Scan Example: Computing pow(A,k)**
-
-.. literalinclude:: scan_pow.py
-
-
-**Scan Example: Calculating a Polynomial**
-
-.. literalinclude:: scan_poly.py
-
-Exercise 4
------------
-
-- Run both examples 
-- Modify and execute the polynomial example to have the reduction done by scan
-
-
-Exercise 5
------------
-
-- In the last exercises, do you see a speed up with the GPU?
-- Where does it come from? (Use ProfileMode)
-- Is there something we can do to speed up the GPU version?
-
-
-Printing/Drawing Theano graphs
-------------------------------
-
-- Pretty Printing
-
-``theano.printing.pprint(variable)``
-
->>> theano.printing.pprint(prediction)
-gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),TensorConstant{0.5})
-
-
-- Debug Print
-
-``theano.printing.debugprint({fct, variable, list of variables})``
-
->>> theano.printing.debugprint(prediction)
-Elemwise{gt,no_inplace} [@181772236] ''
- |Elemwise{true_div,no_inplace} [@181746668] ''
- | |InplaceDimShuffle{x} [@181746412] ''
- | | |TensorConstant{1} [@181745836]
- | |Elemwise{add,no_inplace} [@181745644] ''
- | | |InplaceDimShuffle{x} [@181745420] ''
- | | | |TensorConstant{1} [@181744844]
- | | |Elemwise{exp,no_inplace} [@181744652] ''
- | | | |Elemwise{sub,no_inplace} [@181744012] ''
- | | | | |Elemwise{neg,no_inplace} [@181730764] ''
- | | | | | |dot [@181729676] ''
- | | | | | | |x [@181563948]
- | | | | | | |w [@181729964]
- | | | | |InplaceDimShuffle{x} [@181743788] ''
- | | | | | |b [@181730156]
- |InplaceDimShuffle{x} [@181771788] ''
- | |TensorConstant{0.5} [@181771148]
->>> theano.printing.debugprint(predict)
-Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] ''   2
- |dot [@183018796] ''   1
- | |x [@183000780]
- | |w [@183000812]
- |InplaceDimShuffle{x} [@183133580] ''   0
- | |b [@183000876]
- |TensorConstant{[ 0.5]} [@183084108]
-
-- Picture Printing of Graphs
-
->>> theano.printing.pydotprint_variables(prediction)
-
-.. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png
-   :width: 800 px
-
-All pydotprint* requires graphviz and pydot
-
->>> theano.printing.pydotprint(predict)
-
-.. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_predic.png
-   :width: 800 px
-
->>> theano.printing.pydotprint(train) # This is a small train example!
-
-.. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_train.png
-   :width: 1500 px
-
-
-Debugging
----------
-
-- Run with the Theano flag ``compute_test_value = {``off'',``ignore'', ``warn'', ``raise''}``
-
-  - Run the code as we create the graph
-  - Allows you to find the bug earlier (ex: shape mismatch)
-  - Makes it easier to identify where the problem is in *your* code
-  - Use the value of constants and shared variables directly
-  - For pure symbolic variables uses ``x.tag.test_value = numpy.random.rand(5,10)``
-
-- Run with the flag ``mode=FAST_COMPILE``
-  
-  - Few optimizations
-  - Run Python code (better error messages and can be debugged interactively in the Python debugger)
-
-- Run with the flag ``mode=DebugMode``
-
-  - 100-1000x slower
-  - Test all optimization steps from the original graph to the final graph
-  - Checks many things that Op should/shouldn't do
-  - Executes both the Python and C code versions
-
-Known limitations
------------------
-
-- Compilation phase distinct from execution phase
-
-  - Use ``a_tensor_variable.eval()`` to make this less visible
-
-- Compilation time can be significant
-
-  - Amortize it with functions over big input or reuse functions
-
-- Execution overhead
-
-  - We have worked on this, but more work needed
-  - So needs a certain number of operations to be useful
-
-- Compilation time superlinear in the size of the graph.
-
-  - Hundreds of nodes is fine
-  - Disabling a few optimizations can speed up compilation
-  - Usually too many nodes indicates a problem with the graph
diff --git a/doc/crei2013/gpundarray.txt b/doc/crei2013/gpundarray.txt
deleted file mode 100644
index f0462975a17..00000000000
--- a/doc/crei2013/gpundarray.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-
-.. _crei2013_gpundarray:
-
-**********
-GpuNdArray
-**********
-
-Why a common GPU ndarray?
--------------------------
-
-- Currently there are at least 4 different GPU array data structures in use by Python packages
-
-  - CudaNdarray (Theano), GPUArray (PyCUDA), CUDAMatrix (cudamat), GPUArray (PyOpenCL), ...
-  - There are even more if we include other languages
-
-- All of them are a subset of the functionality of ``numpy.ndarray`` on the GPU
-- Lots of duplicated effort
-
-  - GPU code is harder/slower to do {\bf correctly} and {\bf fast} than on the CPU/Python
-
-- Lack of a common array API makes it harder to port/reuse code
-- Also harder to find/distribute code
-- Divides development work
-
-
-Design Goals
-------------
-
-- Make it VERY similar to ``numpy.ndarray``
-- Be compatible with both CUDA and OpenCL
-- Have the base object accessible from C to allow collaboration with more projects, across high-level languages
-
-  - We want people from C, C++, Ruby, R, ... all use the same base GPU N-dimensional array
-
-
-Final Note
-----------
-
-- Under development
-- Will be the next GPU array container for Theano (*this summer!*)
-- Probably also for PyCUDA, PyOpenCL
-- Mailing list: http://lists.tiker.net/listinfo/gpundarray
diff --git a/doc/crei2013/ifelse_switch.py b/doc/crei2013/ifelse_switch.py
deleted file mode 100644
index 31cd3223ca2..00000000000
--- a/doc/crei2013/ifelse_switch.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import time
-
-import numpy
-
-import theano
-from theano import tensor as tt
-from theano.ifelse import ifelse
-
-a, b = tt.scalars('a', 'b')
-x, y = tt.matrices('x', 'y')
-
-z_switch = tt.switch(tt.lt(a, b), tt.mean(x), tt.mean(y))
-z_lazy = ifelse(tt.lt(a, b), tt.mean(x), tt.mean(y))
-
-f_switch = theano.function([a, b, x, y], z_switch)
-f_lazyifelse = theano.function([a, b, x, y], z_lazy)
-
-val1 = 0.
-val2 = 1.
-big_mat1 = numpy.ones((10000, 1000))
-big_mat2 = numpy.ones((10000, 1000))
-
-n_times = 10
-
-tic = time.clock()
-for i in xrange(n_times):
-    f_switch(val1, val2, big_mat1, big_mat2)
-print 'time spent evaluating both values %f sec' % (time.clock() - tic)
-
-tic = time.clock()
-for i in xrange(n_times):
-    f_lazyifelse(val1, val2, big_mat1, big_mat2)
-print 'time spent evaluating one value %f sec' % (time.clock() - tic)
\ No newline at end of file
diff --git a/doc/crei2013/index.txt b/doc/crei2013/index.txt
deleted file mode 100644
index 101ce7f2970..00000000000
--- a/doc/crei2013/index.txt
+++ /dev/null
@@ -1,71 +0,0 @@
-
-.. _crei2013_index:
-
-===========================
-Theano Tutorial @ CREI 2013
-===========================
-
-July 19, 2013, Sherbrook, Québec, Canada.
-
-
-Theano is python software for evaluating complicated array expressions.
-
-What does it do?
-
- * aggressive expression optimizations,
-
- * automatic GPU use,
-
- * symbolic differentiation and R op.
-
-It complements the Python numeric/scientific software stack (e.g. NumPy, SciPy,
-scikits, matplotlib, PIL.)
-
-Design and feature set has been driven by machine learning research
-at the University of
-Montreal (groups of Yoshua Bengio, Pascal Vincent, Aaron Courville and Roland Memisevic)
-The result is a very good library for doing research in deep
-learning and neural network training, and a flexible framework for
-many other models and algorithms in machine learning more generally.
-
-It has proven to be useful for implementing:
-
- - linear and nonlinear neural network classifiers
-
- - convolutional models
-
- - Energy models: RBM, DBN, GRBM, ssRBM, AIS
-
- - Auto-encoders: DAE, CAE
-
- - GP regression
-
- - sparse coding
-
- - recurrent neural networks, echo state, (HMM?)
-
- - online and batch learning and optimization
-
- - Even SVM!
-
-As people's needs change this list will grow, but Theano is built
-around vector, matrix, and tensor expressions; there is little reason
-to use it for calculations on other data structures except. There is
-also some sparse matrix support.
-
-
-Contents
---------
-
-The structured part of these lab sessions will be a walk-through of the following
-material. Interleaved with this structured part will be blocks of time for
-individual or group work.  The idea is that you can try out Theano and get help
-from gurus on hand if you get stuck.
-
-.. toctree::
-
-    introduction
-    theano
-    advanced_theano
-    gpundarray
-    /tutorial/extending_theano
diff --git a/doc/crei2013/introduction.txt b/doc/crei2013/introduction.txt
deleted file mode 100644
index 8e0269b47e5..00000000000
--- a/doc/crei2013/introduction.txt
+++ /dev/null
@@ -1,397 +0,0 @@
-
-.. _crei2013_Introduction:
-
-
-************
-Introduction
-************
-
-Background Questionaire
------------------------
-
-* Who has used Theano before?
-
- * What did you do with it?
-
-* Who has used Python? NumPy? SciPy? matplotlib?
-
-* Who has used iPython?
-
- * Who has used it as a distributed computing engine?
-
-* Who has done C/C++ programming?
-
-* Who has organized computation around a particular physical memory layout?
-
-* Who has used a multidimensional array of >2 dimensions?
-
-* Who has written a Python module in C before?
-
- * Who has written a program to *generate* Python modules in C?
-
-* Who has used a templating engine?
-
-* Who has programmed a GPU before?
-
- * Using OpenGL / shaders ?
-
- * Using CUDA (runtime? / driver?)
-
- * Using PyCUDA ?
-
- * Using OpenCL / PyOpenCL ?
-
- * Using cudamat / gnumpy ?
-
- * Other?
-
-* Who has used Cython?
-
-
-Python in one slide
--------------------
-
-* General-purpose high-level OO interpreted language
- 
-* Emphasizes code readability
- 
-* Comprehensive standard library
- 
-* Dynamic type and memory management
-
-* Built-in types: int, float, str, list, dict, tuple, object
-
-* Slow execution
-
-* Popular in web-dev and scientific communities
-
-
-.. code-block:: python
-
-    #######################
-    # PYTHON SYNTAX EXAMPLE
-    #######################
-    a = 1                     # no type declaration required!
-    b = (1, 2, 3)             # tuple of three int literals
-    c = [1, 2, 3]             # list of three int literals
-    d = {'a': 5, b: None}     # dictionary of two elements
-                              # N.B. string literal, None
-
-    print d['a']              # square brackets index
-    # -> 5
-    print d[(1, 2, 3)]        # new tuple == b, retrieves None
-    # -> None
-    print d[6]
-    # raises KeyError Exception
-
-    x, y, z = 10, 100, 100    # multiple assignment from tuple
-    x, y, z = b               # unpacking a sequence
-
-    b_squared = [b_i**2 for b_i in b]  # list comprehension
-
-    def foo(b, c=3):          # function w default param c
-        return a + b + c      # note scoping, indentation
-
-    foo(5)                    # calling a function
-    # -> 1 + 5 + 3 == 9       # N.B. scoping
-    foo(b=6, c=2)             # calling with named args
-    # -> 1 + 6 + 2 == 9
-
-    print b[1:3]              # slicing syntax
-
-    class Foo(object):        # Defining a class
-        def __init__(self):
-            self.a = 5
-        def hello(self):
-            return self.a
-
-    f = Foo()                 # Creating a class instance
-    print f.hello()           # Calling methods of objects
-    # -> 5 
-
-    class Bar(Foo):           # Defining a subclass
-        def __init__(self, a):
-            self.a = a
-
-    print Bar(99).hello()     # Creating an instance of Bar
-    # -> 99
-
-NumPy in one slide
-------------------
-
-* Python floats are full-fledged objects on the heap
-
- * Not suitable for high-performance computing!
-
-* NumPy provides a N-dimensional numeric array in Python
-
- * Perfect for high-performance computing.
- * Slice are return view (no copy)
-
-* NumPy provides
-
- * elementwise computations
-
- * linear algebra, Fourier transforms
-
- * pseudorandom numbers from many distributions
-
-* SciPy provides lots more, including
-
- * more linear algebra
-
- * solvers and optimization algorithms
-
- * matlab-compatible I/O
-
- * I/O and signal processing for images and audio
-
-.. code-block:: python
-
-    ##############################
-    # Properties of NumPy arrays
-    # that you really need to know
-    ##############################
-
-    import numpy as np          # import can rename
-    a = np.random.rand(3, 4, 5) # random generators
-    a32 = a.astype('float32')   # arrays are strongly typed
-
-    a.ndim                      # int: 3
-    a.shape                     # tuple: (3, 4, 5)
-    a.size                      # int: 60
-    a.dtype                     # np.dtype object: 'float64'
-    a32.dtype                   # np.dtype object: 'float32'
-
-    assert a[1, 1, 1] != 10     # a[1, 1, 1] is a view
-    a[1, 1, 1] = 10             # So affectation to it change the
-    assert a[1, 1, 1] == 10     # original array
-
-
-Arrays can be combined with numeric operators, standard mathematical
-functions. NumPy has great `documentation <http://docs.scipy.org/doc/numpy/reference/>`_.
-
-Training an MNIST-ready classification neural network in pure NumPy might look like this:
-
-.. code-block:: python
-
-    #########################
-    # NumPy for Training a
-    # Neural Network on MNIST
-    #########################
-
-    x = np.load('data_x.npy')
-    y = np.load('data_y.npy')
-    w = np.random.normal(
-        avg=0,
-        std=.1,
-        size=(784, 500))
-    b = np.zeros((500,))
-    v = np.zeros((500, 10))
-    c = np.zeros((10,))
-
-    batchsize = 100
-    for i in xrange(1000):
-        x_i = x[i * batchsize: (i + 1) * batchsize]
-        y_i = y[i * batchsize: (i + 1) * batchsize]
-
-        hidin = np.dot(x_i, w) + b
-
-        hidout = np.tanh(hidin)
-
-        outin = np.dot(hidout, v) + c
-        outout = (np.tanh(outin) + 1) / 2.0
-
-        g_outout = outout - y_i
-        err = 0.5 * np.sum(g_outout) ** 2
-
-        g_outin = g_outout * outout * (1.0 - outout)
-
-        g_hidout = np.dot(g_outin, v.T)
-        g_hidin = g_hidout * (1 - hidout ** 2)
-
-        b -= lr * np.sum(g_hidin, axis=0)
-        c -= lr * np.sum(g_outin, axis=0)
-        w -= lr * np.dot(x_i.T, g_hidin)
-        v -= lr * np.dot(hidout.T, g_outin)
-
-
-What's missing?
----------------
-
-* Non-lazy evaluation (required by Python) hurts performance
-
-* NumPy is bound to the CPU
-
-* NumPy lacks symbolic or automatic differentiation
-
-Now let's have a look at the same algorithm in Theano, which runs 15 times faster if
-you have GPU (I'm skipping some dtype-details which we'll come back to).
-
-.. code-block:: python
-
-    #########################
-    # Theano for Training a
-    # Neural Network on MNIST
-    #########################
-
-    import numpy as np
-
-    import theano
-    import theano.tensor as tensor
-
-    x = np.load('data_x.npy')
-    y = np.load('data_y.npy')
-
-    # symbol declarations
-    sx = tensor.matrix()
-    sy = tensor.matrix()
-    w = theano.shared(np.random.normal(avg=0, std=.1,
-                                       size=(784, 500)))
-    b = theano.shared(np.zeros(500))
-    v = theano.shared(np.zeros((500, 10)))
-    c = theano.shared(np.zeros(10))
-
-    # symbolic expression-building
-    hid = tensor.tanh(tensor.dot(sx, w) + b)
-    out = tensor.tanh(tensor.dot(hid, v) + c)
-    err = 0.5 * tensor.sum(out - sy) ** 2
-    gw, gb, gv, gc = tensor.grad(err, [w, b, v, c])
-
-    # compile a fast training function
-    train = theano.function([sx, sy], err,
-        updates={
-            w: w - lr * gw,
-            b: b - lr * gb,
-            v: v - lr * gv,
-            c: c - lr * gc})
-
-    # now do the computations
-    batchsize = 100
-    for i in xrange(1000):
-        x_i = x[i * batchsize: (i + 1) * batchsize]
-        y_i = y[i * batchsize: (i + 1) * batchsize]
-        err_i = train(x_i, y_i)
-
-    
-Theano in one slide
--------------------
-
-* High-level domain-specific language tailored to numeric computation
-
-* Compiles most common expressions to C for CPU and GPU.
-
-* Limited expressivity means lots of opportunities for expression-level optimizations
-
- * No function call -> global optimization
-
- * Strongly typed -> compiles to machine instructions
-
- * Array oriented -> parallelizable across cores
-
- * Support for looping and branching in expressions
-
-* Expression substitution optimizations automatically draw
-  on many backend technologies for best performance.
-
- * FFTW, MKL, ATLAS, SciPy, Cython, CUDA
-
- * Slower fallbacks always available
-
-* Automatic differentiation and R op
-
-* Sparse matrices
-
-
-Project status
---------------
-
-* Mature: theano has been developed and used since January 2008 (5.5 yrs old)
-
-* Driven over 87 research papers
-
-* Good user documentation
-
-* Active mailing list with participants from outside our lab
-
-* Core technology for a funded Silicon-Valley startup
-
-* Many contributors (some from outside our lab)
-
-* Used to teach IFT6266 for many years
-
-* Used for research at Google and Yahoo.
-
-* Downloads (January 2011 -  June 8 2011):
-
- * Pypi (16 July 2013): 60k total, 159 last day, 823 last week 
-
- * Github (`bleeding edge` repository): unknown
-
-
-
-
-Why scripting for GPUs?
------------------------
-
-They *Complement each other*:
-
-* GPUs are everything that scripting/high level languages are not
-
- * Highly parallel
-
- * Very architecture-sensitive
-
- * Built for maximum FP/memory throughput
-
- * So hard to program that meta-programming is easier.
-
-* CPU: largely restricted to control
-
- * Optimized for sequential code and low latency (rather than high throughput)
-
- * Tasks (1000/sec)
-
- * Scripting fast enough
-
-Best of both: scripted CPU invokes JIT-compiled kernels on GPU.
-
-
-How Fast are GPUs?
-------------------
-
-* Theory
-
- * Intel Core i7 980 XE (107Gf/s float64) 6 cores
-
- * NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32) 480 cores
-
- * NVIDIA GTX580 (1.5Tf/s float32) 512 cores
-
- * GPUs are faster, cheaper, more power-efficient
-
-* Practice (our experience)
-
- * Depends on algorithm and implementation!
-
- * Reported speed improvements over CPU in lit. vary *widely* (.01x to 1000x)
-
- * Matrix-matrix multiply speedup: usually about 10-20x.
-
- * Convolution speedup: usually about 15x.
-
- * Elemwise speedup: slower or up to 100x (depending on operation and layout)
-
- * Sum: can be faster or slower depending on layout.
-
-* Benchmarking is delicate work...
-
- * How to control quality of implementation?
-
-  * How much time was spent optimizing CPU vs GPU code?
-
- * Theano goes up to 100x faster on GPU because it uses only one CPU core
-
- * Theano can be linked with multi-core capable BLAS (GEMM and GEMV)
-
-* If you see speedup > 100x, the benchmark is probably not fair.
diff --git a/doc/crei2013/logreg.py b/doc/crei2013/logreg.py
deleted file mode 100644
index bd1bbbe6ff8..00000000000
--- a/doc/crei2013/logreg.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import numpy
-import theano
-import theano.tensor as tt
-rng = numpy.random
-
-N = 400
-feats = 784
-D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
-training_steps = 10000
-
-# Declare Theano symbolic variables
-x = tt.matrix("x")
-y = tt.vector("y")
-w = theano.shared(rng.randn(feats), name="w")
-b = theano.shared(0., name="b")
-print "Initial model:"
-print w.get_value(), b.get_value()
-
-# Construct Theano expression graph
-p_1 = 1 / (1 + tt.exp(-tt.dot(x, w) - b))   # Probability that target = 1
-prediction = p_1 > 0.5                      # The prediction thresholded
-xent = -y * tt.log(p_1) - (1 - y) * tt.log(1 - p_1)  # Cross-entropy loss
-cost = xent.mean() + 0.01 * (w ** 2).sum()  # The cost to minimize
-gw, gb = tt.grad(cost, [w, b])
-
-# Compile
-train = theano.function(
-    inputs=[x, y],
-    outputs=[prediction, xent],
-    updates=[(w, w - 0.1 * gw),
-             (b, b - 0.1 * gb)],
-    name='train')
-
-predict = theano.function(inputs=[x], outputs=prediction,
-                          name='predict')
-
-# Train
-for i in range(training_steps):
-    pred, err = train(D[0], D[1])
-
-print "Final model:"
-print w.get_value(), b.get_value()
-print "target values for D:", D[1]
-print "prediction on D:", predict(D[0])
diff --git a/doc/crei2013/logreg_profile.prof b/doc/crei2013/logreg_profile.prof
deleted file mode 100644
index bd61054a32e..00000000000
--- a/doc/crei2013/logreg_profile.prof
+++ /dev/null
@@ -1,121 +0,0 @@
-Function profiling
-==================
-  Message: train
-  Time in 10000 calls to Function.__call__: 7.171231e+00s
-  Time in Function.fn.__call__: 6.686692e+00s (93.243%)
-  Time in thunks: 6.511275e+00s (90.797%)
-  Total compile time: 6.550491e-01s
-    Theano Optimizer time: 5.976810e-01s
-       Theano validate time: 1.260662e-02s
-    Theano Linker time (includes C, CUDA code generation/compiling): 2.649593e-02s
-
-Class
----
-<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
-  87.0%    87.0%       5.665s       2.83e-04s     C     20000        2   <class 'theano.tensor.blas_c.CGemv'>
-  11.5%    98.4%       0.746s       7.46e-06s     C     100000       10   <class 'theano.tensor.elemwise.Elemwise'>
-   0.7%    99.1%       0.045s       2.27e-06s     C     20000        2   <class 'theano.tensor.basic.Alloc'>
-   0.5%    99.6%       0.030s       1.01e-06s     C     30000        3   <class 'theano.tensor.elemwise.DimShuffle'>
-   0.2%    99.8%       0.013s       1.34e-06s     C     10000        1   <class 'theano.tensor.elemwise.Sum'>
-   0.2%   100.0%       0.012s       6.00e-07s     C     20000        2   <class 'theano.tensor.opt.Shape_i'>
-   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)
-
-Ops
----
-<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
-  87.0%    87.0%       5.665s       2.83e-04s     C     20000        2   CGemv{inplace}
-   6.9%    93.9%       0.452s       4.52e-05s     C     10000        1   Elemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(
-   1.8%    95.7%       0.116s       1.16e-05s     C     10000        1   Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i
-   1.7%    97.4%       0.109s       1.09e-05s     C     10000        1   Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 
-   0.7%    98.1%       0.045s       2.27e-06s     C     20000        2   Alloc
-   0.3%    98.4%       0.020s       1.02e-06s     C     20000        2   InplaceDimShuffle{x}
-   0.2%    98.6%       0.015s       1.50e-06s     C     10000        1   Elemwise{sub,no_inplace}
-   0.2%    98.8%       0.014s       1.42e-06s     C     10000        1   Elemwise{gt,no_inplace}
-   0.2%    99.1%       0.013s       1.34e-06s     C     10000        1   Sum
-   0.2%    99.3%       0.013s       1.29e-06s     C     10000        1   Elemwise{neg,no_inplace}
-   0.2%    99.4%       0.012s       6.00e-07s     C     20000        2   Shape_i{0}
-   0.2%    99.6%       0.010s       9.84e-07s     C     10000        1   InplaceDimShuffle{1,0}
-   0.1%    99.7%       0.010s       9.58e-07s     C     10000        1   Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)]
-   0.1%    99.8%       0.007s       6.95e-07s     C     10000        1   Elemwise{Cast{float64}}
-   0.1%    99.9%       0.005s       5.46e-07s     C     10000        1   Elemwise{inv,no_inplace}
-   0.1%   100.0%       0.005s       4.88e-07s     C     10000        1   Elemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)]
-   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)
-
-Apply
-------
-<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
-  51.0%    51.0%       3.319s       3.32e-04s   10000     7 CGemv{inplace}(Alloc.0, TensorConstant{1.0}, x, w, TensorConstant{0.0})
-  36.0%    87.0%       2.345s       2.35e-04s   10000    18 CGemv{inplace}(w, TensorConstant{-0.1}, x.T, Elemwise{Composite{[Composite{[Compo
-   6.9%    93.9%       0.452s       4.52e-05s   10000    13 Elemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_
-   1.8%    95.7%       0.116s       1.16e-05s   10000    16 Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, n
-   1.7%    97.4%       0.109s       1.09e-05s   10000    14 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)](Elemwis
-   0.5%    97.9%       0.031s       3.13e-06s   10000    12 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
-   0.2%    98.1%       0.015s       1.50e-06s   10000     4 Elemwise{sub,no_inplace}(TensorConstant{(1,) of 1.0}, y)
-   0.2%    98.3%       0.014s       1.42e-06s   10000    15 Elemwise{gt,no_inplace}(Elemwise{ScalarSigmoid{output_types_preference=transfer_t
-   0.2%    98.5%       0.014s       1.40e-06s   10000     5 Alloc(TensorConstant{0.0}, Shape_i{0}.0)
-   0.2%    98.7%       0.013s       1.34e-06s   10000    17 Sum(Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i
-   0.2%    98.9%       0.013s       1.33e-06s   10000     0 InplaceDimShuffle{x}(b)
-   0.2%    99.1%       0.013s       1.29e-06s   10000    11 Elemwise{neg,no_inplace}(Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0)
-   0.2%    99.3%       0.010s       9.84e-07s   10000     2 InplaceDimShuffle{1,0}(x)
-   0.1%    99.4%       0.010s       9.58e-07s   10000     9 Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)](CGemv{inplace}.0, InplaceDimShuff
-   0.1%    99.6%       0.007s       7.11e-07s   10000     6 InplaceDimShuffle{x}(Shape_i{0}.0)
-   0.1%    99.7%       0.007s       6.95e-07s   10000     8 Elemwise{Cast{float64}}(InplaceDimShuffle{x}.0)
-   0.1%    99.8%       0.006s       6.18e-07s   10000     1 Shape_i{0}(x)
-   0.1%    99.8%       0.006s       5.82e-07s   10000     3 Shape_i{0}(y)
-   0.1%    99.9%       0.005s       5.46e-07s   10000    10 Elemwise{inv,no_inplace}(Elemwise{Cast{float64}}.0)
-   0.1%   100.0%       0.005s       4.88e-07s   10000    19 Elemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)](b, TensorConstant{0.1}, Sum.0
-   ... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)
-
-Function profiling
-==================
-  Message: predict
-  Time in 1 calls to Function.__call__: 4.870892e-04s
-  Time in Function.fn.__call__: 4.608631e-04s (94.616%)
-  Time in thunks: 4.491806e-04s (92.217%)
-  Total compile time: 7.993293e-02s
-    Theano Optimizer time: 7.383800e-02s
-       Theano validate time: 2.010584e-03s
-    Theano Linker time (includes C, CUDA code generation/compiling): 4.319906e-03s
-
-Class
----
-<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
-  94.2%    94.2%       0.000s       4.23e-04s     C        1        1   <class 'theano.tensor.blas_c.CGemv'>
-   4.0%    98.2%       0.000s       1.81e-05s     C        1        1   <class 'theano.tensor.elemwise.Elemwise'>
-   0.7%    98.9%       0.000s       3.10e-06s     C        1        1   <class 'theano.tensor.basic.Alloc'>
-   0.6%    99.5%       0.000s       2.86e-06s     C        1        1   <class 'theano.tensor.elemwise.DimShuffle'>
-   0.5%   100.0%       0.000s       2.15e-06s     C        1        1   <class 'theano.tensor.opt.Shape_i'>
-   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)
-
-Ops
----
-<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
-  94.2%    94.2%       0.000s       4.23e-04s     C        1        1   CGemv{inplace}
-   4.0%    98.2%       0.000s       1.81e-05s     C        1        1   Elemwise{Composite{[Composite{[Composite{[Composite{[GT(scalar_sigmoid
-   0.7%    98.9%       0.000s       3.10e-06s     C        1        1   Alloc
-   0.6%    99.5%       0.000s       2.86e-06s     C        1        1   InplaceDimShuffle{x}
-   0.5%   100.0%       0.000s       2.15e-06s     C        1        1   Shape_i{0}
-   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)
-
-Apply
-------
-<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
-  94.2%    94.2%       0.000s       4.23e-04s      1     3 CGemv{inplace}(Alloc.0, TensorConstant{1.0}, x, w, TensorConstant{0.0})
-   4.0%    98.2%       0.000s       1.81e-05s      1     4 Elemwise{Composite{[Composite{[Composite{[Composite{[GT(scalar_sigmoid(i0), i1)]}
-   0.7%    98.9%       0.000s       3.10e-06s      1     2 Alloc(TensorConstant{0.0}, Shape_i{0}.0)
-   0.6%    99.5%       0.000s       2.86e-06s      1     0 InplaceDimShuffle{x}(b)
-   0.5%   100.0%       0.000s       2.15e-06s      1     1 Shape_i{0}(x)
-   ... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)
-
-Function profiling
-==================
-  Message: Sum of all printed profiles at exit
-  Time in 10001 calls to Function.__call__: 7.171718e+00s
-  Time in Function.fn.__call__: 6.687153e+00s (93.243%)
-  Time in thunks: 6.511724e+00s (90.797%)
-  Total compile time: 7.349820e-01s
-    Theano Optimizer time: 6.715190e-01s
-       Theano validate time: 1.461720e-02s
-    Theano Linker time (includes C, CUDA code generation/compiling): 3.081584e-02s
-
-  [...]
diff --git a/doc/crei2013/scan_poly.py b/doc/crei2013/scan_poly.py
deleted file mode 100644
index c47865ceb26..00000000000
--- a/doc/crei2013/scan_poly.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import numpy
-
-import theano
-import theano.tensor as tt
-
-coefficients = theano.tensor.vector("coefficients")
-x = tt.scalar("x")
-max_coefficients_supported = 10000
-
-# Generate the components of the polynomial
-full_range = theano.tensor.arange(max_coefficients_supported)
-components, updates = theano.scan(fn=lambda coeff, power, free_var:
-                                  coeff * (free_var ** power),
-                                  outputs_info=None,
-                                  sequences=[coefficients, full_range],
-                                  non_sequences=x)
-polynomial = components.sum()
-calculate_polynomial = theano.function(inputs=[coefficients, x],
-                                       outputs=polynomial)
-
-test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
-print calculate_polynomial(test_coeff, 3)
-# 19.0
diff --git a/doc/crei2013/scan_pow.py b/doc/crei2013/scan_pow.py
deleted file mode 100644
index ca7f582d6a7..00000000000
--- a/doc/crei2013/scan_pow.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import theano
-import theano.tensor as tt
-
-k = tt.iscalar("k")
-A = tt.vector("A")
-
-
-def inner_fct(prior_result, A):
-    return prior_result * A
-# Symbolic description of the result
-result, updates = theano.scan(fn=inner_fct,
-                              outputs_info=tt.ones_like(A),
-                              non_sequences=A, n_steps=k)
-
-# Scan has provided us with A**1 through A**k.  Keep only the last
-# value. Scan notices this and does not waste memory saving them.
-final_result = result[-1]
-
-power = theano.function(inputs=[A, k],
-                        outputs=final_result,
-                        updates=updates)
-
-print power(range(10), 2)
-#[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
diff --git a/doc/crei2013/theano.txt b/doc/crei2013/theano.txt
deleted file mode 100644
index 17fedc742e0..00000000000
--- a/doc/crei2013/theano.txt
+++ /dev/null
@@ -1,334 +0,0 @@
-
-.. _crei2013_theano:
-
-******
-Theano
-******
-
-Pointers
---------
-
-* http://deeplearning.net/software/theano/
-* Announcements mailing list: http://groups.google.com/group/theano-announce
-* User mailing list: http://groups.google.com/group/theano-users
-* Deep Learning Tutorials: http://www.deeplearning.net/tutorial/
-* Installation: https://deeplearning.net/software/theano/install.html
-
-
-Description
------------
-
-* Mathematical symbolic expression compiler
-* Dynamic C/CUDA code generation
-* Efficient symbolic differentiation
- 
-  * Theano computes derivatives of functions with one or many inputs.
-
-* Speed and stability optimizations
-
-  * Gives the right answer for ``log(1+x)`` even if x is really tiny.
-  
-* Works on Linux, Mac and Windows
-* Transparent use of a GPU
-
-  * float32 only for now (working on other data types)
-  * Still in experimental state on Windows
-  * On GPU data-intensive calculations are typically between 6.5x and 44x faster. We've seen speedups up to 140x
-
-* Extensive unit-testing and self-verification
-
-  * Detects and diagnoses many types of errors
-  
-* On CPU, common machine learning algorithms are 1.6x to 7.5x faster than competitive alternatives
-
-  * including specialized implementations in C/C++, NumPy, SciPy, and Matlab
-
-* Expressions mimic NumPy's syntax & semantics
-* Statically typed and purely functional
-* Some sparse operations (CPU only)
-
-Simple example
---------------
-
->>> import theano
->>> a = theano.tensor.vector("a")      # declare symbolic variable
->>> b = a + a ** 10                    # build symbolic expression
->>> f = theano.function([a], b)        # compile function
->>> print f([0, 1, 2])                 # prints `array([0, 2, 1026])`
-
-
-======================================================  =====================================================
-        Unoptimized graph                                    Optimized graph
-======================================================  =====================================================
-.. image:: ../hpcs2011_tutorial/pics/f_unoptimized.png   .. image:: ../hpcs2011_tutorial/pics/f_optimized.png
-======================================================  =====================================================
-
-Symbolic programming = *Paradigm shift*: people need to use it to understand it.
-
-Exercise 1
------------
-
-.. code-block:: python
-
-  import theano
-  a = theano.tensor.vector()      # declare variable
-  out = a + a ** 10               # build symbolic expression
-  f = theano.function([a], out)   # compile function
-  print f([0, 1, 2])
-  # prints `array([0, 2, 1026])`
-  
-  theano.printing.pydotprint_variables(b, outfile="f_unoptimized.png", var_with_name_simple=True)
-  theano.printing.pydotprint(f, outfile="f_optimized.png", var_with_name_simple=True)
-
-Modify and execute the example to do this expression: ``a ** 2 + b ** 2 + 2 * a * b``
-
-Real example
-------------
-
-**Logistic Regression**
-
-* GPU-ready
-* Symbolic differentiation
-* Speed optimizations
-* Stability optimizations
-
-.. literalinclude:: logreg.py
-
-
-**Optimizations:**
-
-Where are those optimization applied?
-
-* ``log(1+exp(x))``
-
-* ``1 / (1 + tt.exp(var))`` (sigmoid)
-
-* ``log(1-sigmoid(var))`` (softplus, stabilisation)
-
-* GEMV (matrix-vector multiply from BLAS)
-
-* Loop fusion
-
-
-.. code-block:: python
-
-  p_1 = 1 / (1 + tt.exp(-tt.dot(x, w) - b))
-  # 1 / (1 + tt.exp(var)) -> sigmoid(var)
-  xent = -y * tt.log(p_1) - (1 - y) * tt.log(1 - p_1)
-  # Log(1-sigmoid(var)) -> -sigmoid(var)
-  prediction = p_1 > 0.5
-  cost = xent.mean() + 0.01 * (w ** 2).sum()
-  gw,gb = tt.grad(cost, [w, b])
-
-  train = theano.function(
-            inputs=[x, y],
-            outputs=[prediction, xent],
-            # w - 0.1 * gw: GEMV with the dot in the grad
-            updates=[(w, w - 0.1 * gw),
-                     (b, b - 0.1 * gb)])
-
-
-Theano flags
-------------
-
-Theano can be configured with flags. They can be defined in two ways
-
-* With an environment variable: ``THEANO_FLAGS="mode=ProfileMode,ProfileMode.profile_memory=True"``
-
-* With a configuration file that defaults to ``~/.theanorc``
-
-
-Exercise 2
------------
-
-.. code-block:: python
-    
-    import numpy
-    import theano
-    import theano.tensor as tt
-    rng = numpy.random
-
-    N = 400
-    feats = 784
-    D = (rng.randn(N, feats).astype(theano.config.floatX),
-    rng.randint(size=N,low=0, high=2).astype(theano.config.floatX))
-    training_steps = 10000
-
-    # Declare Theano symbolic variables
-    x = tt.matrix("x")
-    y = tt.vector("y")
-    w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
-    b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
-    x.tag.test_value = D[0]
-    y.tag.test_value = D[1]
-    #print "Initial model:"
-    #print w.get_value(), b.get_value()
-
-
-    # Construct Theano expression graph
-    p_1 = 1 / (1 + tt.exp(-tt.dot(x, w) - b))  # Probability of having a one
-    prediction = p_1 > 0.5  # The prediction that is done: 0 or 1
-    xent = -y * tt.log(p_1) - (1 - y) * tt.log(1 - p_1)  # Cross-entropy
-    cost = xent.mean() + 0.01 * (w**2).sum()  # The cost to optimize
-    gw,gb = tt.grad(cost, [w, b])
-
-    # Compile expressions to functions
-    train = theano.function(
-                inputs=[x, y],
-                outputs=[prediction, xent],
-                updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
-                name="train")
-    predict = theano.function(inputs=[x], outputs=prediction,
-                              name="predict")
-
-    if any([x.op.__class__.__name__=='Gemv' for x in
-            train.maker.fgraph.toposort()]):
-        print 'Used the cpu'
-    elif any([x.op.__class__.__name__=='GpuGemm' for x in
-              train.maker.fgraph.toposort()]):
-        print 'Used the gpu'
-    else:
-        print 'ERROR, not able to tell if theano used the cpu or the gpu'
-        print train.maker.fgraph.toposort()
-
-
-
-    for i in range(training_steps):
-        pred, err = train(D[0], D[1])
-    #print "Final model:"
-    #print w.get_value(), b.get_value()
-
-    print "target values for D"
-    print D[1]
-
-    print "prediction on D"
-    print predict(D[0])
-
-    # Print the graph used in the slides
-    theano.printing.pydotprint(predict,
-                               outfile="pics/logreg_pydotprint_predic.png",
-                               var_with_name_simple=True)
-    theano.printing.pydotprint_variables(prediction,
-                               outfile="pics/logreg_pydotprint_prediction.png",
-                               var_with_name_simple=True)
-    theano.printing.pydotprint(train,
-                               outfile="pics/logreg_pydotprint_train.png",
-                               var_with_name_simple=True)
-
-Modify and execute the example to run on CPU with floatX=float32
-
-* You will need to use: ``theano.config.floatX`` and ``ndarray.astype("str")``
-
-GPU
----
-
-* Only 32 bit floats are supported (being worked on)
-* Only 1 GPU per process. Wiki page on using multiple process for multiple GPU
-* Use the Theano flag ``device=gpu`` to tell to use the GPU device
-  
- * Use ``device=gpu{0, 1, ...}`` to specify which GPU if you have more than one
- * Shared variables with float32 dtype are by default moved to the GPU memory space
-
-* Use the Theano flag ``floatX=float32``
-
- * Be sure to use ``floatX`` (``theano.config.floatX``) in your code
- * Cast inputs before putting them into a shared variable
- * Cast "problem": int32 with float32 to float64
-    
-  * Insert manual cast in your code or use [u]int{8,16}
-  * The mean operator is worked on to make the output stay in float32.
-
-* Use the Theano flag ``force_device=True``, to exit if Theano isn't able to use a GPU.
-
-  * Theano 0.6rc4 will have the combination of ``force_device=True``
-    and ``device=cpu`` disable the GPU.
-
-
-
-Exercise 3
------------
-
-* Modify and execute the example of `Exercise 2`_ to run with floatX=float32 on GPU
-
-* Time with: ``time python file.py``
-
-Symbolic variables
-------------------
-
-* # Dimensions
-    
- * tt.scalar, tt.vector, tt.matrix, tt.tensor3, tt.tensor4
-
-* Dtype
-
- * tt.[fdczbwil]vector (float32, float64, complex64, complex128, int8, int16, int32, int64)
-
- * tt.vector to floatX dtype
-
- * floatX: configurable dtype that can be float32 or float64.
-
-* Custom variable
-
- * All are shortcuts to: ``tt.tensor(dtype, broadcastable=[False]*nd)``
-
- * Other dtype: uint[8,16,32,64], floatX
-
-Creating symbolic variables: Broadcastability
-
-* Remember what I said about broadcasting?
-
-* How to add a row to all rows of a matrix?
-
-* How to add a column to all columns of a matrix?
-
-
-Details regarding symbolic broadcasting...
-  
-* Broadcastability must be specified when creating the variable
-
-* The only shorcut with broadcastable dimensions are: **tt.row** and **tt.col**
-
-* For all others: ``tt.tensor(dtype, broadcastable=([False or True])*nd)``
-
-
-Differentiation details
------------------------
-
->>> gw,gb = tt.grad(cost, [w,b])
-
-* tt.grad works symbolically: takes and returns a Theano variable
-
-* tt.grad can be compared to a macro: it can be applied multiple times
-
-* tt.grad takes scalar costs only
-
-* Simple recipe allows to compute efficiently vector x Jacobian and vector x Hessian
-
-* We are working on the missing optimizations to be able to compute efficently the full Jacobian and Hessian and Jacobian x vector
-
-
-Old Benchmarks
---------------
-
-:ref:`Example: <cifar2011_benchmark>`
-
-* Multi-layer perceptron
-* Convolutional Neural Networks
-* Misc Elemwise operations
-
-Competitors: NumPy + SciPy, MATLAB, EBLearn, Torch5, numexpr
-
-* EBLearn, Torch5: specialized libraries written by practitioners specifically for these tasks
-* numexpr: similar to Theano, 'virtual machine' for elemwise expressions
-
-New Benchmarks
---------------
-
-`Example <http://arxiv.org/pdf/1211.5590v1.pdf>`_ (Page 7 and 9):
-
-* Logistic regression, MLP with 1 and 3 layers
-* Recurrent neural networks
-
-Competitors: Torch7, RNNLM
-
-* Torch7, RNNLM: specialized libraries written by practitioners specifically for these tasks
diff --git a/doc/extending/cop.txt b/doc/extending/cop.txt
index 3072e201bc9..d38eb4de131 100644
--- a/doc/extending/cop.txt
+++ b/doc/extending/cop.txt
@@ -31,172 +31,64 @@ There are less methods to define for an Op than for a Type:
 
     .. method:: c_code(node, name, input_names, output_names, sub)
 
-       This must return C code that carries the computation we want to
-       do.
+      This must return C code that carries the computation we want to do.
 
-       `sub` is a dictionary of extras parameters to the c_code
-       method. It contains the following values:
-
-       ``sub['fail']``
-
-          A string of code that you should execute (after ensuring
-          that a python exception is set) if your C code needs to
-          raise an exception.
-
-       ``sub['struct_id']``
-
-          The integer id passed to the various _struct methods.
+      sub is a dictionary of strings for you to substitute into your code.
+      It's not clear if it ever contains anything other than 'fail'.
+      sub['fail'] is a string of code that you should execute (after calling
+      PyErr_Format) if your C code needs to raise an exception.
 
     .. method:: c_code_cleanup(node, name, input_names, output_names, sub)
 
-       This must return C code that cleans up whatever c_code
-       allocated and that we must free.
+      This must return C code that cleans up whatever c_code allocated and
+      that we must free.
 
-       *Default:* The default behavior is to do nothing.
+      *Default:* The default behavior is to do nothing.
 
     .. method:: c_headers()
-
-       Returns a list of headers to include in the file. 'Python.h' is
-       included by default so you don't need to specify it.  Also all
-       of the header required by the Types involved (inputs and
-       outputs) will also be included.
-
     .. method:: c_header_dirs()
-
-       Returns a list of directories to search for headers (arguments
-       to -I).
-
     .. method:: c_libraries()
-
-       Returns a list of library names that your op needs to link to.
-       All ops are automatically linked with 'python' and the
-       libraries their types require. (arguments to -l)
-
     .. method:: c_lib_dirs()
 
-       Returns a list of directory to search for libraries (arguments
-       to -L).
+      Allows you to specify headers, libraries, and their directories,
 
     .. method:: c_compile_args()
-
-       Allows to specify additional arbitrary arguments to g++.  This
-       is not usually required.
-
     .. method:: c_no_compile_args()
 
-       Returns a list of g++ arguments that are forbidden when
-       compiling this Op.
-
-    .. method:: c_init_code()
-
-       Allows you to specify code that will be executed once when the
-       module is initialized, before anything else is executed.  This
-       is for code that will be executed once per Op.
-
-    .. method:: c_init_code_apply(node, name)
-
-       Allows you to specify code that will be executed once when the
-       module is initialized, before anything else is executed and is
-       specialized for a particular apply of an :ref:`op`.
-
-    .. method:: c_init_code_struct(node, struct_id, sub)
-
-       Allows you to specify code that will be inserted in the struct
-       constructor of the Op.  This is for code which should be
-       executed once per thunk (Apply node, more or less).
-
-       `struct_id` is an integer guaranteed to be unique inside the
-       struct.
-
-       `sub` is a dictionary of extras parameters to the
-       c_code_init_code_struct method. It contains the following
-       values:
-
-       ``sub['fail']``
-
-          A string of code that you should execute (after ensuring
-          that a python exception is set) if your C code needs to
-          raise an exception.
+      Allows you to specify special g++ arguments to add/exclude
 
     .. method:: c_support_code()
 
-       Allows you to specify helper functions/structs that the
-       :ref:`op` needs.  That code will be reused for each apply of
-       this op. It will be inserted at global scope.
+      Allows you to specify helper functions/structs that the
+      :ref:`op` needs.  That code will be reused for each apply of
+      this op. It will be inserted at global scope.
 
     .. method:: c_support_code_apply(node, name)
 
-       Allows you to specify helper functions/structs specialized for
-       a particular apply of an :ref:`op`. Use :meth:`c_support_code`
-       if the code is the same for each apply of an op.  It will be
-       inserted at global scope.
-
-    .. method:: c_support_code_struct(node, struct_id)
-
-       Allows you to specify helper functions of variables that will
-       be specific to one particular thunk.  These are inserted at
-       struct scope.
-
-       `struct_id` is an integer guaranteed to be unique inside the
-       struct.
-
-       :note:
-         You cannot specify kernels in the code returned by this since
-         that isn't supported by CUDA.  You should place your kernels
-         in :meth:`c_support_code()` or :meth:`c_support_code_apply()`
-         and call them from this code.
-
-    .. method:: c_cleanup_code_struct(node, struct_id)
-
-       Allows you to specify code that will be inserted in the struct
-       destructor of the Op.  This is for cleaninp up allocations and
-       stuff like this when the thunk is released (when you "free" a
-       compiled function using this op).
-
-       `struct_id` is an integer guaranteed to be unique inside the
-       struct.
+      Allows you to specify helper functions/structs specialized for a
+      particular apply of an :ref:`op`. Use `c_support_code` if the
+      code is the same for each apply of an op.
+      It will be inserted at global scope.
 
     .. method:: infer_shape(node, (i0_shapes,i1_shapes,...))
 
-       Allow optimizations to lift the Shape op over this op.  An
-       example of why this is good is when we only need the shape of a
-       variable: we will be able to obtain it without computing the
-       variable itself.
-
-       Must return a list where each element is a tuple representing
-       the shape of one output.
-
-       For example, for the matrix-matrix product ``infer_shape`` will
-       have as inputs (node, ((x0,x1), (y0,y1))) and should return
-       [(x0, y1)]. Both the inputs and the return value may be Theano
-       variables.
+      Allow optimizations to lift the Shape op over this op.
+      An example of why this is good is when we only need the shape of a
+      variable: we will be able to obtain it without computing the variable
+      itself.
+      Must return a list where each element is a tuple representing the shape
+      of one output.
+      For example, for the matrix-matrix product ``infer_shape`` will have as
+      inputs (node, ((x0,x1), (y0,y1))) and should return [(x0, y1)]. Both the
+      inputs and the return value may be Theano variables.
 
     .. method:: c_code_cache_version()
 
-       Must return a tuple of hashable objects like integers. This
+       Should return a tuple of hashable objects like integers. This
        specifies the version of the code. It is used to cache the
        compiled code. You MUST change the returned tuple for each
-       change in the code. If you don't want to cache the compiled
-       code return an empty tuple or don't implement it.
-
-    .. method:: c_code_cache_version_apply(node)
-
-       Overrides :meth:`c_code_cache_version` if defined, but
-       otherwise has the same contract.
-
-    .. method:: python_constant_folding(node)
-
-       Optional. If present this method will be called before doing
-       constant folding of a node, with that node as a parameter. If
-       it return True, we will not generate c code when doing constant
-       folding of this node.  This is useful when the compilation of
-       the c code will be longer then the computation in python
-       (e.g. Elemwise of scalars).
-
-       In addition, this allow to lower the number of compiled module
-       and disk access. Particularly useful when the file system load
-       is high or when theano compilation directory is shared by many
-       process (like on a network file server on a cluster).
+       change in the code. If you don't want to cache the compiled code
+       return an empty tuple or don't implement it.
 
 The ``name`` argument is currently given an invalid value, so steer
 away from it. As was the case with Type, ``sub['fail']`` provides
@@ -307,18 +199,18 @@ version that it produces in the code I gave above.
            return self.ccode % locals()
 
 
-   add = BinaryDoubleOp(name='add',
-                        fn=lambda x, y: x + y,
-                        ccode="%(z)s = %(x)s + %(y)s;")
+   add = BinaryDoubleOp(name = 'add',
+                        fn = lambda x, y: x + y,
+                        ccode = "%(z)s = %(x)s + %(y)s;")
 
-   sub = BinaryDoubleOp(name='sub',
-                        fn=lambda x, y: x - y,
-                        ccode="%(z)s = %(x)s - %(y)s;")
+   sub = BinaryDoubleOp(name = 'sub',
+                        fn = lambda x, y: x - y,
+                        ccode = "%(z)s = %(x)s - %(y)s;")
 
-   mul = BinaryDoubleOp(name='mul',
-                        fn=lambda x, y: x * y,
-                        ccode="%(z)s = %(x)s * %(y)s;")
+   mul = BinaryDoubleOp(name = 'mul',
+                        fn = lambda x, y: x * y,
+                        ccode = "%(z)s = %(x)s * %(y)s;")
 
-   div = BinaryDoubleOp(name='div',
-                        fn=lambda x, y: x / y,
-                        ccode="%(z)s = %(x)s / %(y)s;")
+   div = BinaryDoubleOp(name = 'div',
+                        fn = lambda x, y: x / y,
+                        ccode = "%(z)s = %(x)s / %(y)s;")
diff --git a/doc/extending/ctype.txt b/doc/extending/ctype.txt
index c47720e9e1f..bb92c290ae2 100644
--- a/doc/extending/ctype.txt
+++ b/doc/extending/ctype.txt
@@ -50,7 +50,7 @@ the most important ones:
 
 .. class:: CLinkerType
 
-    .. method:: c_declare(name, sub, check_input=True)
+    .. method:: c_declare(name, sub)
 
         This must return C code which declares variables. These variables
         will be available to operations defined in C. You may also write
@@ -61,7 +61,7 @@ the most important ones:
         This must return C code which initializes the variables declared in
         ``c_declare``. Either this or ``c_extract`` will be called.
 
-    .. method:: c_extract(name, sub, check_input=True)
+    .. method:: c_extract(name, sub)
 
         This must return C code which takes a reference to a Python object
         and initializes the variables declared in ``c_declare`` to match the
@@ -90,14 +90,6 @@ the most important ones:
 
         Allows to specify special compiler arguments to add/exclude.
 
-    .. method:: c_init_code()
-
-        Allows you to specify code that will be executed once when the
-        module is initialized, before anything else is executed.
-        For instance, if a type depends on NumPy's C API, then
-        ``'import_array();'`` has to be among the snippets returned
-        by ``c_init_code()``.
-
     .. method:: c_support_code()
 
         Allows to add helper functions/structs that the :ref:`type` needs.
@@ -129,11 +121,6 @@ associated to the Variable is ``py_<name>``.
 ``sub``, on the other hand, is a dictionary containing bits of C code
 suitable for use in certain situations. For instance, ``sub['fail']``
 contains code that should be inserted wherever an error is identified.
-
-``c_declare`` and ``c_extract`` also accept a third ``check_input`` 
-optional argument. If you want your type to validate its inputs, it must
-only do it when ``check_input`` is True.
-
 The example code below should help you understand how everything plays
 out:
 
@@ -528,53 +515,31 @@ You can implement c_code for this op. You register it like this:
 
 .. code-block:: python
 
-   theano.compile.ops.register_deep_copy_op_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
+   theano.compile.function_module.register_DeepCopyOp_c_code(YOUR_TYPE_CLASS, THE_C_CODE)
 
 In your C code, you should use %(iname)s and %(oname)s to represent
 the C variable names of the DeepCopyOp input and output
-respectively. See an example for the type ``CudaNdarrayType`` (GPU
-array) in the file `theano/sandbox/cuda/type.py`. The version
-parameter is what is returned by DeepCopyOp.c_code_cache_version(). By
-default, it will recompile the c code for each process.
+respectively. See an example for the type ``CudaNdarrayType`` (GPU array)
+in the file `theano/sandbox/cuda/type.py`.
 
-ViewOp
-======
+Output Guard
+============
 
-We have an internal Op called ViewOp. It is used for some
+We have an internal Op called OutputGuard. It is used for some
 verification of inplace/view Ops. Its C implementation increments and
 decrements Python reference counts, and thus only works with Python
 objects. If your new type represents Python objects, you should tell
-ViewOp to generate C code when working with this type, as
+OutputGuard to generate C code when working with this type, as
 otherwise it will use Python code instead. This is achieved by
 calling:
 
 .. code-block:: python
 
-   theano.compile.ops.register_view_op_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
-
-In your C code, you should use %(iname)s and %(oname)s to represent
-the C variable names of the ViewOp input and output
-respectively. See an example for the type ``CudaNdarrayType`` (GPU
-array) in the file `theano/sandbox/cuda/type.py`. The version
-parameter is what is returned by ViewOp.c_code_cache_version(). By
-default, it will recompile the c code for each process.
-
-
-Shape and Shape_i
-=================
-
-We have 2 generic Ops, Shape and Shape_i, that return the shape of any
-Theano Variable that has a shape attribute (Shape_i returns only one of
-the elements of the shape).
-
-
-.. code-block:: python
-
-   theano.compile.ops.register_shape_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
-   theano.compile.ops.register_shape_i_c_code(YOUR_TYPE_CLASS, THE_C_CODE, CHECK_INPUT, version=())
-
-The C code works as the ViewOp. Shape_i has the additional ``i`` parameter
-that you can use with ``%(i)s``.
+   theano.compile.mode.register_OutputGuard_c_code(YOUR_TYPE_CLASS)
 
-In your CHECK_INPUT, you must check that the input has enough dimensions to
-be able to access the i-th one.
+Note that you should not call ``register_OutputGuard_c_code`` with your
+new type if this type is not associated to Python objects in the C code,
+as otherwise the reference count mechanism will crash. For instance,
+the ``Double`` type described here is associated to ``double`` objects
+in the C code, and consequently this type should not be registered in
+OutputGuard.
diff --git a/doc/extending/fibby.txt b/doc/extending/fibby.txt
index 08383a753f8..7d69218ba61 100644
--- a/doc/extending/fibby.txt
+++ b/doc/extending/fibby.txt
@@ -67,9 +67,9 @@ you should check the strides and alignment.
         if (!%(y)s)
             %(fail)s;
         {//New scope needed to make compilation work
-            dtype_%(y)s * y = (dtype_%(y)s*)PyArray_DATA(%(y)s);
-            dtype_%(x)s * x = (dtype_%(x)s*)PyArray_DATA(%(x)s);
-            for (int i = 2; i < PyArray_DIMS(%(x)s)[0]; ++i)
+            dtype_%(y)s * y = (dtype_%(y)s*)%(y)s->data;
+            dtype_%(x)s * x = (dtype_%(x)s*)%(x)s->data;
+            for (int i = 2; i < %(x)s->dimensions[0]; ++i)
                 y[i] = y[i-1]*y[i-2] + x[i];
         }
       """ % locals()
@@ -141,7 +141,8 @@ The ``c_code`` method accepts variable names as arguments (``name``, ``inames``,
 ``onames``) and returns a C code fragment that computes the expression output.
 In case of error, the ``%(fail)s`` statement cleans up and returns properly.
 The variables ``%(x)s`` and ``%(y)s`` are set up by the TensorType to be ``PyArrayObject`` pointers.
-TensorType also set up ``dtype_%(x)s`` to be a typdef to the C type for ``x``.
+TensorType also set up ``dtype_%(x)s`` to be a typdef to the C type for ``x``,
+``type_num_%(x)s`` is the corresponding NumPy type number.
 
 In the first two lines of the C function, we make y point to a new array with
 the correct size for the output. This is essentially simulating the line
diff --git a/doc/extending/graphstructures.txt b/doc/extending/graphstructures.txt
index 970e9cf562e..5fbc11a59e1 100644
--- a/doc/extending/graphstructures.txt
+++ b/doc/extending/graphstructures.txt
@@ -371,59 +371,3 @@ A constant does not need to be specified in a :func:`function
 <function.function>`'s list
 of inputs.  In fact, doing so will raise an exception.
 
-
-
-Graph Structures Extension
-==========================
-
-When we start the compilation of a Theano function, we compute some
-extra information. This section describes a portion of the information
-that is made available. Not everything is described, so email
-theano-dev if you need something that is missing.
-
-
-The graph gets cloned at the start of compilation, so modifications done
-during compilation won't affect the user graph.
-
-Each variable receives a new field called clients. It is a list with
-references to every place in the graph where this variable is used. If
-its length is 0, it means the variable isn't used. Each place where it
-is used is described by a tuple of 2 elements. There are two types of
-pairs:
-
-- The first element is an Apply node.
-- The first element is the string "output". It means the
-  function outputs this variable.
-
-In both types of pairs, the second element of the tuple is an index,
-such that: ``var.clients[*][0].inputs[index]`` or
-``fgraph.outputs[index]`` is that variable.
-
-.. code-block:: python
-
-    import theano
-    v = theano.tensor.vector()
-    f = theano.function([v], (v+1).sum())
-    theano.printing.debugprint(f)
-    # Sorted list of all nodes in the compiled graph.
-    topo = f.maker.fgraph.toposort()
-    topo[0].outputs[0].clients
-    # [(Sum(Elemwise{add,no_inplace}.0), 0)]
-    topo[1].outputs[0].clients
-    # [('output', 0)]
-
-    # An internal variable
-    var = topo[0].outputs[0]
-    client = var.clients[0]
-    client
-    # (Sum(Elemwise{add,no_inplace}.0), 0)
-    type(clients[0][0])
-    # <class 'theano.gof.graph.Apply'>
-    assert client[0].inputs[client[1]] is var
-
-    # An output of the graph
-    var = topo[1].outputs[0]
-    client = var.clients[0]
-    client
-    # ('output', 0)
-    assert f.maker.fgraph.outputs[client[1]] is var
diff --git a/doc/extending/index.txt b/doc/extending/index.txt
index 783b7cb9a71..d73a20da70b 100644
--- a/doc/extending/index.txt
+++ b/doc/extending/index.txt
@@ -30,10 +30,11 @@ a C implementation.
     type
     op
     inplace
-    other_ops
     ctype
     cop
     optimization
     tips
     unittest
     extending_faq
+
+
diff --git a/doc/extending/inplace.txt b/doc/extending/inplace.txt
index ca23f35a053..723ab62d2ce 100644
--- a/doc/extending/inplace.txt
+++ b/doc/extending/inplace.txt
@@ -216,3 +216,5 @@ optimization can pre-check whether it will get rejected by using the
 which Ops can be performed inplace. You may then skip the optimization if it is
 incompatible with this check. Note however that this check does not cover all
 cases where an optimization may be rejected (it will not detect cycles).
+
+.. _optdb:
diff --git a/doc/extending/op.txt b/doc/extending/op.txt
index b1035f0c9b4..626f3cfece9 100644
--- a/doc/extending/op.txt
+++ b/doc/extending/op.txt
@@ -6,26 +6,28 @@ Making arithmetic Ops on double
 Now that we have a ``double`` type, we have yet to use it to perform
 computations. We'll start by defining multiplication.
 
+
 .. _op_contract:
 
 Op's contract
 =============
 
-An Op is any object which inherits from :class:`gof.Op`.  It has to
-define the following methods.
+An Op (:class:`gof.Op`) is any object which defines the
+following methods:
+
 
 .. function:: make_node(*inputs)
 
   This method is responsible for creating output Variables of a
-  suitable symbolic Type to serve as the outputs of this Op's
-  application.  The Variables found in ``*inputs`` must be operated on
-  using Theano's symbolic language to compute the symbolic output
-  Variables. This method should put these outputs into an Apply
-  instance, and return the Apply instance.
+  suitable symbolic Type to serve as the outputs of this Op's application.
+  The Variables found in ``*inputs`` must be operated on using Theano's
+  symbolic language to compute the symbolic output Variables. This method
+  should put these outputs into an Apply instance, and return the
+  Apply instance.
 
   This method creates an Apply node representing the application of
-  the Op on the inputs provided. If the Op cannot be applied to these
-  inputs, it must raise an appropriate exception.
+  the Op on the inputs provided. If the Op cannot be applied to
+  these inputs, it must raise an appropriate exception.
 
   The inputs of the Apply instance returned by this call must be
   ordered correctly: a subsequent ``self.make_node(*apply.inputs)``
@@ -33,12 +35,10 @@ define the following methods.
 
 .. function:: perform(node, inputs, output_storage)
 
-  This method computes the function associated to this Op. ``node`` is
-  an Apply node created by the Op's ``make_node`` method. ``inputs``
-  is a list of references to data to operate on using non-symbolic
-  statements, (i.e., statements in Python, Numpy). ``output_storage``
-  is a list of storage cells where the variables of the computation
-  must be put.
+  This method computes the function associated to this Op. ``node`` is an Apply node created by the Op's ``make_node``
+  method. ``inputs`` is a list of references to data to operate on using non-symbolic statements, 
+  (i.e., statements in Python, Numpy and C languages). ``output_storage`` is a list of storage cells where the
+  variables of the computation must be put.
 
   More specifically:
 
@@ -52,20 +52,20 @@ define the following methods.
 
     - ``output_storage``: This is a list of storage cells where the output is to be stored.
       A storage cell is a one-element list. It is forbidden to change
-      the length of the list(s) contained in ``output_storage``.
-      There is one storage cell for each output of the Op.
+      the length of the list(s) contained in ``output_storage``.  There is
+      one storage cell for each output of the Op.
 
       The data put in ``output_storage`` must match the type of the
       symbolic output. This is a situation where the ``node`` argument
       can come in handy.
 
-      A function Mode may allow ``output_storage`` elements to persist
-      between evaluations, or it may reset ``output_storage`` cells to
-      hold a value of ``None``.  It can also pre-allocate some memory
-      for the Op to use.  This feature can allow ``perform`` to reuse
-      memory between calls, for example. If there is something
-      preallocated in the ``output_storage``, it will be of the good
-      dtype, but can have the wrong shape and have any stride pattern.
+      A function Mode may allow ``output_storage`` elements to persist between
+      evaluations, or it may reset ``output_storage`` cells to hold a value of
+      ``None``.  It can also pre-allocate some memory for the Op to use.
+      This feature can allow ``perform`` to reuse memory between
+      calls, for example. If there is something  preallocated in the
+      ``output_storage``, it will be of the good dtype, but can have
+      the wrong shape and have any stride pattern.
 
   This method must be determined by the inputs. That is to say, if
   it is evaluated once on inputs A and returned B, then if ever
@@ -77,10 +77,6 @@ define the following methods.
   operations <views_and_inplace>` before writing a ``perform``
   implementation that does either of these things.
 
-Instead (or in addition to) ``perform()`` You can also provide a
-:ref:`C implementation <cop>` of For more details, refer to the
-documentation for :ref:`op`.
-
 .. function:: __eq__(other)
 
   ``other`` is also an Op.
@@ -93,10 +89,6 @@ documentation for :ref:`op`.
   inputs (same view_map). For more details, see
   :ref:`views_and_inplace`.
 
-   .. note::
-
-     If you set `__props__`, this will be automatically generated.
-
 .. function:: __hash__()
 
   If two Op instances compare equal, then they **must** return the
@@ -106,286 +98,179 @@ documentation for :ref:`op`.
   lifetime of self.  Op instances should be immutable in this
   sense.
 
-   .. note::
-
-     If you set `__props__`, this will be automatically generated.
-
-.. op_optional:
-
-Optional methods or attributes
-==============================
-
-.. attribute:: __props__
-
-  *Default:* Undefined
-
-  Must be a tuple.  Lists the name of the attributes which influence
-  the computation performed.  This will also enable the automatic
-  generation of appropriate __eq__, __hash__ and __str__ methods.
-  Should be set to `()` if you have no attributes that are relevant to
-  the computation to generate the methods.
-
-  .. versionadded:: 0.7
-
-.. attribute:: default_output
-
-  *Default:* None
-
-  If this member variable is an integer, then the default
-  implementation of ``__call__`` will return
-  ``node.outputs[self.default_output]``, where ``node`` was returned
-  by ``make_node``.  Otherwise, the entire list of outputs will be
-  returned, unless it is of length 1, where the single element will be
-  returned by itself.
-
-.. function:: make_thunk(node, storage_map, compute_map, no_recycling)
-
-   This function must return a thunk, that is a zero-arguments
-   function that encapsulates the computation to be performed by this
-   op on the arguments of the node.
-
-   :param node: Apply instance
-     The node for which a thunk is requested.
-   :param storage_map: dict of lists
-     This maps variables to a one-element lists holding the variable's
-     current value. The one-element list acts as pointer to the value
-     and allows sharing that "pointer" with other nodes and instances.
-   :param compute_map: dict of lists
-     This maps variables to one-element lists holding booleans.  If
-     the value is 0 then the variable has not been computed and the
-     value should not be considered valid.  If the value is 1 the
-     variable has been computed and the value is valid.  If the value
-     is 2 the variable has been garbage-collected and is no longer
-     valid, but shouldn't be required anymore for this call.
-   :param no_recycling: WRITEME
-     WRITEME
-
-   The returned function must ensure that is sets the computed
-   variables as computed in the `compute_map`.
-
-   Defining this function removes the requirement for :meth:`perform`
-   or C code, as you will define the thunk for the computation
-   yourself.
-
-.. function:: __call__(*inputs, **kwargs)
-
-   By default this is a convenience function which calls
-   :meth:`make_node` with the supplied arguments and returns the
-   result indexed by `default_output`.  This can be overridden by
-   subclasses to do anything else, but must return either a theano
-   Variable or a list of Variables.
-
-   If you feel the need to override `__call__` to change the graph
-   based on the arguments, you should instead create a function that
-   will use your Op and build the graphs that you want and call that
-   instead of the Op instance directly.
-
-.. function:: infer_shape(node, shapes)
-
-   This function is needed for shape optimization. ``shapes`` is a
-   list with one tuple for each input of the Apply node (which corresponds
-   to the inputs of the op).  Each tuple contains as many elements as the
-   number of dimensions of the corresponding input. The value of each element
-   is the shape (number of items) along the corresponding dimension of that
-   specific input.
-
-   While this might sound complicated, it is nothing more than the shape
-   of each input as symbolic variables (one per dimension).
-
-   The function should return a list with one tuple for each output.
-   Each tuple should contain the corresponding output's computed shape.
-
-   Implementing this method will allow Theano to compute the output's
-   shape without computing the output itself, potentially sparing you
-   a costly recomputation.
-
-.. function:: flops(inputs, outputs)
-
-   It is only used to have more information printed by the memory
-   profiler.  It makes it print the mega flops and giga flops per
-   second for each apply node. It takes as inputs two lists: one for the
-   inputs and one for the outputs. They contain tuples that are the
-   shapes of the corresponding inputs/outputs.
+.. function:: connection_pattern( node ):
 
-.. function:: __str__()
-
-   This allows you to specify a more informative string representation of your
-   Op. If an Op has parameters, it is highly recommended to have the
-   ``__str__`` method include the name of the op and the Op's parameters'
-   values.
+  Optional method; sometimes needed for gradient.grad to
+  work correctly.
 
-   .. note::
-
-     If you set `__props__`, this will be automatically generated.
-     You can still overide it for custom output.
+  Returns a list of list of bools.
 
-.. function:: do_constant_folding(node)
+  Op.connection_pattern[input_idx][output_idx] is true if the
+  elements of inputs[input_idx] have an effect on the elements of
+  outputs[output_idx].
 
-   *Default:* Return True
+  The ``node`` parameter is needed to determine the number of
+  inputs. Some ops such as Subtensor take a variable number of
+  inputs.
 
-   By default when optimizations are enabled, we remove during
-   function compilation Apply nodes whose inputs are all constants.
-   We replace the Apply node with a Theano constant variable.
-   This way, the Apply node is not executed at each function
-   call. If you want to force the execution of an op during the
-   function call, make do_constant_folding return False.
+  If no connection_pattern is specified, gradient.grad will
+  assume that all inputs have some elements connected to some
+  elements of all outputs.
 
-   As done in the Alloc op, you can return False only in some cases by
-   analyzing the graph from the node parameter.
+  This method conveys two pieces of information that are otherwise
+  not part of the theano graph:
 
-If you want your op to work with gradient.grad() you also need to
-implement the functions described below.
+  1) Which of the op's inputs are truly ancestors of each of the
+     op's outputs. Suppose an op has two inputs, x and y, and
+     outputs f(x) and g(y). y is not really an ancestor of f, but
+     it appears to be so in the theano graph.
+  2) Whether the actual elements of each input/output are relevant
+     to a computation.
+     For example, the shape op does not read its input's elements,
+     only its shape metadata. d shape(x) / dx should thus raise
+     a disconnected input exception (if these exceptions are
+     enabled).
+     As another example, the elements of the Alloc op's outputs
+     are not affected by the shape arguments to the Alloc op.
 
-Gradient
-========
+  Failing to implement this function for an op that needs it can
+  result in two types of incorrect behavior:
+  
+  1) gradient.grad erroneously raising a TypeError reporting that
+     a gradient is undefined.
+  2) gradient.grad failing to raise a ValueError reporting that
+     an input is disconnected.
 
-These are the function required to work with gradient.grad().
+  Even if connection_pattern is not implemented correctly,
+  if gradient.grad returns an expression, that expression will
+  be numerically correct.
 
 .. function:: grad(inputs, output_gradients)
 
-  If the Op being defined is differentiable, its gradient may be
-  specified symbolically in this method. Both ``inputs`` and
-  ``output_gradients`` are lists of symbolic Theano Variables and
-  those must be operated on using Theano's symbolic language. The grad
-  method must return a list containing one Variable for each
-  input. Each returned Variable represents the gradient with respect
-  to that input computed based on the symbolic gradients with respect
-  to each output.
-
-  If the output is not differentiable with respect to an input then
-  this method should be defined to return a variable of type NullType
-  for that input. Likewise, if you have not implemented the grad
-  computation for some input, you may return a variable of type
-  NullType for that input. theano.gradient contains convenience
-  methods that can construct the variable for you:
-  :func:`theano.gradient.grad_undefined` and
+  Optional (but needed to have it work with gradient.grad()).
+
+  If the Op being defined is differentiable, its gradient may be specified 
+  symbolically in this method. Both ``inputs`` and ``output_gradients``
+  are lists of symbolic Theano Variables and those must be operated on using 
+  Theano's symbolic language. The grad method must return a list containing 
+  one Variable for each input. Each returned Variable represents 
+  the gradient with respect to that input computed based on the symbolic gradients with
+  respect to each output.
+
+  If the output is not differentiable with respect to an input
+  then this method should be defined to return a variable of type
+  NullType for that input. Likewise, if you have not implemented the
+  grad computation for some input, you may return a variable of type
+  NullType for that input. theano.gradient contains convenience methods
+  that can construct the variable for you: :func:`theano.gradient.grad_undefined` and
   :func:`theano.gradient.grad_not_implemented`, respectively.
 
-  If an element of output_gradient is of type
-  theano.gradient.DisconnectedType, it means that the cost is not a
-  function of this output. If any of the op's inputs participate in
-  the computation of only disconnected outputs, then Op.grad should
-  return DisconnectedType variables for those inputs.
+  If an element of output_gradient is of type theano.gradient.DisconnectedType,
+  it means that the cost is not a function of this output. If any of the
+  op's inputs participate in the computation of only disconnected outputs,
+  then Op.grad should return DisconnectedType variables for those inputs.
 
   If the grad method is not defined, then Theano assumes it has been
   forgotten.  Symbolic differentiation will fail on a graph that
   includes this Op.
 
-  It must be understood that the Op's grad method is not meant to
-  return the gradient of the Op's output. theano.tensor.grad computes
-  gradients; Op.grad is a helper function that computes terms that
-  appear in gradients.
+  It must be understood that the Op's grad method is not meant to return the
+  gradient of the Op's output. theano.tensor.grad computes gradients; Op.grad
+  is a helper function that computes terms that appear in gradients.
   
-  If an Op has a single vector-valued output y and a single
-  vector-valued input x, then the grad method will be passed x and a
-  second vector z. Define J to be the Jacobian of y with respect to
-  x. The Op's grad method should return dot(J.T,z). When
-  theano.tensor.grad calls the grad method, it will set z to be the
-  gradient of the cost C with respect to y. If this op is the only op
-  that acts on x, then dot(J.T,z) is the gradient of C with respect to
-  x.  If there are other ops that act on x, theano.tensor.grad will
-  have to add up the terms of x's gradient contributed by the other
-  op's grad method.
-
-  In practice, an op's input and output are rarely implemented as
-  single vectors.  Even if an op's output consists of a list
-  containing a scalar, a sparse matrix, and a 4D tensor, you can think
-  of these objects as being formed by rearranging a vector. Likewise
-  for the input. In this view, the values computed by the grad method
-  still represent a Jacobian-vector product.
-
-  In practice, it is probably not a good idea to explicitly construct
-  the Jacobian, which might be very large and very sparse. However,
-  the returned value should be equal to the Jacobian-vector product.
-
-  So long as you implement this product correctly, you need not
-  understand what theano.tensor.grad is doing, but for the curious the
-  mathematical justification is as follows:
-
-  In essence, the grad method must simply implement through symbolic
-  Variables and operations the chain rule of differential
-  calculus. The chain rule is the mathematical procedure that allows
-  one to calculate the total derivative :math:`\frac{d C}{d x}` of the
-  final scalar symbolic Variable C with respect to a primitive
-  symbolic Variable x found in the list ``inputs``.  The grad method
-  does this using ``output_gradients`` which provides the total
-  derivative :math:`\frac{d C}{d f}` of C with respect to a symbolic
-  Variable that is returned by the Op (this is provided in
-  ``output_gradients``), as well as the knowledge of the total
-  derivative :math:`\frac{d f}{d x}` of the latter with respect to the
-  primitive Variable (this has to be computed).
-
-  In mathematics, the total derivative of a scalar variable (C) with
-  respect to a vector of scalar variables (x), i.e. the gradient, is
-  customarily represented as the row vector of the partial
-  derivatives, whereas the total derivative of a vector of scalar
-  variables (f) with respect to another (x), is customarily
-  represented by the matrix of the partial derivatives, i.e.the
-  jacobian matrix. In this convenient setting, the chain rule
-  instructs that the gradient of the final scalar variable C with
-  respect to the primitive scalar variables in x through those in f is
-  simply given by the matrix product: :math:`\frac{d C}{d x} = \frac{d
-  C}{d f} * \frac{d f}{d x}`.
-
-  Here, the chain rule must be implemented in a similar but slightly
-  more complex setting: Theano provides in the list
-  ``output_gradients`` one gradient for each of the Variables returned
-  by the Op. Where f is one such particular Variable, the
-  corresponding gradient found in ``output_gradients`` and
-  representing :math:`\frac{d C}{d f}` is provided with a shape
-  similar to f and thus not necessarily as a row vector of scalars.
-  Furthermore, for each Variable x of the Op's list of input variables
-  ``inputs``, the returned gradient representing :math:`\frac{d C}{d
-  x}` must have a shape similar to that of Variable x.
-
-  If the output list of the op is :math:`[f_1, ... f_n]`, then the
-  list ``output_gradients`` is :math:`[grad_{f_1}(C), grad_{f_2}(C),
-  ... , grad_{f_n}(C)]`.  If ``inputs`` consists of the list
-  :math:`[x_1, ..., x_m]`, then Op.grad should return the list
-  :math:`[grad_{x_1}(C), grad_{x_2}(C), ..., grad_{x_m}(C)]`, where
-  :math:`(grad_{y}(Z))_i = \frac{\partial Z}{\partial y_i}` (and
-  :math:`i` can stand for multiple dimensions).
+  If an Op has a single vector-valued output y and a single vector-valued input x,
+  then the grad method will be passed x and a second vector z. Define J to be
+  the Jacobian of y with respect to x. The Op's grad method should return
+  dot(J.T,z). When theano.tensor.grad calls the grad method, it will set z to
+  be the gradient of the cost C with respect to y. If this op is the only op
+  that acts on x, then dot(J.T,z) is the gradient of C with respect to x.
+  If there are other ops that act on x, theano.tensor.grad will have to add up
+  the terms of x's gradient contributed by the other op's grad method.
+
+  In practice, an op's input and output are rarely implemented as single vectors.
+  Even if an op's output consists of a list containing a scalar, a sparse matrix,
+  and a 4D tensor, you can think of these objects as being formed by rearranging
+  a vector. Likewise for the input. In this view, the values computed by the grad
+  method still represent a Jacobian-vector product.
+
+  In practice, it is probably not a good idea to explicitly construct the Jacobian,
+  which might be very large and very sparse. However, the returned value should
+  be equal to the Jacobian-vector product.
+
+  So long as you implement this product correctly, you need not understand what
+  theano.tensor.grad is doing, but for the curious the mathematical justification
+  is as follows:
+
+  In essence, the grad method must simply implement through symbolic Variables
+  and operations the chain rule of differential calculus. The chain rule
+  is the mathematical procedure that allows one to calculate the total derivative
+  :math:`\frac{d C}{d x}` of the final scalar symbolic Variable C with respect to a
+  primitive symbolic Variable x found in the list ``inputs``.
+  The grad method does this using ``output_gradients`` which provides the total
+  derivative :math:`\frac{d C}{d f}` of C with respect to a symbolic Variable
+  that is returned by the Op (this is provided
+  in ``output_gradients``), as well as the knowledge of the total derivative :math:`\frac{d f}{d x}` of the
+  latter with respect to the primitive Variable (this has to be computed).
+
+  In mathematics, the total derivative of a scalar variable (C) with respect to a vector of
+  scalar variables (x), i.e. the gradient, is customarily represented as the
+  row vector of the partial derivatives, whereas the total derivative of a vector of
+  scalar variables (f) with respect to another (x), is customarily represented by the matrix of
+  the partial derivatives, i.e.the jacobian matrix. In this convenient setting,
+  the chain rule instructs that the gradient of the final scalar variable C with respect
+  to the primitive scalar variables in x through those in f is simply given by the matrix product: 
+  :math:`\frac{d C}{d x} = \frac{d C}{d f} * \frac{d f}{d x}`.
+
+  Here, the chain rule must be implemented in a similar but slightly more complex
+  setting: Theano provides in the list ``output_gradients`` one gradient for each
+  of the Variables returned by the Op. Where f is one such particular Variable,
+  the corresponding gradient found in ``output_gradients`` and representing
+  :math:`\frac{d C}{d f}` is provided with a shape similar to f and thus not
+  necessarily as a row vector of scalars.  Furthermore, for each Variable x of 
+  the Op's list of input variables ``inputs``, the returned gradient representing
+  :math:`\frac{d C}{d x}` must have a shape similar to that of Variable x.
+
+  If the output list of the op is :math:`[f_1, ... f_n]`, then the list 
+  ``output_gradients`` is :math:`[grad_{f_1}(C), grad_{f_2}(C), ... , grad_{f_n}(C)]`.
+  If ``inputs`` consists of the list :math:`[x_1, ..., x_m]`, then Op.grad
+  should return the list :math:`[grad_{x_1}(C), grad_{x_2}(C), ..., grad_{x_m}(C)]`,
+  where :math:`(grad_{y}(Z))_i = \frac{\partial Z}{\partial y_i}` (and :math:`i` can stand for multiple dimensions).
  
-  In other words, :func:`grad` does not return :math:`\frac{d f_i}{d
-  x_j}`, but instead the appropriate dot product specified by the
-  chain rule: :math:`\frac{d C}{d x_j} = \frac{d C}{d f_i} \cdot
-  \frac{d f_i}{d x_j}`.  Both the partial differentiation and the
-  multiplication have to be performed by :func:`grad`.
-
-  Theano currently imposes the following constraints on the values
-  returned by the grad method:
+  In other words, :func:`grad` does not return
+  :math:`\frac{d f_i}{d x_j}`, but instead the appropriate dot product specified by the chain rule:  
+  :math:`\frac{d C}{d x_j} =
+  \frac{d C}{d f_i} \cdot \frac{d f_i}{d x_j}`.
+  Both the partial differentiation and the multiplication have to be performed by
+  :func:`grad`.
+
+
+  Theano currently imposes the following constraints on the values returned by the grad method:
   
   1) They must be Variable instances.
   2) When they are types that have dtypes, they must never have an integer dtype.
 
   The output gradients passed *to* Op.grad will also obey these constraints.
 
-  Integers are a tricky subject. Integers are the main reason for
-  having DisconnectedType, NullType or zero gradient. When you have an
-  integer as an argument to your grad method, recall the definition of
-  a derivative to help you decide what value to return:
+  Integers are a tricky subject. Integers are the main reason for having DisconnectedType,
+  NullType or zero gradient. When you have an integer as an argument to your grad method,
+  recall the definition of a derivative to help you decide what value to return:
 
   :math:`\frac{d f}{d x} = \lim_{\epsilon \rightarrow 0} (f(x+\epsilon)-f(x))/\epsilon`.
 
-  Suppose your function f has an integer-valued output. For most
-  functions you're likely to implement in theano, this means your
-  gradient should be zero, because f(x+epsilon) = f(x) for almost all
-  x. (The only other option is that the gradient could be undefined,
-  if your function is discontinuous everywhere, like the rational
-  indicator function)
-
-  Suppose your function f has an integer-valued input. This is a
-  little trickier, because you need to think about what you mean
-  mathematically when you make a variable integer-valued in
-  theano. Most of the time in machine learning we mean "f is a
-  function of a real-valued x, but we are only going to pass in
-  integer-values of x". In this case, f(x+epsilon) exists, so the
-  gradient through f should be the same whether x is an integer or a
-  floating point variable. Sometimes what we mean is "f is a function
-  of an integer-valued x, and f is only defined where x is an
-  integer." Since f(x+epsilon) doesn't exist, the gradient is
-  undefined.  Finally, many times in theano, integer valued inputs
-  don't actually affect the elements of the output, only its shape.
+  Suppose your function f has an integer-valued output. For most functions you're likely
+  to implement in theano, this means your gradient should be zero, because f(x+epsilon)
+  = f(x) for almost all x. (The only other option is that the gradient could be undefined,
+  if your function is discontinuous everywhere, like the rational indicator function)
+
+  Suppose your function f has an integer-valued input. This is a little trickier, because
+  you need to think about what you mean mathematically when you make a variable integer-valued
+  in theano. Most of the time in machine learning we mean "f is a function of a real-valued
+  x, but we are only going to pass in integer-values of x". In this case, f(x+epsilon) exists,
+  so the gradient through f should be the same whether x is an integer or a floating point
+  variable. Sometimes what we mean is "f is a function of an integer-valued x, and f is only
+  defined where x is an integer." Since f(x+epsilon) doesn't exist, the gradient is undefined.
+  Finally, many times in theano, integer valued inputs don't actually affect the elements of
+  the output, only its shape.
 
   If your function f has both an integer-valued input and an
   integer-valued output, then both rules have to be combined:
@@ -405,75 +290,53 @@ These are the function required to work with gradient.grad().
         Its gradient is zero almost everywhere, so Op.grad should return
         zeros in the shape of x and y.
   2) f(x,y) = dot product between x and y. x is floating point and y is an integer.
-        In this case the output is floating point. It doesn't matter
-        that y is an integer.  We consider f to still be defined at
-        f(x,y+epsilon). The gradient is exactly the same as if y were
-        floating point.
+        In this case the output is floating point. It doesn't matter that y is an integer.
+        We consider f to still be defined at f(x,y+epsilon). The gradient is exactly the
+        same as if y were floating point.
   3) f(x,y) = argmax of x along axis y.
-        The gradient with respect to y is undefined, because f(x,y) is
-        not defined for floating point y. How could you take an argmax
-        along a fraActional axis?  The gradient with respect to x is
-        0, because f(x+epsilon, y) = f(x) almost everywhere.
+        The gradient with respect to y is undefined, because f(x,y) is not defined for
+        floating point y. How could you take an argmax along a fraActional axis?
+        The gradient with respect to x is 0, because f(x+epsilon, y) = f(x) almost
+        everywhere.
   4) f(x,y) = a vector with y elements, each of which taking on the value x
-        The grad method should return DisconnectedType()() for y,
-        because the elements of f don't depend on y. Only the shape of
-        f depends on y. You probably also want to implement a
-        connection_pattern method to encode this.
+        The grad method should return DisconnectedType()() for y, because the elements of
+        f don't depend on y. Only the shape of f depends on y. You probably also want to
+        implement a connection_pattern method to encode this.
   5) f(x) = int(x) converts float x into an int. g(y) = float(y) converts an integer y into a float.
         If the final cost C = 0.5 * g(y) = 0.5 g(f(x)), then the
         gradient with respect to y will be 0.5, even if y is an
         integer. However, the gradient with respect to x will be 0,
         because the output of f is integer-valued.
 
-.. function:: connection_pattern(node):
-
-  Sometimes needed for proper operation of gradient.grad().
 
-  Returns a list of list of bools.
+.. function:: infer_shape(node, shapes)
 
-  Op.connection_pattern[input_idx][output_idx] is true if the
-  elements of inputs[input_idx] have an effect on the elements of
-  outputs[output_idx].
+   Optional.
 
-  The ``node`` parameter is needed to determine the number of
-  inputs. Some ops such as Subtensor take a variable number of
-  inputs.
-
-  If no connection_pattern is specified, gradient.grad will
-  assume that all inputs have some elements connected to some
-  elements of all outputs.
+   This function is needed for shape optimization. ``shapes`` is a
+   list with one tuple for each input of the Apply node (which corresponds
+   to the inputs of the op).  Each tuple contains as many elements as the
+   number of dimensions of the corresponding input. The value of each element
+   is the shape (number of items) along the corresponding dimension of that
+   specific input.
 
-  This method conveys two pieces of information that are otherwise
-  not part of the theano graph:
+   While this might sound complicated, it is nothing more than the shape
+   of each input as symbolic variables (one per dimension).
 
-  1) Which of the op's inputs are truly ancestors of each of the
-     op's outputs. Suppose an op has two inputs, x and y, and
-     outputs f(x) and g(y). y is not really an ancestor of f, but
-     it appears to be so in the theano graph.
-  2) Whether the actual elements of each input/output are relevant
-     to a computation.
-     For example, the shape op does not read its input's elements,
-     only its shape metadata. d shape(x) / dx should thus raise
-     a disconnected input exception (if these exceptions are
-     enabled).
-     As another example, the elements of the Alloc op's outputs
-     are not affected by the shape arguments to the Alloc op.
+   The function should return a list with one tuple for each output.
+   Each tuple should contain the corresponding output's computed shape.
 
-  Failing to implement this function for an op that needs it can
-  result in two types of incorrect behavior:
+   Implementing this method will allow Theano to compute the output's
+   shape without computing the output itself, potentially sparing you
+   a costly recomputation.
 
-  1) gradient.grad erroneously raising a TypeError reporting that
-     a gradient is undefined.
-  2) gradient.grad failing to raise a ValueError reporting that
-     an input is disconnected.
+.. function:: make_thunk(node, storage_map, compute_map, no_recycling)
 
-  Even if connection_pattern is not implemented correctly, if
-  gradient.grad returns an expression, that expression will be
-  numerically correct.
+   TODO
 
 .. function:: R_op(inputs, eval_points)
 
-   Optional, to work with gradient.R_op().
+   Optional.
 
    This function implements the application of the R-operator on the
    function represented by your op. Let assume that function is :math:`f`,
@@ -500,6 +363,54 @@ These are the function required to work with gradient.grad().
    the outputs) back to their corresponding shapes and return them as the 
    output of the :func:`R_op` method.
 
+.. attribute:: default_output
+
+  *Default:* None
+
+  If this member variable is an integer, then the default
+  implementation of ``__call__`` will return
+  ``node.outputs[self.default_output]``, where ``node`` was returned
+  by ``make_node``.  Otherwise, the entire list of outputs will be
+  returned.
+
+.. function:: __call__(*inputs)
+
+  Syntactic shortcut to make_node which returns the output
+  Variables of the Op.
+
+  *Default:* this is implemented in the parent class and you do not need to change it.
+
+.. function:: __str__()
+
+   *Default:* python default: module_path_to_your_class.CLASSNAME
+
+   This allows you to specify a more informative string representation of your
+   Op. If an Op has parameters, it is highly recommended to have the
+   ``__str__`` method include the name of the op and the Op's parameters'
+   values.
+
+.. function:: do_constant_folding(node)
+
+   *Default:* Return True
+
+   By default when optimizations are enabled, we remove during
+   function compilation Apply nodes whose inputs are all constants.
+   We replace the Apply node with a Theano constant variable.
+   This way, the Apply node is not executed at each function
+   call. If you want to force the execution of an op during the
+   function call, make do_constant_folding return False.
+
+   As done in the Alloc op, you can return False only in some cases by
+   analyzing the graph from the node parameter.
+
+At a bare minimum, a new Op must define ``make_node`` and ``perform``, which
+have no defaults.
+
+You can also provide a :ref:`C implementation <cop>` of
+``perform()``. For more details, refer to the documentation for
+:ref:`op`.
+
+
 Defining an Op: ``mul``
 =======================
 
@@ -521,9 +432,12 @@ First, we'll instantiate a ``mul`` Op:
 
 This function must take as many arguments as the operation we are
 defining is supposed to take as inputs---in this example that would be
-two.  This function ensures that both inputs have the ``double`` type.
-Since multiplying two doubles yields a double, this function makes an
-Apply node with an output Variable of type ``double``.
+two.
+This function ensures that both inputs have the ``double``
+type.
+Since multiplying two doubles yields a double,
+this function makes an Apply node with an output Variable of type
+``double``.
 
 .. If you modify this code, also change :
 .. theano/tests/test_tutorial.py:T_extending.test_extending_1
@@ -709,17 +623,17 @@ arithmetic operators:
        def __str__(self):
            return self.name
 
-   add = BinaryDoubleOp(name='add',
-                        fn=lambda x, y: x + y)
+   add = BinaryDoubleOp(name = 'add',
+                        fn = lambda x, y: x + y)
 
-   sub = BinaryDoubleOp(name='sub',
-                        fn=lambda x, y: x - y)
+   sub = BinaryDoubleOp(name = 'sub',
+                        fn = lambda x, y: x - y)
 
-   mul = BinaryDoubleOp(name='mul',
-                        fn=lambda x, y: x * y)
+   mul = BinaryDoubleOp(name = 'mul',
+                        fn = lambda x, y: x * y)
 
-   div = BinaryDoubleOp(name='div',
-                        fn=lambda x, y: x / y)
+   div = BinaryDoubleOp(name = 'div',
+                        fn = lambda x, y: x / y)
 
 Instead of working directly on an instance of Op, we create a subclass of
 Op that we can parametrize. All the operations we define are binary. They
diff --git a/doc/extending/optimization.txt b/doc/extending/optimization.txt
index 1e551f0b0cb..3956e05e65a 100644
--- a/doc/extending/optimization.txt
+++ b/doc/extending/optimization.txt
@@ -283,9 +283,7 @@ The local version of the above code would be the following:
 The definition of transform is the inner loop of the global optimizer,
 where the node is given as argument. If no changes are to be made,
 ``False`` must be returned. Else, a list of what to replace the node's
-outputs with must be returned. This list must have the same length as
-node.ouputs. If one of node.outputs don't have clients(it is not used
-in the graph), you can put None in the returned list to remove it.
+outputs with must be returned.
 
 In order to apply the local optimizer we must use it in conjunction
 with a :ref:`navigator`. Basically, a :ref:`navigator` is a global
diff --git a/doc/extending/other_ops.txt b/doc/extending/other_ops.txt
deleted file mode 100644
index 0ac15947c3f..00000000000
--- a/doc/extending/other_ops.txt
+++ /dev/null
@@ -1,282 +0,0 @@
-.. _other_ops:
-
-==============================
-Implementing some specific Ops
-==============================
-
-This page is a guide on the implementation of some specific types of Ops,
-and points to some examples of such implementations.
-
-For the random number generating Ops, it explains different possible
-implementation strategies.
-
-
-.. _scalar_ops:
-
-Scalar/Elemwise/Reduction Ops
-=============================
-
-Implementing a Theano scalar Op allows that scalar operation to be reused
-by our elemwise operations on tensors. If the scalar operation has C code, the
-elemwise implementation will automatically have C code too. This
-will enable the fusion of elemwise operations using your new scalar
-operation. It can also reuse the GPU elemwise code. It is similar for
-reduction operations.
-
-For examples of how to add new scalar operations, you can have a look at
-those 2 pull requests, that add `GammaLn and Psi
-<https://github.com/Theano/Theano/pull/686/>`_ and `Gamma
-<https://github.com/Theano/Theano/pull/826/>`_ scalar Ops.
-
-Be careful about some possible problems in the definition of the
-``grad`` method, and about dependencies that may not be available. In
-particular, see the following fixes:
-`Fix to grad() methods
-<https://github.com/Theano/Theano/commit/002872ad97919b97eaf58e095044e3c3067668e4>`_
-and `impl() methods related to SciPy
-<https://github.com/Theano/Theano/commit/08d16c0aa6681fc53d8d0f40342551eb47ff536e>`_.
-
-
-.. _scipy_ops:
-
-SciPy Ops
-=========
-
-We can wrap SciPy functions in Theano. But SciPy is an optional dependency.
-Here is some code that allows the Op to be optional:
-
-.. code-block:: python
-
-    try:
-        import scipy.linalg
-        imported_scipy = True
-    except ImportError:
-        # some ops (e.g. Cholesky, Solve, A_Xinv_b) won't work
-        imported_scipy = False
-
-    class SomeOp(Op):
-        ...
-        def make_node(self, x):
-            assert imported_scipy, (
-            "SciPy not available. SciPy is needed for the SomeOp op.")
-            ...
-
-    from nose.plugins.skip import SkipTest
-    class test_SomeOp(utt.InferShapeTester):
-        ...
-        def test_infer_shape(self):
-            if not imported_scipy:
-                raise SkipTest("SciPy needed for the SomeOp op.")
-            ...
-
-.. _sparse_ops:
-
-Sparse Ops
-==========
-
-There are a few differences to keep in mind if you want to make an op
-that uses :ref:`sparse <tutsparse>` inputs or outputs, rather than the
-usual dense tensors. In particular, in the
-``make_node()`` function, you have to call
-``theano.sparse.as_sparse_variable(x)`` on sparse input variables,
-instead of ``as_tensor_variable(x)``.
-
-Another difference is that you need to use ``SparseVariable`` and
-``SparseType`` instead of ``TensorVariable`` and ``TensorType``.
-
-Do not forget that we support only sparse matrices (so only 2 dimensions)
-and (like in SciPy) they do not support broadcasting operations by default
-(although a few Ops do it when called manually). Also, we support only two
-formats for sparse type: ``csr`` and ``csc``. So in ``make_mode()``,
-you can create output variables like this:
-
-.. code-block:: python
-
-    out_format = inputs[0].format  # or 'csr' or 'csc' if the output format is fixed
-    SparseType(dtype=inputs[0].dtype, format=out_format).make_variable()
-
-See the sparse :class:`theano.sparse.basic.Cast` op `code
-<https://github.com/Theano/Theano/blob/master/theano/sparse/basic.py#L753>`_
-for a good example of a sparse op with Python code.
-
-.. note::
-
-   From the definition of CSR and CSC formats, CSR column indices are
-   not necessarily sorted. Likewise for CSC row indices. Use
-   :class:`EnsureSortedIndices
-   <theano.sparse.basic.EnsureSortedIndices>` if your code does not
-   support it.
-
-   Also, there can be explicit zeros in your inputs. Use
-   :class:`Remove0 <theano.sparse.basic.Remove0>` or ``remove0`` to
-   make sure they aren't present in your input if you don't support
-   that.
-
-   To remove explicit zeros and make sure indices are sorted, use
-   :func:`clean <theano.sparse.basic.clean>`.
-
-Sparse Gradient
----------------
-
-There are 2 types of :ref:`gradients <tutsparse_gradient>` for sparse
-operations: ``normal``
-gradient and ``structured`` gradient. Please document what your op
-implements in its docstring. It is important that the user knows it, and
-it is not always easy to infer from the code. Also make clear which
-inputs/outputs are sparse and which ones are dense.
-
-Sparse C code
--------------
-
-Theano does not have a native C code interface for sparse matrices. The
-reason is simple: we use the SciPy sparse matrix objects and they don't
-have a C object. So we use a simple trick: a sparse matrix is made of
-4 fields that are NumPy vector arrays: ``data``, ``indices``, ``indptr``
-and ``shape``. So to make
-an op with C code that has sparse variables as inputs, we actually make an op
-that takes as input the needed fields of those sparse variables.
-
-You can extract the 4 fields with
-:func:`theano.sparse.basic.csm_properties`. You can use
-:func:`theano.sparse.basic.csm_data`,
-:func:`theano.sparse.basic.csm_indices`,
-:func:`theano.sparse.basic.csm_indptr` and
-:func:`theano.sparse.basic.csm_shape` to extract the individual
-fields.
-
-You can look at the `AddSD
-<https://github.com/Theano/Theano/blob/master/theano/sparse/basic.py#L1704>`_
-sparse op for an example with C code. It implements the addition of a
-sparse matrix with a dense matrix.
-
-Sparse Tests
-------------
-
-You can reuse the test system for tensor variables. To generate the
-needed sparse variable and data, you can use
-:func:`theano.sparse.tests.test_basic.sparse_random_inputs`. It takes
-many parameters, including parameters for the format (csr or csc), the shape, the
-dtype, whether to have explicit 0 and whether to have unsorted indices.
-
-.. _random_ops:
-
-Random distribution
-===================
-
-We have 3 base random number generators. One that wraps NumPy's random
-generator, one that implements MRG31k3p and one that wraps CURAND.
-
-The fastest, but less developed, is CURAND. It works only on CUDA-enabled
-GPUs. It does not work on the CPU and it has fewer random distributions
-implemented.
-
-The recommended and 2nd faster is MRG. It works on the GPU and CPU and
-has more implemented distributions.
-
-The slowest is our wrapper on NumPy's random generator.
-
-We explain and provide advice on 3 possibles implementations of new
-distributions here:
-
-1. Extend our wrapper around NumPy random functions.
-   See this `PR <https://github.com/Theano/Theano/pull/1607>`_ as an example.
-
-2. Extend MRG implementation by reusing existing Theano Op. Look into
-   the ``theano/sandbox/rng_mrg.py`` file and grep for all code about
-   binomial(). This distribution uses the output of the uniform
-   distribution and converts it to a binomial distribution with
-   existing Theano operations. The tests go in
-   ``theano/sandbox/test_rng_mrg.py``
-
-3. Extend MRG implementation with a new Op that takes a uniform sample as
-   input. Look in the ``theano/sandbox/{rng_mrg,multinomial}.py`` file
-   and its test in ``theano/sandbox/test_multinomal.py``. This is
-   recommended when current Theano ops aren't well suited to modify
-   the uniform to the target distribution. This can happen in
-   particular if there is a loop or complicated condition.
-
-.. note::
-
-    In all cases, you must reuse the same interface as NumPy for compatibility.
-
-
-.. _openmp_ops:
-
-OpenMP Ops
-==========
-
-To allow consistent interface of Ops that support OpenMP, we have some
-helper code. Doing this also allows to enable/disable OpenMP globally
-or per op for fine-grained control.
-
-Your Op needs to inherit from ``theano.gof.OpenMPOp``. If it overrides
-the ``__init__()`` method, it must have an ``openmp=None`` parameter
-and must call ``super(MyOpClass, self).__init__(openmp=openmp)``.
-
-The ``OpenMPOp`` class also implements ``c_compile_args`` and
-``make_thunk``. This makes it add the correct g++ flags to compile with
-OpenMP. It also disables OpenMP and prints a warning if the version of
-g++ does not support it.
-
-The Theano flag ``openmp`` is currently False by default as we do not
-have code that gets sped up with it. The only current implementation
-is ConvOp. It speeds up some cases, but slows down others. That is why
-we disable it by default. But we have all the code to have it enabled
-by default if there is more than 1 core and the environment
-variable OMP_NUM_THREADS is not 1. This allows Theano to respect the
-current convention.
-
-.. note:
-
-   The OpenMP parameter of an Op should not be used in its __eq__ and
-   __hash__ methods. Those methods are used to merge equivalent
-   computation in a Theano graph. If we have 2 Apply nodes with the
-   same inputs and they execute 2 ConvOp that only differ on the
-   OpenMP parameter, we want them to be merged.
-
-.. _numba_ops:
-
-Numba Ops
-=========
-
-Want C speed without writing C code for your new Op? You can use Numba
-to generate the C code for you! Here is an `example
-Op <https://gist.github.com/nouiz/5492778#file-theano_op-py>`_ doing that.
-
-.. _alternate_theano_types:
-
-Alternate Theano Types
-======================
-
-Most ops in Theano are used to manipulate tensors. However, Theano also
-supports many other variable types. The supported types are listed below,
-along with pointers to the relevant documentation.
-
-*       :class:`TensorType <tensor.TensorType>` : Theano type that represents
-        a multidimensional array containing elements that all have the same
-        type. Variables of this Theano type are represented in C as objects of
-        class
-        `PyArrayObject <http://docs.scipy.org/doc/numpy/reference/c-api.types-and-structures.html#PyArrayObject>`_.
-
-*       :ref:`TypedList <libdoc_typed_list>` : Theano type that represents a
-        typed list (a list where every element in the list has the same Theano
-        type). Variables of this Theano type are represented in C as objects
-        of class `PyListObject <https://docs.python.org/2/c-api/list.html>`_.
-
-*       :ref:`Scalar <libdoc_scalar>` : Theano type that represents a C
-        primitive type. The C type associated with this Theano type is the
-        represented C primitive itself.
-
-*       :ref:`SparseType <sparse_ops>` : Theano type used to represent sparse
-        tensors. There is no equivalent C type for this Theano Type but you
-        can split a sparse variable into its parts as TensorVariables. Those
-        can then be used as inputs to an op with C code.
-
-*       :class:`Generic <theano.gof.type.Generic>` : Theano type that
-        represents a simple Python Object. Variables of this Theano type are
-        represented in C as objects of class `PyObject
-        <https://docs.python.org/2/c-api/structures.html#c.PyObject>`_.
-
-*       :class:`CDataType <theano.gof.type.CDataType>` :  Theano type that
-        represents a C data type. The C type associated with this Theano type
-        depends on the data being represented.
diff --git a/doc/extending/type.txt b/doc/extending/type.txt
index 062636b0a27..795e1129df7 100644
--- a/doc/extending/type.txt
+++ b/doc/extending/type.txt
@@ -138,15 +138,6 @@ default values.
         :return: the number of bytes taken by the object described by
             ``shape_info``.
 
-    .. method:: may_share_memory(a, b)
-
-        Optional to run, but mandatory for DebugMode. Return True if the Python
-        objects `a` and `b` could share memory. Return False
-        otherwise. It is used to debug when Ops did not declare memory
-        aliasing between variables. Can be a static method.
-        It is highly recommended to use and is mandatory for Type in Theano
-        as our buildbot runs in DebugMode.
-
 For each method, the *default* is what ``Type`` defines
 for you. So, if you create an instance of ``Type`` or an
 instance of a subclass of ``Type``, you
diff --git a/doc/extending/unittest.txt b/doc/extending/unittest.txt
index 6b94a3d1e87..a42b220fce9 100644
--- a/doc/extending/unittest.txt
+++ b/doc/extending/unittest.txt
@@ -350,9 +350,9 @@ The behaviour of seed_rng is as follows:
 
 * If an explicit seed is given, it will be used for seeding numpy's rng.
 
-* If not, it will use ``config.unittests.rseed`` (its default value is 666).
+* If not, it will use ``config.unittest.rseed`` (its default value is 666).
 
-* If config.unittests.rseed is set to "random", it will seed the rng with
+* If config.unittest.rseed is set to "random", it will seed the rng with
   None, which is equivalent to seeding with a random seed.
 
 
@@ -364,7 +364,7 @@ a higher confidence that the variables are correct), while still
 making sure unittests are deterministic.
 
 Users who prefer their unittests to be random (when run on their local
-machine) can simply set ``config.unittests.rseed`` to 'random' (see
+machine) can simply set ``config.unittest.rseed`` to 'random' (see
 :mod:`config`).
 
 Similarly, to provide a seed to numpy.random.RandomState, simply use:
diff --git a/doc/faq.txt b/doc/faq.txt
deleted file mode 100644
index 9a64099dcc7..00000000000
--- a/doc/faq.txt
+++ /dev/null
@@ -1,197 +0,0 @@
-
-.. _faq:
-
-==========================
-Frequently Asked Questions
-==========================
-TypeError: object of type 'TensorVariable' has no len()
--------------------------------------------------------
-
-If you receive the following error, it is because the Python function *__len__* cannot 
-be implemented on Theano variables:
-
-.. code-block:: python
-
-   TypeError: object of type 'TensorVariable' has no len()
-
-Python requires that *__len__* returns an integer, yet it cannot be done as Theano's variables are symbolic. However, `var.shape[0]` can be used as a workaround.
-
-This error message cannot be made more explicit because the relevant aspects of Python's 
-internals cannot be modified.
-
-
-Faster gcc optimization
------------------------
-
-You can enable faster gcc optimization with the ``cxxflags`` option.
-This list of flags was suggested on the mailing list::
-
-    -O3 -ffast-math -ftree-loop-distribution -funroll-loops -ftracer
-
-Use it at your own risk. Some people warned that the ``-ftree-loop-distribution`` optimization resulted in wrong results in the past.
-
-In the past we said that if the ``compiledir`` was not shared by multiple
-computers, you could add the ``-march=native`` flag. Now we recommend
-to remove this flag as Theano does it automatically and safely,
-even if the ``compiledir`` is shared by multiple computers with different
-CPUs. In fact, Theano asks g++ what are the equivalent flags it uses, and re-uses
-them directly.
-
-
-Faster Theano Function Compilation
-----------------------------------
-
-Theano function compilation can be time consuming. It can be sped up by setting
-the flag ``mode=FAST_COMPILE`` which instructs Theano to skip most
-optimizations and disables the generation of any c/cuda code. This is useful
-for quickly testing a simple idea.
-
-If c/cuda code is necessary, as when using a GPU, the flag
-``optimizer=fast_compile`` can be used instead. It instructs Theano to skip time
-consuming optimizations but still generate c/cuda code. To get the most out of
-this flag requires using a development version of Theano instead of the latest
-release (0.6).
-
-Similarly using the flag ``optimizer_excluding=inplace`` will speed up
-compilation by preventing optimizations that replace operations with a version
-that reuses memory where it will not negatively impact the integrity of the
-operation. Such optimizations can be time consuming. However using this flag will
-result in greater memory usage because space must be allocated for the results
-which would be unnecessary otherwise. In short, using this flag will speed up
-compilation but it will also use more memory because
-``optimizer_excluding=inplace`` excludes inplace optimizations resulting
-in a trade off between speed of compilation and memory usage.
-
-Theano flag `reoptimize_unpickled_function` controls if an unpickled theano function
-should reoptimize its graph or not. Theano users can use the standard python pickle 
-tools to save a compiled theano function. When pickling, both graph before and 
-after the optimization are saved, including shared variables. When set to True, 
-the graph is reoptimized when being unpickled. Otherwise, skip the graph optimization 
-and use directly the optimized graph from the pickled file.
-
-Faster Theano function
-----------------------
-
-You can set the Theano flag ``allow_gc`` to ``False`` to get a speed-up by using
-more memory. By default, Theano frees intermediate results when we don't need
-them anymore. Doing so prevents us from reusing this memory. So disabling the
-garbage collection will keep all intermediate results' memory space to allow to
-reuse them during the next call to the same Theano function, if they are of the
-correct shape. The shape could change if the shapes of the inputs change.
-
-.. unsafe_optimization:
-
-Unsafe optimization
-===================
-
-
-Some Theano optimizations make the assumption that the user inputs are
-valid. What this means is that if the user provides invalid values (like
-incompatible shapes or indexing values that are out of bounds) and
-the optimizations are applied, the user error will get lost. Most of the
-time, the assumption is that the user inputs are valid. So it is good
-to have the optimization being applied, but loosing the error is bad.
-The newest optimization in Theano with such assumption will add an
-assertion in the graph to keep the user error message. Computing
-these assertions could take some time. If you are sure everything is valid
-in your graph and want the fastest possible Theano, you can enable an
-optimization that will remove those assertions with:
-``optimizer_including=local_remove_all_assert``
-
-
-Faster Small Theano function
-----------------------------
-
-.. note::
-
-   For Theano 0.6 and up.
-
-For Theano functions that don't do much work, like a regular logistic
-regression, the overhead of checking the input can be significant. You
-can disable it by setting ``f.trust_input`` to True.
-Make sure the types of arguments you provide match those defined when
-the function was compiled.
-
-For example, replace the following
-
-.. code-block:: python
-
-    x = theano.tensor.scalar('x')
-    f = function([x], x + 1.)
-    f(10.)
-
-with
-
-.. code-block:: python
-
-    x = theano.tensor.scalar('x')
-    f = function([x], x + 1.)
-    f.trust_input = True
-    f(numpy.array([10.], dtype=theano.config.floatX))
-
-Also, for small Theano functions, you can remove more Python overhead by
-making a Theano function that does not take any input. You can use shared
-variables to achieve this. Then you can call it like this: ``f.fn()`` or
-``f.fn(n_calls=N)`` to speed it up. In the last case, only the last
-function output (out of N calls) is returned.
-
-Out of memory... but not really
--------------------------------
-
-Occasionally Theano may fail to allocate memory when there appears to be more
-than enough reporting:
-
-    Error allocating X bytes of device memory (out of memory). Driver report Y
-    bytes free and Z total.
-
-where X is far less than Y and Z (i.e. X << Y < Z).
-
-This scenario arises when an operation requires allocation of a large contiguous
-block of memory but no blocks of sufficient size are available.
-
-GPUs do not have virtual memory and as such all allocations must be assigned to
-a continuous memory region. CPUs do not have this limitation because or their
-support for virtual memory. Multiple allocations on a GPU can result in memory
-fragmentation which can makes it more difficult to find contiguous regions
-of memory of sufficient size during subsequent memory allocations.
-
-A known example is related to writing data to shared variables. When updating a
-shared variable Theano will allocate new space if the size of the data does not
-match the size of the space already assigned to the variable. This can lead to
-memory fragmentation which means that a continugous block of memory of
-sufficient capacity may not be available even if the free memory overall is
-large enough.
-
-Related Projects
-----------------
-
-We try to list in this `wiki page <https://github.com/Theano/Theano/wiki/Related-projects>`_ other Theano related projects.
-
-
-"What are Theano's Limitations?"
---------------------------------
-
-Theano offers a good amount of flexibility, but has some limitations too.
-You must answer for yourself the following question: How can my algorithm be cleverly written 
-so as to make the most of what Theano can do?
-
-Here is a list of some of the known limitations:
-
-- *While*- or *for*-Loops within an expression graph are supported, but only via
-  the :func:`theano.scan` op (which puts restrictions on how the loop body can
-  interact with the rest of the graph).
-
-- Neither *goto* nor *recursion* is supported or planned within expression graphs.
-
-
-"float32 / int{32, 64} gives float64"
--------------------------------------
-
-It should be noted that using float32 and int{32, 64} together
-inside a function would provide float64 as output.
-
-Since the GPU can't compute this kind of output, it would be
-preferable not to use those dtypes together.
-
-To help you find where float64 are created, see the
-:attr:`warn_float64` Theano flag.
diff --git a/doc/glossary.txt b/doc/glossary.txt
index c25c6af33e2..8e3a94d1588 100644
--- a/doc/glossary.txt
+++ b/doc/glossary.txt
@@ -3,11 +3,6 @@
 Glossary
 ========
 
-..
-    # This is for the doctests in the file
-    >>> import theano
-    >>> from theano import tensor
-
 .. glossary::
 
     Apply
@@ -30,10 +25,8 @@ Glossary
     Constant
         A variable with an immutable value.
         For example, when you type
-
         >>> x = tensor.ivector()
         >>> y = x + 3
-
         Then a `constant` is created to represent the ``3`` in the graph.
 
         See also: :class:`gof.Constant`
diff --git a/doc/images/theano_logo_allblue_63x21.png b/doc/images/theano_logo_allblue_63x21.png
deleted file mode 100644
index e18d0aa4491..00000000000
Binary files a/doc/images/theano_logo_allblue_63x21.png and /dev/null differ
diff --git a/doc/index.txt b/doc/index.txt
index 093f0d68cd1..6f111af264b 100644
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -13,23 +13,17 @@ arrays efficiently. Theano features:
 * **dynamic C code generation** -- Evaluate expressions faster.
 * **extensive unit-testing and self-verification** -- Detect and diagnose many types of mistake.
 
-Theano has been powering large-scale computationally intensive
-scientific investigations since 2007.  But it is also approachable
-enough to be used in the classroom (IFT6266 at the University of
-Montreal).
+Theano has been powering large-scale computationally intensive scientific investigations
+since 2007.  But it is also approachable enough to be used in the classroom
+(IFT6266 at the University of Montreal).
 
 News
 ====
 
-* Open Machine Learning Workshop 2014 `presentation <omlw2014/omlw_presentation.pdf>`_.
-
-* Colin Raffel `tutorial on Theano <http://nbviewer.ipython.org/github/craffel/theano-tutorial/blob/master/Theano%20Tutorial.ipynb>`_.
-
-* Ian Goodfellow did a `12h class with exercises on Theano <https://github.com/goodfeli/theano_exercises>`_.
-
-* Theano 0.6 was released. Everybody is encouraged to update.
+* Theano 0.6rc3 was released. Everybody is encouraged to update.
 
 * New technical report on Theano: `Theano: new features and speed improvements <http://arxiv.org/abs/1211.5590>`_.
+  However, please keep citing the other paper below in scientific work involving Theano.
 
 * `HPCS 2011 Tutorial <http://www.iro.umontreal.ca/~lisa/pointeurs/tutorial_hpcs2011_fixed.pdf>`_.
   We included a few fixes discovered while doing the Tutorial.
@@ -63,38 +57,28 @@ directory, so that when you pull updates via Git, they will be
 automatically reflected the "installed" version. For more information about
 installation and configuration, see :ref:`installing Theano <install>`.
 
-.. only:: html
-
-  Status
-  ======
-
-  .. raw:: html
+Status
+======
 
-     <a href="http://travis-ci.org/Theano/Theano/builds"><img src="https://secure.travis-ci.org/Theano/Theano.png?branch=master" /></a>&nbsp;
+.. image:: https://secure.travis-ci.org/Theano/Theano.png?branch=master
+    :target: http://travis-ci.org/Theano/Theano/builds
 
-  .. raw:: html
+.. image:: https://pypip.in/v/Theano/badge.png
+    :target: https://crate.io/packages/Theano/
+    :alt: Latest PyPI version
 
-     <a href="https://crate.io/packages/Theano/"><img src="https://pypip.in/v/Theano/badge.png" alt="Latest PyPI version" /></a>&nbsp;
+.. image:: https://pypip.in/d/Theano/badge.png
+    :target: https://crate.io/packages/Theano/
+    :alt: Number of PyPI downloads
 
-  .. raw:: html
-
-     <a href="https://crate.io/packages/Theano/"><img src="https://pypip.in/d/Theano/badge.png" alt="Number of PyPI downloads" /></a>&nbsp;
-
-  .. _available on PyPI: http://pypi.python.org/pypi/Theano
-  .. _Related Projects: https://github.com/Theano/Theano/wiki/Related-projects
+.. _available on PyPI: http://pypi.python.org/pypi/Theano
+.. _Related Projects: https://github.com/Theano/Theano/wiki/Related-projects
 
 Citing Theano
 ==============
 
 If you use Theano for academic research, you are highly encouraged (though not
-required) to cite the following two papers:
-
-* F. Bastien, P. Lamblin, R. Pascanu, J. Bergstra, I. Goodfellow,
-  A. Bergeron, N. Bouchard, D. Warde-Farley and Y. Bengio.
-  `"Theano: new features and speed improvements"
-  <http://arxiv.org/pdf/1211.5590.pdf>`_.
-  NIPS 2012 deep learning workshop. (`BibTex
-  <http://www.iro.umontreal.ca/~lisa/publications2/index.php/export/publication/551/bibtex>`_)
+required) to cite the following paper:
 
 * J. Bergstra, O. Breuleux, F. Bastien, P. Lamblin, R.
   Pascanu, G. Desjardins, J. Turian, D. Warde-Farley and Y.
@@ -109,8 +93,6 @@ us. As an added benefit, you increase Theano's exposure and potential user
 (and developer) base, which is to the benefit of all users of Theano. Thanks
 in advance!
 
-See our :ref:`citation` for details.
-
 Documentation
 =============
 
@@ -120,7 +102,6 @@ Roughly in order of what you'll want to check out:
 * :ref:`introduction` -- What is Theano?
 * :ref:`tutorial` -- Learn the basics.
 * :ref:`libdoc` -- Theano's functionality, module by module.
-* :ref:`faq` -- A set of commonly asked questions.
 * :ref:`optimizations` -- Guide to Theano's graph optimizations.
 * :ref:`extending` -- Learn to add a Type, Op, or graph optimization.
 * :ref:`dev_start_guide` -- How to contribute code to Theano.
diff --git a/doc/install.txt b/doc/install.txt
index d6e63fbd6ec..6b32409eed7 100644
--- a/doc/install.txt
+++ b/doc/install.txt
@@ -1,3 +1,4 @@
+
 .. _install:
 
 
@@ -20,10 +21,9 @@ instructions below for detailed installation steps):
         We develop mainly on 64-bit Linux machines. other architectures are
         not well-tested.
 
-    Python_ >= 2.6
+    Python_ >= 2.4
         The development package (``python-dev`` or ``python-devel``
         on most Linux distributions) is recommended (see just below).
-	Python 2.4 was supported up to and including the release 0.6.
 
     ``g++``, ``python-dev``
         Not technically required but *highly* recommended, in order to compile
@@ -69,17 +69,12 @@ The following libraries and software are optional:
         To be able to make picture of Theano computation graph.
 
     `NVIDIA CUDA drivers and SDK`_
-        Required for GPU code generation/execution on NVIDIA gpus
-
-    `libgpuarray`_
-        Required for GPU/CPU code generation on CUDA and OpenCL devices (see: :ref:`gpuarray`.)
-
-        :note: OpenCL support is still minimal for now.
+        Required for GPU code generation/execution. Only NVIDIA GPUs using
+        32-bit floating point numbers are currently supported.
 
 .. _LaTeX: http://www.latex-project.org/
 .. _dvipng: http://savannah.nongnu.org/projects/dvipng/
 .. _NVIDIA CUDA drivers and SDK: http://developer.nvidia.com/object/gpucomputing.html
-.. _libgpuarray: http://deeplearning.net/software/libgpuarray/installation.html
 
 Linux
 -----
@@ -135,11 +130,20 @@ by typing
 
 You may need to add ``sudo``  before this command to install into your
 system's ``site-packages`` directory. If you do not have administrator access
-to your machine, you can install Theano locally (to ~/.local) using
+to your machine, you can install to an alternate prefix using
 
 .. code-block:: bash
 
-    pip install Theano --user
+    pip install Theano --install-option='--prefix=~/.local'
+
+e.g. using ``--install-option='--prefix=~/.local'`` on Python 2.4 would
+install Theano into ``.local/lib/python2.4/site-packages`` inside your home
+directory on Mac OS X or Unix/Linux (this ``site-packages`` directory must be
+listed in your ``PYTHONPATH`` environment variable; for Python 2.6 and later,
+``~/.local`` is
+automatically searched and does *not* need to be explicitly included in
+``PYTHONPATH``, see :ref:`config_pythonpath` for instructions).
+You can change ``~/.local``, but you need to change your ``PYTHONPATH`` as said above.
 
 Alternatively you can use virtualenv_ to create an isolated ``site-packages``
 directory; see the `virtualenv documentation`_ for details.
@@ -221,7 +225,7 @@ or (if you want to install it for the current user only):
 
 .. code-block:: bash
 
-    pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --user
+    pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --install-option='--prefix=~/.local'
 
 The following are general instructions that will set you up with the
 bleeding-edge version of Theano and allow you to hack it. First,
@@ -238,11 +242,6 @@ From here, the easiest way to get started is (this requires setuptools_ or distr
     cd Theano
     python setup.py develop
 
-.. note::
-
-   "python setup.py develop ..." does not work on Python 3 as it does not call
-   the converter from Python 2 code to Python 3 code.
-
 This will install a ``.pth`` file in your ``site-packages`` directory that
 tells Python where to look for your Theano installation (i.e. in the
 directory your just checked out of Github). Using ``develop`` mode is
@@ -318,7 +317,7 @@ a Python (or IPython) interpreter,
 .. code-block:: python
 
     >>> import theano
-    >>> theano.test() # doctest: +SKIP
+    >>> theano.test()
 
 You can also run them in-place from the Git checkout directory by typing
 
@@ -851,7 +850,7 @@ You can then proceed to the :ref:`windows_basic` or the :ref:`windows_bleeding_e
 Alternative: Canopy
 ###################
 
-Another software from Enthought that installs all Theano dependencies.
+Another software from Enthought that install all Theano dependancy.
 If you are affiliated with a university (as student or employee), you
 can download the installation for free.
 
@@ -864,8 +863,8 @@ can download the installation for free.
 - In Canopy Package Manager, search and install packages "mingw 4.5.2" and "libpython 1.2"
 - (Needed only for Theano 0.6rc3 or earlier)
   The "libpython 1.2" package installs files `libpython27.a` and `libmsvcr90.a` to
-  `C:\\Users\\<USER>\\AppData\\Local\\Enthought\\Canopy\\User\\libs`. Copy the two files to
-  `C:\\Users\\<USER>\\AppData\\Local\\Enthought\\Canopy\\App\\appdata\\canopy-1.0.0.1160.win-x86_64\libs`.
+  `C:\Users\<USER>\AppData\Local\Enthought\Canopy\User\libs`. Copy the two files to
+  `C:\Users\<USER>\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.0.0.1160.win-x86_64\libs`.
 - (Needed only for Theano 0.6rc3 or earlier) Set the Theano flags
   ``blas.ldflags=-LC:\Users\<USER>\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.0.0.1160.win-x86_64\Scripts -lmk2_core -lmk2_intel_thread -lmk2_rt``.
 
@@ -1002,23 +1001,11 @@ instructions in :ref:`windows_bleeding_edge`.
 Windows installer for AnacondaCE
 ################################
 
-.. note::
-
-   This don't work with current Anaconda. Help needed to repair this.
-
 If you installed AnacondaCE, the simplest way to install and configure
 Theano is to download and execute this `Windows installer
 for Theano on AnacondaCE for Windows
 <https://github.com/Theano/Theano-wininstaller/raw/master/bin/theano_installer_latest.msi>`__.
 
-
-.. note::
-
-    It is possible that you need to logout/login or restart the
-    computer after installing AnacondaCE and before running Theano
-    installer. Otherwise, sometimes the Theano installer while trying
-    to find pip.
-
 .. note::
 
     This installer was tested on Windows 7, 64-bit edition, and AnacondaCE
diff --git a/doc/install_ubuntu.txt b/doc/install_ubuntu.txt
index d59e9364bda..65ae9ddbdd9 100644
--- a/doc/install_ubuntu.txt
+++ b/doc/install_ubuntu.txt
@@ -3,8 +3,8 @@
 Easy Installation of an optimized Theano on Ubuntu
 ==================================================
 
-These instructions were written for Ubuntu 11.04, 11.10, 12.04, 12.10, 13.04,
-13.10 and 14.04.
+These instruction was done for Ubuntu 11.04, 11.10 and 12.04. You can
+probably do something similar on older computer.
 
 .. note::
 
@@ -35,21 +35,11 @@ These instructions were written for Ubuntu 11.04, 11.10, 12.04, 12.10, 13.04,
    this is not completely safe. ``easy_install`` with NumPy 1.5.1 does not
    raise this error.
 
-.. note::
-
-   This page describes how to install Theano for Python 2. If you have
-   installed Python 3 on your system, maybe you need to change the
-   command pip to ``pip-2.7`` to specify to install it for Python 2, as
-   sometimes the pip command refers to the Python 3 version.
-
-   The development version of Theano supports Python 3.3 and
-   probably supports Python 3.2, but we do not test on it.
-
 
 Installation steps
 ~~~~~~~~~~~~~~~~~~
 
-Ubuntu 11.10/12.04/12.10/13.04/13.10:
+Ubuntu 11.10/12.04/12.10/13.04:
  1) ``sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libopenblas-dev git``
  2) ``sudo pip install Theano``
 
@@ -63,22 +53,6 @@ Ubuntu 11.04:
  1) ``sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ git libatlas3gf-base libatlas-dev``
  2) ``sudo pip install Theano``
 
-.. note::
-
-    If you have error that contain "gfortran" in it, like this one:
-
-        ImportError: ('/home/Nick/.theano/compiledir_Linux-2.6.35-31-generic-x86_64-with-Ubuntu-10.10-maverick--2.6.6/tmpIhWJaI/0c99c52c82f7ddc775109a06ca04b360.so: undefined symbol: _gfortran_st_write_done'
-
-    The problem is probably that NumPy is linked with a different blas
-    then then one currently available (probably ATLAS). There is 2
-    possible fixes:
-
-    1) Uninstall ATLAS and install OpenBLAS.
-    2) Use the Theano flag "blas.ldflags=-lblas -lgfortran"
-
-    1) is better as OpenBLAS is faster then ATLAS and NumPy is
-    probably already linked with it. So you won't need any other
-    change in Theano files or Theano configuration.
 
 
 Test the newly installed packages
@@ -148,28 +122,6 @@ Do like in the section "Updating Theano", but use
 ``git+git://github.com/Theano/Theano.git`` instead of ``theano``.
 
 
-Manual Openblas instruction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The openblas included in Ubuntu is limited to 2 threads. If you want
-to use more cores at the same time, you will need to compile it
-yourself. Here is some code that will help you.
-
-.. code-block:: bash
-
-    # remove openblas if you installed it
-    sudo apt-get remove libopenblas-base
-    # Download the development version of OpenBLAS
-    git clone git://github.com/xianyi/OpenBLAS
-    cd OpenBLAS
-    make FC=gfortran
-    sudo make PREFIX=/usr/local/ install
-    # Tell Theano to use OpenBLAS.
-    # This works only for the current user.
-    # Each Theano user on that computer should run that line.
-    echo -e "\n[blas]\nldflags = -lopenblas\n" >> ~/.theanorc
-
-
 .. _install_ubuntu_gpu:
 
 Contributed GPU instruction
@@ -188,39 +140,15 @@ Ubuntu 11.10/12.04 (probably work on 11.04 too):
 Then you need to fetch latest CUDA tool kit (download ubuntu 11.04 32/64bit package)
 from `here <http://developer.nvidia.com/cuda-downloads>`_.
 
-Ubuntu 14.04:
-
-.. code-block:: bash
-
-   sudo apt-get install nvidia-current
-   sudo apt-get install nvidia-cuda-toolkit # As of October 31th, 2014, provide cuda 5.5, not the latest cuda 6.5
-
-If you want cuda 6.5, you can download packages from `nvidia
-<http://developer.nvidia.com/cuda-downloads>`_ for Ubuntu 14.04.
-
-If you downloaded the `run` package (the only one available for CUDA 5.0 and older), you install it like this:
+Then you install it like this:
 
 .. code-block:: bash
 
     chmod a+x XXX.sh
     sudo ./XXX.sh
 
-Since CUDA 5.5, Nvidia provide a DEB package. If you don't know how to
-intall it, just double click on it from the graphical interface. It
-should ask if you want to install it. On Ubuntu 14.04, you need to run
-this in your terminal:
-
-.. code-block:: bash
-
-    sudo apt-get update
-    sudo apt-get install cuda
-
-You must reboot the computer after the driver installation. To test
-that it was loaded correctly after the reboot, run the command
-`nvidia-smi` from the command line.
-
-You probably need to change the default version of gcc as
-`explained by Benjamin J. McCann <http://www.benmccann.com/blog/installing-cuda-and-theano/>`_ if the package you downloaded is for another Ubuntu version:
+You probably need to change the default version of gcc as 
+`explained by Benjamin J. McCann <http://www.benmccann.com/blog/installing-cuda-and-theano/>`_:
 
 
 
@@ -242,16 +170,25 @@ Test GPU configuration
 
 .. note::
 
-   Ubuntu 10.04 LTS: default gcc version 4.4.3. gcc 4.1.2, 4.3.4 available.
+   Ubuntu 10.04 LTS: default gcc version 4.4.3. gcc 4.1.2, 4.3.4 availables.
+
+   Ubuntu 11.04: default gcc version 4.5.2. gcc 4.4.5 availables.
+
+   Ubuntu 11.10: default gcc version 4.6.1. gcc 4.4.6 and 4.5.3 availables.
+
+   Ubuntu 12.04 LTS: default gcc version 4.6.3. gcc 4.4.7 and 4.5.3 availables.
+
+   Ubuntu 12.10: default gcc version 4.7.2. gcc 4.4.7, 4.5.4 and 4.6.3 availables.
+
+
+
+
+
+
+
 
-   Ubuntu 11.04: default gcc version 4.5.2. gcc 4.4.5 available.
 
-   Ubuntu 11.10: default gcc version 4.6.1. gcc 4.4.6 and 4.5.3 available.
 
-   Ubuntu 12.04 LTS: default gcc version 4.6.3. gcc 4.4.7 and 4.5.3 available.
 
-   Ubuntu 12.10: default gcc version 4.7.2. gcc 4.4.7, 4.5.4 and 4.6.3 available.
 
-   Ubuntu 13.10: default gcc version 4.8.1. gcc 4.4.7, 4.6.4 and 4.7.3 available.
 
-   Ubuntu 14.04: default gcc version 4.8.2, gcc 4.4.7,, 4.6.4, and 4.7.3 available.
diff --git a/doc/internal/how_to_release.txt b/doc/internal/how_to_release.txt
index de8b85635d9..072fbe20264 100644
--- a/doc/internal/how_to_release.txt
+++ b/doc/internal/how_to_release.txt
@@ -82,17 +82,17 @@ UnicodeDecodeError if there are non-ASCII characters in NEWS.txt. You
 would need to change NEWS.txt so it contains only ASCII characters (the
 problem usually comes from diacritics in people's names).
 
-On freecode (formaly freshmeat)
--------------------------------
+On freshmeat
+------------
 
-Theano project page at freecode is `here <http://freecode.com/projects/theano>`__.
-The package itself is not uploaded to freecode, the only thing to update is
+Theano project page at freshmeat is `here <http://freshmeat.net/projects/theano>`__.
+The package itself is not uploaded to freshmeat, the only thing to update is
 the description and tags.
 
 ou can request the rights to add a release from an admin (for instance Fred),
 pointing them to `the "roles" page
-<http://freecode.net/projects/theano/roles>`__. Then, create a new release from
-`the "releases" page <http://freecode.net/projects/theano/releases>`__.
+<http://freshmeat.net/projects/theano/roles>`__. Then, create a new release from
+`the "releases" page <http://freshmeat.net/projects/theano/releases>`__.
 
 On mloss.org
 ------------
@@ -188,7 +188,6 @@ from ``NEWS.txt``, and send it to the following mailing lists:
 * theano-announce
 * numpy-discussion@scipy.org
 * scipy-user@scipy.org
-* G+, Scientific Python: https://plus.google.com/communities/108773711053400791849
 
 For release candidates, only e-mail:
 
diff --git a/doc/internal/metadocumentation.txt b/doc/internal/metadocumentation.txt
index e593dc35de4..0d0b6aceb3e 100644
--- a/doc/internal/metadocumentation.txt
+++ b/doc/internal/metadocumentation.txt
@@ -123,11 +123,10 @@ The user ``lisa`` runs a cronjob on the computer ``ceylon``,  this
 happens nightly. (To have the crontab executed, the ``lisa`` user must
 be logged into ``ceylon``, Fred leaves a shell open for that.)
 
-The cronjob executes a script that download/update the repo of Theano,
-Pylearn, Pylearn2 and the Deep Learning Tutorial, then run their tests
-script under ``*/misc/do_nightly_build``. Those script tests the
-project under various condition. The cron job also run some tests in
-Python 2.4 and Python 3.3 for Theano.
+The cronjob executes the scripts
+``~/nightly_build/do_nightly_build_{theano,pylearn,deeplearning}``.
+These scripts perform an update of theano (and pylearn, and
+DeepLearningTutorials too), and execute theano-nose (in various settings).
 
 The output is emailed automatically to the `theano-buildbot`_ mailing list.
 
diff --git a/doc/introduction.txt b/doc/introduction.txt
index bf641b1c2b9..1edee77e908 100644
--- a/doc/introduction.txt
+++ b/doc/introduction.txt
@@ -165,11 +165,11 @@ Note: There is no short term plan to support multi-node computation.
 Theano Vision State
 ===================
 
-Here is the state of that vision as of December 3th, 2013 (after Theano release
-0.6):
+Here is the state of that vision as of October 1st, 2012 (after Theano release
+0.6rc1):
 
 * We support tensors using the `numpy.ndarray` object and we support many operations on them.
-* We support sparse types by using the `scipy.{csc,csr,bsr}_matrix` object and support some operations on them.
+* We support sparse types by using the `scipy.{csc,csr}_matrix` object and support some operations on them.
 * We have started implementing/wrapping more advanced linear algebra operations.
 * We have many graph transformations that cover the 4 categories listed above.
 * We can improve the graph transformation with better storage optimization
@@ -184,23 +184,23 @@ Here is the state of that vision as of December 3th, 2013 (after Theano release
 
 * We have a CUDA backend for tensors of type `float32` only.
 * Efforts have begun towards a generic GPU ndarray (GPU tensor) (started in the
-  `libgpuarray <https://github.com/Theano/libgpuarray>`_ project)
+  `compyte <https://github.com/inducer/compyte/wiki>`_ project)
 
-  * Move GPU backend outside of Theano.
-  * Will provide better support for GPU on Windows and support an OpenCL backend on CPU.
+  * Move GPU backend outside of Theano (on top of PyCUDA/PyOpenCL)
+  * Will provide better support for GPU on Windows and use an OpenCL backend on CPU.
 
 * Loops work, but not all related optimizations are currently done.
 * The cvm linker allows lazy evaluation. It is the current default linker.
 
   * How to have `DebugMode` check it? Right now, DebugMode checks the computation non-lazily.
+  * The profiler used by cvm is less complete than `ProfileMode`.
 
 * SIMD parallelism on the CPU comes from the compiler.
-* Multi-core parallelism is only supported by Conv2d(not by default).
-  If the external BLAS implementation supports it,
+* Multi-core parallelism is only supported by Conv2d. If the external BLAS implementation supports it,
   there are also, gemm, gemv and ger that are parallelized.
 * No multi-node support.
 * Many, but not all NumPy functions/aliases are implemented.
-  * https://github.com/Theano/Theano/issues/1080
+  * http://www.assembla.com/spaces/theano/tickets/781
 * Wrapping an existing Python function in easy and documented.
 * We know how to separate the shared variable memory
   storage location from its object type (tensor, sparse, dtype, broadcast
diff --git a/doc/library/compile/function.txt b/doc/library/compile/function.txt
index 512f8663c13..1f84d9a1a5b 100644
--- a/doc/library/compile/function.txt
+++ b/doc/library/compile/function.txt
@@ -187,5 +187,3 @@ Reference
     Replacements specified with
     givens are different from optimizations in that Var2 is not expected to be
     equivalent to Var1.
-
-.. autofunction:: theano.compile.function.function_dump
diff --git a/doc/library/compile/index.txt b/doc/library/compile/index.txt
index 509ee338252..6c9ae96ba23 100644
--- a/doc/library/compile/index.txt
+++ b/doc/library/compile/index.txt
@@ -16,7 +16,6 @@
     shared
     function
     io
-    ops
     mode
     module
     debugmode
diff --git a/doc/library/compile/mode.txt b/doc/library/compile/mode.txt
index 3a6565516a6..bc2813d6f94 100644
--- a/doc/library/compile/mode.txt
+++ b/doc/library/compile/mode.txt
@@ -21,7 +21,7 @@ Theano defines the following modes by name:
 - ``'FAST_COMPILE'``: Apply just a few graph optimizations and only use Python implementations.
 - ``'FAST_RUN'``: Apply all optimizations, and use C implementations where possible.
 - ``'DebugMode'``: A mode for debuging. See :ref:`DebugMode <debugmode>` for details.
-- ``'ProfileMode'``: Deprecated, use the Theano flag :attr:`config.profile`.
+- ``'ProfileMode'``: A mode for profiling. See :ref:`ProfileMode <profilemode>` for details.
 - ``'DEBUG_MODE'``: Deprecated. Use the string DebugMode.
 - ``'PROFILE_MODE'``: Deprecated. Use the string ProfileMode.
 
@@ -42,6 +42,10 @@ Reference
 
 .. attribute:: FAST_RUN
 
+.. attribute:: DEBUG_MODE
+
+.. attribute:: PROFILE_MODE
+
 .. class:: Mode(object)
 
     Compilation is controlled by two attributes: the `optimizer` controls how
diff --git a/doc/library/compile/opfromgraph.txt b/doc/library/compile/opfromgraph.txt
deleted file mode 100644
index 97b49547afc..00000000000
--- a/doc/library/compile/opfromgraph.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-.. _opfromgraph:
-
-===========
-OpFromGraph
-===========
-
-This page describes :class:`theano.OpFromGraph
-<theano.compile.builders.OpFromGraph>`, an Op that allows to
-encapsulate a Theano graph in an op.
-
-This can be used to encapsulate some functionality in one block. It is
-useful to scale Theano compilation for regular bigger graphs when we
-reuse that encapsulated fonctionality with different inputs many
-times. Due to this encapsulation, it can make Theano compilation phase
-faster for graphs with many nodes.
-
-Using this for small graphs is not recommended as it disables
-optimizations between what is inside the encapsulation and outside of it.
-
-.. note:
-
-    This was not used widely up to now. If you have any
-    questions/comments do not hesitate to contact us on the mailing list.
-
-
-
-.. autoclass:: theano.compile.builders.OpFromGraph
diff --git a/doc/library/compile/ops.txt b/doc/library/compile/ops.txt
deleted file mode 100644
index 0c81bf12b22..00000000000
--- a/doc/library/compile/ops.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-==================================================
-:mod:`ops` --  Some Common Ops and extra Ops stuff
-==================================================
-
-.. automodule:: theano.compile.ops
-    :members:
diff --git a/doc/library/compile/profilemode.txt b/doc/library/compile/profilemode.txt
index c3493e67871..62ea096d13b 100644
--- a/doc/library/compile/profilemode.txt
+++ b/doc/library/compile/profilemode.txt
@@ -14,10 +14,6 @@
 Guide
 =====
 
-.. note::
-
-    ProfileMode is deprecated. Use :attr:`config.profile` instead.
-
 To profile a Theano graph, a special mode called ProfileMode, must be passed as
 an argument when compiling your graph. Using ProfileMode is a three-step
 process.
diff --git a/doc/library/compile/shared.txt b/doc/library/compile/shared.txt
index 4a280ecd813..c4bb2b58cab 100644
--- a/doc/library/compile/shared.txt
+++ b/doc/library/compile/shared.txt
@@ -47,11 +47,21 @@
 
         :type: class:`Container`
 
-.. autofunction:: theano.compile.sharedvalue.shared
+.. function:: shared(value, name=None, strict=False, **kwargs)
 
-.. function:: shared_constructor(ctor)
+    Return a :class:`SharedVariable` Variable, initialized with a copy or reference of `value`.
 
-    Append `ctor` to the list of shared constructors (see :func:`shared`).
+    This function iterates over constructor functions (see `shared_constructor`) to find a
+    suitable SharedVariable subclass.  The suitable one is the first constructor
+    that doesn't raise an exception.
+
+    This function is meant as a convenient default.  If you want to use a
+    specific shared variable constructor, consider calling it directly.
+
+    .. note::
+
+        By passing `kwargs`, you effectively limit the set of potential constructors to those that
+        can accept those kwargs.
 
     Each registered constructor ``ctor`` will be called like this:
 
@@ -59,4 +69,12 @@
 
         ctor(value, name=name, strict=strict, **kwargs)
 
-    If it do not support given value, it must raise a TypeError.
+    .. attribute:: constructors
+
+        A list of shared variable constructors that will be tried in reverse
+        order.
+
+.. function:: shared_constructor(ctor)
+
+    Append `ctor` to the list of shared constructors (see :func:`shared`).
+
diff --git a/doc/library/config.txt b/doc/library/config.txt
index a7ab8110b1d..626b3cce986 100644
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -166,7 +166,7 @@ import theano and print the config variable, as in:
     Theano initialize the GPU device.  Newer version of PyCUDA
     (currently only in the trunk) don't have this restriction.
 
-.. attribute:: print_active_device
+.. attribute:: config.print_active_device
 
     Bool value: either ``True`` or ``False``
 
@@ -184,15 +184,6 @@ import theano and print the config variable, as in:
     and similar functions.  It also sets the default theano bit width for
     arguments passed as Python floating-point numbers.
 
-.. attribute:: warn_float64
-
-    String value: either 'ignore', 'warn', 'raise' or 'pdb'
-
-    Default: 'ignore'
-
-    When creating a TensorVariable with dtype float64, what should be done?
-    This is useful to help find upcast to float64 in user code.
-
 .. attribute:: allow_gc
 
     Bool value: either ``True`` or ``False``
@@ -220,13 +211,6 @@ import theano and print the config variable, as in:
     The best is to define it via Theano configuration
     file or with the environment variable THEANO_FLAGS.
 
-.. attribute:: openmp_elemwise_minsize
-
-   Positive int value, default: 200000.
-
-   This specifies the vectors minimum size for which elemwise ops
-   use openmp, if openmp is enabled.
-
 .. attribute:: cast_policy
 
     String value: either 'numpy+floatX' or 'custom'
@@ -274,8 +258,7 @@ import theano and print the config variable, as in:
 
 .. attribute:: mode
 
-    String value: 'Mode', 'ProfileMode'(deprecated), 'DebugMode', 'FAST_RUN',
-    'FAST_COMPILE'
+    String value: 'Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN', 'FAST_COMPILE'
 
     Default 'Mode'
 
@@ -290,8 +273,6 @@ import theano and print the config variable, as in:
 
     Do the vm/cvm linkers profile the execution time of Theano functions?
 
-    See :ref:`tut_profiling` for examples.
-
 .. attribute:: profile_memory
 
     Bool value: either True or False
@@ -310,45 +291,25 @@ import theano and print the config variable, as in:
     Do the vm/cvm linkers profile the optimization phase when compiling a Theano function?
     It only works when profile=True.
 
-.. attribute:: config.profiling.n_apply
+.. attribute:: profiling.n_apply
 
     Positive int value, default: 20.
 
     The number of Apply nodes to print in the profiler output
 
-.. attribute:: config.profiling.n_ops
+.. attribute:: profiling.n_ops
 
     Positive int value, default: 20.
 
     The number of Ops to print in the profiler output
 
-.. attribute:: config.profiling.min_memory_size
+.. attribute:: profiling.min_memory_size
 
     Positive int value, default: 1024.
 
     For the memory profile, do not print Apply nodes if the size
     of their outputs (in bytes) is lower than this.
 
-.. attribute:: config.profiling.min_peak_memory
-
-    Bool value: either True or False
-
-    Default False
-
-    Does the memory profile print the min peak memory usage?
-    It only works when profile=True, profile_memory=True
-
-.. attribute:: config.profiling.destination
-
-    String value: 'stderr', 'stdout', or a name of a
-    file to be created
-
-    Default 'stderr'
-
-    Name of the destination file for the profiling output.
-    The profiling output can be either directed to stderr
-    (default), or stdout or an arbitrary file.
-
 .. attribute:: config.lib.amdlibm
 
     Bool value: either True or False
@@ -460,23 +421,7 @@ import theano and print the config variable, as in:
 
     Default: '-lblas'
 
-    Link arguments to link against a (Fortran) level-3 blas
-    implementation.  The default will test if '-lblas' work. If not,
-    we will disable our c code for BLAS.
-
-.. attribute:: config.experimental.local_alloc_elemwise_assert
-
-    Bool value: either True or False
-
-    Default: True
-
-    When the local_alloc_optimization is applied, add an assert to highlight
-    shape errors.
-
-    Without such asserts this optimization could hide errors in the user code.
-    We add the assert only if we can't infer that the shapes are equivalent.
-    As such this optimization does not always introduce an assert in the graph.
-    Removing the assert could speed up execution.
+    Link arguments to link against a (Fortran) level-3 blas implementation.
 
 .. attribute:: config.cuda.root
 
@@ -491,7 +436,7 @@ import theano and print the config variable, as in:
     Extra parameters to pass to gcc when compiling.  Extra include paths,
     library paths, configuration options, etc.
 
-.. attribute:: cxx
+.. attribute:: config.cxx
 
 
     Default: 'g++' if g++ is present. Empty string otherwise.
@@ -537,14 +482,6 @@ import theano and print the config variable, as in:
 
     This flag's value cannot be modified during the program execution.
 
-.. attribute:: optimizer_verbose
-
-    Bool value: either True or False
-
-    Default: False
-
-    When True, we print on the stdout the optimization applied.
-
 .. attribute:: nocleanup
 
     Bool value: either True or False
@@ -555,33 +492,7 @@ import theano and print the config variable, as in:
     This means files whose compilation failed are deleted.
     Set to True to keep those files in order to debug compilation errors.
 
-.. attribute:: compile
-
-   This section contains attributes which influence the compilation of
-   C code for ops.  Due to historical reasons many attributes outside
-   of this section also have an influence over compilation, most
-   notably 'cxx'.  This is not expected to change any time soon.
-
-.. attribute:: config.compile.timeout
-
-   Positive int value, default: :attr:`compile.wait` * 24
-
-   Time to wait before an unrefreshed lock is broken and stolen.  This
-   is in place to avoid manual cleanup of locks in case a process
-   crashed and left a lock in place.
-
-   The refresh time is automatically set to half the timeout value.
-
-.. attribute:: config.compile.wait
-
-   Positive int value, default: 5
-
-   Time to wait between attempts at grabbing the lock if the first
-   attempt is not successful. The actual time will be between
-   :attr:`compile.wait` and :attr:`compile.wait` * 2 to avoid a
-   crowding effect on lock.
-
-.. attribute:: DebugMode
+.. attribute:: config.DebugMode
 
     This section contains various attributes configuring the behaviour
     of mode :class:`~debugmode.DebugMode`. See directly this section
@@ -621,7 +532,7 @@ import theano and print the config variable, as in:
     Generate a warning when the destroy_map or view_map tell that an op work
     inplace, but the op did not reuse the input for its output.
 
-.. attribute:: numpy
+.. attribute:: config.numpy
 
     This section contains different attributes for configuring numpy's
     behaviour, described by `numpy.seterr
@@ -695,7 +606,7 @@ import theano and print the config variable, as in:
 
     This flag's value cannot be modified during the program execution.
 
-.. attribute:: compute_test_value
+.. attribute:: config.compute_test_value
 
     String Value: ``'off'``, ``'ignore'``, ``'warn'``, ``'raise'``.
 
@@ -719,23 +630,7 @@ import theano and print the config variable, as in:
           this Op
         - ``'raise'`` will raise an Exception
     
-.. attribute:: compute_test_value_opt
-
-   As ``compute_test_value``, but it is the value used during Theano
-   optimization phase. Theano user's do not need to use this. This is
-   to help debug shape error in Theano optimization.
-
-.. attribute:: reoptimize_unpickled_function
-
-    Bool value, default: True
-
-    Theano users can use the standard python pickle tools to save a compiled 
-    theano function. When pickling, both graph before and after the optimization 
-    are saved, including shared variables. When set to True, the graph is 
-    reoptimized when being unpickled. Otherwise, skip the graph optimization and 
-    use directly the optimized graph. 
-
-.. attribute:: exception_verbosity
+.. attribute:: config.exception_verbosity
 
     String Value: ``'low'``, ``'high'``.
 
@@ -775,14 +670,8 @@ import theano and print the config variable, as in:
     This is useful to debug in gdb modules compiled by Theano.
     The parameter -g is passed by default to g++.
 
-.. attribute:: config.cmodule.compilation_warning
+.. attribute:: cmodule.compilation_warning
 
     Bool value, default: False
 
     If True, will print compilation warnings.
-
-.. attribute:: config.cmodule.preload_cache
-
-    Bool value, default: False
-
-    If set to True, will preload the C module cache at import time
diff --git a/doc/library/gof/fgraph.txt b/doc/library/gof/fgraph.txt
index 0166524273a..d6d6b82780f 100644
--- a/doc/library/gof/fgraph.txt
+++ b/doc/library/gof/fgraph.txt
@@ -36,7 +36,3 @@ Reference
 
     ***TODO***
 
-    .. note:: FunctionGraph(inputs, outputs) clones the inputs by
-        default. To avoid this behavior, add the parameter
-        clone=False. This is needed as we do not want cached constants
-        in fgraph.
diff --git a/doc/library/gof/index.txt b/doc/library/gof/index.txt
index 7b70bb2d9d3..a58447c727a 100644
--- a/doc/library/gof/index.txt
+++ b/doc/library/gof/index.txt
@@ -16,4 +16,6 @@
     fgraph
     toolbox
     type
-    utils
+
+
+
diff --git a/doc/library/gof/utils.txt b/doc/library/gof/utils.txt
deleted file mode 100644
index 772bf99c37e..00000000000
--- a/doc/library/gof/utils.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-.. _libdoc_gof_utils:
-
-==========================================================
-:mod:`utils` -- Utilities functions operating on the graph
-==========================================================
-
-.. module:: utils
-   :platform: Unix, Windows
-   :synopsis: Utilities functions operating on the graph
-.. moduleauthor:: LISA
-
----------
-Reference
----------
-
-.. automodule:: theano.gof.utils
-    :members:
diff --git a/doc/library/gradient.txt b/doc/library/gradient.txt
index 1baaf7b5cd1..afa359fa6a8 100644
--- a/doc/library/gradient.txt
+++ b/doc/library/gradient.txt
@@ -9,11 +9,11 @@
    :synopsis: low-level automatic differentiation
 .. moduleauthor:: LISA
 
-Symbolic gradient is usually computed from :func:`gradient.grad`, which offers a
+Symbolic gradient is usually computed from :func:`tensor.grad`, which offers a
 more convenient syntax for the common case of wanting the gradient in some
 expressions with respect to a scalar cost.  The :func:`grad_sources_inputs`
 function does the underlying work, and is more flexible, but is also more
-awkward to use when :func:`gradient.grad` can do the job.
+awkward to use when :func:`tensor.grad` can do the job.
 
 
 .. automodule:: theano.gradient
diff --git a/doc/library/index.txt b/doc/library/index.txt
index bb67add0aa1..ffcb77f81d6 100644
--- a/doc/library/index.txt
+++ b/doc/library/index.txt
@@ -22,7 +22,6 @@ Types and Ops that you can use to build and compile expression graphs.
    gof/index
    scan
    sandbox/index
-   typed_list
 
 There are also some top-level imports that you might find more convenient:
 
@@ -36,11 +35,6 @@ There are also some top-level imports that you might find more convenient:
     
     Alias for :func:`function.function`
 
-
-.. function:: function_dump(...)
-
-    Alias for :func:`theano.compile.function.function_dump`
-
 .. function:: shared(...)
     
     Alias for :func:`shared.shared`
@@ -54,5 +48,3 @@ There are also some top-level imports that you might find more convenient:
     Works like :func:`tensor.dot` for both sparse and dense matrix products
 
 .. autofunction:: theano.clone
-
-.. autofunction:: theano.sparse_grad
diff --git a/doc/library/sandbox/cuda/dnn.txt b/doc/library/sandbox/cuda/dnn.txt
deleted file mode 100644
index 13954427c05..00000000000
--- a/doc/library/sandbox/cuda/dnn.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-.. _libdoc_cuda_dnn:
-
-================================
-:mod:`sandbox.cuda.dnn` -- cuDNN
-================================
-
-.. moduleauthor:: LISA
-
-`cuDNN <https://developer.nvidia.com/cuDNN>`_ is an NVIDIA library with
-functionality used by deep neural network. It provides optimized versions
-of some operations like the convolution. cuDNN is not currently
-installed with CUDA 6.5. You must download and install it
-yourself.
-
-To install it, decompress the downloaded file and make the ``*.h`` and
-``*.so*`` files available to the compilation environment.
-There are at least three possible ways of doing so:
-
-- The easiest is to include them in your CUDA installation. Copy the
-  ``*.h`` files to ``CUDA_ROOT/include`` and the ``*.so*`` files to
-  ``CUDA_ROOT/lib64`` (by default, ``CUDA_ROOT`` is ``/usr/local/cuda``
-  on Linux).
-- Alternatively, on Linux, you can set the environment variables
-  ``LD_LIBRARY_PATH``, ``LIBRARY_PATH`` and ``CPATH`` to the directory
-  extracted from the download. If needed, separate multiple directories
-  with ``:`` as in the ``PATH`` environment variable.
-- And as a third way, also on Linux, you can copy the ``*.h`` files
-  to ``/usr/include`` and the ``*.so*`` files to ``/lib64``.
-
-By default, Theano will detect if it can use cuDNN. If so, it will use
-it.  If not, Theano optimizations will not introduce cuDNN ops. So
-Theano will still work if the user did not introduce them manually.
-
-To get an error if Theano can not use cuDNN, use this Theano flag:
-``optimizer_including=cudnn``.
-
-.. note::
-
-    Normally you should not call GPU Ops directly, but the CPU interface
-    currently does not allow all options supported by cuDNN ops. So it is
-    possible that you will need to call them manually.
-
-
-Functions
-=========
-
-.. automodule:: theano.sandbox.cuda.dnn
-    :members: dnn_conv, dnn_pool
-
-Convolution Ops
-===============
-
-.. automodule:: theano.sandbox.cuda.dnn
-    :members: GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI
-
-Pooling Ops
-===========
-
-.. automodule:: theano.sandbox.cuda.dnn
-    :members: GpuDnnPoolDesc, GpuDnnPool, GpuDnnPoolGrad
-
-Softmax Ops
-===========
-
-.. automodule:: theano.sandbox.cuda.dnn
-    :members: GpuDnnSoftmax, GpuDnnSoftmaxGrad
diff --git a/doc/library/sandbox/cuda/index.txt b/doc/library/sandbox/cuda/index.txt
index 7de6379a004..d32b35122c5 100644
--- a/doc/library/sandbox/cuda/index.txt
+++ b/doc/library/sandbox/cuda/index.txt
@@ -13,7 +13,6 @@
 .. toctree::
     :maxdepth: 1
 
-    op
     var
     type
-    dnn
+    op
diff --git a/doc/library/sandbox/index.txt b/doc/library/sandbox/index.txt
index 517493f62cc..bca39560473 100644
--- a/doc/library/sandbox/index.txt
+++ b/doc/library/sandbox/index.txt
@@ -16,4 +16,6 @@
     cuda/index
     linalg
     neighbours
-    rng_mrg
+
+
+
diff --git a/doc/library/sandbox/linalg.txt b/doc/library/sandbox/linalg.txt
index 75d975200ee..8788edd90f4 100644
--- a/doc/library/sandbox/linalg.txt
+++ b/doc/library/sandbox/linalg.txt
@@ -1,7 +1,7 @@
 ..  ../../../../theano/sandbox/linalg/ops.py
 ..  ../../../../theano/sandbox/linalg
 
-.. _libdoc_sandbox_linalg:
+.. _libdoc_linalg:
 
 ===================================================================
 :mod:`sandbox.linalg` --  Linear Algebra Ops
diff --git a/doc/library/sandbox/rng_mrg.txt b/doc/library/sandbox/rng_mrg.txt
deleted file mode 100644
index 109d337d776..00000000000
--- a/doc/library/sandbox/rng_mrg.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-.. _libdoc_rng_mrg:
-
-===================================================================
-:mod:`sandbox.rng_mrg` --  MRG random number generator
-===================================================================
-
-.. module:: sandbox.rng_mrg
-   :platform: Unix, Windows
-   :synopsis: MRG random number generator
-.. moduleauthor:: LISA
-
-API
-===
-
-.. automodule:: theano.sandbox.rng_mrg
-    :members:
diff --git a/doc/library/scan.txt b/doc/library/scan.txt
index 2cb9f03ac7e..3732c6e339b 100644
--- a/doc/library/scan.txt
+++ b/doc/library/scan.txt
@@ -27,7 +27,7 @@ More precisely, if *A* is a tensor you want to compute
   for i in xrange(k):
     result = result * A
 
-There are three things here that we need to handle: the initial value
+There are three thing here that we need to handle: the initial value
 assigned to ``result``, the accumulation of results in ``result``, and
 the unchanging variable ``A``. Unchanging variables are passed to scan as
 ``non_sequences``. Initialization occurs in ``outputs_info``, and the accumulation
@@ -67,7 +67,7 @@ Next we initialize the output as a tensor with same shape and dtype as ``A``,
 filled with ones. We give ``A`` to scan as a non sequence parameter and
 specify the number of steps ``k`` to iterate over our lambda expression.
 
-Scan returns a tuple containing our result (``result``) and a
+Scan return a tuples, containing our result (``result``) and a
 dictionary of updates (empty in this case). Note that the result
 is not a matrix, but a 3D tensor containing the value of ``A**k`` for
 each step. We want the last value (after ``k`` steps) so we compile
diff --git a/doc/library/sparse/index.txt b/doc/library/sparse/index.txt
index 920dd04d1f1..249073cf449 100644
--- a/doc/library/sparse/index.txt
+++ b/doc/library/sparse/index.txt
@@ -11,20 +11,20 @@ In the tutorial section, you can find a :ref:`sparse tutorial
 The sparse submodule is not loaded when we import Theano. You must
 import ``theano.sparse`` to enable it.
 
-The sparse module provides the same functionality as the tensor
-module. The difference lies under the covers because sparse matrices
-do not store data in a contiguous array. Note that there are no GPU
-implementations for sparse matrices in Theano. The sparse module has
-been used in:
+The sparse module provide the same functionalities as the tensor
+module. The difference lies under the cover because sparse matrices
+does not store data in a contiguous array. Note that there are no GPU
+implementations for sparse matrices implemented in Theano. The sparse
+module has been used in:
 
 - NLP: Dense linear transformations of sparse vectors.
-- Audio: Filterbank in the Fourier domain.
+- Audio: Filterbank in Fourier domain.
 
 Compressed Sparse Format
 ========================
 
-This section tries to explain how information is stored for the two
-sparse formats of SciPy supported by Theano. There are more formats
+This section tries to explain how information is store for the two
+sparse formats of SciPy supported by Theano. There is more formats
 that can be used with SciPy and some documentation about them may be
 found `here
 <http://deeplearning.net/software/theano/sandbox/sparse.html>`_.
@@ -50,14 +50,14 @@ attributes: ``data``, ``indices``, ``indptr`` and ``shape``.
 CSC Matrix
 ----------
 
-In the *Compressed Sparse Column* format, ``indices`` stands for
-indexes inside the column vectors of the matrix and ``indptr`` tells
-where the column starts in the ``data`` and in the ``indices``
-attributes. ``indptr`` can be thought of as giving the slice which
-must be applied to the other attribute in order to get each column of
-the matrix. In other words, ``slice(indptr[i], indptr[i+1])``
-corresponds to the slice needed to find the i-th column of the matrix
-in the ``data`` and ``indices`` fields.
+In the *Compressed Sparse Column* format, ``indices`` stands for index
+inside the column vectors of the matrix and ``indptr`` tells where the
+column starts in the ``data`` and in the ``indices``
+attributes. ``indptr`` can be tought as giving the slice which must be
+applied to the other attribute in order to get each column of the
+matrix. In other words, ``slice(indptr[i], indptr[i+1])`` correspond
+to the slice needed to find the i-th column of the matrix in the
+``data`` and in the ``indices`` fields.
 
 The following example builds a matrix and returns its columns. It
 prints the i-th column, i.e. a list of indices in the column and their
@@ -84,18 +84,18 @@ corresponding value in the second list.
 CSR Matrix
 ----------
 
-In the *Compressed Sparse Row* format, ``indices`` stands for indexes
+In the *Compressed Sparse Row* format, ``indices`` stands for index
 inside the row vectors of the matrix and ``indptr`` tells where the
 row starts in the ``data`` and in the ``indices``
-attributes. ``indptr`` can be thought of as giving the slice which
-must be applied to the other attribute in order to get each row of the
-matrix. In other words, ``slice(indptr[i], indptr[i+1])`` corresponds
+attributes. ``indptr`` can be tought as giving the slice which must be
+applied to the other attribute in order to get each row of the
+matrix. In other words, ``slice(indptr[i], indptr[i+1])`` correspond
 to the slice needed to find the i-th row of the matrix in the ``data``
-and ``indices`` fields.
+and in the ``indices`` fields.
 
 The following example builds a matrix and returns its rows. It prints
-the i-th row, i.e. a list of indices in the row and their
-corresponding value in the second list.
+the i-th row, i.e. a list of indices in the row and their corresponding value
+in the second list.
 
 >>> data = np.asarray([7, 8, 9])
 >>> indices = np.asarray([0, 1, 2])
@@ -119,48 +119,37 @@ List of Implemented Operations
 ==============================
 
 - Moving from and to sparse
-    - :func:`dense_from_sparse <theano.sparse.basic.dense_from_sparse>`.
-      Both grads are implemented. Structured by default.
-    - :func:`csr_from_dense <theano.sparse.basic.csr_from_dense>`,
-      :func:`csc_from_dense <theano.sparse.basic.csc_from_dense>`.
+    - :class:`DenseFromSparse <theano.sparse.basic.DenseFromSparse>` and ``dense_from_sparse``.
+      Both grad are implemented. Structured by default.
+    - :class:`SparseFromDense <theano.sparse.basic.SparseFromDense>` and ``csr_from_dense``, ``csc_from_dense``.
       The grad implemented is structured.
-    - Theano SparseVariable objects have a method ``toarray()`` that is the same as
-      :func:`dense_from_sparse <theano.sparse.basic.dense_from_sparse>`.
 
 - Construction of Sparses and their Properties
     - :class:`CSM <theano.sparse.basic.CSM>` and ``CSC``, ``CSR`` to construct a matrix.
       The grad implemented is regular.
-    - :func:`csm_properties <theano.sparse.basic.csm_properties>`.
-      to get the properties of a sparse matrix.
+    - :class:`CSMProperties <theano.sparse.basic.CSMProperties>` to get the properties of a sparse matrix.
       The grad implemented is regular.
-    - csm_indices(x), csm_indptr(x), csm_data(x) and csm_shape(x) or x.shape.
     - :func:`sp_ones_like <theano.sparse.basic.sp_ones_like>`.
       The grad implemented is regular.
     - :func:`sp_zeros_like <theano.sparse.basic.sp_zeros_like>`.
       The grad implemented is regular.
-    - :func:`square_diagonal <theano.sparse.basic.square_diagonal>`.
+    - :class:`SquareDiagonal <theano.sparse.basic.SquareDiagonal>` and ``square_diagonal``.
       The grad implemented is regular.
-    - :func:`construct_sparse_from_list <theano.sparse.basic.construct_sparse_from_list>`.
+    - :class:`ConstructSparseFromList <theano.sparse.basic.ConstructSparseFromList>` and ``construct_sparse_from_list``.
       The grad implemented is regular.
 
 - Cast
-    - :func:`cast <theano.sparse.basic.cast>` with ``bcast``, ``wcast``, ``icast``, ``lcast``,
+    - :class:`Cast <theano.sparse.basic.Cast>` with ``bcast``, ``wcast``, ``icast``, ``lcast``,
       ``fcast``, ``dcast``, ``ccast``, and ``zcast``.
       The grad implemented is regular.
 
 - Transpose
-    - :func:`transpose <theano.sparse.basic.transpose>`.
+    - :class:`Transpose <theano.sparse.basic.Transpose>` and ``transpose``.
       The grad implemented is regular.
 
 - Basic Arithmetic
-    - :func:`neg <theano.sparse.basic.neg>`.
+    - :class:`Neg <theano.sparse.basic.Neg>`.
       The grad implemented is regular.
-    - :func:`eq <theano.sparse.basic.eq>`.
-    - :func:`neq <theano.sparse.basic.neq>`.
-    - :func:`gt <theano.sparse.basic.gt>`.
-    - :func:`ge <theano.sparse.basic.ge>`.
-    - :func:`lt <theano.sparse.basic.lt>`.
-    - :func:`le <theano.sparse.basic.le>`. 
     - :func:`add <theano.sparse.basic.add>`.
       The grad implemented is regular.
     - :func:`sub <theano.sparse.basic.sub>`.
@@ -203,64 +192,32 @@ List of Implemented Operations
     - ``sqrt``
 
 - Dot Product
-    - :func:`dot <theano.sparse.basic.dot>`.
-
-        - One of the inputs must be sparse, the other sparse or dense.
-        - The grad implemented is regular.
-        - No C code for perform and no C code for grad.
-        - Returns a dense for perform and a dense for grad.
-    - :func:`structured_dot <theano.sparse.basic.structured_dot>`.
-
-        - The first input is sparse, the second can be sparse or dense.
-        - The grad implemented is structured.
-        - C code for perform and grad.
-        - It returns a sparse output if both inputs are sparse and
-          dense one if one of the inputs is dense.
-        - Returns a sparse grad for sparse inputs and dense grad for
-          dense inputs.
-    - :func:`true_dot <theano.sparse.basic.true_dot>`.
-
-        - The first input is sparse, the second can be sparse or dense.
-        - The grad implemented is regular.
-        - No C code for perform and no C code for grad.
-        - Returns a Sparse.
-        - The gradient returns a Sparse for sparse inputs and by
-          default a dense for dense inputs. The parameter
-          ``grad_preserves_dense`` can be set to False to return a
-          sparse grad for dense inputs.
-    - :func:`sampling_dot <theano.sparse.basic.sampling_dot>`.
-
-        - Both inputs must be dense.
-        - The grad implemented is structured for `p`.
-        - Sample of the dot and sample of the gradient.
-        - C code for perform but not for grad.
-        - Returns sparse for perform and grad.
-    - :func:`usmm <theano.sparse.basic.usmm>`.
-
-        - You *shouldn't* insert this op yourself!
-           - There is an optimization that transform a
-             :func:`dot <theano.sparse.basic.dot>` to ``Usmm`` when possible.
-
-        - This op is the equivalent of gemm for sparse dot.
-        - There is no grad implemented for this op.
-        - One of the inputs must be sparse, the other sparse or dense.
-        - Returns a dense from perform.
+    - :class:`Dot <theano.sparse.basic.Dot>` and
+      :func:`dot <theano.sparse.basic.dot>`.
+      The grad implemented is regular.
+    - :class:`StructuredDot <theano.sparse.basic.StructuredDot>`
+      and :func:`structured_dot <theano.sparse.basic.structured_dot>`.
+      The grad implemented is structured.
+    - :class:`SamplingDot <theano.sparse.basic.SamplingDot>` and ``sampling_dot``.
+      The grad implemented is structured for `p`.
+    - :class:`Usmm <theano.sparse.basic.Usmm>` and ``usmm``.
+      There is no grad implemented for this op.
 
 - Slice Operations
-    - sparse_variable[N, N], returns a tensor scalar.
+    - sparse_variable[N, N], return a tensor scalar.
       There is no grad implemented for this operation.
-    - sparse_variable[M:N, O:P], returns a sparse matrix
+    - sparse_variable[M:N, O:P], return a sparse matrix
       There is no grad implemented for this operation.
-    - Sparse variables don't support [M, N:O] and [M:N, O] as we don't
-      support sparse vectors and returning a sparse matrix would break
-      the numpy interface.  Use [M:M+1, N:O] and [M:N, O:O+1] instead.
-    - :func:`diag <theano.sparse.basic.diag>`.
+    - Sparse variable don't support [M, N:O] and [M:N, O] as we don't support sparse vector
+      and returning a sparse matrix would break the numpy interface.
+      Use [M:M+1, N:O] and [M:N, O:O+1] instead.
+    - :class:`Diag <theano.sparse.basic.Diag>` and ``diag``.
       The grad implemented is regular.
 
 - Concatenation
-    - :func:`hstack <theano.sparse.basic.hstack>`.
+    - :class:`HStack <theano.sparse.basic.HStack>` and ``hstack``.
       The grad implemented is regular.
-    - :func:`vstack <theano.sparse.basic.vstack>`.
+    - :class:`VStack <theano.sparse.basic.VStack>` and ``vstack``.
       The grad implemented is regular.
 
 - Probability
@@ -274,13 +231,10 @@ List of Implemented Operations
 - Internal Representation
     `They all have a regular grad implemented.`
 
-    - :func:`ensure_sorted_indices <theano.sparse.basic.ensure_sorted_indices>`.
-    - :func:`remove0 <theano.sparse.basic.remove0>`.
+    - :class:`EnsureSortedIndices <theano.sparse.basic.EnsureSortedIndices>` and ``ensure_sorted_indices``
+    - :class:`Remove0 <theano.sparse.basic.Remove0>` and ``remove0``
     - :func:`clean <theano.sparse.basic.clean>` to resort indices and remove zeros
 
-- To help testing
-    - :func:`theano.sparse.tests.test_basic.sparse_random_inputs`
-
 ===================================================================
 :mod:`sparse` --  Sparse Op
 ===================================================================
@@ -293,4 +247,3 @@ List of Implemented Operations
 .. automodule:: theano.sparse.basic
     :members:
 
-.. autofunction:: theano.sparse.tests.test_basic.sparse_random_inputs
diff --git a/doc/library/tensor/basic.txt b/doc/library/tensor/basic.txt
index 33e116c39e3..ecdde47d0f5 100644
--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -1,3 +1,4 @@
+
 .. currentmodule:: tensor
 
 .. _libdoc_basic_tensor:
@@ -6,14 +7,6 @@
 Basic Tensor Functionality
 ===========================
 
-.. testsetup::
-
-   import theano.tensor as T
-   from theano.tensor import scalar, iscalar, TensorType, dmatrix, ivector
-   from theano.tensor import set_subtensor, inc_subtensor, batched_dot
-   from theano import shared
-   import numpy
-
 Theano supports any kind of Python object, but its focus is support for
 symbolic matrix expressions.  When you type,
 
@@ -98,7 +91,7 @@ All Fully-Typed Constructors
 The following TensorType instances are provided in the theano.tensor module.
 They are all callable, and accept an optional ``name`` argument.  So for example:
 
-.. testcode:: constructors
+.. code-block:: python
 
    from theano.tensor import *
 
@@ -203,7 +196,7 @@ will return that many Variables and if strings are provided, it will
 create one Variable for each string, using the string as the Variable's
 name. For example:
 
-.. testcode:: constructors
+.. code-block:: python
 
    from theano.tensor import *
 
@@ -229,8 +222,7 @@ correctly:
 
 >>> my_dmatrix = TensorType('float64', (False,)*2)
 >>> x = my_dmatrix()       # allocate a matrix variable
->>> my_dmatrix == dmatrix
-True
+>>> my_dmatrix == dmatrix  # this compares True
 
 See :class:`TensorType` for more information about creating new types of
 Tensor.
@@ -242,7 +234,7 @@ Converting from Python Objects
 Another way of creating a TensorVariable (a TensorSharedVariable to be
 precise) is by calling :func:`shared()`
 
-.. testcode::
+.. code-block:: python
 
     x = shared(numpy.random.randn(3,4))
 
@@ -303,6 +295,7 @@ them perfectly, but a dscalar otherwise.
     :rtype: :class:`TensorVariable` or :class:`TensorConstant`
 
 
+
 TensorType and TensorVariable
 =============================
 
@@ -429,9 +422,7 @@ TensorVariable
 
 .. class:: _tensor_py_operators(object)
 
-    This mix-in class adds convenient attributes, methods, and support
-    to TensorVariable, TensorConstant and TensorSharedVariable for
-    Python operators (see :ref:`tensor_operator_support`).
+    This mix-in class adds convenient attributes, methods, and support for Python operators (see :ref:`tensor_operator_support`).
 
     .. attribute:: type
 
@@ -474,7 +465,6 @@ TensorVariable
             * (2, 0, 1) -> AxBxC to CxAxB
             * (0, 'x', 1) -> AxB to Ax1xB
             * (1, 'x', 0) -> AxB to Bx1xA
-            * (1,) -> This remove dimensions 0. It must be a broadcastable dimension (1xA to A)
 
     .. method:: flatten(ndim=1)
 
@@ -484,10 +474,6 @@ TensorVariable
 
         See :func:`flatten`.
 
-    .. method:: ravel()
-
-        return self.flatten(). For NumPy compatibility.
-
     .. attribute:: T
 
         Transpose of this tensor.
@@ -501,31 +487,8 @@ TensorVariable
             same vector!  Use `reshape` or `dimshuffle` to turn your vector
             into a row or column matrix.
 
-    .. method:: {any,all}(axis=None, keepdims=False)
-    .. method:: {sum,prod,mean}(axis=None, dtype=None, keepdims=False, acc_dtype=None)
-    .. method:: {var,std,min,max,argmin,argmax}(axis=None, keepdims=False),
-    .. method:: diagonal(offset=0, axis1=0, axis2=1)
-    .. method:: astype(dtype)
-    .. method:: take(indices, axis=None, mode='raise')
-    .. method:: copy()
-    .. method:: norm(L, axis=None)
-    .. method:: nonzero(self, return_matrix=False)
-    .. method:: nonzero_values(self)
-    .. method:: sort(self, axis=-1, kind='quicksort', order=None)
-    .. method:: argsort(self, axis=-1, kind='quicksort', order=None)
-    .. method:: clip(self, a_min, a_max)
-    .. method:: conf()
-    .. method:: repeat(repeats, axis=None)
-    .. method:: round(mode="half_away_from_zero")
-    .. method:: trace()
-    .. method:: get_scalar_constant_value()
-    .. method:: zeros_like(model, dtype=None)
-
-       All the above methods are equivalent to NumPy for Theano on the current tensor.
-
-    .. method:: __{abs,neg,lt,le,gt,ge,invert,and,or,add,sub,mul,div,truediv,floordiv}__
-
-       Those elemwise operation are supported via Python syntax.
+
+
 
 Shaping and Shuffling
 =====================
@@ -586,24 +549,16 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.
 .. function:: unbroadcast(x, *axes)
 
     Make `x` impossible to broadcast in the specified axes `axes`. For
-    example, `unbroadcast(x, 0)` will make the first dimension of `x`
+    example, `unbroadcast(x,0)` will make the first dimension of `x`
     unbroadcastable.
 
 .. function:: addbroadcast(x, *axes)
 
     Make `x` broadcastable in the specified axes `axes`. For
-    example, `addbroadcast(x, 0)` will make the first dimension of `x`
+    example, `unbroadcast(x,0)` will make the first dimension of `x`
     broadcastable. When performing the function, if the length of `x`
     along that dimension is not 1, a ``ValueError`` will be raised.
 
-.. function:: patternbroadcast(x, broadcastable)
-
-    Change `x` broadcastable pattern to
-    `broadcastable`. `broadcastable` must be iterable. For example,
-    `patternbroadcast(x, (True, False))` will make the first dimension
-    of `x` broadcastable and the second dimension not broadcastable,
-    so x will now be a `row`.
-
 .. function:: flatten(x, outdim=1)
 
     Similar to :func:`reshape`, but the shape is inferred from the shape of `x`.
@@ -619,37 +574,12 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.
         dimensions, but with all remaining dimensions of `x` collapsed into
         the last dimension.
 
-    For example, if we flatten a tensor of shape (2, 3, 4, 5) with flatten(x,
+    For example, if we flatten a tensor of shape (2,3,4,5) with flatten(x,
     outdim=2), then we'll have the same (2-1=1) leading dimensions (2,), and the
     remaining dimensions are collapsed.  So the output in this example would
     have shape (2, 60).
 
 
-.. function:: tile(x, reps, ndim=None)
-
-    Construct an array by repeating the input `x` according to `reps`
-    pattern.
-
-    Tiles its input according to `reps`. The length of `reps` is the
-    number of dimension of `x` and contains the number of times to
-    tile `x` in each dimension.
-
-    :see: `numpy.tile
-        <http://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_
-        documentation for examples.
-
-    :see: :func:`theano.tensor.extra_ops.repeat
-        <theano.tensor.extra_ops.repeat>`
-
-    :note: Currently, `reps` must be a constant, `x.ndim` and
-        `len(reps)` must be equal and, if specified, `ndim` must be
-        equal to both.
-
-
-Creating Tensor
-===============
-
-
 .. function:: zeros_like(x)
 
     :param x: tensor that has same shape as output
@@ -669,12 +599,6 @@ Creating Tensor
 
     Create a matrix by filling the shape of `a` with `b`
 
-.. function:: alloc(value, *shape)
-
-    :param value: a value with which to fill the output
-    :param shape: the dimensions of the returned array
-    :returns: an N-dimensional tensor initialized by `value` and having the specified shape.
-
 .. function:: eye(n, m=None, k=0, dtype=theano.config.floatX)
 
     :param n: number of rows in output (value or theano scalar)
@@ -705,8 +629,7 @@ Creating Tensor
     >>> x1 = T.scalar()
     >>> x2 = T.scalar()
     >>> x = T.stack(x0, x1, x2)
-    >>> x.ndim # x is a vector of length 3.
-    1
+    >>> # x.ndim == 1, is a vector of length 3.
 
 .. function:: concatenate(tensor_list, axis=0)
 
@@ -721,8 +644,7 @@ Creating Tensor
     >>> x1 = T.ftensor3()
     >>> x2 = T.fvector()
     >>> x = T.concatenate([x0, x1[0], T.shape_padright(x2)], axis=1)
-    >>> x.ndim
-    2
+    >>> # x.ndim == 2
 
 .. function:: stacklists(tensor_list)
 
@@ -741,8 +663,7 @@ Creating Tensor
     >>> X = stacklists([[a, b], [c, d]])
     >>> f = function([a, b, c, d], X)
     >>> f(1, 2, 3, 4)
-    array([[ 1.,  2.],
-           [ 3.,  4.]])
+    >>> # array([[ 1.,  2.], [ 3.,  4.]], dtype=float32)
 
     We can also stack arbitrarily shaped tensors. Here we stack matrices into
     a 2 by 2 grid:
@@ -753,9 +674,7 @@ Creating Tensor
     >>> f = function([a, b, c, d], X)
     >>> x = ones((4, 4), 'float32')
     >>> f(x, x, x, x).shape
-    (2, 2, 4, 4)
-
-.. autofunction:: theano.tensor.basic.choose
+    >>> # (2, 2, 4, 4)
 
 Reductions
 ==========
@@ -860,7 +779,7 @@ Reductions
      * an *int* - computed along this axis
      * a *list of ints* - computed along these axes
 
-.. function:: prod(x, axis=None, dtype=None, keepdims=False, acc_dtype=None, no_zeros_in_input=False)
+.. function:: prod(x, axis=None, dtype=None, keepdims=False, acc_dtype=None)
 
     :Parameter: *x* -  symbolic Tensor (or compatible)
     :Parameter: *axis* - axis or axes along which to compute the product
@@ -888,21 +807,6 @@ Reductions
         - for float dtypes, we use at least float64;
         - for complex dtypes, we use at least complex128.
 
-    :Parameter: *no_zeros_in_input* - The grad of prod is complicated
-         as we need to handle 3 different cases: without zeros in the
-         input reduced group, with 1 zero or with more zeros.
-
-	 This could slow you down, but more importantly, we currently
-	 don't support the second derivative of the 3 cases. So you
-	 cannot take the second derivative of the default prod().
-
-	 To remove the handling of the special cases of 0 and so get
-	 some small speed up and allow second derivative set
-	 ``no_zeros_in_inputs`` to ``True``. It defaults to ``False``.
-
-	 **It is the user responsibility to make sure there are no zeros
-	 in the inputs. If there are, the grad will be wrong.**
-
     :Returns: product of every term in *x* along *axis*
 
     axis can be:
@@ -989,19 +893,6 @@ Reductions
      * an *int* - computed along this axis
      * a *list of ints* - computed along these axes
 
-.. function:: ptp(x, axis = None)
-
-    Range of values (maximum - minimum) along an axis.
-    The name of the function comes from the acronym for peak to peak.
-
-    :Parameter: *x* Input tensor.
-
-    :Parameter: *axis* Axis along which to find the peaks. By default,
-                flatten the array.
-
-    :Returns: A new array holding the result.
-
-.. _indexing:
 
 Indexing
 ========
@@ -1014,45 +905,19 @@ Theano fully supports basic indexing
 <http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#integer>`_
 will be supported in 0.6rc4 (or the development version). We do not
 support boolean masks, as Theano does not have a boolean type (we use
-int8 for the output of logic operators).
-
-.. testsetup:: indexing
-
-   import theano
-   import numpy as np
-
-NumPy with a mask:
-
-.. doctest:: indexing
+int8 for the output of logic operators). To imitate boolean advanced
+indexing, you can do::
 
-   >>> n = np.arange(9).reshape(3,3)
-   >>> n[n > 4]
-   array([5, 6, 7, 8])
+    # NumPy indexing with a mask
+    n = np.arange(9).reshape(3,3)
+    n[n > 4] # array([5, 6, 7, 8])
 
-Theano indexing with a "mask" (incorrect approach):
+    # Theano indexing with a "mask" (incorrect approach)
+    t = theano.tensor.arange(9).reshape((3,3))
+    t[t > 4].eval()  # an array with shape (3, 3, 3)
 
-.. doctest:: indexing
-
-   >>> t = theano.tensor.arange(9).reshape((3,3))
-   >>> t[t > 4].eval()  # an array with shape (3, 3, 3)
-   array([[[0, 1, 2],
-           [0, 1, 2],
-           [0, 1, 2]],
-   <BLANKLINE>
-          [[0, 1, 2],
-           [0, 1, 2],
-           [3, 4, 5]],
-   <BLANKLINE>
-          [[3, 4, 5],
-           [3, 4, 5],
-           [3, 4, 5]]], dtype=int8)
-
-Getting a Theano result like NumPy:
-
-.. doctest:: indexing
-
-   >>> t[(t > 4).nonzero()].eval()
-   array([5, 6, 7, 8], dtype=int8)
+    # getting a Theano result like NumPy
+    t[(t > 4).nonzero()].eval() # array([5, 6, 7, 8])
 
 The gradient of Advanced indexing needs in many cases NumPy
 1.8. It is not released yet as of April 30th, 2013. You can use NumPy
@@ -1060,11 +925,11 @@ development version to have this feature now.
 
 
 Index-assignment is *not* supported.  If you want to do something like ``a[5]
-= b`` or ``a[5]+=b``, see :func:`theano.tensor.set_subtensor` and :func:`theano.tensor.inc_subtensor` below.
+= b`` or ``a[5]+=b``, see :func:`set_subtensor` and :func:`inc_subtensor` below.
 
-.. autofunction:: theano.tensor.set_subtensor
+.. autofunction:: theano.tensor.basic.set_subtensor
 
-.. autofunction:: theano.tensor.inc_subtensor
+.. autofunction:: theano.tensor.basic.inc_subtensor
 
 .. _tensor_operator_support:
 
@@ -1078,27 +943,21 @@ Many Python operators are supported.
 Arithmetic
 --------------
 
-.. doctest::
-   :options: +SKIP
-
-   >>> a + 3      # T.add(a, 3) -> itensor3
-   >>> 3 - a      # T.sub(3, a)
-   >>> a * 3.5    # T.mul(a, 3.5) -> ftensor3 or dtensor3 (depending on casting)
-   >>> 2.2 / a    # T.truediv(2.2, a)
-   >>> 2.2 // a   # T.intdiv(2.2, a)
-   >>> 2.2**a     # T.pow(2.2, a)
-   >>> b % a      # T.mod(b, a)
+>>> a + 3      # T.add(a, 3) -> itensor3
+>>> 3 - a      # T.sub(3, a)
+>>> a * 3.5    # T.mul(a, 3.5) -> ftensor3 or dtensor3 (depending on casting)
+>>> 2.2 / a    # T.truediv(2.2, a)
+>>> 2.2 // a   # T.intdiv(2.2, a)
+>>> 2.2**a     # T.pow(2.2, a)
+>>> b % a      # T.mod(b, a)
 
 Bitwise
 -------------
 
-.. doctest::
-   :options: +SKIP
-
-   >>> a & b      # T.and_(a,b)    bitwise and (alias T.bitwise_and)
-   >>> a ^ 1      # T.xor(a,1)     bitwise xor (alias T.bitwise_xor)
-   >>> a | b      # T.or_(a,b)     bitwise or (alias T.bitwise_or)
-   >>> ~a         # T.invert(a)    bitwise invert (alias T.bitwise_not)
+>>> a & b      # T.and_(a,b)    bitwise and (alias T.bitwise_and)
+>>> a ^ 1      # T.xor(a,1)     bitwise xor (alias T.bitwise_xor)
+>>> a | b      # T.or_(a,b)     bitwise or (alias T.bitwise_or)
+>>> ~a         # T.invert(a)    bitwise invert (alias T.bitwise_not)
 
 Inplace
 -------------
@@ -1125,12 +984,13 @@ Casting
     This is not a reinterpret cast, but a coersion cast, similar to
     ``numpy.asarray(x, dtype=dtype)``.
 
-    .. testcode:: cast
+    .. code-block:: python
 
         import theano.tensor as T
-        x = T.matrix()
+        x_as_float = T.matrix()
         x_as_int = T.cast(x, 'int32')
 
+
     Attempting to casting a complex value to a real value is ambiguous and
     will raise an exception.  Use `real()`, `imag()`, `abs()`, or `angle()`.
 
@@ -1161,7 +1021,7 @@ The six usual equality and inequality operators share the same interface.
 
   Here is an example with the less-than operator.
 
-  .. testcode:: oper
+  .. code-block:: python
 
     import theano.tensor as T
     x,y = T.dmatrices('x','y')
@@ -1199,18 +1059,6 @@ The six usual equality and inequality operators share the same interface.
 
     Returns a variable representing the result of logical inequality (a!=b).
 
-.. function:: isnan(a)
-
-    Returns a variable representing the comparison of ``a`` elements with nan.
-
-    This is equivalent to ``numpy.isnan``.
-
-.. function:: isinf(a)
-
-    Returns a variable representing the comparison of ``a`` elements
-    with inf or -inf.
-
-    This is equivalent to ``numpy.isinf``.
 
 Condition
 ---------
@@ -1225,7 +1073,7 @@ Condition
       :Parameter:  *iff* - symbolic Tensor (or compatible)
       :Return type: symbolic Tensor
 
-    .. testcode:: switch
+    .. code-block:: python
 
       import theano.tensor as T
       a,b = T.dmatrices('a','b')
@@ -1236,6 +1084,7 @@ Condition
 
    Alias for `switch`. where is the numpy name.
 
+
 .. function:: clip(x, min, max)
 
     Return a variable representing x, but with all elements greater than
@@ -1293,7 +1142,7 @@ The bitwise operators possess this interface:
 
 Here is an example using the bit-wise ``and_`` via the ``&`` operator:
 
-.. testcode:: bitwise
+.. code-block:: python
 
     import theano.tensor as T
     x,y = T.imatrices('x','y')
@@ -1381,29 +1230,6 @@ Mathematical
 
     Returns a variable representing the inverse error function or the inverse complementary error function. `wikipedia <http://en.wikipedia.org/wiki/Error_function#Inverse_functions>`__
 
-.. function:: gamma(a)
-
-   Returns a variable representing the gamma function.
-
-.. function:: gammaln(a)
-
-   Returns a variable representing the logarithm of the gamma function.
-
-.. function:: psi(a)
-
-   Returns a variable representing the derivative of the logarithm of
-   the gamma function (also called the digamma function).
-
-.. function:: chi2sf(a, df)
-
-   Returns a variable representing the survival function (1-cdf —
-   sometimes more accurate).
-
-   C code is provided in the Theano_lgpl repository.
-   This makes it faster.
-
-   https://github.com/Theano/Theano_lgpl.git
-
 
 .. _libdoc_tensor_broadcastable:
 
@@ -1520,9 +1346,7 @@ Linear Algebra
     are compatible. The resulting tensor will have shape (2, 5, 6) -- the
     dimensions that are not being summed:
 
-    .. testcode:: tensordot
-
-        import numpy as np
+    .. code-block:: python
 
         a = np.random.random((2,3,4))
         b = np.random.random((5,6,4,3))
@@ -1546,7 +1370,7 @@ Linear Algebra
                         for m in range(a2):
                             cloop[i,j,k] += a[i,l,m] * b[j,k,m,l]
 
-        assert np.allclose(c, cloop)
+        np.allclose(c, cloop) #true
 
     This specific implementation avoids a loop by transposing a and b such that
     the summed axes of a are last and the summed axes of b are first. The
@@ -1557,17 +1381,14 @@ Linear Algebra
     In an extreme case, no axes may be specified. The resulting tensor
     will have shape equal to the concatenation of the shapes of a and b:
 
-    .. doctest:: tensordot
+    .. code-block:: python
 
-        >>> c = np.tensordot(a, b, 0)
-        >>> a.shape
-        (2, 3, 4)
-        >>> b.shape
-        (5, 6, 4, 3)
-        >>> print(c.shape)
-        (2, 3, 4, 5, 6, 4, 3)
+        c = np.tensordot(a, b, 0)
+        print(a.shape) #(2,3,4)
+        print(b.shape) #(5,6,4,3)
+        print(c.shape) #(2,3,4,5,6,4,3)
 
-    :note: See the documentation of `numpy.tensordot <http://docs.scipy.org/doc/numpy/reference/generated/numpy.tensordot.html>`_ for more examples.
+    See the documentation of numpy.tensordot for more examples.
 
 .. function:: batched_dot(X, Y)
 
@@ -1578,7 +1399,6 @@ Linear Algebra
     over the first dimension using scan.
     Returns a tensor of size e.g. if it is 3D: (dim1, dim3, dim4)
     Example:
-
     >>> first = T.tensor3('first')
     >>> second = T.tensor3('second')
     >>> result = batched_dot(first, second)
@@ -1594,51 +1414,37 @@ Linear Algebra
 
     :return: tensor of products
 
-.. function:: batched_tensordot(X, Y, axes=2)
 
-    :param x: A Tensor with sizes e.g.: for 3D (dim1, dim3, dim2)
-    :param y: A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4)
-    :param axes: an integer or array. If an integer, the number of axes
-                 to sum over. If an array, it must have two array
-                 elements containing the axes to sum over in each tensor.
-
-                 If an integer i, it is converted to an array containing
-                 the last i dimensions of the first tensor and the first
-                 i dimensions of the second tensor (excluding the first
-                 (batch) dimension)::
-
-                     axes = [range(a.ndim - i, b.ndim), range(1,i+1)]
 
-                 If an array, its two elements must contain compatible axes
-                 of the two tensors. For example, [[1, 2], [2, 4]] means sum
-                 over the 2nd and 3rd axes of a and the 3rd and 5th axes of b.
-                 (Remember axes are zero-indexed!) The 2nd axis of a and the
-                 3rd axis of b must have the same shape; the same is true for
-                 the 3rd axis of a and the 5th axis of b.
-    :type axes: int or array-like of length 2
-
-    :returns: a tensor with shape equal to the concatenation of a's shape
-              (less any dimensions that were summed over) and b's shape
-              (less first dimension and any dimensions that were summed over).
-    :rtype: tensor of tensordots
+Gradient / Differentiation
+==========================
 
-    A hybrid of batch_dot and tensordot, this function computes the
-    tensordot product between the two tensors, by iterating over the
-    first dimension using scan to perform a sequence of tensordots.
+.. function:: grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False)
 
-    :note: See :func:`tensordot` and :func:`batched_dot` for
-        supplementary documentation.
+    Return symbolic gradients for one or more variables with respect to some
+    cost.
 
+    For more information about how automatic differentiation works in Theano,
+    see :mod:`gradient`. For information on how to implement the gradient of
+    a certain Op, see :func:`grad`.
 
+    :type cost: 0-d tensor variable
+    :type wrt: tensor variable or list of tensor variables
+    :type g_cost: same as type of `cost`
+    :type consider_constant: list of variables
+    :type warn_type: bool
 
-Gradient / Differentiation
-==========================
+    :param cost: a scalar with respect to which we are differentiating
+    :param wrt: term[s] for which we want gradients
+    :param g_cost: the gradient on the cost
+    :param consider_constant: variables whose gradients will be held at 0.
+    :param warn_type: True will trigger warnings via the logging module when
+       the gradient on an expression has a different type than the original
+       expression
 
-.. automodule:: theano.gradient
-    :members: grad
+    :rtype: variable or list of variables (matching `wrt`)
+    :returns: gradients of the cost with respect to each of the `wrt` terms
 
-See the :ref:`gradient <libdoc_gradient>` page for complete documentation
-of the gradient module.
 
 .. _R_op_list:
 
diff --git a/doc/library/tensor/index.txt b/doc/library/tensor/index.txt
index 9941d54bd33..670e22b218f 100644
--- a/doc/library/tensor/index.txt
+++ b/doc/library/tensor/index.txt
@@ -24,6 +24,3 @@ They are grouped into the following sections:
     signal/index
     utils
     extra_ops
-    io
-    slinalg
-    nlinalg
diff --git a/doc/library/tensor/io.txt b/doc/library/tensor/io.txt
deleted file mode 100644
index 6565b124798..00000000000
--- a/doc/library/tensor/io.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-===================================================================
-:mod:`tensor.io` --  Tensor IO Ops
-===================================================================
-
-.. module:: tensor.io
-   :platform: Unix, Windows
-   :synopsis: Tensor IO Ops
-.. moduleauthor:: LISA
-
-File operation
-==============
-
-- Load from disk with the function :func:`load <theano.tensor.io.load>` and its associated op :class:`LoadFromDisk <theano.tensor.io.LoadFromDisk>`
-
-MPI operation
-=============
-- Non-blocking transfer: :func:`isend <theano.tensor.io.isend>` and :func:`irecv <theano.tensor.io.irecv>`.
-- Blocking transfer: :func:`send <theano.tensor.io.send>` and :func:`recv <theano.tensor.io.recv>`
-
-Details
-=======
-
-.. automodule:: theano.tensor.io
-    :members:
-
diff --git a/doc/library/tensor/nlinalg.txt b/doc/library/tensor/nlinalg.txt
deleted file mode 100644
index 7afb847474b..00000000000
--- a/doc/library/tensor/nlinalg.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-..  ../../../../theano/sandbox/nlinalg.py
-
-.. _libdoc_linalg:
-
-===================================================================
-:mod:`tensor.nlinalg` --  Linear Algebra Ops Using Numpy
-===================================================================
-
-.. module:: tensor.nlinalg
-   :platform: Unix, Windows
-   :synopsis: Linear Algebra Ops Using Numpy
-.. moduleauthor:: LISA
-
-.. note::
-
-   This module is not imported by default. You need to import it to use it.
-
-API
-===
-
-.. automodule:: theano.tensor.nlinalg
-    :members:
diff --git a/doc/library/tensor/nnet/conv.txt b/doc/library/tensor/nnet/conv.txt
index d8a7ea25181..1d77ef6a873 100644
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -22,158 +22,12 @@
 .. moduleauthor:: LISA
 
 
-.. note::
-
-    As of October 21st, 2014, the default GPU image convolution
-    changed: By default, if :ref:`cuDNN <libdoc_cuda_dnn>`
-    is available, we will use it, otherwise we will fall back to using the
-    gemm version (slower then cuDNN in most cases, uses more memory, but
-    faster than the legacy version we used before).
-
-    Both cuDNN and the gemm version can be disabled using the Theano flags
-    ``optimizer_excluding=conv_dnn`` and ``optimizer_excluding=conv_gemm``,
-    respectively. In this case, we will fall back to using the legacy
-    convolution code, which is slower, but does not require extra memory.
-    To verify that cuDNN is used, you can supply the Theano flag
-    ``optimizer_including=cudnn``. This will raise an error if cuDNN is
-    unavailable.
-
-    It is not advised to ever disable cuDNN, as this is usually the fastest
-    option. Disabling the gemm version is only useful if cuDNN is unavailable
-    and you run out of GPU memory.
-
-    There are two other implementations: An FFT-based convolution integrated
-    into Theano, and an implementation by Alex Krizhevsky available via
-    Pylearn2. See the documentation below on how to use them.
-
-    As of November 24th, 2014, you can also use a meta-optimizer to
-    automatically choose the fastest implementation for each specific
-    convolution in your graph. For each instance, it will compile and benchmark
-    each applicable implementation of the ones listed below and choose the
-    fastest one. As performance is dependent on input and filter shapes, this
-    only works for operations introduced via nnet.conv2d with fully specified
-    shape information.
-    Enable it via the Theano flag ``optimizer_including=conv_meta``, and
-    optionally set it to verbose mode via the flag `metaopt.verbose=1`.
+TODO: Give examples for how to use these things! They are pretty complicated.
 
-
-TODO: Give examples on how to use these things! They are pretty complicated.
-
-- Implemented operators for neural network 2D / image convolution:
+- Conv implemented
+    - :func:`signal.conv2d <theano.tensor.signal.conv.conv2d>`.
     - :func:`nnet.conv2d <theano.tensor.nnet.conv.conv2d>`.
-      This is the standard operator for convolutional neural networks working
-      with batches of multi-channel 2D images, available for CPU and GPU. It
-      computes a convolution, i.e., it flips the kernel.
-      Most of the more efficient GPU implementations listed below can be
-      inserted automatically as a replacement for nnet.conv2d via graph
-      optimizations. Some of these graph optimizations are enabled by default,
-      others can be enabled via Theano flags.
-    - :func:`conv2d_fft <theano.sandbox.cuda.fftconv.conv2d_fft>` This
-      is a GPU-only version of nnet.conv2d that uses an FFT transform
-      to perform the work.  It flips the kernel just like ``conv2d``.
-      conv2d_fft should not be used directly as
-      it does not provide a gradient. Instead, use nnet.conv2d and
-      allow Theano's graph optimizer to replace it by the FFT version
-      by setting 'THEANO_FLAGS=optimizer_including=conv_fft'
-      in your environment. If enabled, it will take precedence over cuDNN
-      and the gemm version.  It is not enabled by default because it
-      has some restrictions on input and uses a lot more memory.  Also
-      note that it requires CUDA >= 5.0, scikits.cuda >= 0.5.0 and
-      PyCUDA to run.  To deactivate the FFT optimization on a specific
-      nnet.conv2d while the optimization flag is active, you can set
-      its ``version`` parameter to ``'no_fft'``. To enable it for just
-      one Theano function:
-
-      .. code-block:: python
-
-          mode = theano.compile.get_default_mode()
-          mode = mode.including('conv_fft')
-
-          f = theano.function(..., mode=mode)
-
-    - `cuda-convnet wrapper for 2d correlation <http://deeplearning.net/software/pylearn2/library/alex.html>`_
-
-      Wrapper for an open-source GPU-only implementation of conv2d by Alex
-      Krizhevsky, very fast, but with several restrictions on input and kernel
-      shapes, and with a different memory layout for the input. It does not
-      flip the kernel.
-
-      This is in Pylearn2, where it is normally called from the `linear transform
-      <http://deeplearning.net/software/pylearn2/library/linear.html>`_
-      implementation, but it can also be used `directly from within Theano
-      <http://benanne.github.io/2014/04/03/faster-convolutions-in-theano.html>`_
-      as a manual replacement for nnet.conv2d.
-    - :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`
-      This is a GPU-only 2d correlation implementation taken from
-      `caffe <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu>`_
-      and also used by Torch. It does not flip the kernel.
-
-      For each element in a batch, it first creates a
-      `Toeplitz <http://en.wikipedia.org/wiki/Toeplitz_matrix>`_ matrix in a CUDA kernel.
-      Then, it performs a ``gemm`` call to multiply this Toeplitz matrix and the filters
-      (hence the name: MM is for matrix multiplication).
-      It needs extra memory for the Toeplitz matrix, which is a 2D matrix of shape
-      ``(no of channels * filter width * filter height, output width * output height)``.
-
-      As it provides a gradient, you can use it as a replacement for nnet.conv2d.
-      But usually, you will just use nnet.conv2d and allow Theano's graph
-      optimizer to automatically replace it by the GEMM version if cuDNN is not
-      available. To explicitly disable the graph optimizer, set
-      ``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment.
-      If using it, please see the warning about a bug in CUDA 5.0 to 6.0 below.
-    - :func:`dnn_conv <theano.sandbox.cuda.dnn.dnn_conv>` GPU-only
-      convolution using NVIDIA's cuDNN library. This requires that you have
-      cuDNN installed and available, which in turn requires CUDA 6.5 and a GPU
-      with compute capability 3.0 or more.
-
-      If cuDNN is available, by default, Theano will replace all nnet.conv2d
-      operations with dnn_conv. To explicitly disable it, set
-      ``THEANO_FLAGS=optimizer_excluding=conv_dnn`` in your environment.
-      As dnn_conv has a gradient defined, you can also use it manually.
-- Implemented operators for neural network 3D / video convolution:
-    - :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`
-      3D Convolution applying multi-channel 3D filters to batches of
-      multi-channel 3D images. It does not flip the kernel.
-    - :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>`
-      GPU-only version of conv3D using FFT transform. conv3d_fft should
-      not be called directly as it does not provide a gradient.
-      Instead, use conv3D and allow Theano's graph optimizer to replace it by
-      the FFT version by setting
-      ``THEANO_FLAGS=optimizer_including=conv3d_fft:convgrad3d_fft:convtransp3d_fft``
-      in your environment. This is not enabled by default because it does not
-      support strides and uses more memory. Also note that it requires
-      CUDA >= 5.0, scikits.cuda >= 0.5.0 and PyCUDA to run.
-      To enable for just one Theano function:
-
-      .. code-block:: python
-
-          mode = theano.compile.get_default_mode()
-          mode = mode.including('conv3d_fft', 'convgrad3d_fft', 'convtransp3d_fft')
-
-          f = theano.function(..., mode=mode)
-
-    - :func:`GpuCorr3dMM <theano.sandbox.cuda.blas.GpuCorr3dMM>`
-      This is a GPU-only 3d correlation relying on a Toeplitz matrix
-      and gemm implementation (see :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`)
-      It needs extra memory for the Toeplitz matrix, which is a 2D matrix of shape
-      ``(no of channels * filter width * filter height * filter depth, output width * output height * output depth)``.
-      As it provides a gradient, you can use it as a replacement for nnet.conv3d.
-      Alternatively, you can use nnet.conv3d and allow Theano's graph optimizer
-      to replace it by the GEMM version by setting
-      ``THEANO_FLAGS=optimizer_including=conv3d_gemm:convgrad3d_gemm:convtransp3d_gemm`` in your environment.
-      This is not enabled by default because it uses some extra memory, but the
-      overhead is small compared to conv3d_fft, there are no restrictions on
-      input or kernel shapes and strides are supported. If using it,
-      please see the warning about a bug in CUDA 5.0 to 6.0
-      in :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`.
-
-    - :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>`
-      Another conv3d implementation that uses the conv2d with data reshaping.
-      It is faster in some cases than conv3d, and work on the GPU.
-      It flip the kernel.
+    - :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`.
 
 .. autofunction:: theano.tensor.nnet.conv.conv2d
-.. autofunction:: theano.sandbox.cuda.fftconv.conv2d_fft
 .. autofunction:: theano.tensor.nnet.Conv3D.conv3D
-.. autofunction:: theano.sandbox.cuda.fftconv.conv3d_fft
-.. autofunction:: theano.tensor.nnet.conv3d2d.conv3d
diff --git a/doc/library/tensor/nnet/index.txt b/doc/library/tensor/nnet/index.txt
index 910b99dae19..764035c0674 100644
--- a/doc/library/tensor/nnet/index.txt
+++ b/doc/library/tensor/nnet/index.txt
@@ -18,4 +18,3 @@ and ops which are particular to neural networks and deep learning.
 
     conv
     nnet
-    neighbours
diff --git a/doc/library/tensor/nnet/neighbours.txt b/doc/library/tensor/nnet/neighbours.txt
deleted file mode 100644
index 129dbb2a5a6..00000000000
--- a/doc/library/tensor/nnet/neighbours.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-.. _libdoc_tensor_nnet_neighbours:
-
-=======================================================================
-:mod:`neighbours` -- Ops for working with images in convolutional nets
-=======================================================================
-
-.. module:: sandbox.neighbours
-   :platform: Unix, Windows
-   :synopsis: Ops for working with images in conv nets
-.. moduleauthor:: LISA
-
-
-- Functions
-
-    .. autofunction:: theano.tensor.nnet.neighbours.images2neibs
-
-    .. autofunction:: theano.tensor.nnet.neighbours.neibs2images
-
-- See also: :ref:`indexing`, :ref:`lib_scan`
diff --git a/doc/library/tensor/nnet/nnet.txt b/doc/library/tensor/nnet/nnet.txt
index 1933d58a480..c989573ab59 100644
--- a/doc/library/tensor/nnet/nnet.txt
+++ b/doc/library/tensor/nnet/nnet.txt
@@ -116,20 +116,6 @@
 
    The softmax function will, when applied to a matrix, compute the softmax values row-wise.
 
-    :note: this insert a particular op. But this op don't yet
-       implement the Rop for hessian free. If you want that, implement
-       this equivalent code that have the Rop implemented
-       ``exp(x)/exp(x).sum(1, keepdims=True)``. Theano should
-       optimize this by inserting the softmax op itself.  The code of
-       the softmax op is more numeriacaly stable by using this code:
-
-       .. code-block:: python
-
-           e_x = exp(x - x.max(axis=1, keepdims=True))
-           out = e_x / e_x.sum(axis=1, keepdims=True)
-
-   Example of use:
-
    .. code-block:: python
 
        x,y,b = T.dvectors('x','y','b')
diff --git a/doc/library/tensor/raw_random.txt b/doc/library/tensor/raw_random.txt
index 9a9eed22a04..9134ddcd0ce 100644
--- a/doc/library/tensor/raw_random.txt
+++ b/doc/library/tensor/raw_random.txt
@@ -22,130 +22,88 @@ Reference
     :class:`theano.tensor.shared_randomstreams.RandomStreams` subclass and the
     :class:`theano.tensor.randomstreams.RandomStreams` subclass.
 
-    .. method:: binomial(self, size=(), n=1, p=0.5, ndim=None):
+    .. method:: binomial(self, size=(), n=1, prob=0.5, ndim=None):
 
-        Sample ``n`` times with probability of success ``p`` for each
-        trial and return the number of successes.
+        Sample n times with probability of success prob for each trial, return the number of
+        successes.
 
-        If ``size`` is ambiguous on the number of dimensions, ``ndim``
-        may be a plain integer to supplement the missing information.
-
-	This wraps the numpy implementation, so it has the same
-	behavior.
+        If the size argument is ambiguous on the number of dimensions, the first argument may be a
+        plain integer to supplement the missing information.
 
     .. method:: uniform(self,  size=(), low=0.0, high=1.0, ndim=None):
 
-        Sample a tensor of the given size whose elements come from a
-        uniform distribution between low and high.
-
-        If ``size`` is ambiguous on the number of dimensions, ``ndim``
-        may be a plain integer to supplement the missing information.
+        Sample a tensor of given size whose element from a uniform distribution between low and high.
 
-	This wraps the numpy implementation, so it has the same
-        bounds: [``low``, ``high``\[.
+        If the size argument is ambiguous on the number of
+        dimensions, the first argument may be a plain integer
+        to supplement the missing information.
 
     .. method:: normal(self, size=(), avg=0.0, std=1.0, ndim=None):
 
-        Sample from a normal distribution centered on ``avg`` with the
-        specified standard deviation (``std``)
+        Usage: normal(random_state, size,
+        Sample from a normal distribution centered on avg with
+        the specified standard deviation (std)
 
-        If ``size`` is ambiguous on the number of dimensions, ``ndim``
-        may be a plain integer to supplement the missing information.
-
-	This wrap numpy implementation, so it have the same behavior.
+        If the size argument is ambiguous on the number of
+        dimensions, the first argument may be a plain integer
+        to supplement the missing information.
 
     .. method:: random_integers(self, size=(), low=0, high=1, ndim=None):
 
+        Usage: random_integers(random_state, size, low=0, high=1)
         Sample a random integer between low and high, both inclusive.
 
-        If ``size`` is ambiguous on the number of dimensions, ``ndim``
-        may be a plain integer to supplement the missing information.
-
-	This is a generalization of :py:func:`numpy.random.random_integers`
-        to the case where low and high are tensors. Otherwise it
-        behaves the same.
-
-    .. method:: choice(self, size=(), a=2, replace=True, p=None, ndim=None, dtype='int64'):
-
-        Choose values from ``a`` with or without replacement. ``a``
-        can be a 1-D array or a positive scalar. If ``a`` is a scalar,
-        the samples are drawn from the range [0, ``a``\[.
-
-        If ``size`` is ambiguous on the number of dimensions, ``ndim``
-        may be a plain integer to supplement the missing information.
-
-	This wraps the numpy implementation so it has the same behavior.
-
-    .. method:: poisson(self, size=(), lam=None, ndim=None, dtype='int64'):
-
-        Draw samples from a Poisson distribution.
-  
-        The Poisson distribution is the limit of the Binomial
-        distribution for large N.
-
-        If ``size`` is ambiguous on the number of dimensions, ``ndim``
-        may be a plain integer to supplement the missing information.
-
-	This wraps the numpy implementation so it has the same behavior.
+        If the size argument is ambiguous on the number of
+        dimensions, the first argument may be a plain integer
+        to supplement the missing information.
 
     .. method:: permutation(self, size=(), n=1, ndim=None):
 
-        Returns permutations of the integers between 0 and ``n-1``, as
-        many times as required by ``size``. For instance, if
-        ``size=(p,q)``, ``p*q`` permutations will be generated, and
-        the output shape will be ``(p,q,n)``, because each permutation
-        is of size ``n``.
+        Returns permutations of the integers between 0 and n-1, as many times
+        as required by size. For instance, if size=(p,q), p*q permutations
+        will be generated, and the output shape will be (p,q,n), because each
+        permutation is of size n.
 
-        Theano tries to infer the number of dimensions from the length
-        of ``size``, but you may always specify it with ``ndim``.
+        Theano tries to infer the number of dimensions from the length of the size argument, but you
+        may always specify it with the `ndim` parameter.
 
-        .. note::
-            The output will have ``ndim+1`` dimensions.
-
-        This is a generalization of :py:func:`numpy.random.permutation` to
-        tensors. Otherwise it behaves the same.
+        .. note:: 
+            Note that the output will then be of dimension ndim+1.
 
     .. method:: multinomial(self, size=(), n=1, pvals=[0.5, 0.5], ndim=None):
 
-        Sample n times from a multinomial distribution defined by
-        probabilities ``pvals``, as many times as required by
-        ``size``. For instance, if ``size=(p,q)``, ``p*q`` samples
-        will be drawn, and the output shape will be
-        ``(p,q,len(pvals))``.
-
-        Theano tries to infer the number of dimensions from the length
-        of ``size``, but you may always specify it with ``ndim``.
+        Sample n times from a multinomial distribution defined by probabilities pvals,
+        as many times as required by size. For instance, if size=(p,q), p*q
+        samples will be drawn, and the output shape will be (p,q,len(pvals)).
 
-        .. note::
-            The output will have ``ndim+1`` dimensions.
+        Theano tries to infer the number of dimensions from the length of the size argument, but you
+        may always specify it with the `ndim` parameter.
 
-	This is a generalization of :py:func:`numpy.random.multinomial`
-        to the case where ``n`` and ``pvals`` are tensors. Otherwise
-        it behaves the same.
+        .. note:: 
+            Note that the output will then be of dimension ndim+1.
 
     .. method:: shuffle_row_elements(self, input):
-
+        
         Return a variable with every row (rightmost index) shuffled.
-
-        This uses a permutation random variable internally, available
-        via the ``.permutation`` attribute of the return value.
+        
+        This uses permutation random variable internally, available via the ``.permutation``
+        attribute of the return value.
+    
 
 .. class:: RandomStateType(gof.Type)
 
-    A `Type` for variables that will take ``numpy.random.RandomState``
-    values.
+    A `Type` for variables that will take ``numpy.random.RandomState`` values.
 
 .. function:: random_state_type(name=None)
 
-    Return a new Variable whose ``.type`` is ``random_state_type``.
+    Return a new Variable whose ``.type`` is ``random_state_variable``.
 
 .. class:: RandomFunction(gof.Op)
 
-    Op that draws random numbers from a numpy.RandomState object.
-    This Op is parametrized to draw numbers from many possible
-    distributions.
+    Op that draws random numbers from a numpy.RandomState object.  This Op is
+    parametrized to draw numbers from many possible distributions.
 
-.. function:: uniform(random_state, size=None, low=0.0, high=1.0, ndim=None, dtype=None)
+.. function:: uniform(random_state, size=(), low=0.0, high=1.0)
 
     Sample from a uniform distribution between low and high.
 
@@ -155,62 +113,59 @@ Reference
 
     :returns: :class:`RandomVariable`, NewRandomState
 
-.. function:: binomial(random_state, size=None, n=1, p=0.5, ndim=None, dtype='int64')
+.. function:: binomial(random_state, size=(), n=1, p=0.5)
 
-    Sample ``n`` times with probability of success ``p`` for each
-    trial and return the number of successes.
-
-    If ``size`` is ambiguous on the number of dimensions, ``ndim`` may
-    be a plain integer to supplement the missing information.
+    Sample n times with probability of success prob for each trial,
+    return the number of successes.
 
+    If the size argument is ambiguous on the number of
+    dimensions, the first argument may be a plain integer
+    to supplement the missing information.
     :returns: :class:`RandomVariable`, NewRandomState
 
-.. function:: normal(random_state, size=None, avg=0.0, std=1.0, ndim=None, dtype=None)
+.. function:: normal(random_state, size=(), avg=0.0, std=1.0)
 
-    Sample from a normal distribution centered on ``avg`` with the
-    specified standard deviation (``std``).
+    Sample from a normal distribution centered on avg with
+    the specified standard deviation (std)
 
-    If ``size`` is ambiguous on the number of dimensions, ``ndim`` may
-    be a plain integer to supplement the missing information.
+    If the size argument is ambiguous on the number of
+    dimensions, the first argument may be a plain integer
+    to supplement the missing information.
 
     :returns: :class:`RandomVariable`, NewRandomState
 
-.. function:: random_integers(random_state, size=None, low=0, high=1, ndim=None, dtype='int64')
+.. function:: random_integers(random_state, size=(), low=0, high=1)
 
-    Sample random integers in [``low``, ``high``] to fill up ``size``.
+    Sample a random integer between low and high, both inclusive.
 
-    If ``size`` is ambiguous on the number of dimensions, ``ndim`` may
-    be a plain integer to supplement the missing information.
+    If the size argument is ambiguous on the number of
+    dimensions, the first argument may be a plain integer
+    to supplement the missing information.
 
     :returns: :class:`RandomVariable`, NewRandomState
 
-.. function:: permutation(random_state, size=None, n=1, ndim=None, dtype='int64')
-
-    Returns permutations of the integers in [0, ``n``\[, as many times
-    as required by ``size``. For instance, if ``size=(p,q)``, ``p*q``
-    permutations will be generated, and the output shape will be
-    ``(p,q,n)``, because each permutation is of size ``n``.
+.. function:: permutation(random_state, size=(), n=1)
 
-    If ``size`` is ambiguous on the number of dimensions, ``ndim``
-    may be a plain integer, which should correspond to ``len(size)``.
+    Returns permutations of the integers between 0 and n-1, as many times
+    as required by size. For instance, if size=(p,q), p*q permutations
+    will be generated, and the output shape will be (p,q,n), because each
+    permutation is of size n.
 
-    .. note::
-        The output will have ``ndim+1`` dimensions.
+    If the size argument is ambiguous on the number of dimensions, the first
+    argument may be a plain integer i, which should correspond to len(size).
+    Note that the output will then be of dimension i+1.
 
     :returns: :class:`RandomVariable`, NewRandomState
 
-.. function:: multinomial(random_state, size=None, p_vals=[0.5, 0.5], ndim=None, dtype='int64')
-
-    Sample from a multinomial distribution defined by probabilities
-    ``pvals``, as many times as required by ``size``. For instance, if
-    ``size=(p,q)``, ``p*q`` samples will be drawn, and the output
-    shape will be ``(p,q,len(pvals))``.
+.. function:: multinomial(random_state, size=(), p_vals=[0.5, 0.5])
 
-    If ``size`` is ambiguous on the number of dimensions, ``ndim``
-    may be a plain integer, which should correspond to ``len(size)``.
+    Sample from a multinomial distribution defined by probabilities pvals,
+    as many times as required by size. For instance, if size=(p,q), p*q
+    samples will be drawn, and the output shape will be (p,q,len(pvals)).
 
-    .. note::
-        The output will have ``ndim+1`` dimensions.
+    If the size argument is ambiguous on the number of dimensions, the first
+    argument may be a plain integer i, which should correspond to len(size).
+    Note that the output will then be of dimension i+1.
 
     :returns: :class:`RandomVariable`, NewRandomState
 
diff --git a/doc/library/tensor/slinalg.txt b/doc/library/tensor/slinalg.txt
deleted file mode 100644
index 014c871db19..00000000000
--- a/doc/library/tensor/slinalg.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-..  ../../../../theano/sandbox/slinalg.py
-
-.. _libdoc_slinalg:
-
-===================================================================
-:mod:`tensor.slinalg` --  Linear Algebra Ops Using Scipy
-===================================================================
-
-.. module:: tensor.slinalg
-   :platform: Unix, Windows
-   :synopsis: Linear Algebra Ops Using Scipy
-.. moduleauthor:: LISA
-
-.. note::
-
-   This module is not imported by default. You need to import it to use it.
-
-API
-===
-
-.. automodule:: theano.tensor.slinalg
-    :members:
diff --git a/doc/library/typed_list.txt b/doc/library/typed_list.txt
deleted file mode 100644
index d5ed2e351e0..00000000000
--- a/doc/library/typed_list.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-.. _libdoc_typed_list:
-
-===============================
-:mod:`typed_list` -- Typed List
-===============================
-
-.. note::
-
-    This is not in the released version 0.6.0, but will be in the next release (0.7 or 0.6.1).
-
-This is a type that represents a list in Theano. All elements must have
-the same Theano type. Here is an example::
-
-    import theano.typed_list
-
-    tl = theano.typed_list.TypedListType(theano.tensor.fvector)()
-    v = theano.tensor.fvector()
-    o = theano.typed_list.append(tl, v)
-    f = theano.function([tl, v], o)
-    print f([[1, 2, 3], [4, 5]], [2])
-    #[array([ 1.,  2.,  3.], dtype=float32), array([ 4.,  5.], dtype=float32), array([ 2.], dtype=float32)]
-
-A second example with Scan. Scan doesn't yet have direct support of
-TypedList, so you can only use it as non_sequences (not in sequences or
-as outputs)::
-
-    import theano.typed_list
-
-    a = theano.typed_list.TypedListType(theano.tensor.fvector)()
-    l = theano.typed_list.length(a)
-    s, _ = theano.scan(fn=lambda i, tl: tl[i].sum(),
-                       non_sequences=[a],
-                       sequences=[theano.tensor.arange(l, dtype='int64')])
-
-    f = theano.function([a], s)
-    f([[1, 2, 3], [4, 5]])
-    #array([ 6.,  9.], dtype=float32)
-
-.. automodule:: theano.typed_list.basic
-    :members:
diff --git a/doc/omlw2014/Makefile b/doc/omlw2014/Makefile
deleted file mode 100644
index 2c88ef91bcc..00000000000
--- a/doc/omlw2014/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-all: presentation.pdf sharing.pdf
-
-clean:
-	rm -f pygpu_ndarray.so core.* *.o *~
-
-cleantmp:
-	rm -f core.* *.o *~
-
-presentation.pdf: presentation.tex
-	pdflatex presentation
-	pdflatex presentation
-
-sharing.pdf: sharing.tex
-	pdflatex sharing
-	pdflatex sharing
diff --git a/doc/omlw2014/logreg.py b/doc/omlw2014/logreg.py
deleted file mode 100644
index bd1bbbe6ff8..00000000000
--- a/doc/omlw2014/logreg.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import numpy
-import theano
-import theano.tensor as tt
-rng = numpy.random
-
-N = 400
-feats = 784
-D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
-training_steps = 10000
-
-# Declare Theano symbolic variables
-x = tt.matrix("x")
-y = tt.vector("y")
-w = theano.shared(rng.randn(feats), name="w")
-b = theano.shared(0., name="b")
-print "Initial model:"
-print w.get_value(), b.get_value()
-
-# Construct Theano expression graph
-p_1 = 1 / (1 + tt.exp(-tt.dot(x, w) - b))   # Probability that target = 1
-prediction = p_1 > 0.5                      # The prediction thresholded
-xent = -y * tt.log(p_1) - (1 - y) * tt.log(1 - p_1)  # Cross-entropy loss
-cost = xent.mean() + 0.01 * (w ** 2).sum()  # The cost to minimize
-gw, gb = tt.grad(cost, [w, b])
-
-# Compile
-train = theano.function(
-    inputs=[x, y],
-    outputs=[prediction, xent],
-    updates=[(w, w - 0.1 * gw),
-             (b, b - 0.1 * gb)],
-    name='train')
-
-predict = theano.function(inputs=[x], outputs=prediction,
-                          name='predict')
-
-# Train
-for i in range(training_steps):
-    pred, err = train(D[0], D[1])
-
-print "Final model:"
-print w.get_value(), b.get_value()
-print "target values for D:", D[1]
-print "prediction on D:", predict(D[0])
diff --git a/doc/omlw2014/omlw_presentation.pdf b/doc/omlw2014/omlw_presentation.pdf
deleted file mode 100644
index cd777081dd3..00000000000
Binary files a/doc/omlw2014/omlw_presentation.pdf and /dev/null differ
diff --git a/doc/omlw2014/pr_conv_gemm_profile.png b/doc/omlw2014/pr_conv_gemm_profile.png
deleted file mode 100644
index 10dd96c3a04..00000000000
Binary files a/doc/omlw2014/pr_conv_gemm_profile.png and /dev/null differ
diff --git a/doc/omlw2014/presentation.tex b/doc/omlw2014/presentation.tex
deleted file mode 100644
index 456d0883714..00000000000
--- a/doc/omlw2014/presentation.tex
+++ /dev/null
@@ -1,464 +0,0 @@
-\documentclass[utf8x,xcolor=pdftex,dvipsnames,table]{beamer}
-\usetheme{Malmoe}  % Now it's a beamer presentation with the lisa theme!
-\setbeamertemplate{footline}[page number]
-\usecolortheme{beaver}
-\usepackage[T1]{fontenc}
-\usepackage{amsmath}
-\usepackage[utf8x]{inputenc}
-%\logo{\includegraphics[width=.8in]{UdeM_NoirBleu_logo_Marie_crop}}
-\usepackage{listings}
-
-\newcommand{\superscript}[1]{\ensuremath{^{\textrm{#1}}}}
-
-\mode<presentation>
-
-\title{Theano, Pylearn2, libgpuarray Presentation}
-
-\author{%
-\footnotesize
-Frédéric Bastien, Bart van Merriënboer \newline
-Département d'Informatique et de Recherche Opérationnelle \newline
-Université de Montréal \newline
-Montréal, Canada \newline
-\texttt{\{bastienf, vanmerb\}@iro.umontreal.ca} \newline \newline
-}
-
-\date{OML Workshop 2014}
-
-\setbeamertemplate{navigation symbols}{}
-
-\begin{document}
-
-\begin{frame}[plain]
- \titlepage
- \vspace{-5em}
- \includegraphics[width=1in]{../hpcs2011_tutorial/pics/lisabook_logo_text_3.png}
- \hfill
- \includegraphics[width=.8in]{../hpcs2011_tutorial/pics/UdeM_NoirBleu_logo_Marie_crop}
-\end{frame}
-
-\section{Introduction}
-\begin{frame}{High level}\setcounter{page}{1}
-  Python <- \{NumPy/SciPy/libgpuarray\} <- Theano <- Pylearn2
-  \begin{itemize}
-  \item Python: OO coding language
-  \item Numpy: $n$-dimensional array object and scientific computing toolbox
-  \item SciPy: sparse matrix objects and more scientific computing functionality
-  \item libgpuarray: GPU $n$-dimensional array object in C for CUDA and OpenCL
-  \item Theano: compiler/symbolic graph manipulation
-  \item Pylearn2: machine learning framework
-  \end{itemize}
-\end{frame}
-
-
-%% \begin{frame}{Others}
-%%   \begin{itemize}
-%%   \item matplotlib: one of the many plotting library
-%%   \item IPython: Advanced python shell
-%%   \item IPython notebook: web-based interactive computational environment where you can combine code execution, text, mathematics, plots and rich media into a single document
-%%   \end{itemize}
-%% \end{frame}
-
-\begin{frame}{Python}
-  \begin{itemize}
-  \item General-purpose high-level OO interpreted language
-  \item Emphasizes code readability
-  \item Comprehensive standard library
-  \item Dynamic type and memory management
-  \item Slow execution
-  \item Easily extensible with C
-  \item Popular in {\em web development}\ and {\em scientific communities}
-  \end{itemize}
-\end{frame}
-
-\begin{frame}{NumPy/SciPy}
-  \begin{itemize}
-  \item Python floats are full-fledged objects on the heap
-      \begin{itemize}
-      \item Not suitable for high-performance computing!
-      \end{itemize}
-
-  \item NumPy provides an $n$-dimensional numeric array in Python
-      \begin{itemize}
-      \item Perfect for high-performance computing
-      \item Slices of arrays are views (no copying)
-      \end{itemize}
-
-  \item NumPy provides
-      \begin{itemize}
-      \item Elementwise computations
-      \item Linear algebra, Fourier transforms
-      \item Pseudorandom number generators (many distributions)
-      \end{itemize}
-
-  \item SciPy provides lots more, including
-      \begin{itemize}
-      \item Sparse matrices
-      \item More linear algebra
-      \item Solvers and optimization algorithms
-      \item Matlab-compatible I/O
-      \item I/O and signal processing for images and audio
-      \end{itemize}
-  \end{itemize}
-\end{frame}
-
-\begin{frame}{What's missing?}
-  \begin{itemize}
-    \item Non-lazy evaluation (required by Python) hurts performance
-    \item Bound to the CPU
-    \item Lacks symbolic or automatic differentiation
-    \item No automatic speed and stability optimization
-  \end{itemize}
-
-\end{frame}
-
-%% \begin{frame}{Why scripting for GPUs?}
-%%   \begin{bf}They complement each other\end{bf}
-
-%%   GPUs are everything that high level languages are not
-
-%%   \begin{itemize}
-%%     \item Highly parallel
-%%     \item Very architecture-sensitive
-%%     \item Built for maximum FP/memory throughput
-%%     \item So hard to program that meta-programming is easier
-%%   \end{itemize}
-
-%%   \begin{bf}Best of both worlds:\end{bf} easily scripted code which invokes high-performance GPU kernels.
-
-%%   \begin{bf}Theano C code generation removes overhead\end{bf} of
-%%   function calls between Python and C by launching many C functions at once.
-
-%% \end{frame}
-
-\begin{frame}{Theano}
-
-  High-level domain-specific language tailored to numeric computation.
-
-  \begin{itemize}
-    \item Syntax as close to NumPy as possible
-    \item Compiles most common expressions to C for CPU and/or GPU
-    \item Limited expressivity means more opportunities optimizations
-    \begin{itemize}
-      \item No subroutines -> global optimization
-      \item Strongly typed -> compiles to C
-      \item Array oriented -> easy parallelism
-      \item Support for looping and branching in expressions
-    \end{itemize}
-    \item Automatic speed and stability optimizations
-    \item Can reuse other technologies for best performance.
-    \begin{itemize}
-      \item BLAS, SciPy, Cython, Numba, PyCUDA, CUDA
-    \end{itemize}
-    \item Automatic differentiation and R op
-    \item Sparse matrices
-  \end{itemize}
-\end{frame}
-
-
-\begin{frame}{Pylearn2}
-
-  Machine Learning library aimed at researchers
-
-  \begin{itemize}
-    \item Built on top of Theano, for fast execution and use of GPU
-    \item Easy to try variants of implemented algorithms, and to extend them (using Theano)
-    \item Very modular, each component of the library can be used in isolation
-    \item Experiments can be specified through a YAML config file, or by a Python script
-    \item Scripts for visualizing weights, plot monitored values
-  \end{itemize}
-\end{frame}
-
-
-\begin{frame}{libgpuarray}
-  Goal: A common GPU $n$-dimensional array that can be reused by all projects, support for both CUDA and OpenCL.
-  \newline \newline
-  Motivation:
-  \begin{itemize}
-  \item Currently there are at least 6 different GPU arrays in Python
-    \begin{itemize}
-    \item CudaNdarray (Theano), GPUArray (pycuda), CUDAMatrix (cudamat), GPUArray (pyopencl), Clyther, Copperhead, ...
-    \item There are even more if we include other languages.
-    \end{itemize}
-  \item They are incompatible
-    \begin{itemize}
-    \item None have the same properties and interface
-    \end{itemize}
-  \item All of them implement a subset of numpy.ndarray properties
-  \item This is the new GPU backend on Theano
-  \end{itemize}
-\end{frame}
-
-
-\begin{frame}{Goal of the stack}
-\begin{center}
-\begin{bf}Fast to develop\end{bf}\newline \bigskip
-\begin{bf}Fast to run\end{bf}\newline \bigskip
-\hspace{-2.5cm}
-\includegraphics[width=0.35\textwidth]{road-runner-1.jpg}
-\end{center}
-\end{frame}
-
-
-\section{Theano}
-% I think it is a good idea to make explicit the change into a new section -- PL
-\begin{frame}
-  \tableofcontents[currentsection]
-\end{frame}
-
-\begin{frame}{Description}
-  \begin{itemize}
-    \item Mathematical symbolic expression compiler
-    \item Expressions mimic NumPy's syntax and semantics
-    \item Dynamic C/CUDA code generation
-    \begin{itemize}
-      \item C/C++, CUDA, OpenCL, PyCUDA, Cython, Numba, \ldots
-    \end{itemize}
-    \item Efficient symbolic differentiation
-    %\begin{itemize}
-    %  \item Derivatives of functions with one or many inputs.
-    %  \item Computation of the Jacobian, Hessian, R and L op.
-    %\end{itemize}
-    \item Speed and stability optimizations
-    \begin{itemize}
-      \item Gives the right answer for ``$\log (1 + x)$'' even if $x$ is really tiny.
-    \end{itemize}
-    \item Extensive unit-testing and self-verification
-    %\begin{itemize}
-    %  \item Detects and diagnoses many types of errors
-    %\end{itemize}
-    \item Works on Linux, OS X and Windows
-    \item Transparent use of a GPU
-    \begin{itemize}
-      \item {\tt float32} only for now (libgpuarray provides much more)
-      \item Limited support on Windows
-    \end{itemize}
-
-%    \item Statically typed and purely functional
-    \item Sparse operations (CPU only)
-  \end{itemize}
-\end{frame}
-
-% The following does not work with lstset, for some reason
-%\begin{frame}{Simple example}
-\begin{frame}[fragile]
-  \frametitle{Simple example}
-
-\lstset{language=Python,
-        commentstyle=\itshape\color{blue},
-        stringstyle=\color{violet},
-        }
-\begin{lstlisting}
-import theano
-# declare symbolic variable
-a = theano.tensor.vector("a")
-# build symbolic expression
-b = a + a ** 10
-# compile function
-f = theano.function([a], b)
-print f([0, 1, 2])
-# prints `array([0, 2, 1026])`
-\end{lstlisting}
-\end{frame}
-
-\begin{frame}{Simple example: graph optimization}
-\center
-\includegraphics[width=0.35\textwidth]{../hpcs2011_tutorial/pics/f_unoptimized.png}
-\hspace{0.1\textwidth}
-\includegraphics[width=0.35\textwidth]{../hpcs2011_tutorial/pics/f_optimized.png}
-%Symbolic programming = *Paradigm shift*: people need to use it to understand it.
-
-\end{frame}
-
-
-\begin{frame}{Project status?}
-  \begin{itemize}
-    \item Mature: Theano has been developed and used since January 2008 (6.5 yrs old)
-    \item Driven over 100 research papers
-    \item Good user documentation
-    \item Active mailing list with participants from outside our lab
-    \item Core technology for a few Silicon-Valley start-ups
-    \item Many contributors (some from outside our lab)
-    \item Used to teach many university classes
-    \item Has been used for research at Google and Yahoo.
-  \end{itemize}
-  Theano: \url{deeplearning.net/software/theano/}
-
-  Deep Learning Tutorials: \url{deeplearning.net/tutorial/}
-\end{frame}
-
-
-\section{Pylearn2}
-\begin{frame}
-  \tableofcontents[currentsection]
-\end{frame}
-
-\begin{frame}{Pylearn2 details}
-    The core library contains a collection of:
-    \begin{itemize}
-      \item Training algorithms (e.g. Stochastic and Batch GD, model-specific rules)
-      \begin{itemize}
-        \item Costs, supervised/unsupervised and exact/estimated (e.g. NLL, Score matching, NCE)
-        \item Monitor, history of (functions of) parameters and hyperparameters on different data sets (training, validation, test)
-        \item Termination criteria, determine when to stop training
-      \end{itemize}
-      \item Training extensions, perform actions throughout the training process (e.g., early stopping)
-      \item Models (e.g. NNets, ConvNets, RBMs, k-means, PCA, SVMs)
-      \item Datasets (e.g. MNIST, CIFAR-10) and preprocessors (LCN, ZCA)
-    \end{itemize}
-\end{frame}
-
-\begin{frame}{Pylearn2 details, continued}
-\begin{itemize}
-  \item Data specifications which give semantics to data
-  \begin{itemize}
-    \item IndexSpace, 1D integer array e.g.\ for labels
-    \item VectorSpace, 1D float array e.g.\ for softmax output
-    \item Conv2DSpace, 3D float32 arrays e.g.\ for color image input
-  \end{itemize}
-  \item Allows for automatic conversion when needed e.g.\ labels to one-hot vectors, images to flattened vectors
-  \item YAML file allows experiments to be conducted without writing code
-\end{itemize}
-\end{frame}
-
-\begin{frame}{Project status}
-  \begin{itemize}
-    \item Has been used for scientific publications, Kaggle competitions, used by many researchers at LISA
-    \item Still under rapid development, however the API shouldn't break without warning
-    \item Documentation is incomplete, but quickly improving
-    \item Active mailing list with participants from outside our lab
-    \item Core technology for a least one Silicon-Valley start-up
-    \item Features currently in development:
-    \begin{itemize}
-      \item Recurrent neural networks (RNNs), based on the GroundHog framework developed at LISA
-      \item Better hyperparameter search support, using e.g. Hyperopt
-    \end{itemize}
-  \end{itemize}
-\end{frame}
-
-%% \begin{frame}[fragile]
-%%   \frametitle{Simple example}
-
-%% % I know it is not Python, but YAML is not supported by listings
-%% % close enough? -- PL
-%% \lstset{language=python,
-%%         commentstyle=\slshape\color{blue},
-%%         stringstyle=\color{violet},
-%%         basicstyle=\tiny\ttfamily}
-%% \begin{lstlisting}
-%% !obj:pylearn2.train.Train {
-%%     "dataset": !obj:pylearn2.datasets.dense_design_matrix.DenseDesignMatrix &dataset {
-%%         "X" : !obj:numpy.random.normal { 'size': [5,3] },
-%%     },
-%%     "model": !obj:pylearn2.models.autoencoder.DenoisingAutoencoder {
-%%         "nvis" : 3,
-%%         "nhid" : 4,
-%%         "irange" : 0.05,  # Interval from which to sample weights
-%%         "corruptor": !obj:pylearn2.corruption.BinomialCorruptor {
-%%             "corruption_level": 0.5,
-%%         },
-%%         "act_enc": "tanh",
-%%         "act_dec": null,    # Linear activation on the decoder side.
-%%     },
-%%     "algorithm": !obj:pylearn2.training_algorithms.sgd.SGD {
-%%         "learning_rate" : 1e-3,
-%%         "batch_size" : 5,
-%%         "monitoring_dataset" : *dataset,
-%%         "cost" : !obj:pylearn2.costs.autoencoder.MeanSquaredReconstructionError {},
-%%         "termination_criterion" : !obj:pylearn2.termination_criteria.EpochCounter {
-%%             "max_epochs": 10,
-%%         },
-%%     }
-%% }
-%% \end{lstlisting}
-%% \end{frame}
-
-%% \begin{frame}[fragile]
-%%   \frametitle{Simple example}
-
-%% \lstset{language=python,
-%%         commentstyle=\itshape\color{blue},
-%%         stringstyle=\color{violet},
-%%         basicstyle=\small
-%%         }
-%% \begin{lstlisting}
-%% # Use Pylearn2 to perform a linear transformation
-%% # followed by a softmax
-%% x = theano.tensor.vector("x")
-%% softmax = pylearn2.models.mlp.Softmax(
-%%     n_classes=2, layer_name="softmax", irange=0.05
-%% )
-%% softmax.set_input_space(
-%%   pylearn2.space.VectorSpace(dim=5)
-%% )
-%% y = softmax.fprop(x)
-%% f = theano.function([x], y)
-%% print f([0.12, 0.12, 0.43, 0.32, 0.96])
-%% # prints [0.43, 0.54]
-%% \end{lstlisting}
-%% \end{frame}
-
-\section{libgpuarray}
-\begin{frame}
-  \tableofcontents[currentsection]
-\end{frame}
-
-\begin{frame}{libgpuarray: Design Goals}
-  \begin{itemize}
-  \item Have the base object in C to allow collaboration with more projects.
-    \begin{itemize}
-    \item We want people from C, C++, ruby, R, \ldots all use the same base GPU ndarray.
-    \end{itemize}
-  \item Be compatible with CUDA and OpenCL.
-  \item Not too simple, (don’t support just matrix).
-  \item Support all dtype.
-  \item Allow strided views.
-  \item But still easy to develop new code that support only a few memory layout.
-    \begin{itemize}
-    \item This ease the development of new code.
-    \end{itemize}
-  \end{itemize}
-\end{frame}
-
-\begin{frame}{Project status?}
-  \begin{itemize}
-  \item Usable directly, but not all implementation available.
-  \item Multiple GPUs works.
-  \item Is the next GPU array container for Theano and is working.
-    \begin{itemize}
-    \item Not all Theano implementations available now.
-    \item OpenCL misses more implementations.
-    \item Multiple GPUs on the way.
-    \end{itemize}
-  \item Web site: \url{http://deeplearning.net/software/libgpuarray/}
-  \end{itemize}
-\end{frame}
-
-\section{Conclusion}
-\begin{frame}
-  \tableofcontents[currentsection]
-\end{frame}
-
-\begin{frame}{Conclusion}
-Theano/Pylearn2/libgpuarry provide an environment for machine learning that is:
-\begin{bf}Fast to develop\end{bf}\newline
-\begin{bf}Fast to run\end{bf}\newline
-\end{frame}
-
-\begin{frame}{Acknowledgments}
-\begin{itemize}
-\item All people working or having worked at the LISA lab.
-\item All Theano/Pylearn 2 users/contributors
-\item Compute Canada, RQCHP, NSERC, and Canada Research Chairs for providing funds or access to compute resources.
-\end{itemize}
-\end{frame}
-
-\begin{frame}
-\begin{center}
-\Huge
-Questions?
-\end{center}
-\end{frame}
-
-
-\end{document}
diff --git a/doc/omlw2014/road-runner-1.jpg b/doc/omlw2014/road-runner-1.jpg
deleted file mode 100644
index 301a3d98492..00000000000
Binary files a/doc/omlw2014/road-runner-1.jpg and /dev/null differ
diff --git a/doc/omlw2014/sharing.tex b/doc/omlw2014/sharing.tex
deleted file mode 100644
index 215d7fc4282..00000000000
--- a/doc/omlw2014/sharing.tex
+++ /dev/null
@@ -1,96 +0,0 @@
-\documentclass[utf8x,xcolor=pdftex,dvipsnames,table]{beamer}
-\usetheme{Malmoe}  % Now it's a beamer presentation with the lisa theme!
-\setbeamertemplate{footline}[page number]
-\usecolortheme{beaver}
-\usepackage[T1]{fontenc}
-\usepackage{amsmath}
-\usepackage[utf8x]{inputenc}
-%\logo{\includegraphics[width=.8in]{UdeM_NoirBleu_logo_Marie_crop}}
-\usepackage{listings}
-
-\newcommand{\superscript}[1]{\ensuremath{^{\textrm{#1}}}}
-
-\mode<presentation>
-
-\title{Theano, Pylearn2, libgpuarray: Sharing and Future}
-
-\author{%
-\footnotesize
-Frédéric Bastien, Bart van Merriënboer \newline
-Département d'Informatique et de Recherche Opérationnelle \newline
-Université de Montréal \newline
-Montréal, Canada \newline
-\texttt{\{bastienf, vanmerb\}@iro.umontreal.ca} \newline \newline
-}
-
-\date{OML Workshop 2014}
-
-\setbeamertemplate{navigation symbols}{}
-
-\begin{document}
-
-\begin{frame}[plain]
- \titlepage
- \vspace{-5em}
- \includegraphics[width=1in]{../hpcs2011_tutorial/pics/lisabook_logo_text_3.png}
- \hfill
- \includegraphics[width=.8in]{../hpcs2011_tutorial/pics/UdeM_NoirBleu_logo_Marie_crop}
-\end{frame}
-
-\section{Future}
-\begin{frame}
-  \tableofcontents[currentsection]
-\end{frame}
-
-\begin{frame}{Theano}\setcounter{page}{1}
-\begin{itemize}
-\item Easier C code development and better documentation of that
-\item Faster compilation
-\item Multi-GPU
-\item Better looping (update to scan)
-\item Allow checkpoint with GPU to reload without GPU
-\item Less memory allocation(lower Theano overhead)
-\item Faster convolution
-\end{itemize}
-\end{frame}
-
-\begin{frame}{libgpuarray}
-\begin{itemize}
-\item Find other projects to use it?
-\item More functionality as NumPy
-\item Move some of the functionality from Python/Theano to the C level
-\item Optimize the kernel selection and parametrization based on the GPU
-\end{itemize}
-\end{frame}
-
-\begin{frame}{Pylearn2}
-\begin{itemize}
-\item RNN
-\item Better hyperparameter search support, using e.g. Hyperopt
-\item Documentation
-\item Checkpoint
-\item Better support for sparse dataset
-\item Machine translation examples
-\item Gated activations for conditional computation
-\item Variational Auto-Encoders
-\end{itemize}
-\end{frame}
-
-\begin{frame}
-\end{frame}
-
-\begin{frame}{Simplifying code sharing between}
-\begin{enumerate}
-  \item<1-> License: \begin{bf}Suggest BSD\end{bf} as it is used by many software in our field.
-    \begin{itemize}
-    \item Common license help share code.
-    \item When reusing code, don't forget to keep the license and the copyright notice
-    \end{itemize}
-  \item<2-> Common base object! \begin{bf}libgpuarray\end{bf}
-  \item<3-> Otherwise: put important implementation(e.g. convolution) in \begin{bf}separate file\end{bf} and \begin{bf}use raw ptr/shape/strides\end{bf} as inputs. Document that interface.
-  \item<4-> Acknowledge reuse \begin{bf}in section on web site\end{bf} AND \begin{bf}in papers\end{bf} about the software we reuse! (and use too)
-\end{enumerate}
-\end{frame}
-
-
-\end{document}
diff --git a/doc/proposals/advidx.txt b/doc/proposals/advidx.txt
new file mode 100644
index 00000000000..f306534a596
--- /dev/null
+++ b/doc/proposals/advidx.txt
@@ -0,0 +1,6 @@
+==================
+Advanced Indexing
+==================
+
+Continue the Advanced Indexing project that is on either github or bitbucket.
+
diff --git a/doc/proposals/index.txt b/doc/proposals/index.txt
index cfad756677c..209860f41ac 100644
--- a/doc/proposals/index.txt
+++ b/doc/proposals/index.txt
@@ -12,4 +12,4 @@ Proposals for new/revised features
     noupdates
     opt_patterns2
     graphical_models
-    complex_gradient
+
diff --git a/doc/proposals/opencl.txt b/doc/proposals/opencl.txt
new file mode 100644
index 00000000000..57da7ba8558
--- /dev/null
+++ b/doc/proposals/opencl.txt
@@ -0,0 +1,10 @@
+=======
+OpenCL
+=======
+
+Migrate the GPU code-generators to the PyCUDA style, and eventually to OpenCL.
+This means mainly to use a different kind of code-generation strategy.  The
+kernel itself is compiled, but the calling code remains in python or cython.  We
+would no longer generate entire C files this way, and no longer use the CLinker
+for GPU code.
+
diff --git a/doc/scripts/docgen.py b/doc/scripts/docgen.py
index fb48261bf8c..84fbee9d337 100644
--- a/doc/scripts/docgen.py
+++ b/doc/scripts/docgen.py
@@ -65,20 +65,18 @@ def Op_to_RoutineDoc(op, routine_doc, module_name=None):
     options.update(dict([x, y or True] for x, y in
         getopt.getopt(sys.argv[1:],
                       'o:',
-                      ['epydoc', 'rst', 'help', 'nopdf', 'cache', 'test'])[0]))
+                      ['epydoc', 'rst', 'help', 'nopdf'])[0]))
     if options['--help']:
         print 'Usage: %s [OPTIONS]' % sys.argv[0]
         print '  -o <dir>: output the html files in the specified dir'
-        print '  --cache: use the doctree cache'
         print '  --rst: only compile the doc (requires sphinx)'
         print '  --nopdf: do not produce a PDF file from the doc, only HTML'
         print '  --epydoc: only compile the api documentation',
         print '(requires epydoc)'
-        print '  --test: run all the code samples in the documentaton'
         print '  --help: this help'
         sys.exit(0)
 
-    if not (options['--epydoc'] or options['--rst'] or options['--test']):
+    if not (options['--epydoc'] or options['--rst']):
         # Default is now rst
         options['--rst'] = True
 
@@ -114,25 +112,18 @@ def mkdir(path):
         # Generate PDF doc
         # TODO
 
-    def call_sphinx(builder, workdir, extraopts=None):
-        import sphinx
-        if extraopts is None:
-            extraopts = []
-        if not options['--cache']:
-            extraopts.append('-E')
-        sphinx.main(['', '-b', builder] + extraopts +
-                    [os.path.join(throot, 'doc'), workdir])
-
     if options['--all'] or options['--rst']:
         mkdir("doc")
+        import sphinx
         sys.path[0:0] = [os.path.join(throot, 'doc')]
-        call_sphinx('html', '.')
+        sphinx.main(['', '-E', os.path.join(throot, 'doc'), '.'])
 
         if not options['--nopdf']:
             # Generate latex file in a temp directory
             import tempfile
             workdir = tempfile.mkdtemp()
-            call_sphinx('latex', workdir)
+            sphinx.main(['', '-E', '-b', 'latex',
+                os.path.join(throot, 'doc'), workdir])
             # Compile to PDF
             os.chdir(workdir)
             os.system('make')
@@ -144,8 +135,3 @@ def call_sphinx(builder, workdir, extraopts=None):
                 print 'OSError:', e
             except IOError, e:
                 print 'IOError:', e
-
-    if options['--test']:
-        mkdir("doc")
-        sys.path[0:0] = [os.path.join(throot, 'doc')]
-        call_sphinx('doctest', '.')
diff --git a/doc/tutorial/adding.txt b/doc/tutorial/adding.txt
index 5e3051869df..5bd4f2597a7 100644
--- a/doc/tutorial/adding.txt
+++ b/doc/tutorial/adding.txt
@@ -99,7 +99,6 @@ The second step is to combine *x* and *y* into their sum *z*:
 *x* and *y*. You can use the :ref:`pp <libdoc_printing>`
 function to pretty-print out the computation associated to *z*.
 
->>> from theano import pp
 >>> print pp(z)
 (x + y)
 
@@ -196,10 +195,9 @@ with NumPy arrays may be found here: :ref:`tensor creation<libdoc_tensor_creatio
    program will use 32- or 64-bit integers (``i`` prefix vs. the ``l`` prefix)
    and floats (``f`` prefix vs. the ``d`` prefix).
 
+-------------------------------------------
 
-
-Exercise
-========
+**Exercise**
 
 .. code-block:: python
 
diff --git a/doc/tutorial/aliasing.txt b/doc/tutorial/aliasing.txt
index 6d8ca18d767..c5e929de613 100644
--- a/doc/tutorial/aliasing.txt
+++ b/doc/tutorial/aliasing.txt
@@ -40,11 +40,11 @@ changes to values in that pool.
 
 * The default behaviour of a function is to return user-space values for
   outputs, and to expect user-space values for inputs.
-
+    
 The distinction between Theano-managed memory and user-managed memory can be
 broken down by some Theano functions (e.g. ``shared``, ``get_value`` and the
-constructors for ``In`` and ``Out``) by using a ``borrow=True`` flag.
-This can make those methods faster (by avoiding copy operations) at the expense
+constructors for ``In`` and ``Out``) by using a ``borrow=True`` flag. 
+This can make those methods faster (by avoiding copy operations) at the expense 
 of risking subtle bugs in the overall program (by aliasing memory).
 
 The rest of this section is aimed at helping you to understand when it is safe
@@ -91,7 +91,7 @@ and may occur only temporarily even if it occurs at all.
 It is not guaranteed to occur because if Theano is using a GPU device, then the
 ``borrow`` flag has no effect. It may occur only temporarily because
 if we call a Theano function that updates the value of *s_true* the aliasing
-relationship *may* or *may not* be broken (the function is allowed to
+relationship *may* or *may not* be broken (the function is allowed to 
 update the ``shared`` variable by modifying its buffer, which will preserve
 the aliasing, or by changing which buffer the variable points to, which
 will terminate the aliasing).
@@ -113,7 +113,7 @@ Borrowing when Accessing Value of Shared Variables
 Retrieving
 ----------
 
-A ``borrow`` argument can also be used to control how a ``shared`` variable's value is
+A ``borrow`` argument can also be used to control how a ``shared`` variable's value is 
 retrieved.
 
 
@@ -138,8 +138,8 @@ The reason that ``borrow=True`` might still make a copy is that the internal
 representation of a ``shared`` variable might not be what you expect.  When you
 create a ``shared`` variable by passing a NumPy array for example, then ``get_value()``
 must return a NumPy array too.  That's how Theano can make the GPU use
-transparent.  But when you are using a GPU (or in the future perhaps a remote machine),
-then the numpy.ndarray is not the internal representation of your data.
+transparent.  But when you are using a GPU (or in the future perhaps a remote machine), 
+then the numpy.ndarray is not the internal representation of your data. 
 If you really want Theano to return its internal representation *and never copy it*
 then you should use the ``return_internal_type=True`` argument to
 ``get_value``.  It will never cast the internal object (always return in
@@ -171,8 +171,8 @@ Assigning
 ---------
 
 ``Shared`` variables also have a ``set_value`` method that can accept an optional
-``borrow=True`` argument. The semantics are similar to those of creating a new
-``shared`` variable - ``borrow=False`` is the default and ``borrow=True`` means
+``borrow=True`` argument. The semantics are similar to those of creating a new 
+``shared`` variable - ``borrow=False`` is the default and ``borrow=True`` means 
 that Theano *may* reuse the buffer you provide as the internal storage for the variable.
 
 A standard pattern for manually updating the value of a ``shared`` variable is as
@@ -216,13 +216,12 @@ be costly.  Here are a few tips to ensure fast and efficient use of GPU memory a
 (Further information on the current implementation of the GPU version of ``set_value()`` can be found
 here: :ref:`libdoc_cuda_var`)
 
-.. _borrowfunction:
 
 Borrowing when Constructing Function Objects
 ============================================
 
 A ``borrow`` argument can also be provided to the ``In`` and ``Out`` objects
-that control how ``theano.function`` handles its argument[s] and return value[s].
+that control how ``theano.function`` handles its argument[s] and return value[s]. 
 
 .. If you modify this code, also change :
 .. theano/tests/test_tutorial.py:T_aliasing.test_aliasing_3
@@ -238,7 +237,7 @@ that control how ``theano.function`` handles its argument[s] and return value[s]
 Borrowing an input means that Theano will treat the argument you provide as if
 it were part of Theano's pool of temporaries.  Consequently, your input
 may be reused as a buffer (and overwritten!) during the computation of other variables in the
-course of evaluating that function (e.g. ``f``).
+course of evaluating that function (e.g. ``f``). 
 
 
 Borrowing an output means that Theano will not insist on allocating a fresh
@@ -259,58 +258,13 @@ combination of ``return_internal_type=True`` and ``borrow=True`` arguments to
 hints that give more flexibility to the compilation and optimization of the
 graph.
 
-For GPU graphs, this borrowing can have a major speed impact.  See the following code:
-
-.. code-block:: python
-
-  from theano import function, config, shared, sandbox, tensor, Out
-  import numpy
-  import time
-
-  vlen = 10 * 30 * 768  # 10 x # cores x # threads per core
-  iters = 1000
-
-  rng = numpy.random.RandomState(22)
-  x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
-  f1 = function([], sandbox.cuda.basic_ops.gpu_from_host(tensor.exp(x)))
-  f2 = function([],
-                Out(sandbox.cuda.basic_ops.gpu_from_host(tensor.exp(x)),
-                    borrow=True))
-  t0 = time.time()
-  for i in xrange(iters):
-      r = f1()
-  t1 = time.time()
-  no_borrow = t1 - t0
-  t0 = time.time()
-  for i in xrange(iters):
-      r = f2()
-  t1 = time.time()
-  print 'Looping', iters, 'times took', no_borrow, 'seconds without borrow',
-  print 'and', t1 - t0, 'seconds with borrow.'
-  if numpy.any([isinstance(x.op, tensor.Elemwise) and
-                ('Gpu' not in type(x.op).__name__)
-                for x in f1.maker.fgraph.toposort()]):
-      print 'Used the cpu'
-  else:
-      print 'Used the gpu'
-
-Which produces this output:
-
-.. code-block:: text
-
-   $ THEANO_FLAGS=device=gpu0,floatX=float32 python test1.py
-   Using gpu device 0: GeForce GTX 275
-   Looping 1000 times took 0.368273973465 seconds without borrow and 0.0240728855133 seconds with borrow.
-   Used the gpu
-
 *Take home message:*
 
-When an input *x* to a function is not needed after the function
-returns and you would like to make it available to Theano as
-additional workspace, then consider marking it with ``In(x,
-borrow=True)``.  It may make the function faster and reduce its memory
-requirement.  When a return value *y* is large (in terms of memory
-footprint), and you only need to read from it once, right away when
-it's returned, then consider marking it with an ``Out(y,
+When an input *x* to a function is not needed after the function returns and you
+would like to make it available to Theano as additional workspace, then consider
+marking it with ``In(x, borrow=True)``.  It may make the function faster and
+reduce its memory requirement.
+When a return value *y* is large (in terms of memory footprint), and you only need to read from it once, right
+away when it's returned, then consider marking it with an ``Out(y,
 borrow=True)``.
 
diff --git a/doc/tutorial/debug_faq.txt b/doc/tutorial/debug_faq.txt
index c91dad20584..f270582bdb2 100644
--- a/doc/tutorial/debug_faq.txt
+++ b/doc/tutorial/debug_faq.txt
@@ -17,76 +17,6 @@ Isolating the Problem/Testing Theano Compiler
 You can run your Theano function in a :ref:`DebugMode<using_debugmode>`.
 This tests the Theano optimizations and helps to find where NaN, inf and other problems come from.
 
-Interpreting Error Messages
----------------------------
-
-Even in its default configuration, Theano tries to display useful error
-messages. Consider the following faulty code.
-
-.. code-block:: python
-
-    import numpy as np
-    import theano
-    import theano.tensor as T
-
-    x = T.vector()
-    y = T.vector()
-    z = x + x
-    z = z + y
-    f = theano.function([x, y], z)
-    f(np.ones((2,)), np.ones((3,)))
-
-Running the code above we see:
-
-.. code-block:: bash
-
-    Traceback (most recent call last):
-      File "test0.py", line 10, in <module>
-        f(np.ones((2,)), np.ones((3,)))
-      File "/PATH_TO_THEANO/theano/compile/function_module.py", line 605, in __call__
-        self.fn.thunks[self.fn.position_of_error])
-      File "/PATH_TO_THEANO/theano/compile/function_module.py", line 595, in __call__
-        outputs = self.fn()
-    ValueError: Input dimension mis-match. (input[0].shape[0] = 3, input[1].shape[0] = 2)
-    Apply node that caused the error: Elemwise{add,no_inplace}(<TensorType(float64, vector)>, <TensorType(float64, vector)>, <TensorType(float64, vector)>)
-    Inputs types: [TensorType(float64, vector), TensorType(float64, vector), TensorType(float64, vector)]
-    Inputs shapes: [(3,), (2,), (2,)]
-    Inputs strides: [(8,), (8,), (8,)]
-    Inputs scalar values: ['not scalar', 'not scalar', 'not scalar']
-
-    HINT: Re-running with most Theano optimization disabled could give you a back-traces when this node was created. This can be done with by setting the Theano flags 'optimizer=fast_compile'. If that does not work, Theano optimization can be disabled with 'optimizer=None'.
-    HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint of this apply node.
-
-Arguably the most useful information is approximately half-way through
-the error message, where the kind of error is displayed along with its
-cause (`ValueError: Input dimension mis-match. (input[0].shape[0] = 3,
-input[1].shape[0] = 2`).
-Below it, some other information is given, such as the apply node that
-caused the error, as well as the input types, shapes, strides and
-scalar values.
-
-The two hints can also be helpful when debugging. Using the theano flag
-``optimizer=fast_compile`` or ``optimizer=None`` can often tell you
-the faulty line, while ``exception_verbosity=high`` will display a
-debugprint of the apply node. Using these hints, the end of the error
-message becomes :
-
-.. code-block:: bash
-
-    Backtrace when the node is created:
-      File "test0.py", line 8, in <module>
-        z = z + y
-
-    Debugprint of the apply node:
-    Elemwise{add,no_inplace} [@A] <TensorType(float64, vector)> ''
-     |Elemwise{add,no_inplace} [@B] <TensorType(float64, vector)> ''
-     | |<TensorType(float64, vector)> [@C] <TensorType(float64, vector)>
-     | |<TensorType(float64, vector)> [@C] <TensorType(float64, vector)>
-     |<TensorType(float64, vector)> [@D] <TensorType(float64, vector)>
-
-We can here see that the error can be traced back to the line ``z = z + y``.
-For this example, using ``optimizer=fast_compile`` worked. If it did not,
-you could set ``optimizer=None`` or use test values.
 
 Using Test Values
 -----------------
@@ -96,19 +26,13 @@ on-the-fly, before a ``theano.function`` is ever compiled. Since optimizations
 haven't been applied at this stage, it is easier for the user to locate the
 source of some bug. This functionality is enabled through the config flag
 ``theano.config.compute_test_value``. Its use is best shown through the
-following example. Here, we use ``exception_verbosity=high`` and
-``optimizer=fast_compile``, which would not tell you the line at fault.
-``optimizer=None`` would and it could therefore be used instead of test values.
+following example.
 
 
 .. code-block:: python
 
-    import numpy
-    import theano
-    import theano.tensor as T
-
     # compute_test_value is 'off' by default, meaning this feature is inactive
-    theano.config.compute_test_value = 'off' # Use 'warn' to activate this feature
+    theano.config.compute_test_value = 'off'
 
     # configure shared variables
     W1val = numpy.random.rand(2, 10, 10).astype(theano.config.floatX)
@@ -118,8 +42,6 @@ following example. Here, we use ``exception_verbosity=high`` and
 
     # input which will be of shape (5,10)
     x  = T.matrix('x')
-    # provide Theano with a default test-value
-    #x.tag.test_value = numpy.random.rand(5, 10)
 
     # transform the shared variable in some way. Theano does not
     # know off hand that the matrix func_of_W1 has shape (20, 10)
@@ -139,32 +61,35 @@ Running the above code generates the following error message:
 
 .. code-block:: bash
 
+    Definition in:
+      File "/u/desjagui/workspace/PYTHON/theano/gof/opt.py", line 1102, in apply
+        lopt_change = self.process_node(fgraph, node, lopt)
+      File "/u/desjagui/workspace/PYTHON/theano/gof/opt.py", line 882, in process_node
+        replacements = lopt.transform(node)
+      File "/u/desjagui/workspace/PYTHON/Theano/theano/tensor/blas.py", line 1030, in local_dot_to_dot22
+        return [_dot22(*node.inputs)]
+      File "/u/desjagui/workspace/PYTHON/Theano/theano/gof/op.py", line 324, in __call__
+        self.add_tag_trace(node)
+    For the full definition stack trace set the Theano flags traceback.limit to -1
+
     Traceback (most recent call last):
-      File "test1.py", line 31, in <module>
-        f(numpy.random.rand(5, 10))
-      File "PATH_TO_THEANO/theano/compile/function_module.py", line 605, in __call__
-        self.fn.thunks[self.fn.position_of_error])
-      File "PATH_TO_THEANO/theano/compile/function_module.py", line 595, in __call__
-        outputs = self.fn()
-    ValueError: Shape mismatch: x has 10 cols (and 5 rows) but y has 20 rows (and 10 cols)
-    Apply node that caused the error: Dot22(x, DimShuffle{1,0}.0)
-    Inputs types: [TensorType(float64, matrix), TensorType(float64, matrix)]
-    Inputs shapes: [(5, 10), (20, 10)]
-    Inputs strides: [(80, 8), (8, 160)]
-    Inputs scalar values: ['not scalar', 'not scalar']
-
-    Debugprint of the apply node:
-    Dot22 [@A] <TensorType(float64, matrix)> ''
-     |x [@B] <TensorType(float64, matrix)>
-     |DimShuffle{1,0} [@C] <TensorType(float64, matrix)> ''
-       |Flatten{2} [@D] <TensorType(float64, matrix)> ''
-         |DimShuffle{2,0,1} [@E] <TensorType(float64, 3D)> ''
-           |W1 [@F] <TensorType(float64, 3D)>
-
-    HINT: Re-running with most Theano optimization disabled could give you a back-traces when this node was created. This can be done with by setting the Theano flags 'optimizer=fast_compile'. If that does not work, Theano optimization can be disabled with 'optimizer=None'.
-
-If the above is not informative enough, by instrumenting the code ever
-so slightly, we can get Theano to reveal the exact source of the error.
+      File "test.py", line 29, in <module>
+        f(numpy.random.rand(5,10))
+      File "/u/desjagui/workspace/PYTHON/theano/compile/function_module.py", line 596, in __call__
+        self.fn()
+      File "/u/desjagui/workspace/PYTHON/theano/gof/link.py", line 288, in streamline_default_f
+        raise_with_op(node)
+      File "/u/desjagui/workspace/PYTHON/theano/gof/link.py", line 284, in streamline_default_f
+        thunk()
+      File "/u/desjagui/workspace/PYTHON/Theano/theano/gof/cc.py", line 1111, in execute
+        raise exc_type, exc_value, exc_trace
+    ValueError: ('Shape mismatch: x has 10 cols but y has 20 rows',
+    _dot22(x, <TensorType(float64, matrix)>), [_dot22.0],
+    _dot22(x, InplaceDimShuffle{1,0}.0), 'Sequence id of Apply node=4')
+
+Needless to say, the above is not very informative and does not provide much in
+the way of guidance. However, by instrumenting the code ever so slightly, we
+can get Theano to reveal the exact source of the error.
 
 .. code-block:: python
 
@@ -183,22 +108,18 @@ value. This allows Theano to evaluate symbolic expressions on-the-fly (by
 calling the ``perform`` method of each op), as they are being defined. Sources
 of error can thus be identified with much more precision and much earlier in
 the compilation pipeline. For example, running the above code yields the
-following error message, which properly identifies *line 24* as the culprit.
+following error message, which properly identifies *line 23* as the culprit.
 
 .. code-block:: bash
 
     Traceback (most recent call last):
-      File "test2.py", line 24, in <module>
-        h1 = T.dot(x, func_of_W1)
-      File "PATH_TO_THEANO/theano/tensor/basic.py", line 4734, in dot
-        return _dot(a, b)
-      File "PATH_TO_THEANO/theano/gof/op.py", line 545, in __call__
-        required = thunk()
-      File "PATH_TO_THEANO/theano/gof/op.py", line 752, in rval
-        r = p(n, [x[0] for x in i], o)
-      File "PATH_TO_THEANO/theano/tensor/basic.py", line 4554, in perform
+      File "test2.py", line 23, in <module>
+        h1 = T.dot(x,func_of_W1)
+      File "/u/desjagui/workspace/PYTHON/Theano/theano/gof/op.py", line 360, in __call__
+        node.op.perform(node, input_vals, output_storage)
+      File "/u/desjagui/workspace/PYTHON/Theano/theano/tensor/basic.py", line 4458, in perform
         z[0] = numpy.asarray(numpy.dot(x, y))
-    ValueError: matrices are not aligned
+    ValueError: ('matrices are not aligned', (5, 10), (20, 10))
 
 The ``compute_test_value`` mechanism works as follows:
 
@@ -367,12 +288,12 @@ can be achieved as follows:
 
 To help understand what is happening in your graph, you can
 disable the ``local_elemwise_fusion`` and all ``inplace``
-optimizations. The first is a speed optimization that merges elemwise
-operations together. This makes it harder to know which particular
-elemwise causes the problem. The second optimization makes some ops'
-outputs overwrite their inputs. So, if an op creates a bad output, you
-will not be able to see the input that was overwriten in the ``post_func``
-function. To disable those optimizations (with a Theano version after
+optimizations. The first is a speed optimization that merge elemwise
+operations together. This make it harder to know which particular
+elemwise cause the problem. The second optimization make some ops
+output overwrite its input. So, if an op create a bad output, you
+won't be able see the input that was overwriten in the ``post_fun``
+function. To disable those optimization (with a Theano version after
 0.6rc3), define the MonitorMode like this:
 
 .. code-block:: python
@@ -390,7 +311,7 @@ function. To disable those optimizations (with a Theano version after
     mode with MonitorMode, as you need to define what you monitor.
 
 To be sure all inputs of the node are available during the call to
-``post_func``, you must also disable the garbage collector. Otherwise,
+``post_func``, you also must disable the garbage collector. Otherwise,
 the execution of the node can garbage collect its inputs that aren't
 needed anymore by the Theano function. This can be done with the Theano
 flag:
@@ -489,29 +410,3 @@ the function itself (a "thunk" is a concept related to closures). Here, to
 get the current node's first input's shape, you'd therefore do "p
 thunk.inputs[0][0].shape", which prints out "(3, 4)".
 
-.. _faq_dump_fct:
-
-Dumping a Function to help debug
---------------------------------
-
-If you are reading this, there is high chance that you emailed our
-mailing list and we asked you to read this section. This section
-explain how to dump all the parameter passed to
-``theano.function()``. This is useful to help us reproduce a problem
-during compilation and it don't request you to make a self contained
-example.
-
-For this to work, we need to be able to import the code for all Op in
-the graph. So if you create your own Op, we will need this
-code. Otherwise, we won't be able to unpickle it. We already have all
-the Ops from Theano and Pylearn2.
-
-.. code-block:: python
-
-    # Replace this line:
-    theano.function(...)
-    # with
-    theano.function_dump(filename, ...)
-    # Where filename is a string to a file that we will write to.
-
-Then send us filename.
diff --git a/doc/tutorial/examples.txt b/doc/tutorial/examples.txt
index 99f2abc8f61..58be475d720 100644
--- a/doc/tutorial/examples.txt
+++ b/doc/tutorial/examples.txt
@@ -5,13 +5,13 @@
 More Examples
 =============
 
-At this point it would be wise to begin familiarizing yourself more
-systematically with Theano's fundamental objects and operations by
-browsing this section of the library: :ref:`libdoc_basic_tensor`.
+At this point it would be wise to begin familiarizing yourself 
+more systematically with Theano's fundamental objects and operations by browsing
+this section of the library: :ref:`libdoc_basic_tensor`.
 
-As the tutorial unfolds, you should also gradually acquaint yourself
-with the other relevant areas of the library and with the relevant
-subjects of the documentation entrance page.
+As the tutorial unfolds, you should also gradually acquaint yourself with the other
+relevant areas of the library and with the relevant subjects of the documentation
+entrance page.
 
 
 Logistic Function
@@ -30,13 +30,13 @@ the logistic curve, which is given by:
     A plot of the logistic function, with x on the x-axis and s(x) on the
     y-axis.
 
-You want to compute the function :ref:`elementwise
-<libdoc_tensor_elementwise>` on matrices of doubles, which means that
-you want to apply this function to each individual element of the
-matrix.
+You want to compute the function :ref:`elementwise <libdoc_tensor_elementwise>` on matrices of
+doubles, which means that you want to apply this function to each
+individual element of the matrix.
 
 Well, what you do is this:
 
+
 .. If you modify this code, also change :
 .. theano/tests/test_tutorial.py:T_examples.test_examples_1
 
@@ -254,11 +254,10 @@ for the purpose of one particular function.
 .. theano/tests/test_tutorial.py:T_examples.test_examples_8
 
 >>> fn_of_state = state * 2 + inc
->>> # The type of foo must match the shared variable we are replacing
->>> # with the ``givens``
->>> foo = T.scalar(dtype=state.dtype)
+>>> foo = T.lscalar()  # the type (lscalar) must match the shared variable we
+>>>                    # are replacing with the ``givens`` list
 >>> skip_shared = function([inc, foo], fn_of_state,
-                           givens=[(state, foo)])
+        givens=[(state, foo)])
 >>> skip_shared(1, 3)  # we're using 3 for the state, not state.value
 array(7)
 >>> state.get_value()  # old state still there, but we didn't use it
@@ -305,7 +304,6 @@ Here's a brief example.  The setup code is:
 .. code-block:: python
 
     from theano.tensor.shared_randomstreams import RandomStreams
-    from theano import function
     srng = RandomStreams(seed=234)
     rv_u = srng.uniform((2,2))
     rv_n = srng.normal((2,2))
@@ -373,11 +371,10 @@ For example:
 >>> state_after_v0 = rv_u.rng.get_value().get_state()
 >>> nearly_zeros()       # this affects rv_u's generator
 >>> v1 = f()
->>> rng = rv_u.rng.get_value(borrow=True)
+>>> rng = rng.get_value(borrow=True)
 >>> rng.set_state(state_after_v0)
 >>> rv_u.rng.set_value(rng, borrow=True)
 >>> v2 = f()             # v2 != v1
->>> v3 = f()             # v3 == v1
 
 Copying Random State Between Theano Graphs
 ------------------------------------------
@@ -450,10 +447,6 @@ Other Random Distributions
 
 There are :ref:`other distributions implemented <libdoc_tensor_raw_random>`. 
 
-Other Implementations
----------------------
-
-There is 2 other implementations based on :class:`CURAND <theano.sandbox.cuda.rng_curand>` and :ref:`MRG31k3p <libdoc_rng_mrg>`
 
 .. _logistic_regression:
 
@@ -461,8 +454,7 @@ There is 2 other implementations based on :class:`CURAND <theano.sandbox.cuda.rn
 A Real Example: Logistic Regression
 ===================================
 
-The preceding elements are featured in this more realistic example.
-It will be used repeatedly.
+The preceding elements are featured in this more realistic example.  It will be used repeatedly.  
 
 .. code-block:: python
 
@@ -473,7 +465,7 @@ It will be used repeatedly.
   
   N = 400
   feats = 784
-  D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
+  D = (rng.randn(N, feats), rng.randint(size=N,low=0, high=2))
   training_steps = 10000
   
   # Declare Theano symbolic variables
@@ -489,7 +481,7 @@ It will be used repeatedly.
   prediction = p_1 > 0.5                    # The prediction thresholded
   xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
   cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
-  gw, gb = T.grad(cost, [w, b])             # Compute the gradient of the cost
+  gw,gb = T.grad(cost, [w, b])              # Compute the gradient of the cost
                                             # (we shall return to this in a
                                             # following section of this tutorial)
 
diff --git a/doc/tutorial/extending_theano.txt b/doc/tutorial/extending_theano.txt
index 4f000b37e62..e65d7d9d071 100644
--- a/doc/tutorial/extending_theano.txt
+++ b/doc/tutorial/extending_theano.txt
@@ -5,16 +5,23 @@
 Extending Theano
 ================
 
-This tutorial covers how to extend Theano with novel ops. It mainly focuses on ops that offer a Python implementation, refers to  :ref:`extending_theano_c` for C-based op.
-The first section of this tutorial introduces the Theano Graphs,
-as providing a novel Theano op requires a basic understanting of the Theano Graphs. It then proposes an overview of the most important methods that define an op.
+Theano Graphs
+=============
+
+- Theano works with symbolic graphs.
+- Those graphs are bi-partite graphs (graphs with 2 types of nodes).
+- The two types of nodes are ``Apply`` and ``Variable`` nodes.
+- Each ``Apply`` node has a link to the op that it executes.
 
-As an illustration, this tutorial shows how to write a simple Python-based op which performs operations on Double. It also shows how to implement tests that ensure the proper working of an op.
+Inputs and Outputs are lists of Theano variables.
+
+.. image:: ../hpcs2011_tutorial/pics/apply_node.png
+    :width: 500 px
 
 .. note::
 
     This tutorial does not cover how to make an op that returns a view or
-    modifies the values in its inputs. Thus, all ops created with the
+    modifies the values in its inputs. Thus, all ops created with the 
     instructions described here MUST return newly allocated
     memory or reuse the memory provided in the parameter
     ``output_storage`` of the :func:`perform` function. See :ref:`views_and_inplace`
@@ -35,54 +42,41 @@ As an illustration, this tutorial shows how to write a simple Python-based op wh
    how to make a quality contribution.
 
 
-Theano Graphs
-=============
+Op Contract
+===========
 
 
-.. image:: ../hpcs2011_tutorial/pics/apply_node.png
-    :width: 500 px
-
-Theano represents symbolic mathematical computations as graphs. Those graphs are bi-partite graphs (graphs with 2 types of nodes), they are composed of interconnected :ref:`apply` and :ref:`variable` nodes.
-:ref:`variable` nodes represent data in the graph, either inputs, outputs or intermediary values. As such, Inputs and Outputs of a graph are lists of Theano :ref:`variable` nodes. :ref:`apply` nodes perform computation on these variables to produce new variables. Each :ref:`apply` node has a link to an instance of :ref:`Op` which describes the computation to perform. This tutorial details how to write such an Op instance. Please refers to :ref:`graphstructures` for a more detailed explanation about the graph structure.
-
-
-
-
-Op Structure
-============
-
-An op is any Python object which inherits from :class:`gof.Op`.
-This section provides an overview of the methods you typically have to implement to make a new op.  It does not provide extensive coverage of all the
-possibilities you may encounter or need.  For that refer to
-:ref:`op_contract`.
-
 .. code-block:: python
 
     import theano
 
     class MyOp(theano.Op):
-        # Properties attribute
-        __props__ = ()
-
         def make_node(self, *inputs):
             pass
 
+        def __eq__(self, other):
+            pass
+
+        def __hash__(self):
+            pass
+
+        def __str__(self):
+            pass
+
         # Python implementation:
         def perform(self, node, inputs_storage, output_storage):
             pass
 
-        # Other type of implementation
         # C implementation: [see theano web site for other functions]
         def c_code(...):
             # ...
             pass
-        # Other implementations (pycuda, ...):
+
+        # others implementation (pycuda, ...):
         def make_thunk(self, node, storage_map, _, _2):
             pass
 
         # optional:
-        check_input = True
-
         def __init__(self, ...):
             pass
 
@@ -92,160 +86,44 @@ possibilities you may encounter or need.  For that refer to
         def R_op(self, inputs, eval_points):
             pass
 
-        def infer_shape(node, (i0_shapes, ...)):
+        def infer_shape(node, (i0_shapes, ...))
             pass
 
 .. ../extending/op.txt
 
-An op has to implement some methods defined in the the interface of
-:class:`gof.Op`. More specifically, it is mandatory for an op to define the method :func:`make_node` and one of the implementation methods, either :func:`perform`, :meth:`Op.c_code` or :func:`make_thunk`.
-
-  :func:`make_node` method creates an Apply node representing the application
-  of the op on the inputs provided. This method is reponsible for three things:
-
-    - it first checks that the input Variables types are compatible
-      with the current op. If the op cannot be applied on the provided
-      input types, it must raises an exception (such as :class:`TypeError`).
-    - it operates on the Variables found in
-      ``*inputs`` in Theano's symbolic language to infer the type of
-      the symbolic output Variables. It creates output Variables of a suitable
-      symbolic Type to serve as the outputs of this op's
-      application.
-    - it creates an Apply instance with the input and output Variable, and return the Apply instance.
-
-
-
-  :func:`perform` method defines the Python implementation of an op.
-  It takes several arguments:
-
-    - ``node`` is a reference to an Apply node which was previously
-      obtained via the ``Op``'s :func:`make_node` method. It is typically not
-      used in simple ops, but it contains symbolic information that
-      could be required for complex ops.
-    - ``inputs`` is a list of references to data which can be operated on using
-      non-symbolic statements, (i.e., statements in Python, Numpy).
-    - ``output_storage`` is a list of storage cells where the output
-      is to be stored. There is one storage cell for each output of the op.
-      The data put in ``output_storage`` must match the type of the
-      symbolic output. It is forbidden to change the length of the list(s)
-      contained in ``output_storage``.
-      A function Mode may allow ``output_storage`` elements to persist
-      between evaluations, or it may reset ``output_storage`` cells to
-      hold a value of ``None``.  It can also pre-allocate some memory
-      for the op to use.  This feature can allow ``perform`` to reuse
-      memory between calls, for example. If there is something
-      preallocated in the ``output_storage``, it will be of the good
-      dtype, but can have the wrong shape and have any stride pattern.
-
-  :func:`perform` method must be determined by the inputs. That is to say,
-  when applied to identical inputs the method must return the same outputs.
-
-  :class:`gof.Op` allows some other way to define the op implentation.
-  For instance, it is possible to define :meth:`Op.c_code` to provide a
-  C-implementation to the op. Please refers to tutorial
-  :ref:`extending_theano_c` for a description of :meth:`Op.c_code` and other
-  related c_methods. Note that an op can provide both Python and C implementation.
-
-  :func:`make_thunk` method is another alternative to :func:`perform`.
-  It returns a thunk. A thunk is defined as a zero-arguments
-  function which encapsulates the computation to be performed by an
-  op on the arguments of its corresponding node. It takes several parameters:
-
-    - ``node`` is the Apply instance for which a thunk is requested,
-    - ``storage_map`` is a dict of lists which  maps variables to a one-element
-      lists holding the variable's current value. The one-element list acts as
-      pointer to the value and allows sharing that "pointer" with other nodes
-      and instances.
-    - ``compute_map`` is also a  dict of lists.
-      It maps variables to one-element lists holding booleans.  If
-      the value is 0 then the variable has not been computed and the
-      value should not be considered valid.  If the value is 1 the
-      variable has been computed and the value is valid.  If the value
-      is 2 the variable has been garbage-collected and is no longer
-      valid, but shouldn't be required anymore for this call.
-      The returned function must ensure that it sets the computed
-      variables as computed in the `compute_map`.
-
-
-  :func:`make_thunk` is useful if you want to generate code and compile
-  it yourself. For example, this allows you to use PyCUDA to compile GPU
-  code.
-
-  If :func:`make_thunk()` is defined by an op, it will be used by Theano
-  to obtain the op's implementation.
-  :func:`perform` and :meth:`Op.c_code` will be ignored.
-
-Other methods can be optionally defined by the op.
-
-  The :func:`__str__` method provides a meaningful string representation of
-  your op.
-
-  :func:`__eq__` and :func:`__hash__` define respectivelly equality
-  between two ops and the hash of an op instance.
-  They will be used by the optimization
-  phase to merge nodes that are doing equivalent computations (same
-  inputs, same operation).
-  Two ops that are equal according :func:`__eq__`
-  should return the same output when they are applied on the same inputs.
-
-  The :attr:`__props__` lists the properties
-  that influence how the computation is performed (Ususally these are those
-  that you set in  :func:`__init__`). It must be a tuple.
-  If you don't have any properties, then you should set this attribute to the
-  emtpy tuple `()`.
-
-  :attr:`__props__` enables the  automatic generation of appropriate
-  :func:`__eq__` and :func:`__hash__`.
-  Given the method :func:`__eq__`, automatically generated from
-  :attr:`__props__`, two ops will be equal if they have the same values for all
-  the properties listed in :attr:`__props__`.
-  Given to the method :func:`__hash__` automatically generated from
-  :attr:`__props__`, two ops will be have the same hash if they have the same
-  values for all the properties listed in :attr:`__props__`.
-  :attr:`__props__` will also generate a  suitable :func:`__str__` for your op.
-  This requires development version after September 1st, 2014 or version 0.7.
-
-
-  The :func:`infer_shape` method allows to infer the shape of the op
-  output variables, without actually computing the outputs.
-  It takes as input ``node``, a reference to the op Apply node,
-  and a list of Theano symbolic Varables (``i0_shape``, ``i1_shape``, ...)
-  which are the shape of the op input Variables.
-  :func:`infer_shape` returns a list where each element is a tuple representing  the shape of one output.
-  This could be helpful if one only
-  needs the shape of the output instead of the actual outputs, which
-  can be useful, for instance, for optimization procedures.
-
-  The :func:`grad` method is required if you want to differentiate some cost whose expression includes your op. The gradient may be
-  specified symbolically in this method. It takes two arguments ``inputs`` and
-  ``output_gradients`` which are both lists of symbolic Theano Variables and
-  those must be operated on using Theano's symbolic language. The grad
-  method must return a list containing one Variable for each
-  input. Each returned Variable represents the gradient with respect
-  to that input computed based on the symbolic gradients with respect
-  to each output.
-  If the output is not differentiable with respect to an input then
-  this method should be defined to return a variable of type NullType
-  for that input. Likewise, if you have not implemented the grad
-  computation for some input, you may return a variable of type
-  NullType for that input. Please refer to :func:`grad` for a more detailed
-  view.
-
-
-  The :func:`R_op` method is needed if you want ``theano.tensor.Rop`` to
-  work with your op.
-  This function implements the application of the R-operator on the
-  function represented by your op. Let assume that function is :math:`f`,
-  with input :math:`x`, applying the R-operator means computing the
-  Jacobian of :math:`f` and right-multiplying it by :math:`v`, the evaluation
-  point, namely: :math:`\frac{\partial f}{\partial x} v`.
-
-  The optional boolean :attr:`check_input` attribute is used to specify
-  if you want the types used in your op to check their inputs in their
-  c_code. It can be used to speed up compilation, reduce overhead
-  (particularly for scalars) and reduce the number of generated C files.
-
-
+There are two mandatory methods that one needs to implement.
+The first one is :func:`make_node`. The second one 
+would describe the computations that are required to be done
+at run time. Currently there are 2 different possibilites:
+implement the :func:`perform`
+and/or :func:`c_code <Op.c_code>` methods (and other related :ref:`c methods
+<cop>`), or the :func:`make_thunk` method. ``perform`` allows
+to easily wrap an existing Python function into Theano. ``c_code``
+and the related methods allow the op to generate C code that will be 
+compiled and linked by Theano. On the other hand, ``make_thunk``
+will be called only once during compilation and should generate
+a ``thunk``: a standalone function that when called will do the wanted computations.
+This is useful if you want to generate code and compile it yourself. For
+example, this allows you to use PyCUDA to compile GPU code.
+
+Also there are two methods whose implementations are highly recommended. They are
+needed in order to merge duplicate computations involving your op. So if you
+do not want Theano to execute your op multiple times with the same inputs,
+do implement them. Those methods are :func:`__eq__` and
+:func:`__hash__`.
+
+The :func:`infer_shape` method allows to infer the shape of some variable, somewhere in the
+middle of the computational graph without actually computing the outputs (when possible).
+This could be helpful if one only needs the shape of the output instead of the actual outputs.
+
+The :func:`grad` method is required if you want to differentiate some cost whose expression
+includes your op.
+
+The :func:`__str__` method is useful in order to provide a more meaningful
+string representation of your op.
+
+The :func:`R_op` method is needed if you want ``theano.tensor.Rop`` to
+work with your op.
 
 Op Example
 ==========
@@ -255,11 +133,16 @@ Op Example
     import theano
 
     class DoubleOp(theano.Op):
-        __props__ = ()
+        def __eq__(self, other):
+            return type(self) == type(other)
+
+        def __hash__(self):
+            return hash(type(self))
+
+        def __str__(self):
+            return self.__class__.__name__
 
         def make_node(self, x):
-            # check that the theano version has support for __props__
-            assert hasattr(self, '_props')
             x = theano.tensor.as_tensor_variable(x)
             return theano.Apply(self, [x], [x.type()])
 
@@ -328,14 +211,8 @@ returns the right answer. If you detect an error, you must raise an
             inp = numpy.asarray(numpy.random.rand(5, 4), dtype=config.floatX)
             out = f(inp)
             # Compare the result computed to the expected value.
-            utt.assert_allclose(inp * 2, out)
+            assert numpy.allclose(inp * 2, out)
 
-We call ``utt.assert_allclose(expected_value, value)`` to compare
-NumPy ndarray.This raise an error message with more information. Also,
-the default tolerance can be changed with the Theano flags
-``config.tensor.cmp_sloppy`` that take values in 0, 1 and 2. The
-defaul value do the most strict comparison, 1 and 2 make less strict
-comparison.
 
 Testing the infer_shape
 -----------------------
@@ -430,27 +307,24 @@ For instance, to verify the Rop method of the DoubleOp, you can use this:
 Testing GPU Ops
 ---------------
 
-Ops to be executed on the GPU should inherit from the
-``theano.sandbox.cuda.GpuOp`` and not ``theano.Op``. This allows
-Theano to distinguish them. Currently, we use this to test if the
-NVIDIA driver works correctly with our sum reduction code on the GPU.
+Ops to be executed on the GPU should inherit from the ``theano.sandbox.cuda.GpuOp`` 
+and not ``theano.Op``. This allows Theano to distinguish them. Currently, we
+use this to test if the NVIDIA driver works correctly with our sum reduction code on the
+GPU.
 
 
 Running Your Tests
 ==================
 
-To perform your tests, you may select either one of the three
-following methods:
+To perform your tests, you may select either one of the three following methods:
 
 theano-nose
 -----------
 
-The method of choice to conduct tests is to run the file
-``theano-nose``. In a regular Theano installation, the latter will be
-on the operating system's path and directly accessible from any
-folder. Otherwise, it can be accessed in the ``Theano/bin``
-folder. The following command lines may be used for the corresponding
-purposes:
+The method of choice to conduct tests is to run the file ``theano-nose``. In a regular
+Theano installation, the latter will be on the operating system's path and directly accessible
+from any folder. Otherwise, it can be accessed in the ``Theano/bin`` folder. The following command
+lines may be used for the corresponding purposes:
 
 * ``theano-nose --theano``: Run every test found in Theano's path.
 
@@ -458,25 +332,23 @@ purposes:
 
 * ``theano-nose test_file.py``: Run every test found in the file *test_file.py*.
 
-The following are particularly useful for development purposes since
-they call for particular classes or even for particular tests:
+The following are particularly useful for development purposes since they call for
+particular classes or even for particular tests: 
 
 * ``theano-nose test_file.py:test_DoubleRop``: Run every test found inside the class *test_DoubleRop*.
 
 * ``theano-nose test_file.py:test_DoubleRop.test_double_op``: Run only the test *test_double_op*
   in the class *test_DoubleRop*.
 
-Help with the use and functionalities of ``theano-nose`` may be
-obtained by running it with the command line parameter ``--help
-(-h)``.
+Help with the use and functionalities of ``theano-nose`` may be obtained by running
+it with the command line parameter ``--help (-h)``. 
 
 nosetests
 ---------
 
-The command ``nosetests`` can also be used.  Although it lacks the
-useful functionalities that ``theano-nose`` provides, ``nosetests``
-can be called similarly to ``theano-nose`` from any folder in Python's
-path like so:
+The command ``nosetests`` can also be used.  Although it lacks the useful 
+functionalities that ``theano-nose`` provides, ``nosetests`` can be called similarly
+to ``theano-nose`` from any folder in Python's path like so:
 
 ``nosetests [suffix similar to the above]``.
 
@@ -486,10 +358,9 @@ More documentation on ``nosetests`` is available here:
 In-file
 -------
 
-One may also add a block of code similar to the following at the end
-of the file containing a specific test of interest and run the
-file. In this example, the test *test_DoubleRop* in the class
-*test_double_op* would be performed.
+One may also add a block of code similar to the following at the end of the
+file containing a specific test of interest and run the file. In this example, the test
+*test_DoubleRop* in the class *test_double_op* would be performed.
 
 .. code-block:: python
 
@@ -516,97 +387,45 @@ Modify and execute to compute: x * y.
 
 Modify and execute the example to return two outputs: x + y and x - y.
 
-You can omit the Rop functions. Try to implement the testing apparatus
-described above.
+You can omit the Rop functions. Try to implement the testing apparatus described above.
 
 (Notice that Theano's current *elemwise fusion* optimization is
 only applicable to computations involving a single output. Hence, to gain
 efficiency over the basic solution that is asked here, the two operations would
 have to be jointly optimized explicitly in the code.)
 
+SciPy
+-----
 
-as_op
-=====
-
-as_op is a python decorator that converts a python function into a
-basic Theano op that will call the supplied function during execution.
-
-This isn't the recommended way to build an op, but allows for a quick
-implementation.
-
-It takes an optional :func:`infer_shape` parameter that must have this
-signature:
-
-  .. code-block:: python
-
-           def infer_shape(node, input_shapes):
-                # ...
-                return output_shapes
-
-  - `input_shapes` and `output_shapes` are lists of tuples that
-    represent the shape of the corresponding inputs/outputs.
-
-.. note::
-
-    Not providing the `infer_shape` method prevents shape-related
-    optimizations from working with this op. For example
-    `your_op(inputs, ...).shape` will need the op to be executed just
-    to get the shape.
-
-.. note::
-
-    As no grad is defined, this means you won't be able to
-    differentiate paths that include this op.
-
-.. note::
-
-    It converts the Python function to a callable object that takes as
-    inputs Theano variables that were declared.
-
-
-as_op Example
--------------
+We can wrap SciPy functions in Theano. But SciPy is an optional dependency.
+Here is some code that allows the Op to be optional:
 
 .. code-block:: python
 
-    import theano
-    import numpy
-    from theano.compile.ops import as_op
-
-    def infer_shape_numpy_dot(node, input_shapes):
-        ashp, bshp = input_shapes
-        return [ashp[:-1] + bshp[-1:]]
-
-    @as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
-           otypes=[theano.tensor.fmatrix], infer_shape=infer_shape_numpy_dot)
-    def numpy_dot(a, b):
-       return numpy.dot(a, b)
-
-You can try it as follows:
-
-.. code-block:: python
-
-    x = theano.tensor.fmatrix()
-    y = theano.tensor.fmatrix()
-    f = function([x, y], numpy_dot(x, y))
-    inp1 = numpy.random.rand(5, 4)
-    inp2 = numpy.random.rand(4, 7)
-    out = f(inp1, inp2)
-
+    try:
+        import scipy.linalg
+        imported_scipy = True
+    except ImportError:
+        # some ops (e.g. Cholesky, Solve, A_Xinv_b) won't work
+        imported_scipy = False
 
-Exercise
---------
-
-Run the code of the *numpy_dot* example above.
-
-Modify and execute to compute: numpy.add and numpy.subtract.
-
-Modify and execute the example to return two outputs: x + y
-    and x - y.
+    class SomeOp(Op):
+        ...
+        def make_node(self, x):
+            assert imported_scipy, (
+            "SciPy not available. SciPy is needed for the SomeOp op.")
+            ...
 
+    from nose.plugins.skip import SkipTest
+    class test_SomeOp(utt.InferShapeTester):
+        ...
+        def test_infer_shape(self):
+            if not imported_scipy:
+                raise SkipTest("SciPy needed for the SomeOp op.")
+            ...
 
 Random numbers in tests
-=======================
+-----------------------
 
 Making tests errors more reproducible is a good practice. To make your
 tests more reproducible, you need a way to get the same random
@@ -622,8 +441,12 @@ For more details see :ref:`random_value_in_tests`.
 
 :download:`Solution<extending_theano_solution_1.py>`
 
-Documentation
-=============
+
+Final Note
+==========
+
+A more extensive discussion of this section's content may be found in the advanced
+tutorial :ref:`Extending Theano<extending>`
 
 See :ref:`metadocumentation`, for some information on how to generate
 the documentation.
@@ -658,19 +481,3 @@ documentation:
 
 .. automodule:: theano.misc.doubleop
     :members:
-
-Final Note
-==========
-
-A more extensive discussion of this section's content may be found in
-the advanced tutorial :ref:`Extending Theano<extending>`.
-
-The section :ref:`Other ops <other_ops>` includes more instructions for
-the following specific cases:
-
- - :ref:`scalar_ops`
- - :ref:`scipy_ops`
- - :ref:`sparse_ops`
- - :ref:`Random ops <random_ops>`
- - :ref:`openmp_ops`
- - :ref:`numba_ops`
diff --git a/doc/tutorial/extending_theano_c.txt b/doc/tutorial/extending_theano_c.txt
deleted file mode 100644
index a0f826aa723..00000000000
--- a/doc/tutorial/extending_theano_c.txt
+++ /dev/null
@@ -1,963 +0,0 @@
-
-.. _extending_theano_c:
-
-============================
-Extending Theano with a C Op
-============================
-
-This tutorial covers how to extend Theano with an op that offers a C
-implementation. It does not cover ops that run on a GPU but it does introduce
-many elements and concepts which are relevant for GPU ops. This tutorial is
-aimed at individuals who already know how to extend Theano (see tutorial
-:ref:`extending_theano`) by adding a new op with a Python implementation
-and will only cover the additional knowledge required to also produce ops
-with C implementations.
-
-Providing a Theano op with a C implementation requires to interact with
-Python's C-API and Numpy's C-API. Thus, the first step of this tutorial is to
-introduce both and highlight their features which are most relevant to the
-task of implementing a C op. This tutorial then introduces the most important
-methods that the op needs to implement in order to provide a usable C
-implementation. Finally, it shows how to combine these elements to write a
-simple C op for performing the simple task of multiplying every element in a
-vector by a scalar.
-
-Python C-API
-============
-
-Python provides a C-API to allows the manipulation of python objects from C
-code. In this API, all variables that represent Python objects are of type
-``PyObject *``. All objects have a pointer to their type object and a reference
-count field (that is shared with the python side). Most python methods have
-an equivalent C function that can be called on the ``PyObject *`` pointer.
-
-As such, manipulating a PyObject instance is often straight-forward but it
-is important to properly manage its reference count. Failing to do so can
-lead to undesired behavior in the C code.
-
-
-Reference counting
-------------------
-
-Reference counting is a mechanism for keeping track, for an object, of
-the number of references to it held by other entities. This mechanism is often
-used for purposes of garbage collecting because it allows to easily see if
-an object is still being used by other entities. When the reference count
-for an object drops to 0, it means it is not used by anyone any longer and can
-be safely deleted.
-
-PyObjects implement reference counting and the Python C-API defines a number
-of macros to help manage those reference counts. The definition of these
-macros can be found here : `Python C-API Reference Counting
-<https://docs.python.org/2/c-api/refcounting.html>`_. Listed below are the
-two macros most often used in Theano C ops.
-
-
-.. method:: void Py_XINCREF(PyObject *o)
-
-    Increments the reference count of object ``o``. Without effect if the
-    object is NULL.
-
-.. method:: void Py_XDECREF(PyObject *o)
-
-    Decrements the reference count of object ``o``. If the reference count
-    reaches 0, it will trigger a call of the object's deallocation function.
-    Without effect if the object is NULL.
-
-The general principle, in the reference counting paradigm, is that the owner
-of a reference to an object is responsible for disposing properly of it.
-This can be done by decrementing the reference count once the reference is no
-longer used or by transfering ownership; passing on the reference to a new
-owner which becomes responsible for it.
-
-Some functions return "borrowed references"; this means that they return a
-reference to an object **without** transfering ownership of the reference to the
-caller of the function. This means that if you call a function which returns a
-borrowed reference, you do not have the burden of properly disposing of that
-reference. You should **not** call Py_XDECREF() on a borrowed reference.
-
-Correctly managing the reference counts is important as failing to do so can
-lead to issues ranging from memory leaks to segmentation faults.
-
-
-NumPy C-API
-===========
-
-The NumPy library provides a C-API to allow users to create, access and
-manipulate NumPy arrays from within their own C routines. NumPy's ndarrays
-are used extensively inside Theano and so extending Theano with a C op will
-require interaction with the NumPy C-API.
-
-This sections covers the API's elements that are often required to write code
-for a Theano C op. The full documentation for the API can be found here :
-`NumPy C-API <http://docs.scipy.org/doc/numpy/reference/c-api.html>`_.
-
-
-NumPy data types
-----------------
-
-To allow portability between platforms, the NumPy C-API defines its own data
-types which should be used whenever you are manipulating a NumPy array's
-internal data. The data types most commonly used to implement C ops are the
-following : ``npy_int{8,16,32,64}``, ``npy_uint{8,16,32,64}`` and
-``npy_float{32,64}``.
-
-You should use these data types when manipulating a NumPy array's internal
-data instead of C primitives because the size of the memory representation
-for C primitives can vary between platforms. For instance, a C ``long`` can be
-represented in memory with 4 bytes but it can also be represented with 8.
-On the other hand, the in-memory size of NumPy data types remains constant
-across platforms. Using them will make your code simpler and more portable.
-
-The full list of defined data types can be found here :
-`NumPy C-API data types
-<http://docs.scipy.org/doc/numpy/reference/c-api.dtype.html#c-type-names>`_.
-
-
-NumPy ndarrays
---------------
-
-In the NumPy C-API, NumPy arrays are represented as instances of the
-PyArrayObject class which is a descendant of the PyObject class. This means
-that, as for any other Python object that you manipulate from C code, you
-need to appropriatedly manage the reference counts of PyArrayObject instances.
-
-Unlike in a standard multidimensionnal C array, a NumPy array's internal data
-representation does not have to occupy a continuous region in memory. In fact,
-it can be C-contiguous, F-contiguous or non-contiguous. C-contiguous means
-that the data is not only contiguous in memory but also that it is organized
-such that the index of the latest dimension changes the fastest. If the
-following array
-
-.. code-block:: python
-
-    x = [[1, 2, 3],
-         [4, 5, 6]]
-
-is C-contiguous, it means that, in memory, the six values contained in the
-array ``x`` are stored in the order ``[1, 2, 3, 4, 5, 6]`` (the first value is
-``x[0,0]``, the second value is ``x[0,1]``, the third value is ``x[0,2]``, the,
-fourth value is ``x[1,0]``, etc). F-contiguous (or Fortran Contiguous) also
-means that the data is contiguous but that it is organized such that the index
-of the latest dimension changes the slowest. If the array ``x`` is
-F-contiguous, it means that, in memory, the values appear in the order
-``[1, 4, 2, 5, 3, 6]`` (the first value is ``x[0,0]``, the second value is
-``x[1,0]``, the third value is ``x[0,1]``, etc).
-
-Finally, the internal data can be non-contiguous. In this case, it occupies
-a non-contiguous region in memory but it is still stored in an organized
-fashion : the distance between the element ``x[i,j]`` and the element
-``x[i+1,j]`` of the array is constant over all valid values of ``i`` and
-``j``, just as the distance between the element ``x[i,j]`` and the element
-``x[i,j+1]`` of the array is constant over all valid values of ``i`` and ``j``.
-This distance between consecutive elements of an array over a given dimension,
-is called the stride of that dimension.
-
-
-Accessing NumPy ndarrays' data and properties
----------------------------------------------
-
-The following macros serve to access various attributes of NumPy ndarrays.
-
-.. method:: void* PyArray_DATA(PyArrayObject* arr)
-
-    Returns a pointer to the first element of the array's data. The returned
-    pointer must be cast to a pointer of the proper Numpy C-API data type
-    before use.
-
-.. method:: int PyArray_NDIM(PyArrayObject* arr)
-
-    Returns the number of dimensions in the the array pointed by ``arr``
-
-.. method:: npy_intp* PyArray_DIMS(PyArrayObject* arr)
-
-    Returns a pointer on the first element of ``arr``'s internal array
-    describing its dimensions. This internal array contains as many elements
-    as the array ``arr`` has dimensions.
-
-    The macro ``PyArray_SHAPE()`` is a synonym of ``PyArray_DIMS()`` : it has
-    the same effect and is used in an identical way.
-
-.. method:: npy_intp* PyArray_STRIDES(PyArrayObject* arr)
-
-    Returns a pointer on the first element of ``arr``'s internal array
-    describing the stride for each of its dimension. This array has as many
-    elements as the number of dimensions in ``arr``. In this array, the
-    strides are expressed in number of bytes.
-
-.. method:: PyArray_Descr* PyArray_DESCR(PyArrayObject* arr)
-
-    Returns a reference to the object representing the dtype of the array.
-
-    The macro ``PyArray_DTYPE()`` is a synonym of the ``PyArray_DESCR()`` : it
-    has the same effect and is used in an identical way.
-
-    :note:
-        This is a borrowed reference so you do not need to decrement its
-        reference count once you are done with it.
-
-.. method:: int PyArray_TYPE(PyArrayObject* arr)
-
-    Returns the typenumber for the elements of the array. Like the dtype, the
-    typenumber is a descriptor for the type of the data in the array. However,
-    the two are not synonyms and, as such, cannot be used in place of the
-    other.
-
-.. method:: npy_intp PyArray_SIZE(PyArrayObject* arr)
-
-    Returns to total number of elements in the array
-
-.. method:: bool PyArray_CHKFLAGS(PyArrayObject* arr, flags)
-
-    Returns true if the array has the specified flags. The variable flag
-    should either be a NumPy array flag or an integer obtained by applying
-    bitwise or to an ensemble of flags.
-
-    The flags that can be used in with this macro are :
-    NPY_ARRAY_C_CONTIGUOUS, NPY_ARRAY_F_CONTIGUOUS, NPY_ARRAY_OWNDATA,
-    NPY_ARRAY_ALIGNED, NPY_ARRAY_WRITEABLE, NPY_ARRAY_UPDATEIFCOPY.
-
-
-Creating NumPy ndarrays
------------------------
-
-The following functions allow the creation and copy of NumPy arrays :
-
-.. method:: PyObject* PyArray_EMPTY(int nd, npy_intp* dims, typenum dtype,
-                                    int fortran)
-
-    Constructs a new ndarray with the number of dimensions specified by
-    ``nd``, shape specified by ``dims`` and data type specified by ``dtype``.
-    If ``fortran`` is equal to 0, the data is organized in a C-contiguous
-    layout, otherwise it is organized in a F-contiguous layout. The array
-    elements are not initialized in any way.
-
-    The function ``PyArray_Empty()`` performs the same function as the macro
-    ``PyArray_EMPTY()`` but the data type is given as a pointer to a
-    ``PyArray_Descr`` object instead of a ``typenum``.
-
-.. method:: PyObject* PyArray_ZEROS(int nd, npy_intp* dims, typenum dtype,
-                                    int fortran)
-
-    Constructs a new ndarray with the number of dimensions specified by
-    ``nd``, shape specified by ``dims`` and data type specified by ``dtype``.
-    If ``fortran`` is equal to 0, the data is organized in a C-contiguous
-    layout, otherwise it is organized in a F-contiguous layout. Every element
-    in the array is initialized to 0.
-
-    The function ``PyArray_Zeros()`` performs the same function as the macro
-    ``PyArray_ZEROS()`` but the data type is given as a pointer to a
-    ``PyArray_Descr`` object instead of a ``typenum``.
-
-.. method:: PyArrayObject* PyArray_GETCONTIGUOUS(PyObject* op)
-
-    Returns a C-contiguous and well-behaved copy of the array op. If op is
-    already C-contiguous and well-behaved, this function simply returns a
-    new reference to op.
-
-
-
-Methods the C Op needs to define
-================================
-
-There is a key difference between an op defining a Python implementation for
-its computation and defining a C implementation. In the case of a Python
-implementation, the op defines a function ``perform()`` which executes the
-required Python code to realize the op. In the case of a C implementation,
-however, the op does **not** define a function that will execute the C code; it
-instead defines functions that will **return** the C code to the caller.
-
-This is because calling C code from Python code comes with a significant
-overhead. If every op was responsible for executing its own C code, every
-time a Theano function was called, this overhead would occur as many times
-as the number of ops with C implementations in the function's computational
-graph.
-
-To maximize performance, Theano instead requires the C ops to simply return
-the code needed for their execution and takes upon itself the task of
-organizing, linking and compiling the code from the various ops. Through this,
-Theano is able to minimize the number of times C code is called from Python
-code.
-
-The following is a very simple example to illustrate how it's possible to
-obtain performance gains with this process. Suppose you need to execute,
-from Python code, 10 different ops, each one having a C implementation. If
-each op was responsible for executing its own C code, the overhead of
-calling C code from Python code would occur 10 times. Consider now the case
-where the ops instead return the C code for their execution. You could get
-the C code from each op and then define your own C module that would call
-the C code from each op in succession. In this case, the overhead would only
-occur once; when calling your custom module itself.
-
-Moreover, the fact that Theano itself takes care of compiling the C code,
-instead of the individual ops, allows Theano to easily cache the compiled C
-code. This allows for faster compilation times.
-
-See :ref:`cop` for the full documentation of the various methods of the
-class Op that are related to the C implementation. Of particular interest are:
-
-*       The methods :meth:`Op.c_libraries` and :meth:`Op.c_lib_dirs` to allow
-        your op to use external libraries.
-
-*       The method :meth:`Op.c_code_cleanup` to specify how the op should
-        clean up what it has allocated during its execution.
-
-*       The methods :meth:`Op.c_init_code` and :meth:`Op.c_init_code_apply`
-        to specify code that should be executed once when the module is
-        initialized, before anything else is executed.
-
-*       The methods :meth:`Op.c_compile_args` and
-        :meth:`Op.c_no_compile_args` to specify requirements regarding how
-        the op's C code should be compiled.
-
-This section describes the methods :meth:`Op.c_code`,
-:meth:`Op.c_support_code`, :meth:`Op.c_support_code_apply` and
-:meth:`Op.c_code_cache_version` because they are the ones that are most
-commonly used.
-
-.. method:: c_code(node, name, input_names, output_names, sub)
-
-    This method returns a string containing the C code to perform the
-    computation required by this op.
-
-    The ``node`` argument is an :ref:`apply` node representing an
-    application of the current Op on a list of inputs, producing a list of
-    outputs.
-
-    ``input_names`` is a sequence of strings which contains as many strings
-    as the op has inputs. Each string contains the name of the C variable
-    to which the corresponding input has been assigned. For example, the name
-    of the C variable representing the first input of the op is given by
-    ``input_names[0]``. You should therefore use this name in your
-    C code to interact with that variable. ``output_names`` is used
-    identically to ``input_names``, but for the op's outputs.
-
-    Finally, ``sub`` is a dictionary of extras parameters to the c_code
-    method. Among other things, it contains ``sub['fail']`` which is a string
-    of C code that you should include in your C code (after ensuring that a
-    Python exception is set) if it needs to raise an exception. Ex:
-
-    .. code-block:: python
-
-        c_code = """
-            PyErr_Format(PyExc_ValueError, "X does not have the right value");
-            %(fail)s;
-        """ % {'fail' : sub['fail']}
-
-    to raise a ValueError Python exception with the specified message.
-    The function ``PyErr_Format()`` supports string formatting so it is
-    possible to tailor the error message to the specifics of the error
-    that occured. If ``PyErr_Format()`` is called with more than two
-    arguments, the subsequent arguments are used to format the error message
-    with the same behavior as the function `PyString_FromFormat()
-    <https://docs.python.org/2/c-api/string.html#c.PyString_FromFormat>`_. The
-    ``%`` characters in the format characters need to be escaped since the C
-    code itself is defined in a string which undergoes string formatting.
-
-    .. code-block:: python
-
-        c_code = """
-            PyErr_Format(PyExc_ValueError,
-                         "X==%%i but it should be greater than 0", X);
-            %(fail)s;
-        """ % {'fail' : sub['fail']}
-
-    :note:
-        Your C code should not return the output of the computation but
-        rather put the results in the C variables whose names are contained in
-        the ``output_names``.
-
-.. method:: c_support_code()
-
-    Returns a string containing some support C code for this op. This code
-    will be included at the global scope level and can be used to define
-    functions and structs that will be used by every apply of this op.
-
-.. method:: c_support_code_apply(node, name)
-
-    Returns a string containing some support C code for this op. This code
-    will be included at the global scope level and can be used to define
-    functions and structs that will be used by this op. The difference between
-    this method and ``c_support_code()`` is that the C code specified in
-    ``c_support_code_apply()`` should be specific to each apply of the Op,
-    while ``c_support_code()`` is for support code that is not specific to
-    each apply.
-
-    Both ``c_support_code()`` and ``c_support_code_apply ()`` are necessary
-    because a Theano op can be used more than once in a given Theano
-    function. For example, an op that adds two matrices could be used at some
-    point in the Theano function to add matrices of integers and, at another
-    point, to add matrices of doubles. Because the dtype of the inputs and
-    outputs can change between different applies of the op, any support code
-    that relies on a certain dtype is specific to a given apply of the op and
-    should therefore be defined in ``c_support_code_apply()``.
-
-.. method:: c_code_cache_version()
-
-    Returns a tuple of integers representing the version of the C code in this
-    op. Ex : (1, 4, 0) for version 1.4.0
-
-    This tuple is used by Theano to cache the compiled C code for this op. As
-    such, the return value **MUST BE CHANGED** every time the C code is altered
-    or else Theano will disregard the change in the code and simply load a
-    previous version of the op from the cache. If you want to avoid caching of
-    the C code of this op, return an empty tuple or do not implement this
-    method.
-
-    :note:
-        Theano can handle tuples of any hashable objects as return values
-        for this function but, for greater readability and easier management,
-        this function should return a tuple of integers as previously
-        described.
-
-
-Simple C Op example
-===================
-
-In this section, we put together the concepts that were covered in this
-tutorial to generate an op which multiplies every element in a vector
-by a scalar and returns the resulting vector. This is intended to be a simple
-example so the methods ``c_support_code()`` and ``c_support_code_apply()`` are
-not used because they are not required.
-
-In the C code below notice how the reference count on the output variable is
-managed. Also take note of how the new variables required for the op's
-computation are declared in a new scope to avoid cross-initialization errors.
-
-Also, in the C code, it is very important to properly validate the inputs
-and outputs storage. Theano guarantees that the inputs exist and have the
-right number of dimensions but it does not guarantee their exact shape. For
-instance, if an op computes the sum of two vectors, it needs to validate that
-its two inputs have the same shape. In our case, we do not need to validate
-the exact shapes of the inputs because we don't have a need that they match
-in any way.
-
-For the outputs, things are a little bit more subtle. Theano does not
-guarantee that they have been allocated but it does guarantee that, if they
-have been allocated, they have the right number of dimension. Again, Theano
-offers no guarantee on the exact shapes. This means that, in our example, we
-need to validate that the output storage has been allocated and has the same
-shape as our vector input. If it is not the case, we allocate a new output
-storage with the right shape and number of dimensions.
-
-.. code-block:: python
-
-    import numpy
-    import theano
-    from theano import gof
-    import theano.tensor as T
-
-    class VectorTimesScalar(gof.Op):
-        __props__ = ()
-
-        def make_node(self, x, y):
-            # Validate the inputs' type
-            if x.type.ndim != 1:
-                raise TypeError('x must be a 1-d vector')
-            if y.type.ndim != 0:
-                raise TypeError('y must be a scalar')
-
-            # Create an output variable of the same type as x
-            output_var = x.type()
-
-            return gof.Apply(self, [x, y], [output_var])
-
-        def c_code_cache_version(self):
-            return (1, 0)
-
-        def c_code(self, node, name, inp, out, sub):
-            x, y = inp
-            z, = out
-
-            # Extract the dtypes of the inputs and outputs storage to
-            # be able to declare pointers for those dtypes in the C
-            # code.
-            dtype_x = node.inputs[0].dtype
-            dtype_y = node.inputs[1].dtype
-            dtype_z = node.outputs[0].dtype
-
-            itemsize_x = numpy.dtype(dtype_x).itemsize
-            itemsize_z = numpy.dtype(dtype_z).itemsize
-
-            fail = sub['fail']
-
-            c_code = """
-            // Validate that the output storage exists and has the same
-            // dimension as x.
-            if (NULL == %(z)s ||
-                PyArray_DIMS(%(x)s)[0] != PyArray_DIMS(%(z)s)[0])
-            {
-                /* Reference received to invalid output variable.
-                Decrease received reference's ref count and allocate new
-                output variable */
-                Py_XDECREF(%(z)s);
-                %(z)s = (PyArrayObject*)PyArray_EMPTY(1,
-                                                    PyArray_DIMS(%(x)s),
-                                                    PyArray_TYPE(%(x)s),
-                                                    0);
-
-                if (!%(z)s) {
-                    %(fail)s;
-                }
-            }
-
-            // Perform the vector multiplication by a scalar
-            {
-                /* The declaration of the following variables is done in a new
-                scope to prevent cross initialization errors */
-                npy_%(dtype_x)s* x_data_ptr =
-                                (npy_%(dtype_x)s*)PyArray_DATA(%(x)s);
-                npy_%(dtype_z)s* z_data_ptr =
-                                (npy_%(dtype_z)s*)PyArray_DATA(%(z)s);
-                npy_%(dtype_y)s y_value =
-                                ((npy_%(dtype_y)s*)PyArray_DATA(%(y)s))[0];
-                int x_stride = PyArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
-                int z_stride = PyArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
-                int x_dim = PyArray_DIMS(%(x)s)[0];
-
-                for(int i=0; i < x_dim; i++)
-                {
-                    z_data_ptr[i * z_stride] = (x_data_ptr[i * x_stride] *
-                                                y_value);
-                }
-            }
-            """
-
-            return c_code % locals()
-
-
-More complex C Op example
-=========================
-
-This section introduces a new example, slightly more complex than the previous
-one, with an op to perform an element-wise multiplication between the elements
-of two vectors. This new example differs from the previous one in its use
-of the methods ``c_support_code()`` and ``c_support_code_apply()`` (it does
-not `need` to use them but it does so to explain their use) and its capacity
-to support inputs of different dtypes.
-
-Recall the method ``c_support_code()`` is meant to produce code that will
-be used for every apply of the op. This means that the C code in this
-method must be valid in every setting your op supports. If the op is meant
-to supports inputs of various dtypes, the C code in this method should be
-generic enough to work with every supported dtype. If the op operates on
-inputs that can be vectors or matrices, the C code in this method should
-be able to accomodate both kinds of inputs.
-
-In our example, the method ``c_support_code()`` is used to declare a C
-function to validate that two vectors have the same shape. Because our
-op only supports vectors as inputs, this function is allowed to rely
-on its inputs being vectors. However, our op should support multiple
-dtypes so this function cannot rely on a specific dtype in its inputs.
-
-The method ``c_support_code_apply()``, on the other hand, is allowed
-to depend on the inputs to the op because it is apply-specific. Therefore, we
-use it to define a function to perform the multiplication between two vectors.
-Variables or functions defined in the method ``c_support_code_apply()`` will
-be included at the global scale for every apply of the Op. Because of this,
-the names of those variables and functions should include the name of the op,
-like in the example. Otherwise, using the op twice in the same graph will give
-rise to conflicts as some elements will be declared more than once.
-
-The last interesting difference occurs in the ``c_code()`` method. Because the
-dtype of the output is variable and not guaranteed to be the same as any of
-the inputs (because of the upcast in the method ``make_node()``), the typenum
-of the output has to be obtained in the Python code and then included in the
-C code.
-
-.. code-block:: python
-
-    class VectorTimesVector(gof.Op):
-        __props__ = ()
-
-        def make_node(self, x, y):
-            # Validate the inputs' type
-            if x.type.ndim != 1:
-                raise TypeError('x must be a 1-d vector')
-            if y.type.ndim != 1:
-                raise TypeError('y must be a 1-d vector')
-
-            # Create an output variable of the same type as x
-            output_var = theano.tensor.TensorType(
-                            dtype=theano.scalar.upcast(x.dtype, y.dtype),
-                            broadcastable=[False])()
-
-            return gof.Apply(self, [x, y], [output_var])
-
-        def c_code_cache_version(self):
-            return (1, 0, 2)
-
-        def c_support_code(self):
-            c_support_code = """
-            bool vector_same_shape(PyArrayObject* arr1,
-                PyArrayObject* arr2)
-            {
-                return (PyArray_DIMS(arr1)[0] == PyArray_DIMS(arr2)[0]);
-            }
-            """
-
-            return c_support_code
-
-        def c_support_code_apply(self, node, name):
-            dtype_x = node.inputs[0].dtype
-            dtype_y = node.inputs[1].dtype
-            dtype_z = node.outputs[0].dtype
-
-            c_support_code = """
-            void vector_elemwise_mult_%(name)s(npy_%(dtype_x)s* x_ptr,
-                int x_str, npy_%(dtype_y)s* y_ptr, int y_str,
-                npy_%(dtype_z)s* z_ptr, int z_str, int nbElements)
-            {
-                for (int i=0; i < nbElements; i++){
-                    z_ptr[i * z_str] = x_ptr[i * x_str] * y_ptr[i * y_str];
-                }
-            }
-            """
-
-            return c_support_code % locals()
-
-        def c_code(self, node, name, inp, out, sub):
-            x, y = inp
-            z, = out
-
-            dtype_x = node.inputs[0].dtype
-            dtype_y = node.inputs[1].dtype
-            dtype_z = node.outputs[0].dtype
-
-            itemsize_x = numpy.dtype(dtype_x).itemsize
-            itemsize_y = numpy.dtype(dtype_y).itemsize
-            itemsize_z = numpy.dtype(dtype_z).itemsize
-
-            typenum_z = numpy.dtype(dtype_z).num
-
-            fail = sub['fail']
-
-            c_code = """
-            // Validate that the inputs have the same shape
-            if ( !vector_same_shape(%(x)s, %(y)s))
-            {
-                PyErr_Format(PyExc_ValueError, "Shape mismatch : "
-                            "x.shape[0] and y.shape[0] should match but "
-                            "x.shape[0] == %%i and y.shape[0] == %%i",
-                            PyArray_DIMS(%(x)s)[0], PyArray_DIMS(%(y)s)[0]);
-                %(fail)s;
-            }
-
-            // Validate that the output storage exists and has the same
-            // dimension as x.
-            if (NULL == %(z)s || !(vector_same_shape(%(x)s, %(z)s)))
-            {
-                /* Reference received to invalid output variable.
-                Decrease received reference's ref count and allocate new
-                output variable */
-                Py_XDECREF(%(z)s);
-                %(z)s = (PyArrayObject*)PyArray_EMPTY(1,
-                                                    PyArray_DIMS(%(x)s),
-                                                    %(typenum_z)s,
-                                                    0);
-
-                if (!%(z)s) {
-                    %(fail)s;
-                }
-            }
-
-            // Perform the vector elemwise multiplication
-            vector_elemwise_mult_%(name)s(
-                                    (npy_%(dtype_x)s*)PyArray_DATA(%(x)s),
-                                    PyArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                                    (npy_%(dtype_y)s*)PyArray_DATA(%(y)s),
-                                    PyArray_STRIDES(%(y)s)[0] / %(itemsize_y)s,
-                                    (npy_%(dtype_z)s*)PyArray_DATA(%(z)s),
-                                    PyArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                                    PyArray_DIMS(%(x)s)[0]);
-            """
-
-            return c_code % locals()
-
-
-Alternate way of defining C Ops
-===============================
-
-The two previous examples have covered the standard way of implementing C Ops
-in Theano by inheriting from the class :class:`Op`. This process is mostly
-simple but it still involves defining many methods as well as mixing, in the
-same file, both Python and C code which tends to make the result less
-readable.
-
-To help with this, Theano defines a class, ``COp``, from which new C ops
-can inherit. The class ``COp`` aims to simplify the process of implementing
-C ops by doing the following :
-
-*       It allows you to define the C implementation of your op in a distinct
-        C code file. This makes it easier to keep your Python and C code
-        readable and well indented.
-
-*       It automatically handles the methods :meth:`Op.c_code()`,
-        :meth:`Op.c_support_code()`, :meth:`Op.c_support_code_apply()` and
-        :meth:`Op.c_code_cache_version()` based on the provided external C
-        implementation.
-
-To illustrate how much simpler the class ``COp`` makes the process of defining
-a new op with a C implementation, let's revisit the second example of this
-tutorial, the ``VectorTimesVector`` op. In that example, we implemented an op
-to perform the task of element-wise vector-vector multiplication. The two
-following blocks of code illustrate what the op would look like if it was
-implemented using the ``COp`` class.
-
-The new op is defined inside a Python file with the following code :
-
-.. code-block:: python
-
-    import theano
-    from theano import gof
-
-    class VectorTimesVector(gof.COp):
-        __props__ = ()
-
-        func_file = "./vectorTimesVector.c"
-        func_name = "APPLY_SPECIFIC(vector_times_vector)"
-
-        def __init__(self):
-            super(VectorTimesVector, self).__init__(self.func_file,
-                                                    self.func_name)
-
-        def make_node(self, x, y):
-            # Validate the inputs' type
-            if x.type.ndim != 1:
-                raise TypeError('x must be a 1-d vector')
-            if y.type.ndim != 1:
-                raise TypeError('y must be a 1-d vector')
-
-            # Create an output variable of the same type as x
-            output_var = theano.tensor.TensorType(
-                            dtype=theano.scalar.upcast(x.dtype, y.dtype),
-                            broadcastable=[False])()
-
-            return gof.Apply(self, [x, y], [output_var])
-
-And the following is the C implementation of the op, defined in an external
-C file named vectorTimesVector.c :
-
-.. code-block:: c
-
-    THEANO_SUPPORT_CODE_SECTION
-
-    // Support code function
-    bool vector_same_shape(PyArrayObject* arr1, PyArrayObject* arr2)
-    {
-        return (PyArray_DIMS(arr1)[0] == PyArray_DIMS(arr2)[0]);
-    }
-
-
-    THEANO_APPLY_CODE_SECTION
-
-    // Apply-specific support function
-    void APPLY_SPECIFIC(vector_elemwise_mult)(
-        DTYPE_INPUT_0* x_ptr, int x_str,
-        DTYPE_INPUT_1* y_ptr, int y_str,
-        DTYPE_OUTPUT_0* z_ptr, int z_str, int nbElements)
-    {
-        for (int i=0; i < nbElements; i++){
-            z_ptr[i * z_str] = x_ptr[i * x_str] * y_ptr[i * y_str];
-        }
-    }
-
-    // Apply-specific main function
-    int APPLY_SPECIFIC(vector_times_vector)(PyArrayObject* input0,
-                                            PyArrayObject* input1,
-                                            PyArrayObject** output0)
-    {
-        // Validate that the inputs have the same shape
-        if ( !vector_same_shape(input0, input1))
-        {
-            PyErr_Format(PyExc_ValueError, "Shape mismatch : "
-                        "input0.shape[0] and input1.shape[0] should "
-                        "match but x.shape[0] == %i and "
-                        "y.shape[0] == %i",
-                        PyArray_DIMS(input0)[0], PyArray_DIMS(input1)[0]);
-            return 1;
-        }
-
-        // Validate that the output storage exists and has the same
-        // dimension as x.
-        if (NULL == *output0 || !(vector_same_shape(input0, *output0)))
-        {
-            /* Reference received to invalid output variable.
-            Decrease received reference's ref count and allocate new
-            output variable */
-            Py_XDECREF(*output0);
-            *output0 = (PyArrayObject*)PyArray_EMPTY(1,
-                                                    PyArray_DIMS(input0),
-                                                    TYPENUM_OUTPUT_0,
-                                                    0);
-
-            if (!*output0) {
-                PyErr_Format(PyExc_ValueError,
-                            "Could not allocate output storage");
-                return 1;
-            }
-        }
-
-        // Perform the actual vector-vector multiplication
-        APPLY_SPECIFIC(vector_elemwise_mult)(
-                                (DTYPE_INPUT_0*)PyArray_DATA(input0),
-                                PyArray_STRIDES(input0)[0] / ITEMSIZE_INPUT_0,
-                                (DTYPE_INPUT_1*)PyArray_DATA(input1),
-                                PyArray_STRIDES(input1)[0] / ITEMSIZE_INPUT_1,
-                                (DTYPE_OUTPUT_0*)PyArray_DATA(*output0),
-                                PyArray_STRIDES(*output0)[0] / ITEMSIZE_OUTPUT_0,
-                                PyArray_DIMS(input0)[0]);
-
-        return 0;
-    }
-
-As you can see from this example, the Python and C implementations are nicely
-decoupled which makes them much more readable than when they were intertwined
-in the same file and the C code contained string formatting markers.
-
-Now that we have motivated the COp class, we can have a more precise look at
-what it does for us. For this, we go through the various elements that make up
-this new version of the VectorTimesVector op :
-
-*       Parent class : instead of inheriting from the class :class:`Op`,
-        VectorTimesVector inherits from the class ``COp``.
-
-*       Constructor : in our new op, the ``__init__()`` method has an important
-        use; to inform the constructor of the ``COp`` class of the location,
-        on the filesystem of the C implementation of this op. To do this, it
-        gives the path of file containing the C code as well as the name of
-        the function, in that file, that should be called to perform the
-        computation. The path should be given as a relative path from the
-        folder where the descendant of the ``COp`` class is defined.
-
-*       ``make_node()`` : the ``make_node()`` method is absolutely identical to
-        the one in our old example. Using the ``COp`` class doesn't change
-        anything here.
-
-*       External C code : the external C code performs the computation
-        associated with the op. It contains, at the very least, a 'main' function
-        having the same name as provided to the constructor of the Python class
-        ``COp``. Writing this C code involves a few subtleties which deserve their
-        own respective sections.
-
-
-Main function
--------------
-
-The external C implementation must implement a main function whose name
-is passed by the op to the ``__init__()`` method of the ``COp`` class. This
-main C function must respect the following constraints :
-
-*       It must return an int. The value of that int indicates whether the
-        op could perform its task or not. A value of 0 indicates success while
-        any non-zero value will interrupt the execution of the Theano function.
-        Before returning a non-zero integer, the main function should call the
-        function ``PyErr_Format()`` to setup a Python exception.
-
-*       It must receive one pointer for each input to the op followed by one
-        pointer to a pointer for each output of the op.
-
-For example, the main C function of an op that takes two scalars as inputs and
-returns both their sum and the difference between them would have four
-parameters (two for the op's inputs and two for its outputs) and it's
-signature would look something like this :
-
-.. code-block:: c
-
-    int sumAndDiffOfScalars(PyArrayObject* in0, PyArrayObject* in1,
-                            PyArrayObject** out0, PyArrayObject** out1)
-
-Macros
-------
-
-The ``COp`` class defines a number of macros that can you can use in your C
-implementation to make it simpler and more generic.
-
-For every input array 'i' (indexed from 0) of the op, the following macros are
-defined:
-
-*       ``DTYPE_INPUT_{i}`` : NumPy dtype of the data in the array.
-        This is the variable type corresponding to the NumPy dtype, not the
-        string representation of the NumPy dtype. For instance, if the op's
-        first input is a float32 ndarray, then the macro ``DTYPE_INPUT_0``
-        corresponds to ``npy_float32`` and can directly be used to declare a
-        new variable of the same dtype as the data in the array :
-
-        .. code-block:: c
-
-            DTYPE_INPUT_0 myVar = someValue;
-
-*       ``TYPENUM_INPUT_{i}`` : Typenum of the data in the array
-
-*       ``ITEMSIZE_INPUT_{i}`` : Size, in bytes, of the elements in the array.
-
-In the same way, the macros ``DTYPE_OUTPUT_{i}``, ``ITEMSIZE_OUTPUT_{i}`` and
-``TYPENUM_OUTPUT_{i}``  are defined for every output 'i' of the op.
-
-The ``COp`` class also defines the macro ``APPLY_SPECIFIC(str)`` which will
-automatically append the name of the :ref:`Apply node that applies the Op at
-the end of the provided ``str``. The use of this macro is discussed below.
-
-You should be aware, however, that these macros are apply-specific. As such,
-any function that uses them is considered to contain apply-specific code.
-
-
-Support code
-------------
-
-The file whose name is provided to the ``COp`` class is not constrained to
-contain only one function. It can in fact contain many functions, with every
-function but the main one acting as support code.
-
-When we defined the VectorTimesVector op without using the ``COp`` class, we
-had to make a distinction between two types of support_code : the support
-code that was apply-specific and the support code that wasn't.
-The apply-specific code was defined in the ` c_support_code_apply()`` method
-and the elements defined in that code (global variables and functions) had to
-include the name of the Apply node in their own names to avoid conflicts
-between the different versions of the apply-specific code. The code that
-wasn't apply-specific was simply defined in the ``c_support_code()`` method.
-
-When using the ``COp`` class, we still have to make the distinction between
-apply-specific and apply-agnostic support code but we express it differently
-in the code since it is all defined in the same external C file.
-These two types of support code should each be defined in their own section of
-the file, like in the example above. These sections should be delimited by the
-markers ``THEANO_SUPPORT_CODE_SECTION`` (to be put on its own line, at the
-beginning of the apply-agnostic support code section) and
-``THEANO_APPLY_CODE_SECTION`` (to be put on its own line at the beginning of
-the apply-specific code section). Moreover, just like in the previous examples
-of this tutorial, apply-specific functions and global variables need to
-include the name of the :ref:`Apply` node in their names. To achieve this,
-the macro ``APPLY_SPECIFIC(str)`` should be used when defining those elements
-as well as when referring to them. In the above example, this macro is used
-when defining the functions ``vector_elemwise_mult()`` and
-``vector_times_vector()`` as well as when calling function
-``vector_elemwise_mult()`` from inside ``vector_times_vector()``.
-
-:note:
-
-    The macro ``APPLY_SPECIFIC(str)`` should only ever be used for
-    apply-specific code. It should not be used for apply-agnostic code.
-
-The rules for knowing if a piece of code should be put in the apply-agnostic
-or the apply-specific support code section of the file are simple. If it uses
-any of the macros defined by the class ``COp`` then it is apply-specific and
-goes in the corresponding section. If it calls any apply-specific code then
-it is apply-specific. Otherwise, it is apply-agnostic and goes in the
-apply-agnostic support code section.
-
-In the above example, the ``function vector_same_shape()`` is apply-agnostic
-because it uses none of the macros defined by the class ``COp`` and it doesn't
-rely on any apply-specific code. The function ``vector_elemwise_mult()`` is
-apply-specific because it uses the macros defined by ``COp``. Finally, the
-function ``vector_times_vector()`` is apply-specific because it uses those
-same macros and also because it calls ``vector_elemwise_mult()`` which is an
-apply-specific function.
-
-Final Note
-==========
-
-This tutorial focuses on providing C implementations to ops that manipulate
-Theano tensors. For more information about other Theano types, you can refer
-to the section :ref:`Alternate Theano Types <alternate_theano_types>`.
diff --git a/doc/tutorial/extending_theano_solution_1.py b/doc/tutorial/extending_theano_solution_1.py
index 12291d8a384..87adb24d1ec 100755
--- a/doc/tutorial/extending_theano_solution_1.py
+++ b/doc/tutorial/extending_theano_solution_1.py
@@ -163,41 +163,5 @@ def test_infer_shape(self):
                                  numpy.random.rand(5, 6)],
                                 self.op_class)
 
-
-# as_op exercice
-import theano
-import numpy
-from theano.compile.ops import as_op
-
-
-def infer_shape_numpy_dot(node, input_shapes):
-    ashp, bshp = input_shapes
-    return [ashp[:-1] + bshp[-1:]]
-
-
-@as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
-       otypes=[theano.tensor.fmatrix], infer_shape=infer_shape_numpy_dot)
-def numpy_add(a, b):
-    return numpy.add(a, b)
-
-
-def infer_shape_numpy_add_sub(node, input_shapes):
-    ashp, bshp = input_shapes
-    # Both inputs should have that same shape, so we just return one of them.
-    return [ashp[0]]
-
-
-@as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
-       otypes=[theano.tensor.fmatrix], infer_shape=infer_shape_numpy_add_sub)
-def numpy_add(a, b):
-    return numpy.add(a, b)
-
-
-@as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
-       otypes=[theano.tensor.fmatrix], infer_shape=infer_shape_numpy_add_sub)
-def numpy_sub(a, b):
-    return numpy.sub(a, b)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/doc/tutorial/faq.txt b/doc/tutorial/faq.txt
new file mode 100644
index 00000000000..7bff553ea23
--- /dev/null
+++ b/doc/tutorial/faq.txt
@@ -0,0 +1,90 @@
+
+.. _faq:
+
+==========================
+Frequently Asked Questions
+==========================
+
+TypeError: object of type 'TensorVariable' has no len()
+-------------------------------------------------------
+
+If you receive the following error, it is because the Python function *__len__* cannot 
+be implemented on Theano variables:
+
+.. code-block:: python
+
+   TypeError: object of type 'TensorVariable' has no len()
+
+Python requires that *__len__* returns an integer, yet it cannot be done as Theano's variables are symbolic. However, `var.shape[0]` can be used as a workaround.
+
+This error message cannot be made more explicit because the relevant aspects of Python's 
+internals cannot be modified.
+
+
+Faster gcc optimization
+-----------------------
+
+You can enable faster gcc optimization with the ``cxxflags`` option.
+This list of flags was suggested on the mailing list::
+
+    -O3 -ffast-math -ftree-loop-distribution -funroll-loops -ftracer
+
+Use it at your own risk. Some people warned that the ``-ftree-loop-distribution`` optimization resulted in wrong results in the past.
+
+In the past we said that if the ``compiledir`` was not shared by multiple
+computers, you could add the ``-march=native`` flag. Now we recommend
+to remove this flag as Theano does it automatically and safely,
+even if the ``compiledir`` is shared by multiple computers with different
+CPUs. In fact, Theano asks g++ what are the equivalent flags it uses, and re-uses
+them directly.
+
+
+Faster Theano function
+----------------------
+
+You can set the Theano flag `allow_gc` to `False` to get a speed-up by using
+more memory. By default, Theano frees intermediate results when we don't need
+them anymore. Doing so prevents us from reusing this memory. So disabling the
+garbage collection will keep all intermediate results' memory space to allow to
+reuse them during the next call to the same Theano function, if they are of the
+correct shape. The shape could change if the shapes of the inputs change.
+
+Faster Small Theano function
+----------------------------
+
+.. note::
+
+   For Theano 0.6 and up.
+
+For Theano functions that don't do much work, like a regular logistic
+regression, the overhead of checking the input can be significant. You
+can disable it by setting ``f.trust_input`` to True.
+Make sure the types of arguments you provide match those defined when
+the function was compiled.
+
+Also, for small Theano functions, you can remove more Python overhead by
+making a Theano function that does not take any input. You can use shared
+variables to achieve this. Then you can call it like this: ``f.fn()`` or
+``f.fn(n_calls=N)`` to speed it up. In the last case, only the last
+function output (out of N calls) is returned.
+
+Related Projects
+----------------
+
+We try to list in this `wiki page <https://github.com/Theano/Theano/wiki/Related-projects>`_ other Theano related projects.
+
+
+"What are Theano's Limitations?"
+--------------------------------
+
+Theano offers a good amount of flexibility, but has some limitations too.
+You must answer for yourself the following question: How can my algorithm be cleverly written 
+so as to make the most of what Theano can do?
+
+Here is a list of some of the known limitations:
+
+- *While*- or *for*-Loops within an expression graph are supported, but only via
+  the :func:`theano.scan` op (which puts restrictions on how the loop body can
+  interact with the rest of the graph).
+
+- Neither *goto* nor *recursion* is supported or planned within expression graphs.
diff --git a/doc/tutorial/gpu_data_convert.txt b/doc/tutorial/gpu_data_convert.txt
index 3d37211a8c5..7532e9b22a1 100644
--- a/doc/tutorial/gpu_data_convert.txt
+++ b/doc/tutorial/gpu_data_convert.txt
@@ -115,7 +115,6 @@ You can use a GPU function compiled with PyCUDA in a Theano op:
                 grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
                 pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
                            block=(512, 1, 1), grid=grid)
-            thunk.lazy = False
             return thunk
     
 CUDAMat
diff --git a/doc/tutorial/gradients.txt b/doc/tutorial/gradients.txt
index 4cb94e0af2c..e64bb5e87c5 100644
--- a/doc/tutorial/gradients.txt
+++ b/doc/tutorial/gradients.txt
@@ -221,8 +221,8 @@ f(x)}{\partial x}`. The *L-operator* is also supported for generic tensors
 >>> x = T.dvector('x')
 >>> y = T.dot(x, W)
 >>> VJ = T.Lop(y, W, v)
->>> f = theano.function([v,x], VJ)
->>> f([2, 2], [0, 1])
+>>> f = theano.function([W,v,x], JV)
+>>> f([[1, 1], [1, 1]], [2, 2], [0, 1])
 array([[ 0.,  0.],
        [ 2.,  2.]])
 
diff --git a/doc/tutorial/index.txt b/doc/tutorial/index.txt
index 1f6ff09627d..1791e5cb97d 100644
--- a/doc/tutorial/index.txt
+++ b/doc/tutorial/index.txt
@@ -41,8 +41,6 @@ you out.
     aliasing
     shape_info
     debug_faq
-    profiling
     extending_theano
-    extending_theano_c
+    faq
     python-memory-management
-    multi_cores
diff --git a/doc/tutorial/loading_and_saving.txt b/doc/tutorial/loading_and_saving.txt
index 69d07f1bc5e..29fc15563ad 100644
--- a/doc/tutorial/loading_and_saving.txt
+++ b/doc/tutorial/loading_and_saving.txt
@@ -131,7 +131,7 @@ matrix *W* and a bias *b*, you can define:
 .. code-block:: python
 
     def __getstate__(self):
-        return (self.W, self.b)
+        return (W, b)
 
     def __setstate__(self, state):
         W, b = state
@@ -145,7 +145,7 @@ functions to reflect the change in name:
 .. code-block:: python
 
     def __getstate__(self):
-        return (self.weights, self.bias)
+        return (weights, bias)
 
     def __setstate__(self, state):
         W, b = state
diff --git a/doc/tutorial/loop.txt b/doc/tutorial/loop.txt
index 5e3daf4066d..82bf9d3d14e 100644
--- a/doc/tutorial/loop.txt
+++ b/doc/tutorial/loop.txt
@@ -15,7 +15,7 @@ Scan
 - ``sum()`` could be computed by scanning the *z + x(i)* function over a list, given an initial state of *z=0*.
 - Often a *for* loop can be expressed as a ``scan()`` operation, and ``scan`` is the closest that Theano comes to looping.
 - Advantages of using ``scan`` over *for* loops:
-
+  
   - Number of iterations to be part of the symbolic graph.
   - Minimizes GPU transfers (if GPU is involved).
   - Computes gradients through sequential steps.
@@ -24,255 +24,7 @@ Scan
 
 The full documentation can be found in the library: :ref:`Scan <lib_scan>`.
 
-**Scan Example: Computing tanh(x(t).dot(W) + b) elementwise**
-
-.. code-block:: python
-
-  import theano
-  import theano.tensor as T
-  import numpy as np
-
-  # defining the tensor variables
-  X = T.matrix("X")
-  W = T.matrix("W")
-  b_sym = T.vector("b_sym")
-
-  results, updates = theano.scan(lambda v: T.tanh(T.dot(v, W) + b_sym), sequences=X)
-  compute_elementwise = theano.function(inputs=[X, W, b_sym], outputs=[results])
-
-  # test values
-  x = np.eye(2, dtype=theano.config.floatX)
-  w = np.ones((2, 2), dtype=theano.config.floatX)
-  b = np.ones((2), dtype=theano.config.floatX)
-  b[1] = 2
-
-  print compute_elementwise(x, w, b)[0]
-
-  # comparison with numpy
-  print np.tanh(x.dot(w) + b)
-
-
-**Scan Example: Computing the sequence x(t) = tanh(x(t - 1).dot(W) + y(t).dot(U) + p(T - t).dot(V))**
-
-.. code-block:: python
-
-  import theano
-  import theano.tensor as T
-  import numpy as np
-
-  # define tensor variables
-  X = T.vector("X")
-  W = T.matrix("W")
-  b_sym = T.vector("b_sym")
-  U = T.matrix("U")
-  Y = T.matrix("Y")
-  V = T.matrix("V")
-  P = T.matrix("P")
-
-  results, updates = theano.scan(lambda y, p, x_tm1: T.tanh(T.dot(x_tm1, W) + T.dot(y, U) + T.dot(p, V)),
-            sequences=[Y, P[::-1]], outputs_info=[X])
-  compute_seq = theano.function(inputs=[X, W, Y, U, P, V], outputs=[results])
-
-  # test values
-  x = np.zeros((2), dtype=theano.config.floatX)
-  x[1] = 1
-  w = np.ones((2, 2), dtype=theano.config.floatX)
-  y = np.ones((5, 2), dtype=theano.config.floatX)
-  y[0, :] = -3
-  u = np.ones((2, 2), dtype=theano.config.floatX)
-  p = np.ones((5, 2), dtype=theano.config.floatX)
-  p[0, :] = 3
-  v = np.ones((2, 2), dtype=theano.config.floatX)
-
-  print compute_seq(x, w, y, u, p, v)[0]
-
-  # comparison with numpy
-  x_res = np.zeros((5, 2), dtype=theano.config.floatX)
-  x_res[0] = np.tanh(x.dot(w) + y[0].dot(u) + p[4].dot(v))
-  for i in range(1, 5):
-    x_res[i] = np.tanh(x_res[i - 1].dot(w) + y[i].dot(u) + p[4-i].dot(v))
-  print x_res
-
-**Scan Example: Computing norms of lines of X**
-
-.. code-block:: python
-
-  import theano
-  import theano.tensor as T
-  import numpy as np
-
-  # define tensor variable
-  X = T.matrix("X")
-  results, updates = theano.scan(lambda x_i: T.sqrt((x_i ** 2).sum()), sequences=[X])
-  compute_norm_lines = theano.function(inputs=[X], outputs=[results])
-
-  # test value
-  x = np.diag(np.arange(1, 6, dtype=theano.config.floatX), 1)
-  print compute_norm_lines(x)[0]
-
-  # comparison with numpy
-  print np.sqrt((x ** 2).sum(1))
-
-**Scan Example: Computing norms of columns of X**
-
-.. code-block:: python
-
-  import theano
-  import theano.tensor as T
-  import numpy as np
-
-  # define tensor variable
-  X = T.matrix("X")
-  results, updates = theano.scan(lambda x_i: T.sqrt((x_i ** 2).sum()), sequences=[X.T])
-  compute_norm_cols = theano.function(inputs=[X], outputs=[results])
-
-  # test value
-  x = np.diag(np.arange(1, 6, dtype=theano.config.floatX), 1)
-  print compute_norm_cols(x)[0]
-
-  # comparison with numpy
-  print np.sqrt((x ** 2).sum(0))
-
-**Scan Example: Computing trace of X**
-
-.. code-block:: python
-
-  import theano
-  import theano.tensor as T
-  import numpy as np
-  floatX = "float32"
-
-  # define tensor variable
-  X = T.matrix("X")
-  results, updates = theano.scan(lambda i, j, t_f: T.cast(X[i, j] + t_f, floatX),
-                    sequences=[T.arange(X.shape[0]), T.arange(X.shape[1])],
-                    outputs_info=np.asarray(0., dtype=floatX))
-  result = results[-1]
-  compute_trace = theano.function(inputs=[X], outputs=[result])
-
-  # test value
-  x = np.eye(5, dtype=theano.config.floatX)
-  x[0] = np.arange(5, dtype=theano.config.floatX)
-  print compute_trace(x)[0]
-
-  # comparison with numpy
-  print np.diagonal(x).sum()
-
-**Scan Example: Computing the sequence x(t) = x(t - 2).dot(U) + x(t - 1).dot(V) +  tanh(x(t - 1).dot(W)  + b)**
-
-.. code-block:: python
-
-  import theano
-  import theano.tensor as T
-  import numpy as np
-
-  # define tensor variables
-  X = T.matrix("X")
-  W = T.matrix("W")
-  b_sym = T.vector("b_sym")
-  U = T.matrix("U")
-  V = T.matrix("V")
-  n_sym = T.iscalar("n_sym")
-
-  results, updates = theano.scan(lambda x_tm2, x_tm1: T.dot(x_tm2, U) + T.dot(x_tm1, V) + T.tanh(T.dot(x_tm1, W) + b_sym),
-                      n_steps=n_sym, outputs_info=[dict(initial=X, taps=[-2, -1])])
-  compute_seq2 = theano.function(inputs=[X, U, V, W, b_sym, n_sym], outputs=[results])
-
-  # test values
-  x = np.zeros((2, 2), dtype=theano.config.floatX) # the initial value must be able to return x[-2]
-  x[1, 1] = 1
-  w = 0.5 * np.ones((2, 2), dtype=theano.config.floatX)
-  u = 0.5 * (np.ones((2, 2), dtype=theano.config.floatX) - np.eye(2, dtype=theano.config.floatX))
-  v = 0.5 * np.ones((2, 2), dtype=theano.config.floatX)
-  n = 10
-  b = np.ones((2), dtype=theano.config.floatX)
-
-  print compute_seq2(x, u, v, w, b, n)
-
-  # comparison with numpy
-  x_res = np.zeros((10, 2))
-  x_res[0] = x[0].dot(u) + x[1].dot(v) + np.tanh(x[1].dot(w) + b)
-  x_res[1] = x[1].dot(u) + x_res[0].dot(v) + np.tanh(x_res[0].dot(w) + b)
-  x_res[2] = x_res[0].dot(u) + x_res[1].dot(v) + np.tanh(x_res[1].dot(w) + b)
-  for i in range(2, 10):
-    x_res[i] = (x_res[i - 2].dot(u) + x_res[i - 1].dot(v) +
-                np.tanh(x_res[i - 1].dot(w) + b))
-  print x_res
-
-**Scan Example: Computing the Jacobian of y = tanh(v.dot(A)) wrt x**
-
-.. code-block:: python
-
-  import theano
-  import theano.tensor as T
-  import numpy as np
-
-  # define tensor variables
-  v = T.vector()
-  A = T.matrix()
-  y = T.tanh(T.dot(v, A))
-  results, updates = theano.scan(lambda i: T.grad(y[i], v), sequences=[T.arange(y.shape[0])])
-  compute_jac_t = theano.function([A, v], [results], allow_input_downcast=True) # shape (d_out, d_in)
-
-  # test values
-  x = np.eye(5, dtype=theano.config.floatX)[0]
-  w = np.eye(5, 3, dtype=theano.config.floatX)
-  w[2] = np.ones((3), dtype=theano.config.floatX)
-  print compute_jac_t(w, x)[0]
-
-  # compare with numpy
-  print ((1 - np.tanh(x.dot(w)) ** 2) * w).T
-
-Note that we need to iterate over the indices of ``y`` and not over the elements of ``y``. The reason is that scan create a placeholder variable for its internal function and this placeholder variable does not have the same dependencies than the variables that will replace it.
-
-**Scan Example: Accumulate number of loop during a scan**
-
-.. code-block:: python
-
-  import theano
-  import theano.tensor as T
-  import numpy as np
-
-  # define shared variables
-  k = theano.shared(0)
-  n_sym = T.iscalar("n_sym")
-
-  results, updates = theano.scan(lambda:{k:(k + 1)}, n_steps=n_sym)
-  accumulator = theano.function([n_sym], [], updates=updates, allow_input_downcast=True)
-
-  k.get_value()
-  accumulator(5)
-  k.get_value()
-
-**Scan Example: Computing tanh(v.dot(W) + b) * d where b is binomial**
-
-.. code-block:: python
-
-  import theano
-  import theano.tensor as T
-  import numpy as np
-
-  # define tensor variables
-  X = T.matrix("X")
-  W = T.matrix("W")
-  b_sym = T.vector("b_sym")
-
-  # define shared random stream
-  trng = T.shared_randomstreams.RandomStreams(1234)
-  d=trng.binomial(size=W[1].shape)
-
-  results, updates = theano.scan(lambda v: T.tanh(T.dot(v, W) + b_sym) * d, sequences=X)
-  compute_with_bnoise = theano.function(inputs=[X, W, b_sym], outputs=[results],
-                            updates=updates, allow_input_downcast=True)
-  x = np.eye(10, 2, dtype=theano.config.floatX)
-  w = np.ones((2, 2), dtype=theano.config.floatX)
-  b = np.ones((2), dtype=theano.config.floatX)
-
-  print compute_with_bnoise(x, w, b)
-
-Note that if you want to use a random variable ``d`` that will not be updated through scan loops, you should pass this variable as a ``non_sequences`` arguments. 
-
-**Scan Example: Computing pow(A, k)**
+**Scan Example: Computing pow(A,k)**
 
 .. code-block:: python
 
@@ -294,11 +46,11 @@ Note that if you want to use a random variable ``d`` that will not be updated th
   # Scan has provided us with A ** 1 through A ** k.  Keep only the last
   # value. Scan notices this and does not waste memory saving them.
   final_result = result[-1]
-
+  
   power = theano.function(inputs=[A, k], outputs=final_result,
                         updates=updates)
-
-  print power(range(10), 2)
+  
+  print power(range(10),2)
   #[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
 
 
@@ -332,10 +84,10 @@ Note that if you want to use a random variable ``d`` that will not be updated th
   # 19.0
 
 
+-------------------------------------------
 
 
-Exercise
-========
+**Exercise**
 
 Run both examples.
 
diff --git a/doc/tutorial/modes.txt b/doc/tutorial/modes.txt
index 83b7dd75f38..9bda77563c3 100644
--- a/doc/tutorial/modes.txt
+++ b/doc/tutorial/modes.txt
@@ -36,9 +36,9 @@ variables, type this from the command-line:
 
 For more detail, see :ref:`Configuration <libdoc_config>` in the library.
 
+-------------------------------------------
 
-Exercise
-========
+**Exercise**
 
 
 Consider the logistic regression:
@@ -125,7 +125,7 @@ as it will be useful later on.
 Mode
 ====
 
-Every time :func:`theano.function <function.function>` is called,
+Everytime :func:`theano.function <function.function>` is called,
 the symbolic relationships between the input and output Theano *variables*
 are optimized and compiled. The way this compilation occurs
 is controlled by the value of the ``mode`` parameter.
@@ -133,11 +133,11 @@ is controlled by the value of the ``mode`` parameter.
 Theano defines the following modes by name:
 
 - ``'FAST_COMPILE'``: Apply just a few graph optimizations and only use Python implementations.
-- ``'FAST_RUN'``: Apply all optimizations and use C implementations where possible.
+- ``'FAST_RUN'``: Apply all optimizations, and use C implementations where possible.
 - ``'DebugMode``: Verify the correctness of all optimizations, and compare C and Python 
     implementations. This mode can take much longer than the other modes, but can identify
     several kinds of problems.
-- ``'ProfileMode'`` (deprecated): Same optimization as FAST_RUN, but print some profiling information.
+- ``'ProfileMode'``: Same optimization then FAST_RUN, put print some profiling information
 
 The default mode is typically ``FAST_RUN``, but it can be controlled via
 the configuration variable :attr:`config.mode`,
@@ -150,7 +150,7 @@ short name        Full constructor
 ``FAST_COMPILE``  ``compile.mode.Mode(linker='py', optimizer='fast_compile')``    Python implementations only, quick and cheap graph transformations
 ``FAST_RUN``      ``compile.mode.Mode(linker='cvm', optimizer='fast_run')``       C implementations where available, all available graph transformations.
 ``DebugMode``     ``compile.debugmode.DebugMode()``                               Both implementations where available, all available graph transformations.
-``ProfileMode``   ``compile.profilemode.ProfileMode()``                           Deprecated. C implementations where available, all available graph transformations, print profile information.
+``ProfileMode``   ``compile.profilemode.ProfileMode()``                           C implementations where available, all available graph transformations, print profile information.
 ================= =============================================================== ===============================================================================
 
 .. Note::
@@ -167,7 +167,7 @@ A mode is composed of 2 things: an optimizer and a linker. Some modes,
 like ``ProfileMode`` and ``DebugMode``, add logic around the optimizer and
 linker. ``ProfileMode`` and ``DebugMode`` use their own linker.
 
-You can select which linker to use with the Theano flag :attr:`config.linker`.
+You can select witch linker to use with the Theano flag :attr:`config.linker`.
 Here is a table to compare the different linkers.
 
 =============  =========  =================  =========  ===
@@ -180,7 +180,7 @@ c|py_nogc      no         yes                "++"       As c|py, but without gc
 c              no         yes                "+"        Use only C code (if none available for an op, raise an error)
 py             yes        yes                "+++"      Use only Python code
 c&py [#cpy2]_  no         yes                "+++++"    Use C and Python code
-ProfileMode    no         no                 "++++"     (Deprecated) Compute some extra profiling info
+ProfileMode    no         no                 "++++"     Compute some extra profiling info
 DebugMode      no         yes                VERY HIGH  Make many checks on what Theano computes
 =============  =========  =================  =========  ===
 
@@ -253,11 +253,6 @@ For more detail, see :ref:`DebugMode<debugmode>` in the library.
 ProfileMode
 ===========
 
-.. note::
-
-    ProfileMode is deprecated. Use :attr:`config.profile` instead.
-
-
 Besides checking for errors, another important task is to profile your
 code. For this Theano uses a special mode called ProfileMode which has
 to be passed as an argument to :func:`theano.function <function.function>`. 
@@ -376,5 +371,5 @@ Finally, notice that the ``ProfileMode`` also shows which ops were running a C
 implementation.
 
 
-For more detail, see :ref:`ProfileMode<profilemode>` in the library.
+For more detail, see :ref:`ProfileMode<libdoc_compile_mode>` in the library.
 
diff --git a/doc/tutorial/multi_cores.txt b/doc/tutorial/multi_cores.txt
deleted file mode 100644
index c6b5f90b7fe..00000000000
--- a/doc/tutorial/multi_cores.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-.. _tut_multi_cores:
-
-=============================
-Multi cores support in Theano
-=============================
-
-BLAS operation
-==============
-
-BLAS is an interface for some mathematic operations between two
-vectors, a vector and a matrix or two matrices (e.g. the dot product
-between vector/matrix and matrix/matrix). Many different
-implementations of that interface exist and some of them are
-parallelized.
-
-Theano tries to use that interface as frequently as possible for
-performance reasons. So if Theano links to a parallel implementation,
-those operations will run in parallel in Theano.
-
-The most frequent way to control the number of threads used is via the
-``OMP_NUM_THREADS`` environment variable. Set it to the number of
-threads you want to use before starting the Python process. Some BLAS
-implementations support other environment variables.
-
-To test if you BLAS supports OpenMP/Multiple cores, you can use the theano/misc/check_blas.py script from the command line like this::
-
-    OMP_NUM_THREADS=1 python theano/misc/check_blas.py -q
-    OMP_NUM_THREADS=2 python theano/misc/check_blas.py -q
-
-
-
-Parallel element wise ops with OpenMP
-=====================================
-
-Because element wise ops work on every tensor entry independently they
-can be easily parallelized using OpenMP.
-
-To use OpenMP you must set the ``openmp`` :ref:`flag <libdoc_config>`
-to ``True``.
-
-You can use the flag ``openmp_elemwise_minsize`` to set the minimum
-tensor size for which the operation is parallelized because for short
-tensors using OpenMP can slow down the operation. The default value is
-``200000``.
-
-For simple (fast) operations you can obtain a speed-up with very large
-tensors while for more complex operations you can obtain a good speed-up
-also for smaller tensors.
-
-There is a script ``elemwise_openmp_speedup.py`` in ``theano/misc/``
-which you can use to tune the value of ``openmp_elemwise_minsize`` for
-your machine.  The script runs two elemwise operations (a fast one and
-a slow one) for a vector of size ``openmp_elemwise_minsize`` with and
-without OpenMP and shows the time difference between the cases.
-
-The only way to control the number of threads used is via the
-``OMP_NUM_THREADS`` environment variable. Set it to the number of
-threads you want to use before starting the Python process. You can
-test this with this command::
-
-
-    OMP_NUM_THREADS=2 python theano/misc/elemwise_openmp_speedup.py
-    #The output
-
-    Fast op time without openmp 0.000533s with openmp 0.000474s speedup 1.12
-    Slow op time without openmp 0.002987s with openmp 0.001553s speedup 1.92
diff --git a/doc/tutorial/numpy.txt b/doc/tutorial/numpy.txt
index 70f6199a918..5a1449e0a1d 100644
--- a/doc/tutorial/numpy.txt
+++ b/doc/tutorial/numpy.txt
@@ -10,7 +10,6 @@ Here are some quick guides to NumPy:
   * `Numpy quick guide for Matlab users <http://www.scipy.org/NumPy_for_Matlab_Users>`__
   * `Numpy User Guide <http://docs.scipy.org/doc/numpy/user/index.html>`__
   * `More detailed Numpy tutorial <http://www.scipy.org/Tentative_NumPy_Tutorial>`__
-  * `100 NumPy exercises <https://github.com/rougier/numpy-100>`__
 
     .. [TODO: More doc, e.g. see _test_tensor.py]
 
diff --git a/doc/tutorial/printing_drawing.txt b/doc/tutorial/printing_drawing.txt
index 15ac8e76f5c..b2fe0f30150 100644
--- a/doc/tutorial/printing_drawing.txt
+++ b/doc/tutorial/printing_drawing.txt
@@ -19,13 +19,9 @@ The following output depicts the pre- and post- compilation graphs.
 
 .. code-block:: python
     
+    import numpy
     import theano
     import theano.tensor as T
-
-    import numpy
-
-    import os
-
     rng = numpy.random
 
     N = 400
@@ -56,16 +52,16 @@ The following output depicts the pre- and post- compilation graphs.
     train = theano.function(
                 inputs=[x, y],
                 outputs=[prediction, xent],
-                updates=[(w, w - 0.01 * gw), (b, b - 0.01 * gb)],
+                updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
                 name="train")
     predict = theano.function(inputs=[x], outputs=prediction,
                 name="predict")
 
-    if any([x.op.__class__.__name__ in ['Gemv', 'CGemv'] for x in
-            train.maker.fgraph.toposort()]):
+    if any( [x.op.__class__.__name__=='Gemv' for x in
+    train.maker.fgraph.toposort()]):
         print 'Used the cpu'
-    elif any([x.op.__class__.__name__ == 'GpuGemm' for x in
-             train.maker.fgraph.toposort()]):
+    elif any( [x.op.__class__.__name__=='GpuGemm' for x in
+    train.maker.fgraph.toposort()]):
         print 'Used the gpu'
     else:
         print 'ERROR, not able to tell if theano used the cpu or the gpu'
@@ -86,8 +82,6 @@ The following output depicts the pre- and post- compilation graphs.
 
     # Print the picture graphs
     # after compilation
-    if not os.path.exists('pics'):
-        os.mkdir('pics')
     theano.printing.pydotprint(predict,
                                outfile="pics/logreg_pydotprint_predic.png",
                                var_with_name_simple=True)
diff --git a/doc/tutorial/profiling.txt b/doc/tutorial/profiling.txt
deleted file mode 100644
index ca4821774b3..00000000000
--- a/doc/tutorial/profiling.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-
-.. _tut_profiling:
-
-=========================
-Profiling Theano function
-=========================
-
-.. note::
-
-    This method replace the old ProfileMode. Do not use ProfileMode
-    anymore.
-
-Besides checking for errors, another important task is to profile your
-code. For this, you can use Theano flags and/or parameters which are
-to be passed as an argument to :func:`theano.function <function.function>`.
-
-The simplest way to profile Theano functions is to use the Theano
-flags described below. When the process exits, they will cause the
-information to be printed on stdout.
-
-
-Using the ProfileMode is a three-step process.
-
-Enabling the profiler is pretty easy. Just use the Theano flag
-:attr:`config.profile`.
-
-To enable the memory profiler use the Theano flag:
-:attr:`config.profile_memory` in addition to :attr:`config.profile`.
-
-To enable the profiling of Theano optimization phase, use the Theano
-flag: :attr:`config.profile_optimizer` in addition to
-:attr:`config.profile`.
-
-You can use the Theano flags :attr:`profiling.n_apply`,
-:attr:`profiling.n_ops` and :attr:`profiling.min_memory_size` to
-modify the quantify of information printed.
-
-The profiler will output one profile per Theano function and profile
-that is the sum of the printed profile. Each profile contains 4
-sections: global info, class info, Ops info and Apply node info.
-
-In the global section, the "Message" is the name of the Theano
-function. theano.function() has an optional parameter ``name`` that
-defaults to None. Change it to something else to help you profile many
-Theano functions. In that section, we also see the number of time the
-function was called (1) and the total time spent in all those
-calls. The time spent in Function.fn.__call__ and in thunks is useful
-to help understand Theano overhead.
-
-Also, we see the time spent in the two parts of the compilation
-process: optimization(modify the graph to make it more stable/faster)
-and the linking (compile c code and make the Python callable returned
-by function).
-
-The class, Ops and Apply nodes sections are the same information:
-information about the Apply node that ran. The Ops section takes the
-information from the Apply section and merge the Apply nodes that have
-exactly the same op. If two Apply nodes in the graph have two Ops that
-compare equal, they will be merged. Some Ops like Elemwise, will not
-compare equal, if their parameters differ (the scalar being
-executed). So the class section will merge more Apply nodes then the
-Ops section.
-
-Here is an example output when we disable some Theano optimizations to
-give you a better idea of the difference between sections. With all
-optimizations enabled, there would be only one op left in the graph.
-
-.. note::
-
-    To profile the peak memory usage on the GPU you need to do::
-
-        * In the file theano/sandbox/cuda/cuda_ndarray.cu, set the macro
-          COMPUTE_GPU_MEM_USED to 1.
-        * Then call theano.sandbox.cuda.theano_allocated()
-          It return a tuple with two ints. The first is the current GPU
-          memory allocated by Theano. The second is the peak  GPU memory
-          that was allocated by Theano.
-
-    Do not always enable this, as this slowdown memory allocation and
-    free. As this slowdown the computation, this will affect speed
-    profiling. So don't use both at the same time.
-
-to run the example:
-
-  THEANO_FLAGS=optimizer_excluding=fusion:inplace,profile=True python doc/tutorial/profiling_example.py
-
-The output:
-
-.. literalinclude:: profiling_example_out.prof
diff --git a/doc/tutorial/profiling_example.py b/doc/tutorial/profiling_example.py
deleted file mode 100644
index d556fbd2d82..00000000000
--- a/doc/tutorial/profiling_example.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import numpy
-
-import theano
-
-x, y, z = theano.tensor.vectors('xyz')
-f = theano.function([x, y, z], [(x + y + z) * 2])
-xv = numpy.random.rand(10).astype(theano.config.floatX)
-yv = numpy.random.rand(10).astype(theano.config.floatX)
-zv = numpy.random.rand(10).astype(theano.config.floatX)
-f(xv, yv, zv)
diff --git a/doc/tutorial/profiling_example_out.prof b/doc/tutorial/profiling_example_out.prof
deleted file mode 100644
index aafc00d5cba..00000000000
--- a/doc/tutorial/profiling_example_out.prof
+++ /dev/null
@@ -1,31 +0,0 @@
-Function profiling
-==================
-  Message: None
-  Time in 1 calls to Function.__call__: 5.698204e-05s
-  Time in Function.fn.__call__: 1.192093e-05s (20.921%)
-  Time in thunks: 6.198883e-06s (10.879%)
-  Total compile time: 3.642474e+00s
-    Theano Optimizer time: 7.326508e-02s
-       Theano validate time: 3.712177e-04s
-    Theano Linker time (includes C, CUDA code generation/compiling): 9.584920e-01s
-
-Class
----
-<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
-  100.0%   100.0%       0.000s       2.07e-06s     C        3        3   <class 'theano.tensor.elemwise.Elemwise'>
-   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)
-
-Ops
----
-<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
-  65.4%    65.4%       0.000s       2.03e-06s     C        2        2   Elemwise{add,no_inplace}
-  34.6%   100.0%       0.000s       2.15e-06s     C        1        1   Elemwise{mul,no_inplace}
-   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)
-
-Apply
-------
-<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
-  50.0%    50.0%       0.000s       3.10e-06s      1     0   Elemwise{add,no_inplace}(x, y)
-  34.6%    84.6%       0.000s       2.15e-06s      1     2   Elemwise{mul,no_inplace}(TensorConstant{(1,) of 2.0}, Elemwise{add,no_inplace}.0)
-  15.4%   100.0%       0.000s       9.54e-07s      1     1   Elemwise{add,no_inplace}(Elemwise{add,no_inplace}.0, z)
-   ... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)
diff --git a/doc/tutorial/python.txt b/doc/tutorial/python.txt
index 786c67adc2c..247e4924ab1 100644
--- a/doc/tutorial/python.txt
+++ b/doc/tutorial/python.txt
@@ -11,6 +11,5 @@ tutorials/exercises if you need to learn it or only need a refresher:
   * `Python Challenge <http://www.pythonchallenge.com/>`__
   * `Dive into Python <http://diveintopython.net/>`__
   * `Google Python Class <http://code.google.com/edu/languages/google-python-class/index.html>`__
-  * `Enthought Python course <https://training.enthought.com/?utm_source=academic&utm_medium=email&utm_campaign=EToD-Launch#/courses>`__ (free for academics)
 
 We have a tutorial on how :ref:`Python manages its memory <python-memory-management>`.
diff --git a/doc/tutorial/sparse.txt b/doc/tutorial/sparse.txt
index effb6e096be..eb26459cbdf 100644
--- a/doc/tutorial/sparse.txt
+++ b/doc/tutorial/sparse.txt
@@ -174,8 +174,6 @@ provide a structured gradient. More explication below.
  [ 0.  0.  3.]
  [ 5.  0.  0.]]
 
-.. _tutsparse_gradient:
-
 Gradient
 --------
 
diff --git a/doc/tutorial/using_gpu.txt b/doc/tutorial/using_gpu.txt
index b5fcea7bb72..e9728f06853 100644
--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
@@ -5,22 +5,17 @@
 Using the GPU
 =============
 
-For an introductory discussion of *Graphical Processing Units* (GPU)
-and their use for intensive parallel computation purposes, see `GPGPU
-<http://en.wikipedia.org/wiki/GPGPU>`_.
+For an introductory discussion of *Graphical Processing Units* (GPU) and their use for
+intensive parallel computation purposes, see `GPGPU <http://en.wikipedia.org/wiki/GPGPU>`_.
 
-One of Theano's design goals is to specify computations at an abstract
-level, so that the internal function compiler has a lot of flexibility
-about how to carry out those computations.  One of the ways we take
-advantage of this flexibility is in carrying out calculations on a
-graphics card.
+One of Theano's design goals is to specify computations at an
+abstract level, so that the internal function compiler has a lot of flexibility
+about how to carry out those computations.  One of the ways we take advantage of
+this flexibility is in carrying out calculations on an Nvidia graphics card when
+the device present in the computer is CUDA-enabled.
 
-There are two ways currently to use a gpu, one of which only supports NVIDIA cards (:ref:`cuda`) and the other, in development, that should support any OpenCL device as well as NVIDIA cards (:ref:`gpuarray`).
-
-.. _cuda:
-
-CUDA backend
-------------
+Setting Up CUDA
+----------------
 
 If you have not done so already, you will need to install Nvidia's
 GPU-programming toolchain (CUDA) and configure Theano to use it.
@@ -28,7 +23,7 @@ We provide installation instructions for :ref:`Linux <gpu_linux>`,
 :ref:`MacOS <gpu_macos>` and :ref:`Windows <gpu_windows>`.
 
 Testing Theano with GPU
-~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------
 
 To see if your GPU is being used, cut and paste the following program into a
 file and run it.
@@ -65,10 +60,10 @@ The program just computes the ``exp()`` of a bunch of random numbers.
 Note that we use the ``shared`` function to
 make sure that the input *x* is stored on the graphics device.
 
-.. the following figures have been measured twice on BART3 on Aug 2nd 2012 with no other job running simultaneously
+.. the following figures have been measured twice on BART3 on Aug 2nd 2012 with no other job running simultaneously 
 
 If I run this program (in check1.py) with ``device=cpu``, my computer takes a little over 3 seconds,
-whereas on the GPU it takes just over 0.64 seconds. The GPU will not always produce the exact
+whereas on the GPU it takes just over 0.64 seconds. The GPU will not always produce the exact 
 same floating-point numbers as the CPU. As a benchmark, a loop that calls ``numpy.exp(x.get_value())`` takes about 46 seconds.
 
 .. code-block:: text
@@ -92,7 +87,7 @@ Note that GPU operations in Theano require for now ``floatX`` to be *float32* (s
 
 
 Returning a Handle to Device-Allocated Data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------------------------------------
 
 The speedup is not greater in the preceding example because the function is
 returning its result as a NumPy ndarray which has already been copied from the
@@ -144,61 +139,144 @@ The output from this program is
       1.62323296]
     Used the gpu
 
-Here we've shaved off about 50% of the run-time by simply not copying
-the resulting array back to the host.  The object returned by each
-function call is now not a NumPy array but a "CudaNdarray" which can
-be converted to a NumPy ndarray by the normal NumPy casting mechanism
-using something like ``numpy.asarray()``.
+Here we've shaved off about 50% of the run-time by simply not copying the
+resulting array back to the host.
+The object returned by each function call is now not a NumPy array but a
+"CudaNdarray" which can be converted to a NumPy ndarray by the normal
+NumPy casting mechanism.
+
+
+Running the GPU at Full Speed
+------------------------------
+
+To really get maximum performance in this simple example, we need to use an
+:class:`out<function.Out>` instance with the flag ``borrow=True`` to tell Theano not to copy
+the output it returns to us. This is because Theano pre-allocates memory for internal use 
+(like working buffers), and by default will never return a result that is aliased to one of
+its internal buffers: instead, it will copy the buffers associated to outputs into newly 
+allocated memory at each function call. This is to ensure that subsequent function calls will
+not overwrite previously computed outputs. Although this is normally what you want, our last
+example was so simple that it had the unwanted side-effect of really slowing things down.
+
+
+.. 
+    TODO:
+    The story here about copying and working buffers is misleading and potentially not correct
+    ... why exactly does borrow=True cut 75% of the runtime ???
+
+..  TODO: Answer by Olivier D: it sounds correct to me -- memory allocations must be slow.
+
+.. If you modify this code, also change :
+.. theano/tests/test_tutorial.py:T_using_gpu.test_using_gpu_3
+.. code-block:: python
+
+    from theano import function, config, shared, sandbox, Out
+    import theano.tensor as T
+    import numpy
+    import time
+
+    vlen = 10 * 30 * 768  # 10 x # cores x # threads per core
+    iters = 1000
+
+    rng = numpy.random.RandomState(22)
+    x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
+    f = function([], 
+            Out(sandbox.cuda.basic_ops.gpu_from_host(T.exp(x)),
+                borrow=True))
+    print f.maker.fgraph.toposort()
+    t0 = time.time()
+    for i in xrange(iters):
+        r = f()
+    t1 = time.time()
+    print 'Looping %d times took' % iters, t1 - t0, 'seconds'
+    print 'Result is', r
+    print 'Numpy result is', numpy.asarray(r)
+    if numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()]):
+        print 'Used the cpu'
+    else:
+        print 'Used the gpu'
+
+Running this version of the code takes just over 0.05 seconds, that is 60x faster than
+the CPU implementation!
+
+.. code-block:: text
+
+    With *flag* ``borrow=False``:
+
+    $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python using_gpu_solution_1.py
+    Using gpu device 0: GeForce GTX 580
+    [GpuElemwise{exp,no_inplace}(<CudaNdarrayType(float32, vector)>)]
+    Looping 1000 times took 0.31614613533 seconds
+    Result is <CudaNdarray object at 0x77e9270>
+    Numpy result is [ 1.23178029  1.61879349  1.52278066 ...,  2.20771813  2.29967761
+      1.62323296]
+    Used the gpu
+
+    With *flag* ``borrow=True``:
+
+    $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python using_gpu_solution_1.py
+    Using gpu device 0: GeForce GTX 580
+    [GpuElemwise{exp,no_inplace}(<CudaNdarrayType(float32, vector)>)]
+    Looping 1000 times took 0.0502779483795 seconds
+    Result is <CudaNdarray object at 0x83e5cb0>
+    Numpy result is [ 1.23178029  1.61879349  1.52278066 ...,  2.20771813  2.29967761
+      1.62323296]
+    Used the gpu
+
+
+This version of the code including the flag ``borrow=True`` is slightly less safe because if we had saved
+the *r* returned from one function call, we would have to take care and remember that its value might
+be over-written by a subsequent function call.  Although ``borrow=True`` makes a dramatic difference
+in this example, be careful!  The advantage of ``borrow=True`` is much weaker in larger graphs, and 
+there is a lot of potential for making a mistake by failing to account for the resulting memory aliasing.
 
-For even more speed you can play with the ``borrow`` flag.  See
-:ref:`borrowfunction`.
 
 What Can Be Accelerated on the GPU
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+----------------------------------
 
 The performance characteristics will change as we continue to optimize our
 implementations, and vary from device to device, but to give a rough idea of
 what to expect right now:
 
-* Only computations
+* Only computations 
   with *float32* data-type can be accelerated. Better support for *float64* is expected in upcoming hardware but
-  *float64* computations are still relatively slow (Jan 2010).
+  *float64* computations are still relatively slow (Jan 2010).  
 * Matrix
   multiplication, convolution, and large element-wise operations can be
   accelerated a lot (5-50x) when arguments are large enough to keep 30
-  processors busy.
+  processors busy.  
 * Indexing,
   dimension-shuffling and  constant-time reshaping will be equally fast on GPU
   as on CPU.
-* Summation
+* Summation 
   over rows/columns of tensors can be a little slower on the GPU than on the CPU.
-* Copying
+* Copying 
   of large quantities of data to and from a device is relatively slow, and
   often cancels most of the advantage of one or two accelerated functions on
   that data.  Getting GPU performance largely hinges on making data transfer to
   the device pay off.
 
 Tips for Improving Performance on GPU
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------------------------------
 
-* Consider
+* Consider 
   adding ``floatX=float32`` to your ``.theanorc`` file if you plan to do a lot of
   GPU work.
 * Use the Theano flag ``allow_gc=False``. See :ref:`gpu_async`
-* Prefer
+* Prefer  
   constructors like ``matrix``, ``vector`` and ``scalar`` to ``dmatrix``, ``dvector`` and
   ``dscalar`` because the former will give you *float32* variables when
   ``floatX=float32``.
-* Ensure
+* Ensure 
   that your output variables have a *float32* dtype and not *float64*.  The
   more *float32* variables are in your graph, the more work the GPU can do for
   you.
-* Minimize
+* Minimize 
   tranfers to the GPU device by using ``shared`` *float32* variables to store
   frequently-accessed data (see :func:`shared()<shared.shared>`).  When using
   the GPU, *float32* tensor ``shared`` variables are stored on the GPU by default to
   eliminate transfer time for GPU ops using those variables.
-* If you aren't happy with the performance you see, try building your functions with
+* If you aren't happy with the performance you see, try building your functions with 
   ``mode='ProfileMode'``. This should print some timing information at program
   termination. Is time being used sensibly?   If an op or Apply is
   taking more time than its share, then if you know something about GPU
@@ -218,7 +296,7 @@ Tips for Improving Performance on GPU
 .. _gpu_async:
 
 GPU Async capabilities
-~~~~~~~~~~~~~~~~~~~~~~
+----------------------
 
 Ever since Theano 0.6 we started to use the asynchronous capability of
 GPUs. This allows us to be faster but with the possibility that some
@@ -236,20 +314,20 @@ as it inserts synchronization points in the graph. Set the Theano flag
 usage.
 
 Changing the Value of Shared Variables
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------------------------------------
 
 To change the value of a ``shared`` variable, e.g. to provide new data to processes,
 use ``shared_variable.set_value(new_value)``. For a lot more detail about this,
 see :ref:`aliasing`.
 
+-------------------------------------------
 
-Exercise
-++++++++
+**Exercise**
 
 Consider again the logistic regression:
 
 .. code-block:: python
-
+    
     import numpy
     import theano
     import theano.tensor as T
@@ -308,9 +386,9 @@ Consider again the logistic regression:
     print "prediction on D"
     print predict(D[0])
 
+   
 
-
-Modify and execute this example to run on GPU with ``floatX=float32`` and
+Modify and execute this example to run on GPU with ``floatX=float32`` and 
 time it using the command line ``time python file.py``. (Of course, you may use some of your answer
 to the exercise in section :ref:`Configuration Settings and Compiling Mode<using_modes>`.)
 
@@ -329,204 +407,17 @@ What can be done to further increase the speed of the GPU version? Put your idea
    * There is a limit of one GPU per process.
    * Use the Theano flag ``device=gpu`` to require use of the GPU device.
    * Use ``device=gpu{0, 1, ...}`` to specify which GPU if you have more than one.
-
+ 
    * Apply the Theano flag ``floatX=float32`` (through ``theano.config.floatX``) in your code.
    * ``Cast`` inputs before storing them into a ``shared`` variable.
    * Circumvent the automatic cast of *int32* with *float32* to *float64*:
-
+    
      * Insert manual cast in your code or use *[u]int{8,16}*.
      * Insert manual cast around the mean operator (this involves division by length, which is an *int64*).
      * Notice that a new casting mechanism is being developed.
 
 :download:`Solution<using_gpu_solution_1.py>`
 
--------------------------------------------
-
-.. _gpuarray:
-
-GpuArray Backend
-----------------
-
-If you have not done so already, you will need to install libgpuarray
-as well as at least one computing toolkit.  Instructions for doing so
-are provided at `libgpuarray <http://deeplearning.net/software/libgpuarray/installation.html>`_.
-
-While all types of devices are supported if using OpenCL, for the
-remainder of this section, whatever compute device you are using will
-be referred to as GPU.
-
-.. warning::
-
-  While it is fully our intention to support OpenCL, as of May 2014
-  this support is still in its infancy.  A lot of very useful ops
-  still do not support it because they were ported from the old
-  backend with minimal change.
-
-Testing Theano with GPU
-~~~~~~~~~~~~~~~~~~~~~~~
-
-To see if your GPU is being used, cut and paste the following program
-into a file and run it.
-
-.. code-block:: python
-
-  from theano import function, config, shared, tensor, sandbox
-  import numpy
-  import time
-
-  vlen = 10 * 30 * 768  # 10 x #cores x # threads per core
-  iters = 1000
-
-  rng = numpy.random.RandomState(22)
-  x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
-  f = function([], tensor.exp(x))
-  print f.maker.fgraph.toposort()
-  t0 = time.time()
-  for i in xrange(iters):
-      r = f()
-  t1 = time.time()
-  print 'Looping %d times took' % iters, t1 - t0, 'seconds'
-  print 'Result is', r
-  if numpy.any([isinstance(x.op, tensor.Elemwise) and
-                ('Gpu' not in type(x.op).__name__)
-                for x in f.maker.fgraph.toposort()]):
-      print 'Used the cpu'
-  else:
-      print 'Used the gpu'
-
-The program just compute ``exp()`` of a bunch of random numbers.  Note
-that we use the :func:`theano.shared` function to make sure that the
-input *x* is stored on the GPU.
-
-.. code-block:: text
-
-  $ THEANO_FLAGS=device=cpu python check1.py
-  [Elemwise{exp,no_inplace}(<TensorType(float64, vector)>)]
-  Looping 1000 times took 2.6071999073 seconds
-  Result is [ 1.23178032  1.61879341  1.52278065 ...,  2.20771815  2.29967753
-    1.62323285]
-  Used the cpu
-
-  $ THEANO_FLAGS=device=cuda0 python check1.py
-  Using device cuda0: GeForce GTX 275
-  [GpuElemwise{exp,no_inplace}(<GpuArray<float64>>), HostFromGpu(gpuarray)(GpuElemwise{exp,no_inplace}.0)]
-  Looping 1000 times took 2.28562092781 seconds
-  Result is [ 1.23178032  1.61879341  1.52278065 ...,  2.20771815  2.29967753
-    1.62323285]
-  Used the gpu
-
-Returning a Handle to Device-Allocated Data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-By default functions that execute on the GPU still return a standard
-numpy ndarray.  A transfer operation is inserted just before the
-results are returned to ensure a consistent interface with CPU code.
-This allows changing the deivce some code runs on by only replacing
-the value of the ``device`` flag without touching the code.
-
-If you don't mind a loss of flexibility, you can ask theano to return
-the GPU object directly.  The following code is modifed to do just that.
-
-.. code-block:: python
-  :emphasize-lines: 10,17
-
-  from theano import function, config, shared, tensor, sandbox
-  import numpy
-  import time
-
-  vlen = 10 * 30 * 768  # 10 x #cores x # threads per core
-  iters = 1000
-
-  rng = numpy.random.RandomState(22)
-  x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
-  f = function([], sandbox.gpuarray.basic_ops.gpu_from_host(tensor.exp(x)))
-  print f.maker.fgraph.toposort()
-  t0 = time.time()
-  for i in xrange(iters):
-      r = f()
-  t1 = time.time()
-  print 'Looping %d times took' % iters, t1 - t0, 'seconds'
-  print 'Result is', numpy.asarray(r)
-  if numpy.any([isinstance(x.op, tensor.Elemwise) and
-                ('Gpu' not in type(x.op).__name__)
-                for x in f.maker.fgraph.toposort()]):
-      print 'Used the cpu'
-  else:
-      print 'Used the gpu'
-
-Here the :func:`theano.sandbox.gpuarray.basic.gpu_from_host` call
-means "copy input to the GPU".  However during the optimization phase,
-since the result will already be on th gpu, it will be removed.  It is
-used here to tell theano that we want the result on the GPU.
-
-The output is
-
-.. code-block:: text
-
-  $ THEANO_FLAGS=device=cuda0 python check2.py
-  Using device cuda0: GeForce GTX 275
-  [GpuElemwise{exp,no_inplace}(<GpuArray<float64>>)]
-  Looping 1000 times took 0.455810785294 seconds
-  Result is [ 1.23178032  1.61879341  1.52278065 ...,  2.20771815  2.29967753
-    1.62323285]
-  Used the gpu
-
-
-While the time per call appears to be much lower than the two previous
-invocations (and should indeed be lower, since we avoid a transfer)
-the massive speedup we obtained is in part due to asynchronous nature
-of execution on GPUs, meaning that the work isn't completed yet, just
-'launched'.  We'll talk about that later.
-
-The object returned is a GpuArray from pygpu.  It mostly acts as a
-numpy ndarray with some exceptions due to its data being on the GPU.
-You can copy it to the host and convert it to a regular ndarray by
-using usual numpy casting such as ``numpy.asarray()``.
-
-For even more speed, you can play with the ``borrow`` flag.  See
-:ref:`borrowfunction`.
-
-What Can be Accelerated on the GPU
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The performance characteristics will of course vary from device to
-device, and also as we refine our implementation.
-
-This backend supports all regular theano data types (float32, float64,
-int, ...) however GPU support varies and some units can't deal with
-double (float64) or small (less than 32 bits like int16) data types.
-You will get an error at compile time or runtime if this is the case.
-
-Complex support is untested and most likely completely broken.
-
-In general, large operations like matrix multiplication, or
-element-wise operations with large inputs, will be significatly
-faster.
-
-
-GPU Async Capabilities
-~~~~~~~~~~~~~~~~~~~~~~
-
-By default, all operations on the GPU are run asynchronously.  This
-means that they are only scheduled to run and the function returns.
-This is made somewhat transparently by the underlying libgpuarray.
-
-A forced synchronization point is introduced when doing memory
-transfers between device and host. Another is introduced when
-releasing active memory buffers on the GPU (active buffers are buffers
-that are still in use by a kernel).
-
-It is possible to force synchronization for a particular GpuArray by
-calling its ``sync()`` method.  This is useful to get accurate timings
-when doing benchmarks.
-
-The forced synchronization points interact with the garbage collection
-of the intermediate results.  To get the fastest speed possible, you
-should disable the garbage collector by using the theano flag
-``allow_gc=False``.  Be aware that this will increase memory usage
-sometimes significantly.
-
-
 -------------------------------------------
 
 
@@ -535,7 +426,7 @@ Software for Directly Programming a GPU
 
 Leaving aside Theano which is a meta-programmer, there are:
 
-* **CUDA**: GPU programming API by NVIDIA based on extension to C (CUDA C)
+* **CUDA**: GPU programming API by NVIDIA based on extension to C (CUDA C) 
 
   * Vendor-specific
 
@@ -547,17 +438,17 @@ Leaving aside Theano which is a meta-programmer, there are:
 
   * Fewer libraries, lesser spread.
 
-* **PyCUDA**: Python bindings to CUDA driver interface allow to access Nvidia's CUDA parallel
+* **PyCUDA**: Python bindings to CUDA driver interface allow to access Nvidia's CUDA parallel 
   computation API from Python
 
   * Convenience:
 
     Makes it easy to do GPU meta-programming from within Python.
-
+ 
     Abstractions to compile low-level CUDA code from Python (``pycuda.driver.SourceModule``).
 
     GPU memory buffer (``pycuda.gpuarray.GPUArray``).
-
+  
     Helpful documentation.
 
   * Completeness: Binding to all of CUDA's driver API.
@@ -574,9 +465,9 @@ Leaving aside Theano which is a meta-programmer, there are:
 
     PyCUDA knows about dependencies (e.g. it won't detach from a context before all memory
     allocated in it is also freed).
+     
 
-
-  (This is adapted from PyCUDA's `documentation <http://documen.tician.de/pycuda/index.html>`_
+  (This is adapted from PyCUDA's `documentation <http://documen.tician.de/pycuda/index.html>`_ 
   and Andreas Kloeckner's `website <http://mathema.tician.de/software/pycuda>`_ on PyCUDA.)
 
 
@@ -597,7 +488,7 @@ The following resources will assist you in this learning process:
   * `NVIDIA's slides <http://www.sdsc.edu/us/training/assets/docs/NVIDIA-02-BasicsOfCUDA.pdf>`_
 
   * `Stein's (NYU) slides <http://www.cs.nyu.edu/manycores/cuda_many_cores.pdf>`_
-
+  
 * **CUDA API and CUDA C: Advanced**
 
   * `MIT IAP2009 CUDA <https://sites.google.com/site/cudaiap2009/home>`_
@@ -618,7 +509,7 @@ The following resources will assist you in this learning process:
 
   * `Kloeckner's slides <http://www.gputechconf.com/gtcnew/on-demand-gtc.php?sessionTopic=&searchByKeyword=kloeckner&submit=&select=+&sessionEvent=2&sessionYear=2010&sessionFormat=3>`_
 
-  * `Kloeckner' website <http://mathema.tician.de/software/pycuda>`_
+  * `Kloeckner' website <http://mathema.tician.de/software/pycuda>`_ 
 
 * **PYCUDA: Advanced**
 
@@ -637,7 +528,7 @@ you feel competent enough, you may try yourself on the corresponding exercises.
   import pycuda.autoinit
   import pycuda.driver as drv
   import numpy
-
+  
   from pycuda.compiler import SourceModule
   mod = SourceModule("""
   __global__ void multiply_them(float *dest, float *a, float *b)
@@ -648,10 +539,10 @@ you feel competent enough, you may try yourself on the corresponding exercises.
   """)
 
   multiply_them = mod.get_function("multiply_them")
-
+  
   a = numpy.random.randn(400).astype(numpy.float32)
   b = numpy.random.randn(400).astype(numpy.float32)
-
+  
   dest = numpy.zeros_like(a)
   multiply_them(
           drv.Out(dest), drv.In(a), drv.In(b),
@@ -660,14 +551,15 @@ you feel competent enough, you may try yourself on the corresponding exercises.
   assert numpy.allclose(dest, a*b)
   print dest
 
+-------------------------------------------
 
-Exercise
-~~~~~~~~
+**Exercise**
 
 Run the preceding example.
 
 Modify and execute to work for a matrix of shape (20, 10).
 
+-------------------------------------------
 
 
 .. _pyCUDA_theano:
@@ -685,19 +577,15 @@ Modify and execute to work for a matrix of shape (20, 10).
     class PyCUDADoubleOp(theano.Op):
         def __eq__(self, other):
             return type(self) == type(other)
-
         def __hash__(self):
             return hash(type(self))
-
         def __str__(self):
             return self.__class__.__name__
-
         def make_node(self, inp):
             inp = cuda.basic_ops.gpu_contiguous(
                cuda.basic_ops.as_cuda_ndarray_variable(inp))
             assert inp.dtype == "float32"
             return theano.Apply(self, [inp], [inp.type()])
-
         def make_thunk(self, node, storage_map, _, _2):
             mod = SourceModule("""
         __global__ void my_fct(float * i0, float * o0, int size) {
@@ -707,30 +595,30 @@ Modify and execute to work for a matrix of shape (20, 10).
         }
       }""")
             pycuda_fct = mod.get_function("my_fct")
-            inputs = [storage_map[v] for v in node.inputs]
-            outputs = [storage_map[v] for v in node.outputs]
-
+            inputs = [ storage_map[v] for v in node.inputs]
+            outputs = [ storage_map[v] for v in node.outputs]
             def thunk():
                 z = outputs[0]
-                if z[0] is None or z[0].shape != inputs[0][0].shape:
+                if z[0] is None or z[0].shape!=inputs[0][0].shape:
                     z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
-                grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1)
+                grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
                 pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
-                           block=(512, 1, 1), grid=grid)
+                           block=(512,1,1), grid=grid)
             return thunk
-
+    
 
 Use this code to test it:
 
 >>> x = theano.tensor.fmatrix()
 >>> f = theano.function([x], PyCUDADoubleOp()(x))
->>> xv = numpy.ones((4, 5), dtype="float32")
+>>> xv=numpy.ones((4,5), dtype="float32")
 >>> assert numpy.allclose(f(xv), xv*2)
 >>> print numpy.asarray(f(xv))
 
+-------------------------------------------
+
+**Exercise**
 
-Exercise
-~~~~~~~~
 
 Run the preceding example.
 
diff --git a/doc/tutorial/using_gpu_solution_1.py b/doc/tutorial/using_gpu_solution_1.py
index e2d5e82b6a3..f80941f0749 100755
--- a/doc/tutorial/using_gpu_solution_1.py
+++ b/doc/tutorial/using_gpu_solution_1.py
@@ -4,7 +4,12 @@
 
 
 # 1. Raw results
-
+#
+# same code as in mode_solution_1 but run with following command lines:
+# THEANO_FLAGS=mode=FAST_RUN,device=gpu time python program_name.py
+# THEANO_FLAGS=mode=FAST_RUN,device=cpu time python program_name.py
+# for GPU and CPU respectively
+# typical time: 20 sec (CPU), 10 sec (GPU)
 
 import numpy
 import theano
@@ -23,8 +28,8 @@
 training_steps = 10000
 
 # Declare Theano symbolic variables
-x = theano.shared(D[0], name="x")
-y = theano.shared(D[1], name="y")
+x = tt.matrix("x")
+y = tt.vector("y")
 w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
 b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
 x.tag.test_value = D[0]
@@ -53,11 +58,11 @@
 
 # Compile expressions to functions
 train = theano.function(
-            inputs=[],
+            inputs=[x, y],
             outputs=[prediction, xent],
             updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
             name="train")
-predict = theano.function(inputs=[], outputs=prediction,
+predict = theano.function(inputs=[x], outputs=prediction,
             name="predict")
 
 if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in
@@ -71,7 +76,7 @@
     print train.maker.fgraph.toposort()
 
 for i in range(training_steps):
-    pred, err = train()
+    pred, err = train(D[0], D[1])
 #print "Final model:"
 #print w.get_value(), b.get_value()
 
@@ -79,199 +84,241 @@
 print D[1]
 
 print "prediction on D"
-print predict()
+print predict(D[0])
 
 """
 
 # 2. Profiling
+#
+# same code as above but run with following command lines:
+# THEANO_FLAGS=mode=ProfileMode,device=gpu python program_name.py
+# THEANO_FLAGS=mode=ProfileMode,device=cpu python program_name.py
+# for GPU and CPU
 
 
-# 2.1 Profiling for CPU computations
+# 2.1 Profiling output for CPU computations
 
-# In your terminal, type:
-$ THEANO_FLAGS=profile=True,device=cpu python using_gpu_solution_1.py
 
-# You'll see first the output of the script:
+$ THEANO_FLAGS=mode=ProfileMode,device=cpu python program_name.py
+Used the cpu
+target values for D
+prediction on D
 Used the cpu
 target values for D
 prediction on D
 
-# Followed by the output of profiling.. You'll see profiling results for each function
-# in the script, followed by a summary for all functions.
-# We'll show here only the summary:
-
-Results were produced using an Intel(R) Core(TM) i7-4820K CPU @ 3.70GHz
-
-Function profiling
-==================
-  Message: Sum of all(3) printed profiles at exit excluding Scan op profile.
-  Time in 10002 calls to Function.__call__: 1.590916e+00s
-  Time in Function.fn.__call__: 1.492365e+00s (93.805%)
-  Time in thunks: 1.408159e+00s (88.512%)
-  Total compile time: 6.309664e+00s
-    Number of Apply nodes: 25
-    Theano Optimizer time: 4.848340e-01s
-       Theano validate time: 5.454302e-03s
-    Theano Linker time (includes C, CUDA code generation/compiling): 5.691789e+00s
-
-Class
----
-<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
-  59.6%    59.6%       0.839s       4.19e-05s     C    20001       3   theano.tensor.blas_c.CGemv
-  30.1%    89.7%       0.424s       4.71e-06s     C    90001      10   theano.tensor.elemwise.Elemwise
-   5.5%    95.2%       0.078s       7.79e-02s     Py       1       1   theano.tensor.blas.Gemv
-   1.9%    97.1%       0.026s       1.30e-06s     C    20001       3   theano.tensor.basic.Alloc
-   1.3%    98.4%       0.018s       1.85e-06s     C    10000       1   theano.tensor.elemwise.Sum
-   1.0%    99.4%       0.014s       4.78e-07s     C    30001       4   theano.tensor.elemwise.DimShuffle
-   0.6%   100.0%       0.008s       4.23e-07s     C    20001       3   theano.compile.ops.Shape_i
-   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)
-
-Ops
----
-<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
-  59.6%    59.6%       0.839s       4.19e-05s     C     20001        3   CGemv{inplace}
-  15.8%    75.4%       0.223s       2.23e-05s     C     10000        1   Elemwise{Composite{[sub(mul(i0, scalar_softplus(i1)), mul(i2, i3, scalar_softplus(i4)))]}}[(0, 4)]
-   7.7%    83.1%       0.109s       1.09e-05s     C     10000        1   Elemwise{Composite{[add(mul(scalar_sigmoid(i0), i1, i2, i3), true_div(mul(scalar_sigmoid(neg(i0)), i4), i5))]}}[(0, 0)]
-   5.5%    88.7%       0.078s       7.79e-02s     Py       1        1   Gemv{no_inplace}
-   4.3%    92.9%       0.060s       6.00e-06s     C     10000        1   Elemwise{Composite{[GT(scalar_sigmoid(i0), i1)]}}
-   1.9%    94.8%       0.026s       1.30e-06s     C     20001        3   Alloc
-   1.3%    96.1%       0.018s       1.85e-06s     C     10000        1   Sum{acc_dtype=float64}
-   0.7%    96.8%       0.009s       4.73e-07s     C     20001        3   InplaceDimShuffle{x}
-   0.6%    97.4%       0.009s       8.52e-07s     C     10000        1   Elemwise{sub,no_inplace}
-   0.6%    98.0%       0.008s       4.23e-07s     C     20001        3   Shape_i{0}
-   0.5%    98.5%       0.007s       7.06e-07s     C     10000        1   Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)]
-   0.5%    98.9%       0.007s       6.57e-07s     C     10000        1   Elemwise{neg,no_inplace}
-   0.3%    99.3%       0.005s       4.88e-07s     C     10000        1   InplaceDimShuffle{1,0}
-   0.3%    99.5%       0.004s       3.78e-07s     C     10000        1   Elemwise{inv,no_inplace}
-   0.2%    99.8%       0.003s       3.44e-07s     C     10000        1   Elemwise{Cast{float32}}
-   0.2%   100.0%       0.003s       3.01e-07s     C     10000        1   Elemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)]
-   0.0%   100.0%       0.000s       8.11e-06s     C        1        1   Elemwise{Composite{[GT(scalar_sigmoid(neg(sub(neg(i0), i1))), i2)]}}
-   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)
-
-Apply
-------
-<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
-  31.6%    31.6%       0.445s       4.45e-05s   10000     7   CGemv{inplace}(Alloc.0, TensorConstant{1.0}, x, w, TensorConstant{0.0})
-  27.9%    59.6%       0.393s       3.93e-05s   10000    17   CGemv{inplace}(w, TensorConstant{-0.00999999977648}, x.T, Elemwise{Composite{[add(mul(scalar_sigmoid(i0), i1, i2, i3), true_div(mul(scalar_sigmoid(neg(i0)), i4), i5))]}}[(0, 0)].0, TensorConstant{0.999800026417})
-  15.8%    75.4%       0.223s       2.23e-05s   10000    14   Elemwise{Composite{[sub(mul(i0, scalar_softplus(i1)), mul(i2, i3, scalar_softplus(i4)))]}}[(0, 4)](y, Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0, TensorConstant{(1,) of -1.0}, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
-   7.7%    83.1%       0.109s       1.09e-05s   10000    15   Elemwise{Composite{[add(mul(scalar_sigmoid(i0), i1, i2, i3), true_div(mul(scalar_sigmoid(neg(i0)), i4), i5))]}}[(0, 0)](Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0, TensorConstant{(1,) of -1.0}, Alloc.0, y, Elemwise{sub,no_inplace}.0, Elemwise{Cast{float32}}.0)
-   5.5%    88.7%       0.078s       7.79e-02s      1     0   Gemv{no_inplace}(aa, TensorConstant{1.0}, xx, yy, TensorConstant{0.0})
-   4.3%    92.9%       0.060s       6.00e-06s   10000    13   Elemwise{Composite{[GT(scalar_sigmoid(i0), i1)]}}(Elemwise{neg,no_inplace}.0, TensorConstant{(1,) of 0.5})
-   1.3%    94.2%       0.018s       1.85e-06s   10000    16   Sum{acc_dtype=float64}(Elemwise{Composite{[add(mul(scalar_sigmoid(i0), i1, i2, i3), true_div(mul(scalar_sigmoid(neg(i0)), i4), i5))]}}[(0, 0)].0)
-   1.0%    95.2%       0.013s       1.34e-06s   10000     5   Alloc(TensorConstant{0.0}, Shape_i{0}.0)
-   0.9%    96.1%       0.013s       1.27e-06s   10000    12   Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
-   0.6%    96.7%       0.009s       8.52e-07s   10000     4   Elemwise{sub,no_inplace}(TensorConstant{(1,) of 1.0}, y)
-   0.5%    97.2%       0.007s       7.06e-07s   10000     9   Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)](CGemv{inplace}.0, InplaceDimShuffle{x}.0)
-   0.5%    97.6%       0.007s       6.57e-07s   10000    11   Elemwise{neg,no_inplace}(Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0)
-   0.4%    98.1%       0.006s       6.27e-07s   10000     0   InplaceDimShuffle{x}(b)
-   0.4%    98.5%       0.006s       5.90e-07s   10000     1   Shape_i{0}(x)
-   0.3%    98.9%       0.005s       4.88e-07s   10000     2   InplaceDimShuffle{1,0}(x)
-   0.3%    99.1%       0.004s       3.78e-07s   10000    10   Elemwise{inv,no_inplace}(Elemwise{Cast{float32}}.0)
-   0.2%    99.4%       0.003s       3.44e-07s   10000     8   Elemwise{Cast{float32}}(InplaceDimShuffle{x}.0)
-   0.2%    99.6%       0.003s       3.19e-07s   10000     6   InplaceDimShuffle{x}(Shape_i{0}.0)
-   0.2%    99.8%       0.003s       3.01e-07s   10000    18   Elemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)](b, TensorConstant{0.00999999977648}, Sum{acc_dtype=float64}.0)
-   0.2%   100.0%       0.003s       2.56e-07s   10000     3   Shape_i{0}(y)
-   ... (remaining 5 Apply instances account for 0.00%(0.00s) of the runtime)
-
-
-
-# 2.2 Profiling for GPU computations
-
-# In your terminal, type:
-$ CUDA_LAUNCH_BLOCKING=1 THEANO_FLAGS=profile=True,device=gpu python using_gpu_solution_1.py
-
-# You'll see first the output of the script:
+ProfileMode.print_summary()
+---------------------------
+
+Time since import 12.586s
+Theano compile time: 0.000s (0.0% since import)
+    Optimization time: 0.000s
+    Linker time: 0.000s
+Theano fct call 5.147s (40.9% since import)
+   Theano Op time 3.595s 28.6%(since import) 69.8%(of fct call)
+   Theano function overhead in ProfileMode 1.552s 12.3%(since import) 30.2%(of fct call)
+20002 Theano fct call, 0.000s per call
+Rest of the time since import 7.440s 59.1%
+
+Theano fct summary:
+<% total fct time> <total time> <time per call> <nb call> <fct name>
+   49.9% 2.567s 2.57e-04s 10000 train
+    0.0% 0.000s 1.24e-04s 1 predict
+    0.0% 0.000s 1.26e-04s 1 predict
+   50.1% 2.579s 2.58e-04s 10000 train
+
+Single Op-wise summary:
+<% of local_time spent on this kind of Op> <cumulative %> <self seconds> <cumulative seconds> <time per call> [*] <nb_call> <nb_op> <nb_apply> <Op name>
+   59.3%   59.3%  2.133s  2.133s  5.33e-05s * 40002  1  6 <class 'theano.tensor.blas_c.CGemv'>
+   34.4%   93.8%  1.238s  3.371s  6.19e-06s * 200002 11 22 <class 'theano.tensor.elemwise.Elemwise'>
+    2.8%   96.6%  0.100s  3.471s  2.51e-06s * 40002  1  6 <class 'theano.tensor.basic.Alloc'>
+    2.1%   98.7%  0.075s  3.546s  1.26e-06s * 60002  2  8 <class 'theano.tensor.elemwise.DimShuffle'>
+    0.7%   99.3%  0.024s  3.571s  6.11e-07s * 40002  1  6 <class 'theano.tensor.opt.Shape_i'>
+    0.7%  100.0%  0.024s  3.595s  1.18e-06s * 20000  1  2 <class 'theano.tensor.elemwise.Sum'>
+   ... (remaining 0 single Op account for 0.00%(0.00s) of the runtime)
+(*) Op is running a c implementation
+
+Op-wise summary:
+<% of local_time spent on this kind of Op> <cumulative %> <self seconds> <cumulative seconds> <time per call> [*]  <nb_call> <nb apply> <Op name>
+   59.3%   59.3%  2.133s  2.133s  5.33e-05s * 40002  6 CGemv{inplace}
+   18.1%   77.4%  0.650s  2.783s  3.25e-05s * 20000  2 Elemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_softplus(i1), mul(i2, i3))]}(i0, i1, i2, scalar_softplus(i3))]}}
+    6.4%   83.9%  0.231s  3.014s  1.16e-05s * 20000  2 Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)]
+    4.0%   87.8%  0.142s  3.157s  7.11e-06s * 20000  2 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)]
+    2.8%   90.6%  0.100s  3.257s  2.51e-06s * 40002  6 Alloc
+    1.4%   92.1%  0.052s  3.309s  1.30e-06s * 40002  6 InplaceDimShuffle{x}
+    1.1%   93.1%  0.038s  3.347s  1.92e-06s * 20000  2 Elemwise{Cast{float32}}
+    1.1%   94.2%  0.038s  3.386s  1.91e-06s * 20000  2 Elemwise{sub,no_inplace}
+    1.0%   95.2%  0.036s  3.421s  1.79e-06s * 20000  2 Elemwise{gt,no_inplace}
+    0.8%   96.0%  0.029s  3.450s  1.44e-06s * 20000  2 Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)]
+    0.8%   96.8%  0.028s  3.479s  1.42e-06s * 20000  2 Elemwise{neg,no_inplace}
+    0.7%   97.5%  0.024s  3.503s  6.11e-07s * 40002  6 Shape_i{0}
+    0.7%   98.1%  0.024s  3.527s  1.18e-06s * 20000  2 Sum
+    0.6%   98.8%  0.023s  3.550s  1.16e-06s * 20000  2 InplaceDimShuffle{1,0}
+    0.6%   99.4%  0.023s  3.573s  1.15e-06s * 20000  2 Elemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)]
+    0.6%  100.0%  0.022s  3.595s  1.08e-06s * 20000  2 Elemwise{inv,no_inplace}
+    0.0%  100.0%  0.000s  3.595s  1.19e-05s *     2  2 Elemwise{Composite{[Composite{[Composite{[Composite{[GT(scalar_sigmoid(i0), i1)]}(neg(i0), i1)]}(sub(i0, i1), i2)]}(neg(i0), i1, i2)]}}
+   ... (remaining 0 Op account for   0.00%(0.00s) of the runtime)
+(*) Op is running a c implementation
+
+Apply-wise summary:
+<% of local_time spent at this position> <cumulative %%> <apply time> <cumulative seconds> <time per call> [*] <nb_call> <Apply position> <Apply Op name>
+   14.9%   14.9%  0.536s  0.536s 5.36e-05s  * 10000   7 CGemv{inplace}(Alloc.0, TensorConstant{1.0}, x, w, TensorConstant{1.0})
+   14.9%   29.8%  0.534s  1.070s 5.34e-05s  * 10000  18 CGemv{inplace}(w, TensorConstant{-0.00999999977648}, x.T, Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)].0, TensorConstant{0.999800026417})
+   14.8%   44.6%  0.532s  1.602s 5.32e-05s  * 10000   7 CGemv{inplace}(Alloc.0, TensorConstant{1.0}, x, w, TensorConstant{1.0})
+   14.7%   59.3%  0.530s  2.132s 5.30e-05s  * 10000  18 CGemv{inplace}(w, TensorConstant{-0.00999999977648}, x.T, Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)].0, TensorConstant{0.999800026417})
+    9.1%   68.4%  0.327s  2.460s 3.27e-05s  * 10000  13 Elemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_softplus(i1), mul(i2, i3))]}(i0, i1, i2, scalar_softplus(i3))]}}(y, Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
+    9.0%   77.4%  0.323s  2.783s 3.23e-05s  * 10000  13 Elemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_softplus(i1), mul(i2, i3))]}(i0, i1, i2, scalar_softplus(i3))]}}(y, Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
+    3.2%   80.6%  0.116s  2.899s 1.16e-05s  * 10000  16 Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{Cast{float32}}.0)
+    3.2%   83.9%  0.116s  3.014s 1.16e-05s  * 10000  16 Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{Cast{float32}}.0)
+    2.0%   85.8%  0.071s  3.086s 7.12e-06s  * 10000  14 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)](Elemwise{neg,no_inplace}.0)
+    2.0%   87.8%  0.071s  3.156s 7.09e-06s  * 10000  14 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)](Elemwise{neg,no_inplace}.0)
+    0.9%   88.8%  0.034s  3.190s 3.38e-06s  * 10000  12 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
+    0.9%   89.7%  0.034s  3.224s 3.37e-06s  * 10000  12 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
+    0.5%   90.2%  0.019s  3.243s 1.93e-06s  * 10000   8 Elemwise{Cast{float32}}(InplaceDimShuffle{x}.0)
+    0.5%   90.8%  0.019s  3.262s 1.92e-06s  * 10000   4 Elemwise{sub,no_inplace}(TensorConstant{(1,) of 1.0}, y)
+    0.5%   91.3%  0.019s  3.282s 1.90e-06s  * 10000   4 Elemwise{sub,no_inplace}(TensorConstant{(1,) of 1.0}, y)
+   ... (remaining 35 Apply instances account for 8.71%(0.31s) of the runtime)
+(*) Op is running a c implementation
+
+Profile of Theano functions memory:
+(This check only the output of each apply node. It don't check the temporary memory used by the op in the apply node.)
+   We skipped 4 theano function(s). Each of them used less then 1024B(theano flags ProfileMode.min_memory_size) of total intermediate memory size
+
+Here are tips to potentially make your code run faster
+(if you think of new ones, suggest them on the mailing list).
+Test them first, as they are not guaranteed to always provide a speedup.
+  Sorry, no tip for today.
+
+
+# 2.2 Profiling output for GPU computations
+
+$ THEANO_FLAGS=mode=ProfileMode,device=gpu python program_name.py
+Using gpu device 0: GeForce GTX 580
+Used the gpu
+target values for D
+prediction on D
 Used the gpu
 target values for D
 prediction on D
 
-Results were produced using a GeForce GTX TITAN
-
-# Profiling summary for all functions:
-
-Function profiling
-==================
-  Message: Sum of all(3) printed profiles at exit excluding Scan op profile.
-  Time in 10002 calls to Function.__call__: 3.535239e+00s
-  Time in Function.fn.__call__: 3.420863e+00s (96.765%)
-  Time in thunks: 2.865905e+00s (81.067%)
-  Total compile time: 4.728150e-01s
-    Number of Apply nodes: 36
-    Theano Optimizer time: 4.283385e-01s
-       Theano validate time: 7.687330e-03s
-    Theano Linker time (includes C, CUDA code generation/compiling): 2.801418e-02s
-
-Class
----
-<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
-  45.7%    45.7%       1.308s       1.64e-05s     C    80001       9   theano.sandbox.cuda.basic_ops.GpuElemwise
-  17.2%    62.8%       0.492s       2.46e-05s     C    20002       4   theano.sandbox.cuda.blas.GpuGemv
-  15.1%    77.9%       0.433s       2.17e-05s     C    20001       3   theano.sandbox.cuda.basic_ops.GpuAlloc
-   8.2%    86.1%       0.234s       1.17e-05s     C    20002       4   theano.sandbox.cuda.basic_ops.HostFromGpu
-   7.2%    93.3%       0.207s       2.07e-05s     C    10000       1   theano.sandbox.cuda.basic_ops.GpuCAReduce
-   4.4%    97.7%       0.127s       1.27e-05s     C    10003       4   theano.sandbox.cuda.basic_ops.GpuFromHost
-   0.9%    98.6%       0.025s       8.23e-07s     C    30001       4   theano.sandbox.cuda.basic_ops.GpuDimShuffle
-   0.7%    99.3%       0.020s       9.88e-07s     C    20001       3   theano.tensor.elemwise.Elemwise
-   0.5%    99.8%       0.014s       7.18e-07s     C    20001       3   theano.compile.ops.Shape_i
-   0.2%   100.0%       0.006s       5.78e-07s     C    10000       1   theano.tensor.elemwise.DimShuffle
-   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)
-
-Ops
----
-<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
-  17.2%    17.2%       0.492s       2.46e-05s     C     20001        3   GpuGemv{inplace}
-   8.2%    25.3%       0.234s       1.17e-05s     C     20002        4   HostFromGpu
-   8.0%    33.3%       0.228s       2.28e-05s     C     10001        2   GpuAlloc{memset_0=True}
-   7.4%    40.7%       0.211s       2.11e-05s     C     10000        1   GpuElemwise{Composite{[sub(mul(i0, scalar_softplus(i1)), mul(i2, i3, scalar_softplus(i4)))]},no_inplace}
-   7.2%    47.9%       0.207s       2.07e-05s     C     10000        1   GpuCAReduce{add}{1}
-   7.1%    55.0%       0.205s       2.05e-05s     C     10000        1   GpuAlloc
-   6.9%    62.0%       0.198s       1.98e-05s     C     10000        1   GpuElemwise{sub,no_inplace}
-   6.9%    68.9%       0.198s       1.98e-05s     C     10000        1   GpuElemwise{inv,no_inplace}
-   6.2%    75.1%       0.178s       1.78e-05s     C     10000        1   GpuElemwise{neg,no_inplace}
-   5.6%    80.6%       0.159s       1.59e-05s     C     10000        1   GpuElemwise{Composite{[add(mul(scalar_sigmoid(i0), i1, i2, i3), true_div(mul(i4, i5), i6))]}}[(0, 0)]
-   4.4%    85.1%       0.127s       1.27e-05s     C     10003        4   GpuFromHost
-   4.3%    89.4%       0.124s       1.24e-05s     C     10000        1   GpuElemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)]
-   4.2%    93.6%       0.121s       1.21e-05s     C     10000        1   GpuElemwise{ScalarSigmoid}[(0, 0)]
-   4.2%    97.7%       0.119s       1.19e-05s     C     10000        1   GpuElemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)]
-   0.5%    98.2%       0.014s       7.18e-07s     C     20001        3   Shape_i{0}
-   0.5%    98.7%       0.013s       1.33e-06s     C     10001        2   Elemwise{gt,no_inplace}
-   0.3%    99.0%       0.010s       9.81e-07s     C     10000        1   GpuDimShuffle{1,0}
-   0.3%    99.3%       0.008s       7.90e-07s     C     10000        1   GpuDimShuffle{0}
-   0.2%    99.6%       0.007s       6.97e-07s     C     10001        2   GpuDimShuffle{x}
-   0.2%    99.8%       0.006s       6.50e-07s     C     10000        1   Elemwise{Cast{float32}}
-   ... (remaining 3 Ops account for   0.20%(0.01s) of the runtime)
-
-Apply
-------
-<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
-   8.8%     8.8%       0.251s       2.51e-05s   10000    22   GpuGemv{inplace}(w, TensorConstant{-0.00999999977648}, GpuDimShuffle{1,0}.0, GpuElemwise{Composite{[add(mul(scalar_sigmoid(i0), i1, i2, i3), true_div(mul(i4, i5), i6))]}}[(0, 0)].0, TensorConstant{0.999800026417})
-   8.4%    17.2%       0.241s       2.41e-05s   10000     7   GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, x, w, TensorConstant{0.0})
-   8.0%    25.1%       0.228s       2.28e-05s   10000     5   GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
-   7.4%    32.5%       0.211s       2.11e-05s   10000    13   GpuElemwise{Composite{[sub(mul(i0, scalar_softplus(i1)), mul(i2, i3, scalar_softplus(i4)))]},no_inplace}(y, GpuElemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0, CudaNdarrayConstant{[-1.]}, GpuElemwise{sub,no_inplace}.0, GpuElemwise{neg,no_inplace}.0)
-   7.2%    39.7%       0.207s       2.07e-05s   10000    21   GpuCAReduce{add}{1}(GpuElemwise{Composite{[add(mul(scalar_sigmoid(i0), i1, i2, i3), true_div(mul(i4, i5), i6))]}}[(0, 0)].0)
-   7.1%    46.9%       0.205s       2.05e-05s   10000    17   GpuAlloc(GpuDimShuffle{0}.0, Shape_i{0}.0)
-   6.9%    53.8%       0.198s       1.98e-05s   10000     4   GpuElemwise{sub,no_inplace}(CudaNdarrayConstant{[ 1.]}, y)
-   6.9%    60.7%       0.198s       1.98e-05s   10000    12   GpuElemwise{inv,no_inplace}(GpuFromHost.0)
-   6.2%    66.9%       0.178s       1.78e-05s   10000    11   GpuElemwise{neg,no_inplace}(GpuElemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0)
-   5.6%    72.5%       0.159s       1.59e-05s   10000    19   GpuElemwise{Composite{[add(mul(scalar_sigmoid(i0), i1, i2, i3), true_div(mul(i4, i5), i6))]}}[(0, 0)](GpuElemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0, CudaNdarrayConstant{[-1.]}, GpuAlloc.0, y, GpuElemwise{ScalarSigmoid}[(0, 0)].0, GpuElemwise{sub,no_inplace}.0, GpuFromHost.0)
-   4.8%    77.3%       0.138s       1.38e-05s   10000    18   HostFromGpu(GpuElemwise{ScalarSigmoid}[(0, 0)].0)
-   4.4%    81.7%       0.126s       1.26e-05s   10000    10   GpuFromHost(Elemwise{Cast{float32}}.0)
-   4.3%    86.0%       0.124s       1.24e-05s   10000     9   GpuElemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)](GpuGemv{inplace}.0, GpuDimShuffle{x}.0)
-   4.2%    90.2%       0.121s       1.21e-05s   10000    15   GpuElemwise{ScalarSigmoid}[(0, 0)](GpuElemwise{neg,no_inplace}.0)
-   4.2%    94.4%       0.119s       1.19e-05s   10000    23   GpuElemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)](b, CudaNdarrayConstant{0.00999999977648}, GpuCAReduce{add}{1}.0)
-   3.4%    97.7%       0.096s       9.61e-06s   10000    16   HostFromGpu(GpuElemwise{Composite{[sub(mul(i0, scalar_softplus(i1)), mul(i2, i3, scalar_softplus(i4)))]},no_inplace}.0)
-   0.5%    98.2%       0.013s       1.33e-06s   10000    20   Elemwise{gt,no_inplace}(HostFromGpu.0, TensorConstant{(1,) of 0.5})
-   0.3%    98.5%       0.010s       9.81e-07s   10000     2   GpuDimShuffle{1,0}(x)
-   0.3%    98.8%       0.008s       8.27e-07s   10000     1   Shape_i{0}(x)
-   0.3%    99.1%       0.008s       7.90e-07s   10000    14   GpuDimShuffle{0}(GpuElemwise{inv,no_inplace}.0)
-   ... (remaining 16 Apply instances account for 0.90%(0.03s) of the runtime)
+ProfileMode.print_summary()
+---------------------------
+
+Time since import 25.682s
+Theano compile time: 0.000s (0.0% since import)
+    Optimization time: 0.000s
+    Linker time: 0.000s
+Theano fct call 17.052s (66.4% since import)
+   Theano Op time 14.548s 56.6%(since import) 85.3%(of fct call)
+   Theano function overhead in ProfileMode 2.505s 9.8%(since import) 14.7%(of fct call)
+20002 Theano fct call, 0.001s per call
+Rest of the time since import 8.630s 33.6%
+
+Theano fct summary:
+<% total fct time> <total time> <time per call> <nb call> <fct name>
+   50.0% 8.526s 8.53e-04s 10000 train
+    0.0% 0.001s 1.09e-03s 1 predict
+   50.0% 8.524s 8.52e-04s 10000 train
+    0.0% 0.001s 1.10e-03s 1 predict
+
+Single Op-wise summary:
+<% of local_time spent on this kind of Op> <cumulative %> <self seconds> <cumulative seconds> <time per call> [*] <nb_call> <nb_op> <nb_apply> <Op name>
+   54.8%   54.8%  7.968s  7.968s  1.33e-04s   60002  1  8 <class 'theano.sandbox.cuda.basic_ops.GpuFromHost'>
+   16.2%   71.0%  2.358s  10.325s  1.47e-05s * 160002  9 18 <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'>
+   12.3%   83.3%  1.795s  12.120s  4.49e-05s * 40002  1  6 <class 'theano.sandbox.cuda.blas.GpuGemv'>
+    7.0%   90.4%  1.024s  13.144s  2.56e-05s   40002  1  6 <class 'theano.sandbox.cuda.basic_ops.HostFromGpu'>
+    5.0%   95.4%  0.728s  13.872s  1.82e-05s * 40002  1  6 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
+    2.1%   97.4%  0.300s  14.171s  1.50e-05s * 20000  1  2 <class 'theano.sandbox.cuda.basic_ops.GpuSum'>
+    1.3%   98.7%  0.189s  14.360s  3.15e-06s * 60002  3  8 <class 'theano.sandbox.cuda.basic_ops.GpuDimShuffle'>
+    0.6%   99.4%  0.094s  14.454s  2.35e-06s * 40002  2  6 <class 'theano.tensor.elemwise.Elemwise'>
+    0.3%   99.7%  0.048s  14.503s  1.21e-06s * 40002  1  6 <class 'theano.tensor.opt.Shape_i'>
+    0.3%  100.0%  0.045s  14.548s  2.25e-06s * 20000  1  2 <class 'theano.tensor.elemwise.DimShuffle'>
+   ... (remaining 0 single Op account for 0.00%(0.00s) of the runtime)
+(*) Op is running a c implementation
+
+Op-wise summary:
+<% of local_time spent on this kind of Op> <cumulative %> <self seconds> <cumulative seconds> <time per call> [*]  <nb_call> <nb apply> <Op name>
+   54.8%   54.8%  7.968s  7.968s  1.33e-04s   60002  8 GpuFromHost
+   12.3%   67.1%  1.795s  9.763s  4.49e-05s * 40002  6 GpuGemv{inplace}
+    7.0%   74.1%  1.024s  10.786s  2.56e-05s   40002  6 HostFromGpu
+    5.0%   79.1%  0.728s  11.514s  1.82e-05s * 40002  6 GpuAlloc
+    2.3%   81.4%  0.334s  11.848s  1.67e-05s * 20000  2 GpuElemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)]
+    2.2%   83.6%  0.319s  12.167s  1.59e-05s * 20000  2 GpuElemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_softplus(i1), mul(i2, i3))]}(i0, i1, i2, scalar_softplus(i3))]},no_inplace}
+    2.1%   85.7%  0.301s  12.468s  1.50e-05s * 20000  2 GpuElemwise{neg,no_inplace}
+    2.1%   87.8%  0.300s  12.768s  1.50e-05s * 20000  2 GpuSum{1}
+    2.0%   89.8%  0.292s  13.060s  1.46e-05s * 20000  2 GpuElemwise{inv,no_inplace}
+    1.9%   91.7%  0.283s  13.343s  1.42e-05s * 20000  2 GpuElemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)]
+    1.9%   93.7%  0.281s  13.625s  1.41e-05s * 20000  2 GpuElemwise{sub,no_inplace}
+    1.9%   95.5%  0.273s  13.898s  1.37e-05s * 20000  2 GpuElemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)]
+    1.9%   97.4%  0.273s  14.171s  1.37e-05s * 20000  2 GpuElemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)]
+    1.0%   98.4%  0.141s  14.313s  7.06e-06s * 20002  4 GpuDimShuffle{x}
+    0.4%   98.8%  0.057s  14.370s  2.87e-06s * 20002  4 Elemwise{gt,no_inplace}
+    0.3%   99.1%  0.048s  14.418s  1.21e-06s * 40002  6 Shape_i{0}
+    0.3%   99.4%  0.045s  14.463s  2.25e-06s * 20000  2 InplaceDimShuffle{x}
+    0.3%   99.7%  0.037s  14.500s  1.83e-06s * 20000  2 Elemwise{Cast{float32}}
+    0.2%   99.8%  0.025s  14.525s  1.24e-06s * 20000  2 GpuDimShuffle{0}
+    0.2%  100.0%  0.023s  14.548s  1.14e-06s * 20000  2 GpuDimShuffle{1,0}
+   ... (remaining 1 Op account for   0.00%(0.00s) of the runtime)
+(*) Op is running a c implementation
+
+Apply-wise summary:
+<% of local_time spent at this position> <cumulative %%> <apply time> <cumulative seconds> <time per call> [*] <nb_call> <Apply position> <Apply Op name>
+   24.0%   24.0%  3.493s  3.493s 3.49e-04s    10000   1 GpuFromHost(x)
+   23.9%   47.9%  3.479s  6.972s 3.48e-04s    10000   1 GpuFromHost(x)
+    4.3%   52.3%  0.629s  7.602s 6.29e-05s  * 10000  24 GpuGemv{inplace}(w, TensorConstant{-0.00999999977648}, GpuDimShuffle{1,0}.0, GpuElemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)].0, TensorConstant{0.999800026417})
+    4.3%   56.6%  0.629s  8.231s 6.29e-05s  * 10000  24 GpuGemv{inplace}(w, TensorConstant{-0.00999999977648}, GpuDimShuffle{1,0}.0, GpuElemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, neg(i1), true_div(i2, i3))]}(i0, mul(i1, i2, i3), i4, i5)]}(i0, i1, i2, exp(i3), i4, i5)]}}[(0, 0)].0, TensorConstant{0.999800026417})
+    1.8%   58.4%  0.269s  8.499s 2.69e-05s  * 10000   9 GpuGemv{inplace}(GpuAlloc.0, TensorConstant{1.0}, GpuFromHost.0, w, TensorConstant{1.0})
+    1.8%   60.3%  0.268s  8.767s 2.68e-05s  * 10000   9 GpuGemv{inplace}(GpuAlloc.0, TensorConstant{1.0}, GpuFromHost.0, w, TensorConstant{1.0})
+    1.8%   62.1%  0.266s  9.033s 2.66e-05s    10000  18 HostFromGpu(GpuElemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_softplus(i1), mul(i2, i3))]}(i0, i1, i2, scalar_softplus(i3))]},no_inplace}.0)
+    1.8%   63.9%  0.262s  9.296s 2.62e-05s    10000  18 HostFromGpu(GpuElemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_softplus(i1), mul(i2, i3))]}(i0, i1, i2, scalar_softplus(i3))]},no_inplace}.0)
+    1.8%   65.7%  0.260s  9.555s 2.60e-05s    10000   3 GpuFromHost(y)
+    1.8%   67.5%  0.258s  9.813s 2.58e-05s    10000   3 GpuFromHost(y)
+    1.7%   69.2%  0.248s  10.061s 2.48e-05s    10000  20 HostFromGpu(GpuElemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)].0)
+    1.7%   70.9%  0.247s  10.309s 2.47e-05s    10000  20 HostFromGpu(GpuElemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)].0)
+    1.6%   72.5%  0.238s  10.547s 2.38e-05s    10000  12 GpuFromHost(Elemwise{Cast{float32}}.0)
+    1.6%   74.1%  0.237s  10.785s 2.37e-05s    10000  12 GpuFromHost(Elemwise{Cast{float32}}.0)
+    1.3%   75.4%  0.185s  10.969s 1.85e-05s  * 10000   6 GpuAlloc(CudaNdarrayConstant{[  1.58212732e-09]}, Shape_i{0}.0)
+   ... (remaining 53 Apply instances account for 24.60%(3.58s) of the runtime)
+(*) Op is running a c implementation
+
+Some info useful for gpu:
+
+    Spent 1.211s(8.324%) in cpu Op, 13.337s(91.676%) in gpu Op and 0.000s(0.000%) transfert Op
+
+    Theano function input that are float64
+    <fct name> <input name> <input type> <str input>
+
+    List of apply that don't have float64 as input but have float64 in outputs
+    (Useful to know if we forgot some cast when using floatX=float32 or gpu code)
+    <Apply> <Apply position> <fct name> <inputs type> <outputs type>
+
+Profile of Theano functions memory:
+(This check only the output of each apply node. It don't check the temporary memory used by the op in the apply node.)
+   We skipped 4 theano function(s). Each of them used less then 1024B(theano flags ProfileMode.min_memory_size) of total intermediate memory size
+
+Here are tips to potentially make your code run faster
+(if you think of new ones, suggest them on the mailing list).
+Test them first, as they are not guaranteed to always provide a speedup.
+  Sorry, no tip for today.
+
 
 
 # 3. Conclusions
 
-Examine and compare 'Ops' summaries for CPU and GPU. Usually GPU ops 'GpuFromHost' and 'HostFromGpu' by themselves
-consume a large amount of extra time, but by making as few as possible data transfers between GPU and CPU, you can minimize their overhead.
-Notice that each of the GPU ops consumes more time than its CPU counterpart. This is because the ops operate on small inputs;
-if you increase the input data size (e.g. set N = 4000), you will see a gain from using the GPU.
+
+Facts:
+Examine and compare 'Single Op-wise' summaries for CPU and GPU. GPU ops 'GpuFromHost' (and 'HostFromGpu') by themselves
+consume a large amount of extra time. Furthermore, notice that each of the GPU ops consumes more time than its CPU counterpart.
+An additional experiment also confirms that adding an 'out' instance in the GPU version only brings about a minor
+improvement in this situation.
+
+Tentative conclusion:
+The large number of external training steps (10000) generates disproportionate GPU overhead costs.
+
+Tentative solution:
+Include the training steps inside the definition of the Theano function.
+
+Implement this solution and put it to test.
+
 
 """
diff --git a/setup.py b/setup.py
index d3fddebacee..8b1c6298a9c 100755
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,6 @@
 import os
 import sys
 import subprocess
-import codecs
 from fnmatch import fnmatchcase
 from distutils.util import convert_path
 try:
@@ -42,21 +41,14 @@
 Operating System :: POSIX
 Operating System :: Unix
 Operating System :: MacOS
-Programming Language :: Python :: 2
-Programming Language :: Python :: 2.4
-Programming Language :: Python :: 2.5
-Programming Language :: Python :: 2.6
-Programming Language :: Python :: 2.7
-Programming Language :: Python :: 3
-Programming Language :: Python :: 3.3
 """
 NAME                = 'Theano'
 MAINTAINER          = "LISA laboratory, University of Montreal"
 MAINTAINER_EMAIL    = "theano-dev@googlegroups.com"
 DESCRIPTION         = ('Optimizing compiler for evaluating mathematical ' +
                        'expressions on CPUs and GPUs.')
-LONG_DESCRIPTION    = (codecs.open("DESCRIPTION.txt",encoding='utf-8').read() + "\n\n" +
-                       codecs.open("NEWS.txt",encoding='utf-8').read())
+LONG_DESCRIPTION    = (open("DESCRIPTION.txt").read() + "\n\n" +
+                       open("NEWS.txt").read())
 URL                 = "http://deeplearning.net/software/theano/"
 DOWNLOAD_URL        = ""
 LICENSE             = 'BSD'
@@ -67,7 +59,7 @@
 MAJOR               = 0
 MINOR               = 6
 MICRO               = 0
-SUFFIX              = ""  # Should be blank except for rc's, betas, etc.
+SUFFIX              = "rc3"  # Should be blank except for rc's, betas, etc.
 ISRELEASED          = False
 
 VERSION             = '%d.%d.%d%s' % (MAJOR, MINOR, MICRO, SUFFIX)
@@ -75,11 +67,11 @@
 
 def find_packages(where='.', exclude=()):
     out = []
-    stack = [(convert_path(where), '')]
+    stack=[(convert_path(where), '')]
     while stack:
         where, prefix = stack.pop(0)
         for name in os.listdir(where):
-            fn = os.path.join(where, name)
+            fn = os.path.join(where,name)
             if ('.' not in name and os.path.isdir(fn) and
                 os.path.isfile(os.path.join(fn, '__init__.py'))
             ):
@@ -123,14 +115,29 @@ def _minimal_ext_cmd(cmd):
         git_revision = "unknown-git"
     return git_revision
 
-
+# Python 2.4 compatibility: Python versions 2.6 and later support new
+# exception syntax, but for now we have to resort to exec. 
+if sys.hexversion >= 0x2070000:
+    exec("""\
+def write_text(filename, text):
+    with open(filename, 'w') as a:
+        try:
+            a.write(text)
+        except Exception as e:
+            print(e)
+""")
+else:
+    exec("""\
 def write_text(filename, text):
+    a = open(filename, 'w')
     try:
-        with open(filename, 'w') as a:
+        try:
             a.write(text)
-    except Exception as e:
-        print(e)
-
+        except Exception, e:
+            print e
+    finally:
+        a.close()
+""")
 
 def write_version_py(filename=os.path.join('theano', 'generated_version.py')):
     cnt = """
@@ -161,7 +168,6 @@ def write_version_py(filename=os.path.join('theano', 'generated_version.py')):
                   'isrelease': str(ISRELEASED)}
     write_text(filename, text)
 
-
 def do_setup():
     write_version_py()
     setup(name=NAME,
@@ -177,17 +183,17 @@ def do_setup():
           packages=find_packages(),
           install_requires=['numpy>=1.5.0', 'scipy>=0.7.2'],
           package_data={
-              '': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl',
-                   '*.h', 'ChangeLog'],
+              '': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh',
+                   'ChangeLog'],
               'theano.misc': ['*.sh']
           },
           scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'],
           keywords=' '.join([
-              'theano', 'math', 'numerical', 'symbolic', 'blas',
-              'numpy', 'gpu', 'autodiff', 'differentiation'
+            'theano', 'math', 'numerical', 'symbolic', 'blas',
+            'numpy', 'gpu', 'autodiff', 'differentiation'
           ]),
-          cmdclass={'build_py': build_py,
-                    'build_scripts': build_scripts}
+          cmdclass = {'build_py': build_py,
+                      'build_scripts': build_scripts}
     )
 if __name__ == "__main__":
     do_setup()
diff --git a/theano/__init__.py b/theano/__init__.py
index 9f1c94b519f..f9d744b1939 100644
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -57,11 +57,11 @@
     SymbolicOutput, Out, \
     Mode, \
     predefined_modes, predefined_linkers, predefined_optimizers, \
-    FunctionMaker, function, function_dump, OpFromGraph, \
+    FunctionMaker, function, OpFromGraph, \
     Component, External, Member, Method, \
     Composite, ComponentList, ComponentDict, Module, \
     ProfileMode, ProfileStats, \
-    Param, shared, as_op
+    Param, shared
 
 from theano.misc.safe_asarray import _asarray
 
@@ -79,17 +79,7 @@
 #we don't import by default as we don't want to force having scipy installed.
 #import sparse
 
-from theano.gradient import Rop, Lop, grad, subgraph_grad
-
-# This need to be before the init of GPU, as it add config variable
-# needed during that phase.
-import theano.tests
-if hasattr(theano.tests, "TheanoNoseTester"):
-    test = theano.tests.TheanoNoseTester().test
-else:
-    def test():
-        raise ImportError("The nose module is not installed."
-                          " It is needed for Theano tests.")
+from theano.gradient import Rop, Lop, grad
 
 if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
     import theano.sandbox.cuda
@@ -101,10 +91,6 @@ def test():
 
         theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
 
-if config.device.startswith('cuda') or config.device.startswith('opencl') or \
-        config.gpuarray.init_device != '':
-    import theano.sandbox.gpuarray
-
 # Use config.numpy to call numpy.seterr
 import numpy
 
@@ -182,22 +168,18 @@ def get_scalar_constant_value(v):
     return tensor.get_scalar_constant_value(v)
 
 
-def sparse_grad(var):
-    """This function return a new variable whose gradient will be
-    stored in a sparse format instead of dense.
-
-    Currently only variable created by AdvancedSubtensor1 is supported.
-    i.e. a_tensor_var[an_int_vector].
-
-    .. versionadded:: 0.6rc4
-    """
-    assert isinstance(var.owner.op, tensor.AdvancedSubtensor1)
-    ret = var.owner.op.__class__(sparse_grad=True)(*var.owner.inputs)
-    return ret
+import theano.tests
+if hasattr(theano.tests, "TheanoNoseTester"):
+    test = theano.tests.TheanoNoseTester().test
+else:
+    def test():
+        raise ImportError("The nose module is not installed."
+                          " It is needed for Theano tests.")
 
 # This cannot be done in tensor/__init__.py due to a circular dependency -- randomstreams
 # depends on raw_random which depends on tensor.  As a work-around, we import RandomStreams
 # here and inject an instance in tensor.
+from theano import tensor
 from theano.tensor.randomstreams import RandomStreams
 # Imitate the numpy.random symbol with a tensor.random one
 tensor.random = RandomStreams(seed=0xBAD5EED, no_warn=True)
diff --git a/theano/compat/__init__.py b/theano/compat/__init__.py
index 15cf7d60333..3d07d0dd611 100644
--- a/theano/compat/__init__.py
+++ b/theano/compat/__init__.py
@@ -8,6 +8,7 @@
 from theano.compat.six.moves import reload_module as reload
 
 if PY3:
+
     from operator import truediv as operator_div
 
     # In python 3.x, when an exception is reraised it saves original
@@ -27,9 +28,8 @@ def cmp(a, b):
     any = any
     from functools import partial
     from collections import defaultdict, deque
-    from sys import maxsize
     from itertools import combinations, product
-    from collections import OrderedDict, MutableMapping as DictMixin
+    from sys import maxsize
 
     def decode(x):
         return x.decode()
diff --git a/theano/compat/six.py b/theano/compat/six.py
index e21a80cfa0a..ab45e3841ac 100644
--- a/theano/compat/six.py
+++ b/theano/compat/six.py
@@ -244,11 +244,6 @@ def callable(obj):
 
 if PY3:
     def get_unbound_function(unbound):
-        # Op.make_thunk isn't bound, so don't have a __func__ attr.
-        # But bound method, have a __func__ method that point to the
-        # not bound method. That is what we want.
-        if hasattr(unbound, '__func__'):
-            return unbound.__func__
         return unbound
 
     Iterator = object
diff --git a/theano/compile/__init__.py b/theano/compile/__init__.py
index 3063deb912b..488b83b124a 100644
--- a/theano/compile/__init__.py
+++ b/theano/compile/__init__.py
@@ -1,10 +1,6 @@
 from theano.compile.ops import (
         DeepCopyOp, deep_copy_op, register_deep_copy_op_c_code,
-        Shape, shape, register_shape_c_code,
-        Shape_i, register_shape_i_c_code,
-        ViewOp, view_op, register_view_op_c_code, FromFunctionOp, 
-        as_op, Rebroadcast, register_rebroadcast_c_code,
-        SpecifyShape, specify_shape, register_specify_shape_c_code)
+        ViewOp, view_op, register_view_op_c_code)
 
 from theano.compile.function_module import *
 
@@ -12,6 +8,8 @@
 
 from theano.compile.io import *
 
+from theano.compile.builders import *
+
 from theano.compile.module import *
 
 from theano.compile.debugmode import DebugMode
@@ -26,6 +24,4 @@
                                         SharedVariable)
 from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared
 
-from theano.compile.builders import *
-
-from theano.compile.function import function, function_dump
+from theano.compile.function import function
diff --git a/theano/compile/builders.py b/theano/compile/builders.py
index b64050fc235..1d7849a87b9 100644
--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
@@ -1,68 +1,40 @@
-import theano
 from theano import gof
+from theano import gradient as G
 from theano.compile.function_module import orig_function
-from theano.compile import SharedVariable, rebuild_collect_shared
 from theano.gof import ops_with_inner_function
 
 
 class OpFromGraph(gof.Op):
-    """This creates an `Op` from inputs and outputs lists of variables.
-
-    The signature is similar to theano.function() and the resulting
-    `Op`'s perform will do the same operation as::
-
-        orig_function(inputs, outputs, **kwargs)
-
-    TODO:
-        - examples for a multi-layer mlp. where?
-        - __hash__, __eq__ otherwise won't merge, try gof.opt.is_same_graph_with_merge(op1.new_outputs, op2, new_outputs)
-        - c_code() to remove the double overhead?
-        - opt to unfold it, work inplace on inputs
-        - grad() make it support DisconnectedType and the new interface
-        - check how it works with updates.
-        - add test with constant as input or inside the inner graph.
-        - Add support for the GPU? Probably just need an opt to remove transfer
-        - Add support to pickle this Op.
-        - Add support/test with random generator
-
-    :note:
-        - We support shared variables in the inner graph. This is automatic and
-          invisible to the user. They can be as input to the node or in the
-          inner graph.
-        - We support unused inputs. This is needed for the grad.
-
-    Example 1:
-
-    .. code-block:: python
-
-        from theano import function, OpFromGraph, tensor
-        x, y, z = tensor.scalars('xyz')
-        e = x + y * z
-        op = OpFromGraph([x, y, z], [e])
-        # op behaves like a normal theano op
-        e2 = op(x, y, z) + op(z, y, x)
-        fn = function([x, y, z], [e2])
-
-
-
-    Example 2 with shared variable:
-
-    .. code-block:: python
-
-        import numpy
-        import theano
-        from theano import config, function, OpFromGraph, tensor
-        x, y, z = tensor.scalars('xyz')
-        s = theano.shared(numpy.random.rand(2, 2).astype(config.floatX))
-        e = x + y * z + s
-        op = OpFromGraph([x, y, z], [e])
-        # op behaves like a normal theano op
-        e2 = op(x, y, z) + op(z, y, x)
-        fn = function([x, y, z], [e2])
-
+    """
+    This create an L{Op} from a list of input variables and a list of output
+    variables.
+
+    The signature is the same as the signature of L{FunctionFactory}
+    and/or function and the resulting L{Op}'s perform will do the same
+    operation as::
+      function(inputs, outputs, **kwargs)
+
+    Take note that the following options, if provided, must take the
+    value(s) listed below:
+      unpack_single = False
+      borrow_outputs = False
+
+    OpFromGraph takes an additional input, grad_depth. If grad_depth
+    is n, OpFromGraph will make special Ops for gradients up to the
+    nth level, allowing the user to differentiate this op up to n
+    times. The parameter defaults to 1. If grad_depth == 0, the op
+    will not be differentiable.
+
+    Example:
+      x, y, z = tensor.scalars('xyz')
+      e = x + y * z
+      op = OpFromGraph([x, y, z], [e], linker='c')
+      # op behaves like a normal theano op
+      e2 = op(x, y, z) + op(z, y, x)
+      fn = function([x, y, z], [e2])
     """
 
-    def __init__(self, inputs, outputs, **kwargs):
+    def __init__(self, inputs, outputs, grad_depth=1, **kwargs):
         if not isinstance(outputs, list):
             raise TypeError('outputs must be list', outputs)
         for i in inputs + outputs:
@@ -72,33 +44,34 @@ def __init__(self, inputs, outputs, **kwargs):
         if 'updates' in kwargs:
             raise TypeError('updates are not allowed in kwargs')
 
-        # To support correctly shared variables the inner fct should
-        # not see them. Otherwise their is problem with the gradient.
-        self.shared_inputs = [var for var in gof.graph.inputs(outputs)
-                              if isinstance(var, SharedVariable)]
-        used_inputs = [var for var in gof.graph.inputs(outputs)
-                       if not isinstance(var, gof.Constant)]
-        shared_vars = [var.type() for var in self.shared_inputs]
-        new = rebuild_collect_shared(outputs, inputs=inputs + shared_vars,
-                                     replace=dict(zip(self.shared_inputs,
-                                                      shared_vars)),
-                                     copy_inputs_over=False)
-        (new_inputs, new_outputs,
-         [clone_d, update_d, update_expr, shared_inputs]) = new
-        assert len(new_inputs) == len(inputs) + len(self.shared_inputs)
-        assert len(new_outputs) == len(outputs)
-        assert not update_d
-        assert not update_expr
-        assert not shared_inputs
-
-        self.new_inputs = new_inputs
-        self.new_outputs = new_outputs
+        # TODO: the graph may have implicit inputs like
+        #       SharedVariable instances.
+        #       what impact to they have on the validity of this Op?
+        self.fn = orig_function(inputs, outputs, **kwargs)
         self.inputs = inputs
         self.outputs = outputs
-        self.kwargs = kwargs
         self.input_types = [input.type for input in inputs]
         self.output_types = [output.type for output in outputs]
 
+        if grad_depth > 0:
+            output_grads = [t() for t in self.output_types]
+            # OpFromGraph doesn't implement a connection_pattern, so for now we regard
+            # all inputs and outputs as connected. This will compute the right numerical
+            # value for the gradients but could fail to raise the disconnected inputs error
+            # in some cases.
+            gs = G.grad(cost=None, known_grads=dict(zip(self.outputs, output_grads)),
+                    wrt=self.inputs, disconnected_inputs='ignore')
+            self.grad_ops = []
+            for g in gs:
+                if g is None:
+                    self.grad_ops.append(lambda *args: None)
+                else:
+                    # It is normal if some inputs are not needed in order
+                    # to compute the gradient, so we ignore them.
+                    self.grad_ops.append(OpFromGraph(inputs + output_grads,
+                                                     [g],
+                                                     grad_depth=grad_depth - 1,
+                                                     on_unused_input='ignore'))
 
     def __eq__(self, other):
         #TODO: recognize a copy
@@ -114,18 +87,9 @@ def make_node(self, *inputs):
                 raise TypeError("Wrong type, expected %s but got %s"
                         % (type, input.type))
         return gof.Apply(self,
-                         list(inputs) + self.shared_inputs,
+                         inputs,
                          [type() for type in self.output_types])
 
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
-        ret = super(OpFromGraph, self).make_thunk(node, storage_map,
-                                                  compute_map, no_recycling)
-        if not hasattr(self, "fn"):
-            self.fn = orig_function(self.new_inputs,
-                                    self.new_outputs,
-                                    **self.kwargs)
-        return ret
-
     def perform(self, node, inputs, outputs):
         variables = self.fn(*inputs)
         assert len(variables) == len(outputs)
@@ -135,32 +99,10 @@ def perform(self, node, inputs, outputs):
             output[0] = variable.copy()
 
     def grad(self, inputs, output_grads):
-        # OpFromGraph doesn't implement a connection_pattern, so for
-        # now we regard all inputs and outputs as connected. This will
-        # compute the right numerical value for the gradients but
-        # could fail to raise the disconnected inputs error in some
-        # cases.
-        if hasattr(self, "grad_ops"):
-            grad_ops = self.grad_ops
+        if hasattr(self, 'grad_ops'):
+            return [go(*(inputs + output_grads)) for go in self.grad_ops]
         else:
-            gs = theano.gradient.grad(cost=None,
-                        known_grads=dict(zip(self.new_outputs, output_grads)),
-                        wrt=self.new_inputs,
-                        disconnected_inputs='ignore')
-
-            grad_ops = []
-            for g in gs:
-                if g is None:
-                    grad_ops.append(lambda *args: None)
-                else:
-                    # It is normal if some inputs are not needed in order
-                    # to compute the gradient, so we ignore them.
-                    grad_ops.append(OpFromGraph(self.new_inputs + output_grads,
-                                                [g],
-                                                on_unused_input='ignore'))
-            self.grad_ops = grad_ops
-
-        return [go(*(inputs + output_grads)) for go in grad_ops]
+            raise NotImplementedError
 
 # Since OpFromGraph contains a Theano compiled function, we should let
 # DebugMode know about it
diff --git a/theano/compile/debugmode.py b/theano/compile/debugmode.py
index b6711bd2cb9..309a27ac39d 100644
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -97,6 +97,7 @@ def is_valid_check_preallocated_output_param(param):
 
 import logging
 _logger = logging.getLogger("theano.compile.debugmode")
+_logger.setLevel(logging.WARNING)
 
 
 # Filter to avoid duplicating optimization warnings
@@ -170,8 +171,7 @@ def str_diagnostic(self):
         of the exception"""
         sio = StringIO()
         print >> sio, "BadThunkOutput"
-        print >> sio, "  Apply   :", self.r.owner
-        print >> sio, "  op      :", self.offending_op()
+        print >> sio, "  variable    :", self.r
         print >> sio, "  Outputs Type:", self.r.type
         print >> sio, "  Outputs Shape:", getattr(self.val1, 'shape', None)
         print >> sio, "  Outputs Strides:", getattr(self.val1, 'strides', None)
@@ -180,15 +180,60 @@ def str_diagnostic(self):
                                           for val in self.inputs_val]
         print >> sio, "  Inputs Strides:", [getattr(val, 'strides', None)
                                             for val in self.inputs_val]
-        print >> sio, "  Bad Variable:", self.r
+        print >> sio, "  Apply   :", self.r.owner
         print >> sio, "  thunk1  :", self.thunk1
         print >> sio, "  thunk2  :", self.thunk2
-
-        #Don't import it at the top of the file to prevent circular import.
-        utt = theano.tests.unittest_tools
-        print >> sio, utt.str_diagnostic(self.val1, self.val2, None, None)
-        ret = sio.getvalue()
-        return ret
+        print >> sio, "  val1    :", self.val1
+        print >> sio, "  val2    :", self.val2
+        print >> sio, "  op      :", self.offending_op()
+        try:
+            ssio = StringIO()
+            print >> ssio, "  Value 1 : shape, dtype, strides, min, max, n_inf, n_nan:",
+            print >> ssio, self.val1.shape,
+            print >> ssio, self.val1.dtype,
+            print >> ssio, self.val1.strides,
+            print >> ssio, self.val1.min(),
+            print >> ssio, self.val1.max(),
+            print >> ssio, numpy.isinf(self.val1).sum(),
+            print >> ssio, numpy.isnan(self.val1).sum(),
+            # only if all succeeds to we add anything to sio
+            print >> sio, ssio.getvalue()
+        except Exception:
+            pass
+        try:
+            ssio = StringIO()
+            print >> ssio, "  Value 2 : shape, dtype, strides, min, max, n_inf, n_nan:",
+            print >> ssio, self.val2.shape,
+            print >> ssio, self.val2.dtype,
+            print >> ssio, self.val2.strides,
+            print >> ssio, self.val2.min(),
+            print >> ssio, self.val2.max(),
+            print >> ssio, numpy.isinf(self.val2).sum(),
+            print >> ssio, numpy.isnan(self.val2).sum(),
+            # only if all succeeds to we add anything to sio
+            print >> sio, ssio.getvalue()
+        except Exception:
+            pass
+        try:
+            ov = numpy.asarray(self.val1)
+            nv = numpy.asarray(self.val2)
+            ssio = StringIO()
+            absdiff = numpy.absolute(nv - ov)
+            print >> ssio, "  Max Abs Diff: ", numpy.max(absdiff)
+            print >> ssio, "  Mean Abs Diff: ", numpy.mean(absdiff)
+            print >> ssio, "  Median Abs Diff: ", numpy.median(absdiff)
+            print >> ssio, "  Std Abs Diff: ", numpy.std(absdiff)
+            reldiff = numpy.absolute(nv - ov) / (numpy.absolute(nv) +
+                                                 numpy.absolute(ov))
+            print >> ssio, "  Max Rel Diff: ", numpy.max(reldiff)
+            print >> ssio, "  Mean Rel Diff: ", numpy.mean(reldiff)
+            print >> ssio, "  Median Rel Diff: ", numpy.median(reldiff)
+            print >> ssio, "  Std Rel Diff: ", numpy.std(reldiff)
+            # only if all succeeds to we add anything to sio
+            print >> sio, ssio.getvalue()
+        except Exception:
+            pass
+        return sio.getvalue()
 
 
 class BadOptimization(DebugModeError):
@@ -494,8 +539,7 @@ def char_from_number(number):
 def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
                file=sys.stdout, print_destroy_map=False,
                print_view_map=False, order=None, ids='CHAR',
-               stop_on_name=False, prefix_child=None,
-               scan_ops=None):
+               stop_on_name=False, prefix_child=None):
     """Print the graph leading to `r` to given depth.
 
     :param r: Variable instance
@@ -503,10 +547,10 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
     :param depth: maximum recursion depth (Default -1 for unlimited).
     :param done: dict of Apply instances that have already been printed
                  and their associated printed ids
-    :param print_type: whether to print the Variable type after the other infos
+    :param print_type: wether to print the Variable type after the other infos
     :param file: file-like object to which to print
-    :param print_destroy_map: whether to print the op destroy_map after other info
-    :param print_view_map: whether to print the op view_map after other info
+    :param print_destroy_map: wether to print the op destroy_map after ofther info
+    :param print_view_map: wether to print the op view_map after ofther info
     :param order: If not empty will print the index in the toposort.
     :param ids: How do we print the identifier of the variable
                 id - print the python id value
@@ -515,8 +559,6 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
                 "" - don't print an identifier
     :param stop_on_name: When True, if a node in the graph has a name,
                          we don't print anything below it.
-    :param scan_ops: Scan ops in the graph will be added inside this list
-                     for later printing purposes.
 
     """
     if depth == 0:
@@ -528,9 +570,6 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
     if done is None:
         done = dict()
 
-    if scan_ops is None:
-        scan_ops = []
-
     if print_type:
         type_str = ' <%s>' % r.type
     else:
@@ -581,45 +620,37 @@ def get_id_str(obj):
         o = ''
         if order:
             o = str(order.index(r.owner))
-
         already_printed = a in done  # get_id_str put it in the dict
         id_str = get_id_str(a)
 
         if len(a.outputs) == 1:
             print >> file, '%s%s %s%s \'%s\' %s %s %s' % (prefix, a.op,
-                                                          id_str,
-                                                          type_str,
-                                                          r_name,
-                                                          destroy_map_str,
-                                                          view_map_str,
-                                                          o)
-        else:
-            print >> file, '%s%s.%i %s%s \'%s\' %s %s %s' % (prefix, a.op,
-                                                             a.outputs.index(r),
-                                                             id_str, type_str,
-                                                             r_name,
+                                                             id_str,
+                                                             type_str, r_name,
                                                              destroy_map_str,
                                                              view_map_str,
                                                              o)
+        else:
+            print >> file, '%s%s.%i %s%s \'%s\' %s %s %s' % (prefix, a.op,
+                                                            a.outputs.index(r),
+                                                            id_str, type_str,
+                                                            r_name,
+                                                            destroy_map_str,
+                                                            view_map_str,
+                                                            o)
         if not already_printed:
             if (not stop_on_name or
                 not (hasattr(r, 'name') and r.name is not None)):
                 new_prefix = prefix_child + ' |'
                 new_prefix_child = prefix_child + ' |'
-
                 for idx, i in enumerate(a.inputs):
                     if idx == len(a.inputs) - 1:
                         new_prefix_child = prefix_child + '  '
 
-                    if hasattr(i, 'owner') and hasattr(i.owner, 'op'):
-                        if isinstance(i.owner.op, theano.scan_module.scan_op.Scan):
-                            scan_ops.append(i)
-
                     debugprint(i, new_prefix, depth=depth - 1, done=done,
                                print_type=print_type, file=file, order=order,
                                ids=ids, stop_on_name=stop_on_name,
-                               prefix_child=new_prefix_child, scan_ops=scan_ops)
-
+                               prefix_child=new_prefix_child)
     else:
         #this is an input variable
         id_str = get_id_str(r)
@@ -638,13 +669,15 @@ def _optcheck_fgraph(input_specs, output_specs, accept_inplace=False):
     :type accept_inplace: Bool
     :rtype: `FunctionGraph`
     :returns: a new FunctionGraph with a cloned graph, with debugging `Feature` instances already installed.
+
     """
     orig_inputs = [spec.variable for spec in input_specs]
     updates = [spec.update for spec in input_specs if spec.update]
     orig_outputs = [spec.variable for spec in output_specs] + updates
 
+    inputs, outputs = gof.graph.clone(orig_inputs, orig_outputs)
     equivalence_tracker = _VariableEquivalenceTracker()
-    fgraph = gof.fg.FunctionGraph(orig_inputs, orig_outputs,
+    fgraph = gof.fg.FunctionGraph(inputs, outputs,
             # DestroyHandler may not be needed yet, as there is usually no
             # inplace operation in the graph at this stage. DestroyHandler
             # will be installed by an optimization after canonicalization,
@@ -669,7 +702,7 @@ def _optcheck_fgraph(input_specs, output_specs, accept_inplace=False):
                 break
 
     # We need to protect all immutable inputs from inplace operations.
-    fgraph.attach_feature(Supervisor(input for spec, input in zip(input_specs, fgraph.inputs)
+    fgraph.attach_feature(Supervisor(input for spec, input in zip(input_specs, inputs)
                           if not (spec.mutable or (hasattr(fgraph, 'destroyers')
                                                    and fgraph.destroyers(input)))))
 
@@ -679,26 +712,14 @@ def _optcheck_fgraph(input_specs, output_specs, accept_inplace=False):
     return fgraph, map(SymbolicOutput, updates), equivalence_tracker
 
 
-class DataDestroyed():
-    # this is a singleton class We put it in the storage_map when the
-    # variable value was destroyed to prevent reusing bad value for
-    # it.
-    pass
-
-data_destroyed = DataDestroyed()
-
-
 def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
                   clobber_dr_vals=True,
                   perform=None, warn_input_not_reused=True):
-    """Raise BadDestroyMap if necessary, update dr_vals
+    """
+    Raise BadDestroyMap if necessary, update dr_vals
 
     Returns a list of output variables that actually worked inplace
     (their value is aliased to the value of at least one input).
-
-    It modify the storage_map to remove node.inputs variable that have
-    been destroyed.
-
     """
     destroyed_idx_list = []
     destroy_map = getattr(node.op, 'destroy_map', {})
@@ -709,10 +730,9 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
     actually_inplace_outputs = []
     dmap = getattr(node.op, 'destroy_map', {})
     for oo, ii in dmap.iteritems():
-        var = node.outputs[oo]
-        out_var = storage_map[var][0]
+        out_var = storage_map[node.outputs[oo]][0]
         in_var = storage_map[node.inputs[ii[0]]][0]
-        if var.type.may_share_memory(out_var, in_var):
+        if _may_share_memory(out_var, in_var):
             actually_inplace_outputs.append(node.outputs[oo])
 
         if warn_input_not_reused and destroyed_res_list:
@@ -727,11 +747,9 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
 
     vmap = getattr(node.op, 'view_map', {})
     for oo, ii in vmap.iteritems():
-        var = node.outputs[oo]
-        out_var = storage_map[var][0]
+        out_var = storage_map[node.outputs[oo]][0]
         in_var = storage_map[node.inputs[ii[0]]][0]
-        may_share = var.type.may_share_memory(out_var, in_var)
-        if may_share:
+        if _may_share_memory(out_var, in_var):
             actually_inplace_outputs.append(node.outputs[oo])
 
         if warn_input_not_reused:
@@ -744,7 +762,7 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
             if isinstance(node.op, OutputGuard):
                 # This class is not in the final graph.
                 continue
-            if not may_share:
+            if not _may_share_memory(out_var, in_var):
                 _logger.warning("Optimization Warning: input idx %d marked "
                         "as viewed but new memory allocated by node '%s'",
                         ii[0], str(node))
@@ -761,8 +779,7 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
                         raise Exception('failure in topological ordering')
                     if clobber_dr_vals:
                         dr_vals[r] = (storage_map[r][0], node) #no copy, this is the last use of this variable
-                    # make sure that dr_vals[r] doens't get used again
-                    storage_map[r][0] = data_destroyed
+                    storage_map[r][0] = None #make sure that dr_vals[r] doens't get used again
             else:
                 raise BadDestroyMap(node, r_idx, r_vals[r],
                                     storage_map[r][0], perform)
@@ -782,6 +799,7 @@ def _check_viewmap(node, storage_map):
 
         good_alias, bad_alias = {}, {}
         outstorage = storage_map[onode][0]
+        instorage_id = [id(storage_map[i][0]) for i in node.inputs]
 
         # first find out which input it aliases
         view_map = getattr(node.op, 'view_map', {})
@@ -792,15 +810,8 @@ def _check_viewmap(node, storage_map):
         # case...
 
         for ii, inode in enumerate(node.inputs):
-            in_storage = storage_map[inode][0]
-            if in_storage is data_destroyed:
-                # If the input have been destroyed, it can't be a
-                # view. So no need to check. Also, we don't have the
-                # original value, we we wouldn't be able to do this
-                # useless check.
-                continue
-            if hasattr(inode.type, 'may_share_memory') and\
-               inode.type.may_share_memory(outstorage, in_storage):
+
+            if _may_share_memory(outstorage, storage_map[inode][0]):
 
                 nodeid = id(inode)
                 bad_alias[nodeid] = ii
@@ -828,18 +839,26 @@ def _check_viewmap(node, storage_map):
                 other_storage = storage_map[other_onode][0]
                 # check to see if we share memory with this other output
                 # this is not a problem if the node is not actually used
-                if (_is_used_in_graph(other_onode) and
-                    other_onode.type.may_share_memory(outstorage,
-                                                      other_storage)):
+                if _is_used_in_graph(other_onode) and \
+                        _may_share_memory(outstorage, other_storage):
                     raise BadViewMap(node, oi, outstorage,
                                      out_alias_idx=other_oi)
 
 
-def _is_used_in_graph(var):
+def _may_share_memory(a, b):
+    from theano.misc.may_share_memory import may_share_memory
+    return may_share_memory(a, b, False)
+
+
+def _is_function_output(node):
     """
-    Returns True if `var` is used by another node in the graph
+    Returns True if the node in question is the a final output of the graph
     """
-    return not(var.clients == [('output', 1)] or var.clients == [])
+    return node.clients == [('output', 1)]
+
+
+def _is_used_in_graph(node):
+    return not(_is_function_output(node) or node.clients == [])
 
 
 def _check_strides_match(a, b, warn_err, op):
@@ -873,7 +892,6 @@ def _lessbroken_deepcopy(a):
     called on a 0-d array will return a numpy scalar, not an array.
     """
     # this exists because copy.deepcopy on numpy arrays is broken
-    # This logic is also in link.py
     if type(a) in (numpy.ndarray, numpy.memmap):
         rval = a.copy()
     else:
@@ -900,6 +918,8 @@ def _find_bad_optimizations0(order, reasons, r_vals):
     for i, node in enumerate(order):
         for new_r in node.outputs:
             for reason, r, old_graph_str, new_graph_str in reasons[new_r]:
+                problem = False
+
                 #check if the value for new_r doesn't match the value for r
                 new_r_val = r_vals[new_r]
                 r_val = r_vals[r]
@@ -1135,21 +1155,18 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
     # is less relevant.
     # Dimensions should be align by the innermost index, so we iterate
     # from the end of shapes.
-    if ('strided' in prealloc_modes or
-        'wrong_size' in prealloc_modes or
-        'ALL' in prealloc_modes):
-        max_ndim = 0
-        rev_out_broadcastable = []
-        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, CudaNdarrayType)):
-                if max_ndim < r.ndim:
-                    rev_out_broadcastable += [True] * (r.ndim - max_ndim)
-                    max_ndim = r.ndim
-                assert len(rev_out_broadcastable) == max_ndim
-
-                for i, b in enumerate(r.broadcastable[::-1]):
-                    rev_out_broadcastable[i] = rev_out_broadcastable[i] and b
-        out_broadcastable = rev_out_broadcastable[::-1]
+    max_ndim = 0
+    rev_out_broadcastable = []
+    for r in considered_outputs:
+        if isinstance(r.type, (TensorType, CudaNdarrayType)):
+            if max_ndim < r.ndim:
+                rev_out_broadcastable += [True] * (r.ndim - max_ndim)
+                max_ndim = r.ndim
+            assert len(rev_out_broadcastable) == max_ndim
+
+            for i, b in enumerate(r.broadcastable[::-1]):
+                rev_out_broadcastable[i] = rev_out_broadcastable[i] and b
+    out_broadcastable = rev_out_broadcastable[::-1]
 
     if 'strided' in prealloc_modes or 'ALL' in prealloc_modes:
         check_ndim = config.DebugMode.check_preallocated_output_ndim
@@ -1455,25 +1472,21 @@ def on_attach(self, fgraph):
         self.reasons = {}
         self.replaced_by = {}
         self.event_list = []
-        for node in fgraph.toposort():
-            self.on_import(fgraph, node, "on_attach")
 
     def on_detach(self, fgraph):
         assert fgraph is self.fgraph
         self.fgraph = None
 
-    def on_prune(self, fgraph, node, reason):
-        self.event_list.append(_FunctionGraphEvent('prune', node,
-                                                   reason=reason))
+    def on_prune(self, fgraph, node):
+        self.event_list.append(_FunctionGraphEvent('prune', node))
         #print 'PRUNING NODE', node, id(node)
         assert node in self.active_nodes
         assert node not in self.inactive_nodes
         self.active_nodes.remove(node)
         self.inactive_nodes.add(node)
 
-    def on_import(self, fgraph, node, reason):
-        self.event_list.append(_FunctionGraphEvent('import', node,
-                                                   reason=reason))
+    def on_import(self, fgraph, node):
+        self.event_list.append(_FunctionGraphEvent('import', node))
 
         #print 'NEW NODE', node, id(node)
         assert node not in self.active_nodes
@@ -1582,6 +1595,7 @@ def make_all(self, profiler = None, input_storage = None
             # don't do this ugly hacky way of setting the
             # filter_checks_isfinite
             from theano.tensor import TensorType  # to set filter_check_isfinite
+            from theano import tests  # for config.unittests.rseed
         fgraph = self.fgraph
         input_storage_ = input_storage
         output_storage_ = output_storage
@@ -1621,9 +1635,8 @@ def make_all(self, profiler = None, input_storage = None
                 # directly from PureOp)
                 if not isinstance(node.op, gof.op.Op):
                     raise utils.MethodNotDefined()
-                e = FunctionGraph(node.inputs, node.outputs)
-                # The toposort isn't a stochastic order as it contain only one node.
-                e.toposort = lambda: list(e.apply_nodes)
+                e = FunctionGraph(*graph.clone(node.inputs, node.outputs))
+                e.toposort = lambda: e.apply_nodes  # WARNING: STOCHASTIC ORDER
                 #  Specifically... e.nodes is a set, but of only 1 element
 
                 cl = CLinker().accept(e, [r for r, r2 in zip(e.outputs,
@@ -1656,7 +1669,7 @@ def make_all(self, profiler = None, input_storage = None
                 thunks_py.append(None)
 
             # If the op define its own make_thunk, check it
-            if get_unbound_function(node.op.make_thunk) not in default_make_thunk:
+            if node.op.make_thunk.im_func not in default_make_thunk:
                 compute_map = {}
                 for k in node.inputs:
                     compute_map[k] = [True]
@@ -1666,8 +1679,6 @@ def make_all(self, profiler = None, input_storage = None
                                            storage_map,
                                            compute_map,
                                            no_recycling)
-                thunk.inputs = [storage_map[v] for v in node.inputs]
-                thunk.outputs = [storage_map[v] for v in node.outputs]
 
                 # Right now there is no op that when called check if
                 # its ouputs are computed and don't recompute itself.
@@ -1735,6 +1746,8 @@ def f():
                                          if r.owner is None]
 
             try:
+                equiv_vals = {}
+                problematic = set()
                 # r_vals are the true values associated with each
                 # variable in the graph they should not change during
                 # the evaluation of this function, even when the graph
@@ -1817,27 +1830,6 @@ def f():
                             # shouldn't have put it into the list in
                             # the first place
                             thunk_py = None
-                        except Exception, e:
-                            # I think that only 1 optimization can
-                            # insert a given apply node. If that is not True,
-                            # we would need to loop over all node outputs,
-                            # But this make the output uglier.
-                            reason = fgraph.equivalence_tracker.reasons[
-                                node.outputs[0]]
-                            if not reason:
-                                raise
-                            opt = str(reason[0][0])
-                            msg = (
-"An optimization (probably %s ) inserted an apply node that raise an error." % opt +
-"\nThe information we have about this optimizations is:" + str(reason[0][1]) +
-"\n" + reason[0][2] +
-"\n\nThe original exception: \n" + str(e))
-                            new_e = e.__class__(msg)
-                            exc_type, exc_value, exc_trace = sys.exc_info()
-                            exc_value = new_e
-                            raise_with_op(node, thunk_c,
-                                          (exc_type, exc_value, exc_trace))
-
 
                     if thunk_py:
                         # check output values for type-correctness
@@ -1915,26 +1907,8 @@ def f():
                         ## First time, with None in output_storage
                         try:
                             thunk_c()
-                        except Exception, e:
-                            # I think that only 1 optimization can
-                            # insert a given apply node. If that is not True,
-                            # we would need to loop over all node outputs,
-                            # But this make the output uglier.
-                            reason = fgraph.equivalence_tracker.reasons[
-                                node.outputs[0]]
-                            if not reason:
-                                raise
-                            opt = str(reason[0][0])
-                            msg = (
-"An optimization (probably %s ) inserted an apply node that raise an error." % opt +
-"\nThe information we have about this optimizations is:" + str(reason[0][1]) +
-"\n" + reason[0][2] +
-"\n\nThe original exception: \n" + str(e))
-                            new_e = e.__class__(msg)
-                            exc_type, exc_value, exc_trace = sys.exc_info()
-                            exc_value = new_e
-                            raise_with_op(node, thunk_c,
-                                          (exc_type, exc_value, exc_trace))
+                        except Exception:
+                            raise_with_op(node)
 
                         for r in node.outputs:
                             # check output values for type-correctness
@@ -1984,7 +1958,7 @@ def thunk():
                                 try:
                                     thunk_c()
                                 except Exception:
-                                    raise_with_op(node, thunk_c)
+                                    raise_with_op(node)
                             _logger.debug(
                                     '%i - calling _check_preallocated_output '
                                     'with thunk_c', i)
@@ -2098,7 +2072,7 @@ def deco():
             return deco
 
         f = run_with_tensortype_filter_check(f)
-        f.storage_map = storage_map
+
         f.allow_gc = True
         assert len(fgraph.inputs) == len(input_storage)
         assert len(fgraph.outputs) == len(output_storage)
@@ -2165,29 +2139,29 @@ def __init__(self, inputs, outputs, optimizer, mode,
         # Check if some input variables are unused
         self._check_unused_inputs(inputs, outputs, on_unused_input)
 
-        # Make a list of (SymbolicInput|SymblicInputKits, indices, [SymbolicInput,...]), one
-        # tuple for each input. (See Function.indices for more details)
+#TODO: REMOVE THIS CRUFT - it's complicated for SymbolicInputKits
         indices = [[input] + self.expand_in(input, _inputs) for input in inputs]
+        expanded_inputs = reduce(list.__add__, [list(z)
+                                                for x, y, z in indices], [])
+
+        assert expanded_inputs == inputs  #JB - I added this to make sure we could delete above
 
         # make the fgraph
         for i in xrange(mode.stability_patience):
             fgraph, additional_outputs, equivalence_tracker = _optcheck_fgraph(
-                inputs, outputs, accept_inplace)
+                expanded_inputs, outputs, accept_inplace)
             fgraph.equivalence_tracker = equivalence_tracker
 
             # optimize the fgraph
             compute_test_value_orig = theano.config.compute_test_value
-            add_stack_trace_on_call = gof.Op.add_stack_trace_on_call
             try:
-                theano.config.compute_test_value = theano.config.compute_test_value_opt
-                gof.Op.add_stack_trace_on_call = False  # Should it be 0 == i?
+                theano.config.compute_test_value = "off"
                 optimizer(fgraph)
 
                 theano.compile.function_module.insert_deepcopy(fgraph, inputs,
                                                     outputs + additional_outputs)
             finally:
                 theano.config.compute_test_value = compute_test_value_orig
-                gof.Op.add_stack_trace_on_call = add_stack_trace_on_call
 
             if i:
                 li = fgraph.equivalence_tracker.event_list
@@ -2253,7 +2227,7 @@ def __init__(self, inputs, outputs, optimizer, mode,
 
         self.indices = indices
         self.inputs = inputs
-        self.expanded_inputs = inputs
+        self.expanded_inputs = expanded_inputs
         self.outputs = outputs
         self.unpack_single = unpack_single
         self.return_none = return_none
@@ -2292,6 +2266,7 @@ def create(self, defaults=None, trustme=False):
                                     "default for a SymbolicInputKit.")
                 input_storage.append(default.storage)
                 default = None
+                required = False
             elif isinstance(input, SymbolicInputKit):
                 # If the input is a SymbolicInputKit, it represents more than
                 # one storage unit. The indices and subinputs lists represent
diff --git a/theano/compile/function.py b/theano/compile/function.py
index a3504df5633..a188245cc3a 100644
--- a/theano/compile/function.py
+++ b/theano/compile/function.py
@@ -2,13 +2,9 @@
 """
 __docformat__ = "restructuredtext en"
 
-import cPickle
 import logging
 _logger = logging.getLogger('theano.compile.function')
 
-import traceback as tb
-import re
-
 from theano.compile.io import In
 from theano.compile.function_module import orig_function
 from theano.compile.pfunc import pfunc
@@ -16,27 +12,6 @@
 import warnings
 from theano import gof
 
-
-def function_dump(filename, inputs, outputs=None, mode=None, updates=None,
-                  givens=None,
-                  no_default_updates=False, accept_inplace=False, name=None,
-                  rebuild_strict=True, allow_input_downcast=None, profile=None,
-                  on_unused_input=None):
-    """This is helpful to make a reproducable case for problem during
-    Theano compilation.
-
-    """
-    assert isinstance(filename, basestring)
-    d = dict(inputs=inputs, outputs=outputs, mode=mode, updates=updates,
-             givens=givens, no_default_updates=no_default_updates,
-             accept_inplace=accept_inplace, name=name,
-             rebuild_strict=rebuild_strict,
-             allow_input_downcast=allow_input_downcast, profile=profile,
-             on_unused_input=on_unused_input)
-    with open(filename, 'wb') as f:
-        cPickle.dump(d, f, -1)
-
-
 def function(inputs, outputs=None, mode=None, updates=None, givens=None,
              no_default_updates=False, accept_inplace=False, name=None,
              rebuild_strict=True, allow_input_downcast=None, profile=None,
@@ -184,23 +159,6 @@ def opt_log1p(node):
 
 
     """
-    if name is None:
-        # Determine possible file names
-        source_file = re.sub('\.pyc?', '.py', __file__)
-        compiled_file = source_file + 'c'
-
-        stack = tb.extract_stack()
-        idx = len(stack) - 1
-
-        last_frame = stack[idx]
-        if (last_frame[0] == source_file or last_frame[0] == compiled_file):
-            func_frame = stack[idx - 1]
-            while "theano/gof" in func_frame[0] and idx > 0:
-                idx -= 1
-                # This can hapen if we call var.eval()
-                func_frame = stack[idx - 1]
-            name = func_frame[0] + ':' + str(func_frame[1])
-
     if updates is None:
         updates = []
 
@@ -213,8 +171,7 @@ def opt_log1p(node):
             " got " + str(type(updates)) + ". Using "
             "a standard dictionary here results in "
             "non-deterministic behavior. You should use an OrderedDict"
-            " if you are using Python 2.7 (theano.compat.python2x.OrderedDict"
-            " for older python), or use a list of (shared, update)"
+            " if you are using Python 2.7, or use a list of (shared, update)"
             " pairs. Do not just convert your dictionary to this type before"
             " the call as the conversion will still be non-deterministic.",
             stacklevel=2)
diff --git a/theano/compile/function_module.py b/theano/compile/function_module.py
index 14255c0882c..e7feb34d0ee 100644
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -16,11 +16,8 @@
 from theano import gof
 from theano.gof.python25 import partial
 import theano.compile.mode
-from theano.compile.io import (
-    In, SymbolicInput, SymbolicInputKit, SymbolicOutput)
+from theano.compile.io import In, SymbolicInput, SymbolicInputKit, SymbolicOutput
 from theano.compile.ops import deep_copy_op, view_op
-from theano.gof.graph import is_same_graph
-from theano.gof.op import ops_with_inner_function
 
 import logging
 _logger = logging.getLogger('theano.compile.function_module')
@@ -32,20 +29,15 @@ class UnusedInputError(Exception):
     """
     pass
 
-
 def alias_root(v):
     """Return the variable to which v is aliased by view_maps and destroy_maps"""
-    if v.owner is None:
-        return v
+    if v.owner is None: return v
     vmap = getattr(v.owner.op, 'view_map', {})
     dmap = getattr(v.owner.op, 'destroy_map', {})
     outpos = v.owner.outputs.index(v)
     v_views = vmap.get(outpos, []) + dmap.get(outpos, [])
     if len(v_views) > 1:
-        raise NotImplementedError(
-            str(v) + " is a view/destroyed version of more then one inputs. "
-            "Currently, we only support the case where an output is a view or "
-            "a destroyed version of one input.")
+        raise NotImplementedError()
     elif v_views:
         return alias_root(v.owner.inputs[v_views[0]])
     else:
@@ -114,11 +106,10 @@ def validate(self, fgraph):
             return True
         for r in self.protected + list(fgraph.outputs):
             if fgraph.destroyers(r):
-                raise gof.InconsistencyError(
-                    "Trying to destroy a protected Variable.", r)
+                raise gof.InconsistencyError("Trying to destroy a protected Variable.", r)
 
 
-def std_fgraph(input_specs, output_specs, accept_inplace=False):
+def std_fgraph(input_specs, output_specs, accept_inplace = False):
     """
     Makes an FunctionGraph corresponding to the input specs and the output
     specs.  Any SymbolicInput in the input_specs, if its update field
@@ -138,23 +129,23 @@ def std_fgraph(input_specs, output_specs, accept_inplace=False):
     updates = [spec.update for spec in input_specs if spec.update]
     orig_outputs = [spec.variable for spec in output_specs] + updates
 
-    fgraph = gof.fg.FunctionGraph(orig_inputs, orig_outputs)
+    inputs, outputs = gof.graph.clone(orig_inputs, orig_outputs)
+    fgraph = gof.fg.FunctionGraph(inputs, outputs)
 
     for node in fgraph.apply_nodes:
         if getattr(node.op, 'destroy_map', None):
             if not accept_inplace:
-                raise TypeError("Graph must not contain inplace operations",
-                                node, node.op)
+                raise TypeError("Graph must not contain inplace operations", node, node.op)
             else:
                 fgraph.attach_feature(gof.DestroyHandler())
                 break
 
     # We need to protect all immutable inputs from inplace operations.
     fgraph.attach_feature(
-        Supervisor(input
-                   for spec, input in zip(input_specs, fgraph.inputs)
-                   if not (spec.mutable or
-                           (hasattr(fgraph, 'destroyers') and
+            Supervisor(input
+                for spec, input in zip(input_specs, inputs)
+                if not (spec.mutable or
+                        (hasattr(fgraph, 'destroyers') and
                             fgraph.destroyers(input)))))
 
     # If named nodes are replaced, keep the name
@@ -165,7 +156,6 @@ def std_fgraph(input_specs, output_specs, accept_inplace=False):
 
 std_fgraph.features = [gof.toolbox.PreserveNames]
 
-
 class AliasedMemoryError(Exception):
     """Memory is aliased that should not be"""
     pass
@@ -297,7 +287,6 @@ def __init__(self, fn, input_storage, output_storage, indices, outputs,
         self.profile = None  # reassigned in FunctionMaker.create
         self.trust_input = False  # If True, we don't check the input parameter
         self.name = None
-        self.nodes_with_inner_function = []
 
         # We will be popping stuff off this `containers` object.  It is a copy.
         containers = list(self.input_storage)
@@ -454,10 +443,6 @@ def __contains__(self, item):
             if input.update is not None:
                 self.n_returned_outputs -= 1
 
-        for node in self.maker.fgraph.apply_nodes:
-            if node.op in ops_with_inner_function.keys():
-                self.nodes_with_inner_function.append(node.op)
-
     def __contains__(self, item):
         return self.value.__contains__(item)
 
@@ -595,20 +580,10 @@ def __call__(self, *args, **kwargs):
             outputs = self.fn()
         except Exception:
             if hasattr(self.fn, 'position_of_error'):
-                # this is a new vm-provided function or c linker
-                # they need this because the exception manipulation
+                # this is a new vm-provided function
+                # the C VM needs this because the exception manipulation
                 # done by raise_with_op is not implemented in C.
-                if hasattr(self.fn, 'thunks'):
-                    # For the CVM
-                    gof.link.raise_with_op(
-                        self.fn.nodes[self.fn.position_of_error],
-                        self.fn.thunks[self.fn.position_of_error])
-                else:
-                    # For the c linker We don't have access from
-                    # python to all the temps values So for now, we
-                    # just don't print the extra shapes/strides info
-                    gof.vm.raise_with_op(
-                        self.fn.nodes[self.fn.position_of_error])
+                gof.vm.raise_with_op(self.fn.nodes[self.fn.position_of_error])
             else:
                 # old-style linkers raise their own exceptions
                 raise
@@ -685,23 +660,9 @@ def __call__(self, *args, **kwargs):
         None,  # this property itself is not settable
         doc="""dictionary-like access to the containers associated with Variables""")
 
-
-    def free(self):
-        """
-        When allow_gc = False, clear the Variables in storage_map
-        """
-        # 1.no allow_gc return False 2.has allow_gc, if allow_gc is False, return True
-        if not getattr(self.fn, 'allow_gc', True):
-            for key in self.fn.storage_map.keys():
-                if not isinstance(key, theano.gof.Constant):
-                    self.fn.storage_map[key][0] = None
-            
-            for node in self.nodes_with_inner_function:
-                ops_with_inner_function[node.op].free()
-
-
 # pickling/deepcopy support for Function
 
+
 def _pickle_Function(f):
     #copy of the input storage list
     ins = list(f.input_storage)
@@ -736,12 +697,11 @@ def _pickle_Function(f):
                                     ' operation') %(str(d_i), str(d_j)))
                         else:
                             raise AliasedMemoryError(d_i, d_j)
+
     rval = (_constructor_Function, (f.maker, input_storage, inputs_data))
     return rval
 
 def _constructor_Function(maker, input_storage, inputs_data):
-    if not theano.config.unpickle_function:
-        return None
     f = maker.create(input_storage, trustme = True)
     assert len(f.input_storage) == len(inputs_data)
     for container, x in zip(f.input_storage, inputs_data):
@@ -759,19 +719,12 @@ def _constructor_Function(maker, input_storage, inputs_data):
 ###
 
 class SanityCheckFunction(Function):
-    """Deprecated. It is not used and not tested anywhere in Theano!
-
-    Also, we should remove the check_equal and related function in
-    this file, and use Type.values_equals() instead.
-
-    """
 
     def __init__(self, others, check_equal, *args, **kwargs):
         super(SanityCheckFunction, self).__init__(*args, **kwargs)
         self.others = others
         self.check_equal = check_equal
         # DEPRECATED?  Is this just for DualLinker?
-        warnings.warn("SanityCheckFunction is deprecated")
 
     def __setitem__(self, item, value):
         super(SanityCheckFunction, self).__setitem__(item, value)
@@ -968,175 +921,10 @@ def wrap_out(output):
             return SymbolicOutput(output)
         else:
             raise TypeError("Unknown output type: %s (%s)", type(output), output)
-        
-    def optimize_graph_with_cache(self, optimizer, inputs, outputs):
-        # This function is not finished
-        from theano.gof.compilelock import get_lock, release_lock
-        import os.path
-
-        graph_db_file = os.path.join(theano.config.compiledir, 'optimized_graphs.pkl')
-
-        # the inputs, outputs, and size of the graph to be optimized
-        inputs_new = [inp.variable for inp in inputs]
-        outputs_new = [out.variable for out in outputs]
-        size_new = len(self.fgraph.apply_nodes)
-        need_optimize = False
-        get_lock()
-        key = None
-        #Beginning of cache optimizations.
-        #Could be refactored in different functions.
-        def load_graph_db():
-            if os.path.isfile(graph_db_file):
-                print 'graph_db already exists'
-            else:
-                # create graph_db
-                f = open(graph_db_file, 'wb')
-                print 'create new graph_db in %s' % graph_db_file
-                #file needs to be open and closed for every pickle
-                f.close()
-            # load the graph_db dictionary
-            try:
-                f = open(graph_db_file, 'rb')
-                #Temporary hack to allow theano.scan_module.tests.test_scan.T_Scan
-                #to finish. Should be changed in definitive version.
-                tmp = theano.config.unpickle_function
-                theano.config.unpickle_function = False
-                graph_db = cPickle.load(f)
-                
-                #hack end
-                f.close()
-                print 'graph_db loaded and it is not empty'
-            except EOFError, e:
-                # the file has nothing in it
-                print e
-                print 'graph_db loaded and it is empty'
-                graph_db = {}
-            finally:
-                theano.config.unpickle_function = tmp
-                
-            return graph_db
-
-        def find_same_graph_in_db(graph_db):
-            # If found_graph_in_db is None, then need to optimize.
-            # Otherwise, return the graph found.
-            found_graph_in_db = None
-            # The sole purpose of this loop is to set 'need_optimize' by
-            # going through graph_db, looking for graph that has the same
-            # computation performed. 
-            for graph_old, graph_optimized in graph_db.iteritems():
-                inputs_old = graph_old.inputs
-                outputs_old = graph_old.outputs
-                size_old = len(graph_old.apply_nodes)
-                # Some heuristics to check is the same graphs have
-                # already been optimized before.
-                if len(inputs_new) != len(inputs_old):
-                    # If the inputs are of different size,
-                    # two graphs are for sure different
-                    print 'need to optimize, because input size is different'
-                    continue
-                elif len(outputs_new) != len(outputs_old):
-                    # If the inputs are of different size,
-                    # two graphs are for sure different
-                    print 'need to optimize, because output size is different'
-                    continue
-                elif not all(input_new.type == input_old.type for
-                             input_new, input_old in zip(inputs_new, inputs_old)):
-                    print 'need to optimize, because inputs are of different types'
-                    continue
-                elif not all(output_new.type == output_old.type for
-                             output_new, output_old in zip(outputs_new, outputs_old)):
-                    print 'need to optimize, because outputs are of different types'
-                    continue
-                elif not size_old == size_new:
-                    print 'need to optimize, because numbers of nodes in graph are different'
-                    continue
-                else:
-                    flags = []
-                    for output_new, output_old, i in zip(
-                            outputs_new, outputs_old, range(len(outputs_new))):
-                        print 'loop through outputs node for both graphs'
-                        graph_old.variables = set(gof.graph.variables(
-                            graph_old.inputs, graph_old.outputs))
-
-                        #using clone allowed to avoid a lot of errors
-                        #deep copy seemed to had.
-                        f2 = graph_old.clone(check_integrity=False)
-                        t1 = output_new
-                        t2 = f2.outputs[i]
-
-                        #Used to remove "already used by another graph error
-                        def removeAllFgraph(remove):
-                            if hasattr(remove, 'fgraph'):
-                                del remove.fgraph
-                            if hasattr(remove, 'owner'):
-                                if remove.owner is None:
-                                    pass
-                                else:
-                                    if hasattr(remove.owner, 'fgraph'):
-                                        del remove.owner.fgraph
-                                    if hasattr(remove.owner, 'inputs'):
-                                        remove.owner.inputs = [removeAllFgraph(
-                                            i) for i in remove.owner.inputs]
-                                        for o in remove.owner.outputs:
-                                            if hasattr(o, 'fgraph'):
-                                                del o.fgraph
-                            return remove
-
-                        t2 = removeAllFgraph(t2)
-
-                        givens = dict(zip(gof.graph.inputs([t1]),
-                                        gof.graph.inputs([t2])))
-
-                        temp = dict(zip(gof.graph.inputs([t1]),
-                                    gof.graph.inputs([t2])))
-
-                        #hack to remove inconstent entry in givens
-                        #seems to work that but source of inconsistency
-                        #could be worth investigating.
-                        for key, value in temp.iteritems():
-                            if key.type != value.type:
-                                del givens[key]
-
-                        flag = is_same_graph(t1, t2, givens=givens)
-
-                        flags.append(flag)
-
-                    is_same = all(flags)
-                    if is_same:
-                        # found the match
-                        print 'found a match, no need to optimize'
-                        found_graph_in_db = graph_optimized
-                        break
-            return found_graph_in_db
-                   
-        graph_db = load_graph_db()
-        print 'loaded graph_db from %s, size=%d' % (graph_db_file, len(graph_db))
-        found_graph = find_same_graph_in_db(graph_db)
-        if found_graph:
-            self.fgraph = found_graph
-            optimizer_profile = None
-        else:
-            # this is a brand new graph, optimize it, save it to graph_db
-            print 'graph not found in graph_db, optimizing the graph'
-            self.fgraph.variables = set(gof.graph.variables(
-                self.fgraph.inputs, self.fgraph.outputs))
-            #check_integrity parameters was added to ignore 
-            #"excess cached variables" errors. Works that way
-            #but once again the error couldbe worth
-            #investigating.
-            before_opt = self.fgraph.clone(check_integrity=False)
-            optimizer_profile = optimizer(self.fgraph)
-            graph_db.update({before_opt:self.fgraph})
-            f = open(graph_db_file, 'wb')
-            cPickle.dump(graph_db, f, -1)
-            f.close()
-            print 'new graph saved into graph_db'
-        release_lock()
-        return optimizer_profile
-                
+
     def __init__(self, inputs, outputs,
             mode=None, accept_inplace=False, function_builder=Function,
-            profile=None, on_unused_input=None, fgraph=None):
+            profile=None, on_unused_input=None):
         """
         :type inputs: a list of SymbolicInput instances
 
@@ -1171,24 +959,10 @@ def __init__(self, inputs, outputs,
             raise TypeError(
                     'profile passed via both "mode" and "profile" arguments')
         self.profile = profile = profile or mode_profile
-        if profile or theano.config.cxx:
-            # This is very important:
-            # 1) We preload the cache here to don't have its timming
-            #    included in optimization that compile function.
-            # 2) If other repo that import Theano have Theano ops defined,
-            #    we need to refresh the cache here. Otherwise, their is import
-            #    order problems.
-            #    When device=gpu, we compile during Theano import. This trigger
-            #    the loading of the cache. But unpickling the cache ask that the
-            #    other repos Ops are completly loaded, which isn't always the
-            #    case!
-            #    If a module isn't completly loaded and their unpickling fail,
-            #    it mean it is safe for this function compilation to skip them,
-            #    but not for futur compilation. So reloading the cache at each
-            #    compilation fix this problem.
-            # 3) This help propagate knowledge of newly compiled module to
-            #    concurrent process.
-            theano.gof.cc.get_module_cache().refresh()
+        if profile:
+            # We preload the cache here to don't have its timming
+            # included in optimization that compile function.
+            theano.gof.cc.get_module_cache()
         # Handle the case where inputs and/or outputs is a single Variable (not in a list)
         self.orig_outputs = outputs
         unpack_single = False
@@ -1203,6 +977,7 @@ def __init__(self, inputs, outputs,
             inputs = [inputs]
 
         # Wrap them in In or Out instances if needed.
+        #import pudb; pudb.set_trace()
         inputs, outputs = map(self.wrap_in, inputs), map(self.wrap_out, outputs)
         _inputs = gof.graph.inputs([o.variable for o in outputs] + [i.update
             for i in inputs if getattr(i, 'update', False)])
@@ -1210,58 +985,47 @@ def __init__(self, inputs, outputs,
         # Check if some input variables are unused
         self._check_unused_inputs(inputs, outputs, on_unused_input)
 
-        # Make a list of (SymbolicInput|SymblicInputKits, indices, [SymbolicInput,...]), one 
-        # tuple for each input. (See Function.indices for more details)
+        #TODO: REMOVE THIS CRUFT - it's complicated for SymbolicInputKits
         indices = [[input] + self.expand_in(input, _inputs) for input in inputs]
+        expanded_inputs = reduce(list.__add__, [list(z) for x, y, z in indices], [])
+        assert expanded_inputs == inputs  # JB - I added this to make sure we could delete above
+
+        # make the fgraph (copies the graph, creates NEW INPUT AND OUTPUT VARIABLES)
+        fgraph, additional_outputs = std_fgraph(expanded_inputs, outputs, accept_inplace)
+        fgraph.profile = profile
 
-        if fgraph is None:
-            need_opt = True
-            # make the fgraph (copies the graph, creates NEW INPUT AND OUTPUT VARIABLES)
-            fgraph, additional_outputs = std_fgraph(inputs, outputs, accept_inplace)
-            fgraph.profile = profile
-        else:
-            # fgraph is already an optimized one
-            need_opt = False
-            _, additional_outputs = std_fgraph(inputs, outputs, accept_inplace)
-            pass
-        
         self.fgraph = fgraph
 
         # Fetch the optimizer and linker
         optimizer, linker = mode.optimizer, copy.copy(mode.linker)
-        if need_opt:
-            compute_test_value_orig = theano.config.compute_test_value
-            add_stack_trace_on_call_orig = gof.Op.add_stack_trace_on_call
-            try:
-                # optimize the fgraph
-                theano.config.compute_test_value = theano.config.compute_test_value_opt
-                gof.Op.add_stack_trace_on_call = False
-                start_optimizer = time.time()
-
-                # now optimize the graph
-                if theano.config.cache_optimizations:
-                    optimizer_profile = self.optimize_graph_with_cache(
-                        optimizer, inputs, outputs)
-                else:    
-                    optimizer_profile = optimizer(fgraph)
-                    
-                end_optimizer = time.time()
-                opt_time = end_optimizer - start_optimizer
-                if profile:
-                    profile.optimizer_time += opt_time
-                    if theano.config.profile_optimizer:
-                        profile.optimizer_profile = (optimizer, optimizer_profile)
-                _logger.debug('Optimizing took %f seconds', opt_time)
-
-                #Add deep copy to respect the memory interface
-                insert_deepcopy(fgraph, inputs, outputs + additional_outputs)
-            finally:
-                theano.config.compute_test_value = compute_test_value_orig
-                gof.Op.add_stack_trace_on_call = add_stack_trace_on_call_orig
-        
+
+        # optimize the fgraph
+        compute_test_value_orig = theano.config.compute_test_value
+        add_stack_trace_on_call = gof.Op.add_stack_trace_on_call
+        try:
+            theano.config.compute_test_value = "off"
+            gof.Op.add_stack_trace_on_call = False
+            start_optimizer = time.time()
+            optimizer_profile = optimizer(fgraph)
+            end_optimizer = time.time()
+            opt_time = end_optimizer - start_optimizer
+            mode.optimizer_time += opt_time
+
+            if profile:
+                profile.optimizer_time += opt_time
+                if theano.config.profile_optimizer:
+                    profile.optimizer_profile = (optimizer, optimizer_profile)
+            _logger.debug('Optimizing took %f seconds', opt_time)
+
+            #Add deep copy to respect the memory interface
+            insert_deepcopy(fgraph, inputs, outputs + additional_outputs)
+        finally:
+            theano.config.compute_test_value = compute_test_value_orig
+            gof.Op.add_stack_trace_on_call = add_stack_trace_on_call
+
         # initialize the linker
         if not hasattr(linker, 'accept'):
-            raise ValueError("'linker' parameter of FunctionMaker should be a Linker with an accept method " \
+            raise ValueError("'linker' parameter of FunctionFactory should be a Linker with an accept method " \
                              "or one of %s" % theano.compile.mode.predefined_linkers.keys())
 
         #the 'no_borrow' outputs are the ones for which that we can't return the internal storage pointer.
@@ -1275,18 +1039,17 @@ def __init__(self, inputs, outputs,
         if hasattr(linker, 'accept_var_updates'):
             # hacky thing so VMLinker knows about updates
             self.linker.accept_var_updates(
-                    fgraph_updated_vars(fgraph, inputs))
+                    fgraph_updated_vars(fgraph, expanded_inputs))
 
         self.indices = indices
         self.inputs = inputs
-        self.expanded_inputs = inputs
+        self.expanded_inputs = expanded_inputs
         self.outputs = outputs
         self.unpack_single = unpack_single
         self.return_none = return_none
         self.mode = mode
         self.accept_inplace = accept_inplace
         self.function_builder = function_builder
-        self.on_unused_input = on_unused_input  # Used only for the pickling
 
         self.required = [(i.value is None) for i in self.inputs]
         self.refeed = [
@@ -1407,6 +1170,7 @@ def create(self, input_storage=None, trustme=False):
 
         linker_time = end_linker - start_linker
         _logger.debug('Linker took %f seconds', linker_time)
+        self.mode.linker_time += linker_time
         if self.profile:
             self.profile.linker_time += linker_time
             _fn.time_thunks = self.profile.flag_time_thunks
@@ -1421,23 +1185,16 @@ def _pickle_FunctionMaker(self):
     kwargs = dict(
                 inputs=self.inputs,
                 outputs=self.orig_outputs,
-                fgraph=self.fgraph,
                 mode=self.mode,
                 accept_inplace=self.accept_inplace,
                 function_builder=self.function_builder,
                 profile=self.profile,
-                on_unused_input=self.on_unused_input,
                 )
     return (_constructor_FunctionMaker, (kwargs,))
 
 
 def _constructor_FunctionMaker(kwargs):
-    if theano.config.unpickle_function:
-        if theano.config.reoptimize_unpickled_function:
-            del kwargs['fgraph']
-        return FunctionMaker(**kwargs)
-    else:
-        return None
+    return FunctionMaker(**kwargs)
 
 copy_reg.pickle(FunctionMaker, _pickle_FunctionMaker)
 
@@ -1495,9 +1252,9 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
 
      - FAST_COMPILE (minimal optimization)
 
-     - ProfileMode(deprecated): allow to print a profile mode with mode.print_summary
+     - PROFILE_MODE: allow to print a profile mode with mode.print_summary
 
-     - DebugMode: verify many internal conditions that are normally assumed
+     - DEBUG_MODE: verify many internal conditions that are normally assumed
        (slow)
 
     :param accept_inplace: True iff the graph can contain inplace operations
@@ -1534,12 +1291,12 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
     else:
         Maker = getattr(mode, 'function_maker', FunctionMaker)
         fn = Maker(inputs,
-                   outputs,
-                   mode,
-                   accept_inplace=accept_inplace,
-                   profile=profile,
-                   on_unused_input=on_unused_input).create(
-                       defaults)
+                outputs,
+                mode,
+                accept_inplace=accept_inplace,
+                profile=profile,
+                on_unused_input=on_unused_input).create(
+                        defaults)
 
     t2 = time.time()
     if profile:
diff --git a/theano/compile/io.py b/theano/compile/io.py
index 118cf816acf..dc59595cb0f 100644
--- a/theano/compile/io.py
+++ b/theano/compile/io.py
@@ -79,7 +79,6 @@ def __repr__(self):
         return str(self)
 
 
-# TODO: FB: I think this isn't used, confirm this and remove.
 class SymbolicInputKit(object):
     """
     Represents a group ("kit") of SymbolicInputs. If fed into function or
diff --git a/theano/compile/mode.py b/theano/compile/mode.py
index 44a6919674d..3ada2b4b003 100644
--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -89,29 +89,19 @@ def register_linker(name, linker):
 exclude = []
 if not theano.config.cxx:
     exclude = ['cxx_only']
-OPT_NONE = gof.Query(include=[], exclude=exclude)
-OPT_MERGE = gof.Query(include=['merge'], exclude=exclude)
 OPT_FAST_RUN = gof.Query(include=['fast_run'], exclude=exclude)
 OPT_FAST_RUN_STABLE = OPT_FAST_RUN.requiring('stable')
-# We need fast_compile_gpu here.  As on the GPU, we don't have all
-# operation that exist in fast_compile, but have some that get
-# introduced in fast_run, we want those optimization to also run in
-# fast_compile+gpu. We can't tag them just as 'gpu', as this would
-# exclude them if we exclude 'gpu'.
-OPT_FAST_COMPILE = gof.Query(include=['fast_compile', 'fast_compile_gpu'],
-                             exclude=exclude)
+OPT_FAST_COMPILE = gof.Query(include=['fast_compile'], exclude=exclude)
 OPT_STABILIZE = gof.Query(include=['fast_run'], exclude=exclude)
 OPT_STABILIZE.position_cutoff = 1.5000001
-OPT_NONE.name = 'OPT_NONE'
-OPT_MERGE.name = 'OPT_MERGE'
 OPT_FAST_RUN.name = 'OPT_FAST_RUN'
 OPT_FAST_RUN_STABLE.name = 'OPT_FAST_RUN_STABLE'
 OPT_FAST_COMPILE.name = 'OPT_FAST_COMPILE'
 OPT_STABILIZE.name = 'OPT_STABILIZE'
 
 predefined_optimizers = {
-    None: OPT_NONE,
-    'None': OPT_NONE,
+    None: (lambda fgraph: None),
+    'None': (lambda fgraph: None),
     'merge': gof.MergeOptimizer(),
     'fast_run': OPT_FAST_RUN,
     'fast_run_stable': OPT_FAST_RUN_STABLE,
@@ -176,14 +166,14 @@ def apply(self, fgraph):
 
 optdb = gof.SequenceDB()
 optdb.register('merge1', gof.MergeOptimizer(),
-        0, 'fast_run', 'fast_compile', 'merge')
+        0, 'fast_run', 'fast_compile')
 
 # rearranges elemwise expressions
 optdb.register('canonicalize', gof.EquilibriumDB(),
         1, 'fast_run', 'fast_compile')
 
 optdb.register('merge1.2', gof.MergeOptimizer(),
-        1.2, 'fast_run', 'fast_compile', 'merge')
+        1.2, 'fast_run', 'fast_compile')
 
 optdb.register('Print1.21', PrintCurrentFunctionGraph('Post-canonicalize'),
         1.21,)  # 'fast_run', 'fast_compile')
@@ -197,7 +187,10 @@ def apply(self, fgraph):
 
 # misc special cases for speed
 optdb.register('specialize', gof.EquilibriumDB(),
-        2, 'fast_run', 'fast_compile_gpu')
+        2, 'fast_run')
+
+optdb.register('Print2.01', PrintCurrentFunctionGraph('Post-specialize'),
+        2.01,)  # 'fast_run', 'fast_compile')
 
 # misc special cases for speed that break canonicalization
 optdb.register('uncanonicalize', gof.EquilibriumDB(),
@@ -209,14 +202,14 @@ def apply(self, fgraph):
 
 # especially constant merge
 optdb.register('merge2', gof.MergeOptimizer(),
-        49, 'fast_run', 'merge')
+        49, 'fast_run')
 
 optdb.register('add_destroy_handler', AddDestroyHandler(),
         49.5, 'fast_run', 'inplace')
 
 # final pass just to make sure
 optdb.register('merge3', gof.MergeOptimizer(),
-        100, 'fast_run', 'merge')
+        100, 'fast_run')
 
 
 class Mode(object):
@@ -269,6 +262,8 @@ def __setstate__(self, state):
         self.call_time = 0
         self.fn_time = 0
         linker.mode = self  # TODO: WHY IS THIS HERE?
+        self.optimizer_time = 0
+        self.linker_time = 0
 
     def __str__(self):
         return "%s(linker = %s, optimizer = %s)" % (self.__class__.__name__,
@@ -393,7 +388,7 @@ def register_mode(name, mode):
 def register_OutputGuard_c_code(type):
     """Deprecated function calling register_view_op_c_code"""
     warnings.warn("register_OutputGuard_c_code(type) is deprecated, "
-            "theano.compile.register_view_op_c_code(type, code, version=()) instead.",
+            "theano.compile.register_view_op_c_code(type, code) instead.",
             stacklevel=2)
     register_view_op_c_code(
             type,
diff --git a/theano/compile/monitormode.py b/theano/compile/monitormode.py
index 99301a87fbe..0de12e1d9a0 100644
--- a/theano/compile/monitormode.py
+++ b/theano/compile/monitormode.py
@@ -37,14 +37,14 @@ def __init__(self, pre_func=None, post_func=None,
         :param optimizer: The optimizer to use. One may use for instance
             'fast_compile' to skip optimizations.
 
-        :param linker: DO NOT USE. This mode uses its own linker.
+        :param linker: DO NOT USE. This mode use its own linker.
             The parameter is needed to allow selecting optimizers to use.
         """
         self.pre_func = pre_func
         self.post_func = post_func
         wrap_linker = theano.gof.WrapLinkerMany([theano.gof.OpWiseCLinker()],
                                                 [self.eval])
-        if optimizer == 'default':
+        if optimizer is 'default':
             optimizer = theano.config.optimizer
         if (linker is not None and
             not isinstance(linker.mode, MonitorMode)):
diff --git a/theano/compile/ops.py b/theano/compile/ops.py
index 53fb7f70f93..a75589248ca 100644
--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -1,24 +1,17 @@
-"""This file contains auxiliary Ops, used during the compilation phase
-and Ops building class (:class:`FromFunctionOp`) and decorator
-(:func:`as_op`) that help make new Ops more rapidly.
-
-"""
+"""This file contain auxiliary Ops, used during the compilation phase."""
 import copy
 import warnings
 
-import theano
+#import theano
 from theano import gof
 
 
-import numpy
-
-
 def register_view_op_c_code(type, code, version=()):
     """ Tell ViewOp how to generate C code for a Theano Type
 
-    :param type: A Theano type. It must be the Theano class itself and not an
+    :param typ: A Theano type. It must be the Theano class itself and not an
                 instance of the class.
-    :param code: C code that returns a view for the Theano type 'type'.
+    :param code: C code that deep copies the Theano type 'typ'.
                  Use %(iname)s and %(oname)s for the input and output C
                  variable names respectively.
     :param version: A number indicating the version of the code, for cache.
@@ -74,7 +67,7 @@ def c_code_cache_version(self):
             if not v:
                 warnings.warn("Type %s has C code for ViewOp, but it has "
                         "no version. You should add a 'version' keyword arg "
-                        "when calling register_view_op_c_code." % t,
+                        "when calling register_deep_copy_op_c_code." % t,
                         stacklevel=2)
                 return ()
             version.append((str(t), v))
@@ -109,8 +102,6 @@ class OutputGuard(ViewOp):
     """
     destroy_map = {0: [0]}
 
-    check_input = False
-
 _output_guard = OutputGuard()
 
 
@@ -133,8 +124,6 @@ class DeepCopyOp(gof.Op):
     # the output variable is %(oname)s.
     c_code_and_version = {}
 
-    check_input = False
-
     def __init__(self):
         pass
 
@@ -166,15 +155,13 @@ def c_code_cache_version(self):
         # Else, we will return a list of (type name, version) pairs.
         for t, (c, v) in sorted(self.c_code_and_version.items(), key=lambda pair: str(pair[0])):
             if not v:
-                warnings.warn("Type %s has C code for DeepCopyOp, but it has "
+                warnings.warn("Type %s has C code for OutputGuard, but it has "
                         "no version. You should add a 'version' keyword arg "
-                        "when calling register_deep_copy_op_c_code." % t,
+                        "when calling register_OutputGuard_c_code." % t,
                         stacklevel=2)
                 return ()
             version.append((str(t), v))
 
-        if version:
-            version.append(1)
         return tuple(version)
 
     def c_code(self, node, name, inames, onames, sub):
@@ -193,612 +180,6 @@ def c_code(self, node, name, inames, onames, sub):
 
 deep_copy_op = DeepCopyOp()
 
-
-def register_shape_c_code(type, code, version=()):
-    """ Tell Shape Op how to generate C code for a Theano Type
-
-    :param typ: A Theano type. It must be the Theano class itself and not an
-                instance of the class.
-    :param code: C code that return a vector representing the shape
-                 for the Theano type 'typ'.
-                 Use %(iname)s and %(oname)s for the input and output C
-                 variable names respectively.
-    :param version: A number indicating the version of the code, for cache.
-    """
-    Shape.c_code_and_version[type] = (code, version)
-
-
-class Shape(gof.Op):
-    """
-    L{Op} to return the shape of a matrix.
-
-    @note: Non-differentiable.
-    """
-    # Mapping from Type to C code (and version) to use.
-    # In the C code, the name of the input variable is %(iname)s,
-    # the output variable is %(oname)s.
-    c_code_and_version = {}
-
-    check_input = False
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def make_node(self, x):
-        # Must work for all type that have a shape attribute.
-        # This will fail at execution time.
-        if not isinstance(x, theano.Variable):
-            x = theano.tensor.as_tensor_variable(x)
-        return gof.Apply(self, [x], [theano.tensor.lvector()])
-
-    def perform(self, node, inp, out_):
-        x, = inp
-        out, = out_
-        out[0] = theano._asarray(x.shape, dtype='int64')
-
-    def infer_shape(self, node, in_shapes):
-        return [[len(in_shapes[0])]]
-
-    def connection_pattern(self, node):
-        # the grad returns the gradient with respect to the
-        # elements of a tensor variable
-        # the elements of the tensor variable do not participate
-        # in the computation of the shape, so they are not really
-        # part of the graph
-        return [[False]]
-
-    def grad(self, inp, grads):
-        # the grad returns the gradient with respect to the
-        # elements of a tensor variable
-        # the elements of the tensor variable do not participate
-        # in the computation of the shape, so they are not really
-        # part of the graph
-        return [theano.gradient.DisconnectedType()()]
-
-    def R_op(self, inputs, eval_points):
-        return [None]
-
-    def c_code(self, node, name, inames, onames, sub):
-        iname, = inames
-        oname, = onames
-        fail = sub['fail']
-
-        itype = node.inputs[0].type.__class__
-        if itype in self.c_code_and_version:
-            code, version = self.c_code_and_version[itype]
-            return code % locals()
-
-        # Else, no C code
-        return super(Shape, self).c_code(node, name, inames, onames, sub)
-
-    def c_code_cache_version(self):
-        version = []
-        # If any of the c code is unversionned, we have to return ()
-        # Else, we will return a list of (type name, version) pairs.
-        for t, (c, v) in sorted(self.c_code_and_version.items(), key=lambda pair: str(pair[0])):
-            if not v:
-                warnings.warn("Type %s has C code for Shape, but it has "
-                        "no version. You should add a 'version' keyword arg "
-                        "when calling register_shape_c_code." % t,
-                        stacklevel=2)
-                return ()
-            version.append((str(t), v))
-
-        if version:
-            version.append(1)
-
-        return tuple(version)
-
-
-shape = Shape()
-_shape = shape  # was used in the past, now use shape directly.
-#pprint.assign(_shape, printing.MemberPrinter('shape'))
-
-
-class Shape_i(gof.Op):
-    """
-    L{Op} to return the shape of a matrix.
-
-    @note: Non-differentiable.
-    """
-    # Mapping from Type to C code (and version) to use.
-    # In the C code, the name of the input variable is %(iname)s,
-    # the output variable is %(oname)s.
-    c_code_and_version = {}
-
-    check_input = False
-
-    __props__ = ("i",)
-
-    def __init__(self, i):
-        # As i will be used in the hash and that ndarray are not hashable,
-        # we need to convert it to an int as it is hashable.
-        if isinstance(i, numpy.ndarray):
-            assert "int" in str(i.dtype)
-        assert i == int(i)
-        i = int(i)
-        self.i = i
-
-    def __str__(self):
-        return '%s{%i}' % (self.__class__.__name__, self.i)
-
-    def make_node(self, x):
-        # x could be one of a number of types
-        # the only thing we require is that the variable have a .ndim,
-        # and that the value have a .shape
-        if not isinstance(x, theano.Variable):
-            raise TypeError('x must be Variable with ndim attribute', x)
-        if x.ndim <= self.i:
-            raise TypeError('x has too few dimensions for Shape_i',
-                            (x, self.i))
-        return theano.Apply(self, [x], [theano.tensor.lscalar()])
-
-    def perform(self, node, inp, out_):
-        x, = inp
-        out, = out_
-        if out[0] is None:
-            out[0] = theano._asarray(x.shape[self.i], dtype='int64')
-        else:
-            out[0][...] = x.shape[self.i]
-
-    def c_code_cache_version(self):
-        version = []
-        # If any of the c code is unversionned, we have to return ()
-        # Else, we will return a list of (type name, version) pairs.
-        for t, (c, ci, v) in sorted(self.c_code_and_version.items(),
-                                    key=lambda pair: str(pair[0])):
-            if not v:
-                warnings.warn("Type %s has C code for Shape_i, but it has "
-                              "no version. You should add a 'version' keyword "
-                              "arg when calling register_shape_i_c_code." % t,
-                              stacklevel=2)
-                return ()
-            version.append((str(t), v))
-
-        if version:
-            version.append(1)
-
-        return tuple(version)
-
-    def c_code(self, node, name, inames, onames, sub):
-        iname, = inames
-        oname, = onames
-        fail = sub['fail']
-        i = self.i
-
-        itype = node.inputs[0].type.__class__
-        if itype in self.c_code_and_version:
-            code, check_input, version = self.c_code_and_version[itype]
-            return (check_input + code) % locals()
-
-        # Else, no C code
-        return super(Shape_i, self).c_code(node, name, inames, onames, sub)
-
-    def infer_shape(self, node, input_shapes):
-        return [()]
-
-    def grad(self, inp, grads):
-        return [None]
-
-
-def register_shape_i_c_code(typ, code, check_input, version=()):
-    """ Tell Shape_i how to generate C code for a Theano Type
-
-    :param typ: A Theano type. It must be the Theano class itself and not an
-                instance of the class.
-    :param code: C code that gets the shape of dimensions %(i)s for the Theano type 'typ'.
-                 Use %(iname)s and %(oname)s for the input and output C
-                 variable names respectively.
-    :param version: A number indicating the version of the code, for cache.
-    """
-    Shape_i.c_code_and_version[typ] = (code, check_input, version)
-
-
 # List of Theano Types that one can add an extra dimension and for which
 # Scan can deal with.
 expandable_types = ()
-
-def load_back(mod, name):
-    __import__(mod)
-    import sys
-    module = sys.modules[mod]
-    obj = getattr(module, name)
-    return obj
-
-class FromFunctionOp(gof.Op):
-    """
-    Build a basic Theano Op around a function.
-
-    Since the resulting Op is very basic and is missing most of the
-    optional functionalities, some optimizations may not apply.  If you
-    want to help, you can supply an infer_shape function that computes
-    the shapes of the output given the shapes of the inputs.
-
-    Also the gradient is undefined in the resulting op and Theano will
-    raise an error if you attempt to get the gradient of a graph
-    containing this op.
-    """
-    def __init__(self, fn, itypes, otypes, infer_shape):
-        self.__fn = fn
-        self.itypes = itypes
-        self.otypes = otypes
-        self.__infer_shape = infer_shape
-        if self.__infer_shape is not None:
-            self.infer_shape = self._infer_shape
-
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.__fn == other.__fn)
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.__fn)
-
-    def __str__(self):
-        return 'FromFunctionOp{%s}' % self.__fn.__name__
-
-    def make_node(self, *inputs):
-        assert len(inputs) == len(self.itypes)
-        assert all(inp.type == it for inp, it in zip(inputs, self.itypes))
-        return theano.Apply(self, inputs, [o() for o in self.otypes])
-
-    def perform(self, node, inputs, outputs):
-        outs = self.__fn(*inputs)
-        if not isinstance(outs, (list, tuple)):
-            outs = (outs,)
-        assert len(outs) == len(outputs)
-        for i in range(len(outs)):
-            outputs[i][0] = outs[i]
-
-    def __reduce__(self):
-        mod = self.__fn.__module__
-        name = self.__fn.__name__
-        try:
-            obj = load_back(mod, name)
-        except (ImportError, KeyError, AttributeError):
-            raise PicklingError("Can't pickle as_op(), not found as %s.%s" %
-                                (mod, name))
-        else:
-            if obj is not self:
-                raise PicklingError("Can't pickle as_op(), not the object "
-                                    "at %s.%s" % (mod, name))
-        return load_back, (mod, name)
-
-    def _infer_shape(self, node, input_shapes):
-        return self.__infer_shape(node, input_shapes)
-
-def as_op(itypes, otypes, infer_shape=None):
-    """
-    Decorator that converts a function into a basic Theano op that
-    will call the supplied function as its implementation.
-
-    It takes an optional infer_shape parameter that should be a
-    callable with this signature:
-
-        def infer_shape(node, input_shapes):
-            ...
-            return output_shapes
-
-    Here `input_shapes` and `output_shapes` are lists of tuples that
-    represent the shape of the corresponding inputs/outputs.
-
-    This should not be used when performance is a concern since the
-    very basic nature of the resulting Op may interfere with certain
-    graph optimizations.
-
-    Example usage:
-
-       @as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
-              otypes=[theano.tensor.fmatrix])
-       def numpy_dot(a, b):
-           return numpy.dot(a, b)
-    """
-    if not isinstance(itypes, (list, tuple)):
-        itypes = [itypes]
-    if any(not isinstance(t, theano.Type) for t in itypes):
-        raise TypeError("itypes has to be a list of Theano types")
-    if not isinstance(otypes, (list, tuple)):
-        otypes = [otypes]
-    if any(not isinstance(t, theano.Type) for t in otypes):
-        raise TypeError("otypes has to be a list of Theano types")
-
-    # make sure they are lists and not tuples
-    itypes = list(itypes)
-    otypes = list(otypes)
-
-    if infer_shape is not None and not callable(infer_shape):
-        raise TypeError("infer_shape needs to be a callable")
-
-    def make_op(fn):
-        return FromFunctionOp(fn, itypes, otypes, infer_shape)
-    return make_op
-
-
-def register_rebroadcast_c_code(typ, code, version=()):
-    """Tell Rebroadcast how to generate C code for a Theano Type
-
-    :param typ: A Theano type. It must be the Theano class itself and not an
-                instance of the class.
-
-    :param code: C code that checks if the dimension %(axis)s is of
-                 shape 1 for the Theano type 'typ'.  Use %(iname)s and
-                 %(oname)s for the input and output C variable names
-                 respectively, and %(axis)s for the axis that we need to
-                 check. This code is put in a loop for all axes.
-
-    :param version: A number indicating the version of the code, for cache.
-    """
-    Rebroadcast.c_code_and_version[typ] = (code, version)
-
-
-class Rebroadcast(gof.Op):
-    """
-    Change the input's broadcastable fields in some predetermined way.
-
-    :code:`Rebroadcast((0, True), (1, False))(x)` would make :code:`x`
-    broadcastable in axis 0 and not broadcastable in axis 1
-
-    .. seealso::
-
-      :func:`unbroadcast <theano.tensor.unbroadcast>`
-      :func:`addbroadcast <theano.tensor.addbroadcast>`
-      :func:`patternbroadcast <theano.tensor.patternbroadcast>`
-
-    ..note: works inplace and works for CudaNdarrayType
-    """
-    view_map = {0: [0]}
-    # Mapping from Type to C code (and version) to use.
-    # In the C code, the name of the input variable is %(iname)s,
-    # the output variable is %(oname)s.
-    c_code_and_version = {}
-
-    check_input = False
-
-    def __init__(self, *axis):
-        self.axis = dict(axis)
-        for axis, broad in self.axis.iteritems():
-            assert isinstance(axis, (numpy.integer, int)), (
-                "Rebroadcast needs integer axes. Got ", axis)
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.axis == other.axis
-
-    def __hash__(self):
-        items = sorted(self.axis.iteritems())  # no ambiguity because each item key is unique
-        return hash(type(self)) ^ hash(tuple(items))
-
-    def __str__(self):
-        if len(self.axis) == 0:
-            broadcast_pattern = []
-        else:
-            broadcast_pattern = ['?' for i
-                                 in xrange(1 + numpy.max(self.axis.keys()))]
-        for k, v in self.axis.iteritems():
-            broadcast_pattern[k] = str(int(v))
-        return '%s{%s}' % (self.__class__.__name__,
-                           ','.join(broadcast_pattern))
-
-    def make_node(self, x):
-        if self.axis.keys() and (x.ndim <= numpy.max(self.axis.keys())):
-            raise ValueError('Trying to rebroadcast non-existent dimension')
-        t = x.type.__class__(dtype=x.type.dtype,
-                             broadcastable=[self.axis.get(i, b)
-                                            for i, b in enumerate(
-                                                x.type.broadcastable)])
-        return gof.Apply(self, [x], [t()])
-
-    def perform(self, node, inp, out_):
-        x, = inp
-        out, = out_
-        for axis, value in self.axis.iteritems():
-            if value and x.shape[axis] != 1:
-                raise ValueError('Dimension %s in Rebroadcast\'s input was'
-                                 ' supposed to be 1 (got %s instead)' %
-                                 (axis, x.shape[axis]))
-        out[0] = x
-
-    def grad(self, inp, grads):
-        x, = inp
-        gz, = grads
-        # restore the broadcasting pattern of the input
-        return Rebroadcast(*[(axis, x.type.broadcastable[axis])
-                             for axis, value in self.axis.iteritems()])(gz),
-
-    def infer_shape(self, node, ishapes):
-        assert len(ishapes) == 1
-        l = []
-        one = theano.tensor.basic.constant(1)
-        for ax in xrange(len(ishapes[0])):
-            if self.axis.get(ax, False):
-                l.append(one)
-            else:
-                l.append(ishapes[0][ax])
-
-        return [tuple(l)]
-
-    def R_op(self, inputs, eval_points):
-        if eval_points[0] is None:
-            return [None]
-        return self(*eval_points, **dict(return_list=True))
-
-    def c_code(self, node, nodename, inp, out, sub):
-        iname, = inp
-        oname, = out
-        fail = sub['fail']
-
-        itype = node.inputs[0].type.__class__
-        if itype in self.c_code_and_version:
-            code, version = self.c_code_and_version[itype]
-            final_code = ""
-            for axis, value in self.axis.iteritems():
-                if value:
-                    final_code += code % locals()
-            return final_code + """
-            Py_XDECREF(%(oname)s);
-            %(oname)s = %(iname)s;
-            Py_XINCREF(%(oname)s);
-            """ % locals()
-        return super(Rebroadcast, self).c_code(node, nodename, inp, out, sub)
-
-    def c_code_cache_version(self):
-        version = []
-        # If any of the c code is unversionned, we have to return ()
-        # Else, we will return a list of (type name, version) pairs.
-        for t, (c, v) in sorted(self.c_code_and_version.items(),
-                                key=lambda pair: str(pair[0])):
-            if not v:
-                warnings.warn("Type %s has C code for Rebroadcast, but it has "
-                        "no version. You should add a 'version' keyword arg "
-                        "when calling register_rebroadcast_c_code." % t,
-                        stacklevel=2)
-                return ()
-            version.append((str(t), v))
-
-        if version:
-            version.append(1)
-        return tuple(version)
-
-
-def register_specify_shape_c_code(typ, code, version=(),
-                                  c_support_code_apply=None):
-    """ Tell SpecifyShape how to generate C code for a Theano Type
-
-    :param typ: A Theano type. It must be the Theano class itself and not an
-                instance of the class.
-    :param code: C code that checks the shape and returns a view for the Theano type 'typ'.
-                 Use %(iname)s and %(oname)s for the input and output C
-                 variable names respectively.
-                 %(shape)s is the vector of shape of %(iname)s.
-                 Check that its length is good.
-    :param version: A number indicating the version of the code, for cache.
-    :param c_support_code_apply: extra code.
-    """
-    SpecifyShape.c_code_and_version[typ] = (code, version, c_support_code_apply)
-
-
-class SpecifyShape(gof.Op):
-    """
-    L{Op} that puts into the graph the user-provided shape.
-
-    In the case where this op stays in the final graph, we assert the shape.
-    For this the output of this op must be used in the graph. This is not
-    the case most of the time if we only take the shape of the output.
-    Maybe there are other optimizations that will mess with this.
-
-    @note:     Maybe in the future we will never do the assert!
-    @note:     We currently don't support specifying partial shape information.
-
-    @todo:     test this op with sparse and cuda ndarray.
-               Do C code for them too.
-    """
-    view_map = {0: [0]}
-    # Mapping from Type to C code (and version) to use.
-    # In the C code, the name of the input variable is %(iname)s,
-    # the output variable is %(oname)s.
-    c_code_and_version = {}
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def make_node(self, x, shape):
-        if not isinstance(x, gof.Variable):
-            x = theano.tensor.as_tensor_variable(x)
-        shape = theano.tensor.as_tensor_variable(shape)
-        assert shape.ndim == 1
-        assert "int" in shape.dtype
-        if isinstance(shape, theano.tensor.TensorConstant):
-            assert shape.data.size == x.ndim
-        return gof.Apply(self, [x, shape], [x.type()])
-
-    def perform(self, node, inp, out_):
-        x, shape = inp
-        out, = out_
-        assert x.ndim == shape.size
-        assert numpy.all(x.shape == shape), ("got shape", x.shape,
-                                             "expected", shape)
-        out[0] = x
-
-    def infer_shape(self, node, shapes):
-        xshape, sshape = shapes
-        new_shape = []
-        for dim in xrange(node.inputs[0].ndim):
-            try:
-                s = theano.tensor.get_scalar_constant_value(node.inputs[1][dim])
-                s = theano.tensor.as_tensor_variable(s)
-                new_shape.append(s)
-            except theano.tensor.NotScalarConstantError:
-                new_shape.append(node.inputs[1][dim])
-
-        assert len(new_shape) == len(xshape)
-        return [new_shape]
-
-    def connection_pattern(self, node):
-        return [[True], [False]]
-
-    def grad(self, inp, grads):
-        x, s = inp
-        gz, = grads
-        # Should I set an SpecifyShape on gz? I think so
-        # But I don't do it now as we need to make an optimization
-        # to remove that op from the graph to don't block other optimization
-        # Should I do an optimizer that will remove the SpecifyShape?
-        # I think Yes
-        return [gz, theano.gradient.DisconnectedType()()]
-        return [specify_shape(gz, s), theano.gradient.DisconnectedType()()]
-
-    def R_op(self, inputs, eval_points):
-        if eval_points[0] is None:
-            # It means that the this op sits on top of a non-differentiable
-            # path
-            return [None]
-        return self.make_node(eval_points[0], *inputs[1:]).outputs
-
-    def c_support_code_apply(self, node, name):
-        itype = node.inputs[0].type.__class__
-        if itype in self.c_code_and_version:
-            _, _, support_code = self.c_code_and_version[itype]
-            if support_code:
-                return support_code
-        return super(SpecifyShape, self).c_support_code_apply(node, name)
-
-    def c_code(self, node, name, inames, onames, sub):
-        iname, shape = inames
-        oname, = onames
-        fail = sub['fail']
-
-        itype = node.inputs[0].type.__class__
-        if itype in self.c_code_and_version:
-            code, version, _ = self.c_code_and_version[itype]
-            return code % locals()
-
-        return super(SpecifyShape, self).c_code(node, node, inames, onames, sub)
-
-    def c_code_cache_version(self):
-        version = []
-        # If any of the c code is unversionned, we have to return ()
-        # Else, we will return a list of (type name, version) pairs.
-        for t, (c, v, _) in sorted(self.c_code_and_version.items(),
-                                key=lambda pair: str(pair[0])):
-            if not v:
-                warnings.warn("Type %s has C code for SpecifyShape, but it has "
-                        "no version. You should add a 'version' keyword arg "
-                        "when calling register_specify_shape_c_code." % t,
-                        stacklevel=2)
-                return ()
-            version.append((str(t), v))
-
-        return tuple(version)
-
-
-specify_shape = SpecifyShape()
diff --git a/theano/compile/pfunc.py b/theano/compile/pfunc.py
index 075b53b2e40..ec418e381b4 100644
--- a/theano/compile/pfunc.py
+++ b/theano/compile/pfunc.py
@@ -364,7 +364,8 @@ def pfunc(params, outputs=None, mode=None, updates=None, givens=None,
     that are neither in "updates" nor in "no_default_updates".
 
     :type name: None or string
-    :param name: attaches a name to the profiling result of this function.
+    :param name: attaches a name to the Profiling result of this function when
+    using ProfileMode (will be deprecated).
 
     :type allow_input_downcast: Boolean
     :param allow_input_downcast: True means that the values passed as
@@ -441,8 +442,7 @@ def pfunc(params, outputs=None, mode=None, updates=None, givens=None,
     if len(updates) > 0 and any(isinstance(v, Variable)
                                 for v in iter_over_pairs(updates)):
         raise ValueError(
-            "The updates parameter must be an OrderedDict/dict or a list of "
-            "lists/tuples with 2 elements")
+            "The updates parameter must an OrderedDict/dict or a list of list/tuple with 2 elements")
 
     # transform params into theano.compile.In objects.
     inputs = [_pfunc_param_to_in(p, allow_downcast=allow_input_downcast)
diff --git a/theano/compile/profilemode.py b/theano/compile/profilemode.py
index 45f84326ee6..da8b633714e 100644
--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
@@ -2,7 +2,6 @@
 import copy
 import os
 import time
-import warnings
 
 import theano
 from theano.gof.link import WrapLinker
@@ -99,10 +98,6 @@ def new_fn():
             # Lazy import to avoid compilation when importing theano.
             from theano.gof.cutils import run_cthunk
 
-        warnings.warn(
-            "DEPRECATION WARNING: The ProfileMode is deprecated. Use the Theano"
-            " flags/parameter to theano.function 'profile=True' instead"
-            " of 'mode=ProfileMode'")
         return ret
 
 
@@ -207,6 +202,8 @@ def profile_thunk2(i, node, th):
 
         self.call_time = 0
         self.fn_time = 0
+        self.optimizer_time = 0
+        self.linker_time = 0
 
     def print_summary(self, **kwargs):
         """ Print 3 summary that show where the time is spend. The first show an Apply-wise summary, the second show an Op-wise summary, the third show an type-Op-wise summary.
@@ -292,18 +289,9 @@ def diff_dict(a_time, b_time_):
         apply_cimpl = self.apply_cimpl and other.apply_cimpl
         message = self.message
         variable_shape = diff_dict(self.variable_shape, other.variable_shape)
-        self_linker_time = sum([ps.linker_time for ps
-                                 in self.profile_stats.values()])
-        other_linker_time = sum([ps.linker_time for ps
-                                 in other.profile_stats.values()])
-        self_optimizer_time = sum([ps.optimizer_time for ps
-                                 in self.profile_stats.values()])
-        other_optimizer_time = sum([ps.optimizer_time for ps
-                                 in other.profile_stats.values()])
-
-        other_time = {'linker_time': self_linker_time - other_linker_time,
-                      'optimizer_time': self_optimizer_time -
-                                        other_optimizer_time}
+        other_time = {'linker_time': self.linker_time - other.linker_time,
+                      'optimizer_time': self.optimizer_time -
+                                        other.optimizer_time}
         self.print_summary_("print_diff_summary", compile_time,
                             fct_call_time, fct_call,
                             apply_time, apply_cimpl, message, variable_shape,
@@ -638,17 +626,17 @@ def exp_float32_op(op):
 
 
 def atexit_print_default_profile_mode():
-    """Print the summary of the predefined mode ProfileMode if used.
+    """Print the summary of the predefined mode PROFILE_MODE if used.
 
     This all to have the summary printed at exit when
-    config.mode=ProfileMode
+    config.mode=PROFILE_MODE
     """
     for prof_mode in prof_mode_instance_to_print:
         if prof_mode.local_time > 0:
             prof_mode.print_summary()
 
 #Register atexit_print_default_profile_mode to have the summary of the
-#predefined mode ProfileMode if it is used printed when the program terminate.
+#predefined mode PROFILE_MODE if it is used printed when the program terminate.
 atexit.register(atexit_print_default_profile_mode)
 
 
diff --git a/theano/compile/profiling.py b/theano/compile/profiling.py
index 5b7593c5e42..d48bc638300 100644
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -19,20 +19,18 @@
 import os
 import sys
 import time
-from theano.compat.python2x import defaultdict
 
 import numpy
 
 import theano
-from theano.gof import graph
-from theano.configparser import AddConfigVar, BoolParam, IntParam, StrParam
+from theano.configparser import AddConfigVar, BoolParam, IntParam
 
 
 import_time = time.time()
 config = theano.config
 
 _atexit_print_list = []
-_atexit_registered = False
+_atexit_print_file = sys.stderr
 
 AddConfigVar('profiling.time_thunks',
              """Time individual thunks when profiling""",
@@ -49,65 +47,38 @@
              IntParam(20, lambda i: i > 0),
              in_c_key=False)
 
-AddConfigVar('profiling.output_line_width',
-             "Max line width for the profiling output",
-             IntParam(512, lambda i: i > 0),
-             in_c_key=False)
-
 AddConfigVar('profiling.min_memory_size',
              """For the memory profile, do not print Apply nodes if the size
              of their outputs (in bytes) is lower than this threshold""",
              IntParam(1024, lambda i: i >= 0),
              in_c_key=False)
 
-AddConfigVar('profiling.min_peak_memory',
-             """The min peak memory usage of the order""",
-             BoolParam(False),
-             in_c_key=False)
-
-AddConfigVar('profiling.destination',
-             """
-             File destination of the profiling output
-             """,
-             StrParam('stderr'),
-             in_c_key=False)
-
 
 def _atexit_print_fn():
     """Print ProfileStat objects in _atexit_print_list to _atexit_print_file
     """
-    to_sum = []
-
-    if config.profiling.destination == 'stderr':
-        destination_file = sys.stderr
-    elif config.profiling.destination == 'stdout':
-        destination_file = sys.stdout
-    else:
-        destination_file = open(config.profiling.destination, 'w')
-
+    printed = 0
     for ps in _atexit_print_list:
         if ps.fct_callcount or ps.compile_time > 0:
-            ps.summary(file=destination_file,
+            ps.summary(file=_atexit_print_file,
                        n_ops_to_print=config.profiling.n_ops,
                        n_apply_to_print=config.profiling.n_apply)
-            if not isinstance(ps, ScanProfileStats):
-                to_sum.append(ps)
+            printed += 1
         else:
-            # TODO print the name if there is one!
             print 'Skipping empty Profile'
-    if len(to_sum) > 1:
-        # Make a global profile
-        cum = copy.copy(to_sum[0])
-        msg = ("Sum of all(%d) printed profiles at exit excluding Scan op"
-               " profile." % len(to_sum))
-        cum.message = msg
-        for ps in to_sum[1:]:
+    if printed > 1:
+    # Make a global profile
+        cum = copy.copy(_atexit_print_list[0])
+        cum.message = "Sum of all printed profiles at exit"
+        for ps in _atexit_print_list[1:]:
+#        for ps in [ps for ps in _atexit_print_list[1:]
+#                   if not isinstance(ps, ScanProfileStats)]:
             for attr in ["compile_time", "fct_call_time", "fct_callcount",
                          "vm_call_time", "optimizer_time", "linker_time",
                          "validate_time"]:
                 setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr))
 
-            # merge dictonary
+            #merge dictonary
             for attr in ["apply_time", "apply_callcount",
                          "apply_cimpl", "variable_shape", "variable_strides"]:
                 cum_attr = getattr(cum, attr)
@@ -123,13 +94,15 @@ def _atexit_print_fn():
             else:
                 cum.optimizer_profile = None
 
-        cum.summary(file=destination_file,
+        cum.summary(file=_atexit_print_file,
                     n_ops_to_print=config.profiling.n_ops,
                     n_apply_to_print=config.profiling.n_apply)
 
 
-class ProfileStats(object):
+atexit.register(_atexit_print_fn)
+
 
+class ProfileStats(object):
     """
     Object to store runtime and memory profiling information for all of
     Theano's operations: compilation, optimization, execution.
@@ -194,7 +167,7 @@ class ProfileStats(object):
     linker_time = 0.0
     # time spent linking graph (FunctionMaker.create)
 
-    line_width = config.profiling.output_line_width
+    line_width = 140
 
     optimizer_profile = None
     # None or tuple (the optimizer, the profile it returned)
@@ -209,8 +182,8 @@ def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
                    names of the class vars declared in this class.
         """
         if (hasattr(theano, 'sandbox') and
-                hasattr(theano.sandbox, 'cuda') and
-                theano.sandbox.cuda.cuda_enabled):
+            hasattr(theano.sandbox, 'cuda') and
+            theano.sandbox.cuda.cuda_enabled):
             if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
                 raise Exception(
                     "You are running the Theano profiler with CUDA enabled."
@@ -235,10 +208,6 @@ def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
         if atexit_print:
             global _atexit_print_list
             _atexit_print_list.append(self)
-            global _atexit_registered
-            if not _atexit_registered:
-                atexit.register(_atexit_print_fn)
-                _atexit_registered = True
 
     def class_time(self):
         """dict op -> total time on thunks"""
@@ -323,6 +292,32 @@ def op_impl(self):
                 rval[node.op] = 'Py'
         return rval
 
+    def op_flops(self):
+        """dict op -> total number of flops"""
+        # timing is stored by node, we compute timing by Op on demand
+        rval = {}
+        return rval  # TODO: continue here
+        for node, count in self.apply_callcount.items():
+            rval.setdefault(node.op, 0)
+            rval[node.op] += 1
+        return rval
+        for a, t in self.op_time.items():
+            if hasattr(a, 'flops'):
+                op_flops[a] = a.flops * op_call[a] / t / 1e6
+
+        flops_msg = ''
+        if op_flops:
+            flops_msg = ' <MFlops/s>'
+            print ('\nHACK WARNING: we print the flops for some OP, but the'
+                   ' logic does not always work. You need to know the internal'
+                   ' of Theano to make it work correctly.'
+                   ' Otherwise don\'t use!')
+        print ('\nOp-wise summary:'
+               ' <%% of local_time spent on this kind of Op>'
+               ' <cumulative %%> <self seconds> <cumulative seconds>'
+               ' <time per call> %s <nb_call> <nb apply> <Op name>' % (
+                flops_msg))
+
     def summary_class(self, file=sys.stderr, N=None):
         if self.apply_time:
             local_time = sum(self.apply_time.values())
@@ -330,21 +325,22 @@ def summary_class(self, file=sys.stderr, N=None):
             local_time = 0
         if local_time == 0:
             print >> file, ('ProfileMode.summary_class: total time 0'
-                            ' (did you forget to enable counters?)')
+                    ' (did you forget to enable counters?)')
             return
         class_time = self.class_time()
         class_call = self.class_callcount()
         class_apply = self.class_nodes()
+#        class_flops = self.class_flops()
         class_impl = self.class_impl()
         if N is None:
             N = len(self.class_time)
         otimes = [(t * 100 / local_time,
-                   t,
-                   clas,
-                   class_impl.get(clas, '  '),
-                   class_call.get(clas, 0),
-                   class_apply.get(clas, 0))
-                  for clas, t in class_time.items()]
+                    t,
+                    clas,
+                    class_impl.get(clas, '  '),
+                    class_call.get(clas, 0),
+                    class_apply.get(clas, 0))
+                for clas, t in class_time.items()]
         otimes.sort()
         otimes.reverse()
         tot = 0
@@ -373,13 +369,13 @@ def summary_class(self, file=sys.stderr, N=None):
         es += ['   %2s ']
 
         hs += ['<#call>']
-        es += ['%6d  ']
+        es += ['  %4d  ']
 
         hs += ['<#apply>']
-        es += [' %4d  ']
+        es += ['  %4d  ']
 
         upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
-        maxlen = max(self.line_width - upto_length, 0)
+        maxlen = self.line_width - upto_length
         hs += ['<Class name>']
         es += ['%s']
         header_str = ' '.join(hs)
@@ -393,19 +389,22 @@ def summary_class(self, file=sys.stderr, N=None):
                 continue
             tot += t
             ftot = tot * 100 / local_time
-            # Remove the useless start and end of the class name:
-            # "<class 'theano.sandbox.cuda.blas.GpuDot22'>" -> "theano.sandbox.cuda.blas.GpuDot22"
-            class_name = str(a)[8:-2][:maxlen]
             print >> file, format_str % (f, ftot, t, t / nb_call,
                                          impl, nb_call,
-                                         nb_apply, class_name)
+                                         nb_apply, str(a)[:maxlen])
             # While this carries over less information, it is arranged such
             # that it way more readeable that the previous output of the
             # profiler
+            #if op_flops:
+            #    print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %7.1f %5d %2d %s' % (
+            #            f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
+            #else:
+            #    print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d %s' % (
+            #            f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
         print >>file, '   ... (remaining %i Classes account for %6.2f%%(%.2fs) of the runtime)'\
-            % (max(0, len(otimes) - N),
-               sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
-               sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
+                % (max(0, len(otimes) - N),
+                  sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
+                  sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
         print >> file, ''
 
     def summary_ops(self, file=sys.stderr, N=None):
@@ -415,19 +414,22 @@ def summary_ops(self, file=sys.stderr, N=None):
             local_time = 0
         if local_time == 0:
             print >> file, ('ProfileMode.summary_ops: total time 0'
-                            ' (did you forget to enable counters?)')
+                    ' (did you forget to enable counters?)')
             return
         op_time = self.op_time()
         op_call = self.op_callcount()
         op_apply = self.op_nodes()
+        op_flops = self.op_flops()
         op_impl = self.op_impl()
+        if N is None:
+            N = len(self.op_flops)
         otimes = [(t * 100 / local_time,
-                   t,
-                   op,
-                   op_impl.get(op, '  '),
-                   op_call.get(op, 0),
-                   op_apply.get(op, 0))
-                  for op, t in op_time.items()]
+                    t,
+                    op,
+                    op_impl.get(op, '  '),
+                    op_call.get(op, 0),
+                    op_apply.get(op, 0))
+                for op, t in op_time.items()]
         otimes.sort()
         otimes.reverse()
         tot = 0
@@ -462,7 +464,7 @@ def summary_ops(self, file=sys.stderr, N=None):
         es += ['  %4d  ']
 
         upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
-        maxlen = max(self.line_width - upto_length, 0)
+        maxlen = self.line_width - upto_length
         hs += ['<Op name>']
         es += ['%s']
         header_str = ' '.join(hs)
@@ -482,10 +484,16 @@ def summary_ops(self, file=sys.stderr, N=None):
             # While this carries over less information, it is arranged such
             # that it way more readeable that the previous output of the
             # profiler
+            #if op_flops:
+            #    print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %7.1f %5d %2d %s' % (
+            #            f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
+            #else:
+            #    print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d %s' % (
+            #            f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
         print >>file, '   ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
-            % (max(0, len(otimes) - N),
-               sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
-               sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
+                % (max(0, len(otimes) - N),
+                  sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
+                  sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
         print >> file, ''
 
     def summary_nodes(self, file=sys.stderr, N=None):
@@ -495,7 +503,7 @@ def summary_nodes(self, file=sys.stderr, N=None):
             local_time = 0
         if local_time == 0:
             print >> file, ('ProfileMode.summary_nodes: total time 0'
-                            ' (did you forget to enable counters?)')
+                    ' (did you forget to enable counters?)')
             return
 
         print >> file, 'Apply'
@@ -524,12 +532,8 @@ def summary_nodes(self, file=sys.stderr, N=None):
         hs += ['<id>']
         es += ['%3d']
 
-        es += ['%s', '%s']
-        if self.variable_shape:
-            hs += ['<Mflops>', '<Gflops/s>']
-
         upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
-        maxlen = max(self.line_width - upto_length, 0)
+        maxlen = self.line_width - upto_length
         hs += ['<Apply name>']
         es += ['%s']
 
@@ -538,22 +542,13 @@ def summary_nodes(self, file=sys.stderr, N=None):
 
         print >> file, header_str
 
-        topos = {}  # Only do the topo once per fct.
-        atimes = []
-        for a, t in self.apply_time.items():
-            if a.fgraph not in topos:
-                topo = a.fgraph.toposort()
-                topos[a.fgraph] = topo
-            else:
-                topo = topos[a.fgraph]
-            atimes.append((
+        atimes = [(
                 t * 100 / local_time,
                 t,
                 a,
-                topo.index(a),
-                self.apply_callcount[a]))
-        del topos
-
+                a.fgraph.toposort().index(a),
+                self.apply_callcount[a])
+            for a, t in self.apply_time.items()]
         atimes.sort()
         atimes.reverse()
         tot = 0
@@ -562,45 +557,31 @@ def summary_nodes(self, file=sys.stderr, N=None):
             ftot = tot * 100 / local_time
             if nb_call == 0:
                 continue
-            if not self.variable_shape:
-                flops = ""
-                flops_s = ""
-            elif hasattr(a.op, 'flops'):
-                fl = a.op.flops([self.variable_shape[var]
-                                 for var in a.inputs],
-                                [self.variable_shape[var]
-                                 for var in a.outputs])
-                flops = '%8.1f' % (fl / 1024. / 1024)
-                flops_s = '%10.1f' % (fl / 1024. / 1024 / 1024 / t)
-            else:
-                flops = "        "
-                flops_s = "          "
-            print >> file, format_str % (f, ftot, t, t / nb_call, nb_call,
-                                         nd_id,
-                                         flops, flops_s,
-                                         str(a)[:maxlen])
+            print >> file, format_str %(f, ftot, t, t / nb_call, nb_call,
+                                        nd_id,
+                                        str(a)[:maxlen])
             if not config.profile_memory:
                 continue
             for idx, var in enumerate(a.inputs):
                 sh = self.variable_shape.get(var, 'no shape')
                 st = self.variable_strides.get(var, 'no strides')
                 dtype = getattr(var, 'dtype', 'no dtype')
-                print >> file, "    input %d: dtype=%s, shape=%s, strides=%s " % (
+                print "    input %d: dtype=%s, shape=%s, strides=%s " % (
                     idx, dtype, sh, st)
             for idx, var in enumerate(a.outputs):
                 sh = self.variable_shape.get(var, 'no shape')
                 st = self.variable_strides.get(var, 'no strides')
                 dtype = getattr(var, 'dtype', 'no dtype')
-                print >> file, "    output %d: dtype=%s, shape=%s, strides=%s " % (
+                print "    output %d: dtype=%s, shape=%s, strides=%s " % (
                     idx, dtype, sh, st)
             # Same as before, this I've sacrificied some information making
             # the output more readable
-            # print >> file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs %.2es  %i  %s'%(
+            #print >> file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs %.2es  %i  %s'%(
             #        f, ftot, t, tot, t/nb_call,nb_call, str(a))
         print >> file, '   ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
-            % (max(0, len(atimes) - N),
-               sum(f for f, t, a, nd_id, nb_call in atimes[N:]),
-               sum(t for f, t, a, nd_id, nb_call in atimes[N:]))
+                % (max(0, len(atimes) - N),
+                  sum(f for f, t, a, nd_id, nb_call in atimes[N:]),
+                  sum(t for f, t, a, nd_id, nb_call in atimes[N:]))
         print >> file, ''
 
     def summary_function(self, file):
@@ -608,17 +589,16 @@ def summary_function(self, file):
         print >> file, '=================='
         print >> file, '  Message: %s' % self.message
         print >> file, '  Time in %i calls to Function.__call__: %es' % (
-            self.fct_callcount, self.fct_call_time)
+                self.fct_callcount, self.fct_call_time)
         if self.fct_call_time > 0:
             print >> file, '  Time in Function.fn.__call__: %es (%.3f%%)' % (
-                self.vm_call_time,
-                100 * self.vm_call_time / self.fct_call_time)
+                    self.vm_call_time,
+                    100 * self.vm_call_time / self.fct_call_time)
             local_time = sum(self.apply_time.values())
             if local_time > 0:
                 print >> file, '  Time in thunks: %es (%.3f%%)' % (
-                    local_time, 100 * local_time / self.fct_call_time)
+                        local_time, 100*local_time / self.fct_call_time)
         print >> file, '  Total compile time: %es' % self.compile_time
-        print >> file, '    Number of Apply nodes: %s' % len(self.apply_time)
         print >> file, '    Theano Optimizer time: %es' % self.optimizer_time
         print >> file, '       Theano validate time: %es' % self.validate_time
         print >> file, ('    Theano Linker time (includes C,'
@@ -642,425 +622,90 @@ def summary_memory(self, file, N=None):
             fct_shapes[node.fgraph].setdefault(node, [])
             sum_dense = 0
             for out in node.outputs:
-                if out in self.variable_shape.keys():
-                    sh = self.variable_shape[out]
-                    if hasattr(out.type, 'get_size'):
-                        v = out.type.get_size(sh)
-                        sum_dense += v
-                    else:
-                        v = 'Unknown'
+                sh = self.variable_shape[out]
+                if hasattr(out.type, 'get_size'):
+                    v = out.type.get_size(sh)
+                    sum_dense += v
                 else:
-                    v = 'Variable isnt created'
+                    v = "Unknown"
 
                 var_mem[out] = v
                 fct_memory[node.fgraph][node].append(v)
                 fct_shapes[node.fgraph][node].append(sh)
             node_mem[node] = sum_dense
 
-        # Find the function that used the most of that statistic
+        #Find the function that used the most of that statistic
         max_sum_size = 0
-
-        # statistics with the old order
-        # TODO: Make list more flexible with mulitply GPUs later
-        max_node_memory_size = [0, 0, 0]
-        max_running_max_memory_size = [0, 0, 0]
+        max_node_memory_size = 0
+        max_running_max_memory_size = 0
         max_node_memory_saved_by_view = 0
         max_node_memory_saved_by_inplace = 0
-
-        # statistics with the new order
-        new_max_node_memory_size = [0, 0, 0]
-        new_max_running_max_memory_size = [0, 0, 0]
-        new_max_node_memory_saved_by_view = 0
-        new_max_node_memory_saved_by_inplace = 0
-
-        # track min peak memory usage
-        min_max_peak = 0
-        min_peak_time = 0
-
-        def count_running_memory(order, fgraph, nodes_mem):
-            """
-            Calculate memory with specific node order 
-            Return a list including the following values
-            1.  node_memory_size
-                Sum of the size of all variables that actually allocate
-                memory (excluding views, and inplace);
-            2. running_memory_size
-                The memory allocated after the current apply node
-            3. running_max_memory_size
-                The maximum of running_memory_size during the function   
-            4.  node_memory_saved_by_view
-                The sum of memory saved by returning view instead of new
-                allocation 
-            5.  node_memory_saved_by_inplace
-                The sum of memory saved by reusing the input instead of
-                new allocation
-            """
-            from theano.sandbox.cuda import CudaNdarrayType
-            # Initial Mem info values [CPU, GPU]
-            node_memory_size = [0, 0]
-            running_memory_size = [0, 0]
-            running_max_memory_size = [0, 0]
+        for fgraph, nodes_mem in fct_memory.iteritems():
+            # Sum of the size of all variables in bytes
+            sum_size = sum([sum([v for v in val if not isinstance(v, str)])
+                            for key, val in nodes_mem.iteritems()])
+            # Sum of the size of all variables that actually allocate
+            # memory (excluding views, and inplace);
+            node_memory_size = 0
+            # The sum of memory saved by returning view instead of new
+            # allocation
             node_memory_saved_by_view = 0
+            # The sum of memory saved by reusing the input instead of
+            # new allocation
             node_memory_saved_by_inplace = 0
-            # This take only the inputs/outputs dependencies.
-            dependencies = fgraph.profile.dependencies
-
-            # Initial compute_map which is used to check if a node is valid
-            compute_map = defaultdict(lambda: [0])
-            for var in fgraph.inputs:
-                compute_map[var][0] = 1
-
-            # two data structure used to mimic Python gc
-            viewed_by = {}  # {var1: [vars that view var1]}
-            # The len of the list is the value of python ref count. But we use a list, not just the ref count value.
-            # This is more safe to help detect potential bug  in the algo
-            for var in fgraph.variables:
-                viewed_by[var] = []
-            view_of = {}  # {var1: original var viewed by var1}
-            # The orignal mean that we don't keep trac of all the intermediate
-            # relationship in the view.
+            # The memory allocated after the current apply node
+            running_memory_size = 0
+            # The maximum of running_memory_size during the function
+            running_max_memory_size = 0
 
+            order = fgraph.toposort()
+            # A list of intermediate variable that are not need
+            # after the execution of the corresponding node.
+            # It mean that after executing the node,
+            # the corresponding variable can be gc.
+            post_thunk_old_storage = []
+            computed, last_user = theano.gof.link.gc_helper(order)
+            for node in order:
+                post_thunk_old_storage.append([
+                    input_idx
+                    for input_idx, input in enumerate(node.inputs)
+                    if (input in computed) and
+                    (input not in fgraph.outputs) and
+                    node == last_user[input]])
             for node in order:
-                for var in node.outputs:
-                    compute_map[var][0] = 1
-                idx = 0
+                val = nodes_mem[node]
                 dmap = getattr(node.op, 'destroy_map', None)
                 vmap = getattr(node.op, 'view_map', None)
-                val = nodes_mem[node]
 
-                for v in val:
+                for idx, v in enumerate(val):
                     # TODO check the op returned a view
                     if dmap and idx in dmap:
                         node_memory_saved_by_inplace += v
                     # TODO check the op returned a view
                     elif vmap and idx in vmap:
                         node_memory_saved_by_view += v
-                    idx += 1
-
-                # Update the Python emulating dicts and add the memory
-                # allocated by the node
-                idx2 = 0
-                for out in node.outputs:
-                    if isinstance(out.type, CudaNdarrayType):
-                        cg = 1
-                    else:
-                        cg = 0
-                    ins = None
-                    if dmap and idx2 in dmap:
-                        vidx = dmap[idx2]
-                        assert len(
-                            vidx) == 1, "Here we only support the possibility to destroy one input"
-                        ins = node.inputs[vidx[0]]
-                    if vmap and idx2 in vmap:
-                        assert ins is None
-                        vidx = vmap[idx2]
-                        assert len(
-                            vidx) == 1, "Here we only support the possibility to view one input"
-                        ins = node.inputs[vidx[0]]
-                    if ins is not None:
-                        # This is needed for destroy_map in case it
-                        # return a partial view that is destroyed.  So
-                        # the output could be different then the
-                        # input.
-                        assert isinstance(ins, theano.Variable)
-                        # we keep trac of view only again the origin
-                        origin = view_of.get(ins, ins)
-                        view_of[out] = origin
-                        viewed_by[origin].append(out)
-                    else:
-                        running_memory_size[cg] += var_mem[out]
-                        node_memory_size[cg] += var_mem[out]
-                    idx2 += 1
-
-                running_max_memory_size[0] = max(running_max_memory_size[0],
-                                                 running_memory_size[0])
-                running_max_memory_size[1] = max(running_max_memory_size[1],
-                                                 running_memory_size[1])
-
-                # Mimic the combination of Theano and Python gc
-                for ins in node.inputs:
-                    assert not (ins in view_of and viewed_by[ins])
-                    # we trac the original var, so this shouldn't happen
-                    if isinstance(ins.type, CudaNdarrayType):
-                        cg = 1
-                    else:
-                        cg = 0
-                    if (dependencies[ins] and
-                            ins not in fgraph.outputs and
-                            ins.owner and
-                            all([compute_map[v][0] for v in dependencies[ins]])):
-                        if ins not in view_of and not viewed_by.get(ins, []):
-                            running_memory_size[cg] -= var_mem[ins]
-                        elif ins in view_of:
-                            origin = view_of[ins]
-                            viewed_by[origin].remove(ins)
-                            if (not viewed_by[origin] and
-                                    origin not in fgraph.inputs and
-                                    not isinstance(origin, theano.Constant)):
-                                running_memory_size[cg] -= var_mem[origin]
-                    else:
-                        # ins is viewed_by something else, so its
-                        # memory isn't freed
-                        pass
-
-            return [node_memory_size, running_memory_size,
-                    running_max_memory_size, node_memory_saved_by_inplace,
-                    node_memory_saved_by_view]
-
-        def count_minimum_peak(node_list, fgraph, nodes_mem):
-            global mem_count, mem_bound, max_mem_count
-            node_list = list(node_list)
-            mem_count = 0
-            max_mem_count = 0
-            mem_bound = numpy.inf
-            # This take only the inputs/outputs dependencies.
-            dependencies = fgraph.profile.dependencies
-            done_set = set([])
-            done_dict = {}
-
-            # Initial compute_map which is used to check if a node is valid
-            compute_map = defaultdict(lambda: [0])
-            for var in fgraph.inputs:
-                compute_map[var][0] = 1
-            for var in node_list:
-                for val in var.inputs:
-                    if isinstance(val, graph.Constant):
-                        compute_map[val][0] = 1
-
-            # Initial executable_nodes
-            executable_nodes = set()
-            for var in fgraph.inputs:
-                for c, _ in var.clients:
-                    if c != "output":
-                        deps = c.inputs + c.destroy_dependencies
-                        if all(compute_map[v][0] for v in deps):
-                            executable_nodes.add(c)
-
-            def min_memory_generator(executable_nodes, viewed_by, view_of):
-                """
-                Generate all valid node order from node_list
-                and compute its memory peak.
-
-                :param executable_nodes: Set of executable nodes
-                """
-                global mem_count, mem_bound, max_mem_count
-
-                for node in executable_nodes:
-                    new_exec_nodes = executable_nodes.copy()
-                    new_exec_nodes.remove(node)
-
-                    # Check if cut path now
-                    if max_mem_count > mem_bound:
-                        continue
-
-                    viewof_change = []
-                    # Use to track view_of changes
-
-                    viewedby_add = defaultdict(lambda: [])
-                    viewedby_remove = defaultdict(lambda: [])
-                    # Use to track viewed_by changes
-
-                    for var in node.outputs:
-                        compute_map[var][0] = 1
-
-                    mem_created = 0
-                    mem_freed = 0
-                    max_storage = max_mem_count
-
-                    dmap = getattr(node.op, 'destroy_map', None)
-                    vmap = getattr(node.op, 'view_map', None)
-
-                    idx = 0
-                    # Update the Python emulating dicts and add the
-                    # memory allocated by the node
-                    for out in node.outputs:
-                        ins = None
-                        if dmap and idx in dmap:
-                            vidx = dmap[idx]
-                            assert len(
-                                vidx) == 1, "Here we only support the possibility to destroy one input"
-                            ins = node.inputs[vidx[0]]
-                        if vmap and idx in vmap:
-                            assert ins is None
-                            vidx = vmap[idx]
-                            assert len(
-                                vidx) == 1, "Here we only support the possibility to destroy one input"
-                            ins = node.inputs[vidx[0]]
-                        if ins is not None:
-                            # This is needed for destroy_map in case it
-                            # return a partial view that is destroyed.  So
-                            # the output could be different then the
-                            # input.
-                            assert isinstance(ins, theano.Variable)
-                            # We keep trac of view only again the original
-                            origin = view_of.get(ins, ins)
-                            view_of[out] = origin
-                            viewof_change.append(out)
-                            viewed_by[origin].append(out)
-                            viewedby_add[origin].append(out)
-                        else:
-                            mem_created += var_mem[out]
-                        idx += 1
-
-                    mem_count += mem_created
-                    max_mem_count = max(max_mem_count, mem_count)
-
-                    # Mimic the combination of Theano and Python gc.
-                    for ins in node.inputs:
-                        assert not (ins in view_of and
-                                    viewed_by[ins])
-                        # We track of the original var, so this shouldn't
-                        # happen
-                        if (dependencies[ins] and
-                                ins not in fgraph.outputs and
-                                ins.owner and
-                                all([compute_map[v][0] for v in dependencies[ins]])):
-                            if ins not in view_of and not viewed_by.get(ins, []):
-                                mem_freed += var_mem[ins]
-                            elif ins in view_of:
-                                origin = view_of[ins]
-                                viewed_by[origin].remove(ins)
-                                viewedby_remove[origin].append(ins)
-                                if (not viewed_by[origin] and
-                                        origin not in fgraph.inputs and
-                                        not isinstance(origin, theano.Constant)):
-                                    mem_freed += var_mem[origin]
-                        else:
-                            # ins is viewed_by something else, so its
-                            # memory isn't freed
-                            pass
-
-                    mem_count -= mem_freed
-
-                    done_set.add(node)
-                    frozen_set = frozenset(done_set)
-                    if done_dict.get(frozen_set, max_mem_count + 1) > max_mem_count:
-                        # check if frozen_set is in done_set
-                        # no, add it to done_set
-                        # yes, then compare the past mem and current mem
-                        # bigger, update the value and continue
-                        # smaller, stop this iteration, move to next node
-                        done_dict[frozen_set] = max_mem_count
-
-                        for var in node.outputs:
-                            for c, _ in var.clients:
-                                if c != "output":
-                                    deps = c.inputs + c.destroy_dependencies
-                                    if all(compute_map[v][0] for v in deps):
-                                        new_exec_nodes.add(c)
-
-                        if not new_exec_nodes:
-                            # Check and Update mem_bound
-                            if max_mem_count < mem_bound:
-                                mem_bound = max_mem_count
-                        else:
-                            min_memory_generator(
-                                new_exec_nodes, viewed_by, view_of)
-
-                    # Reset track variables
-                    done_set.remove(node)
-                    mem_count -= mem_created
-                    max_mem_count = max_storage
-                    mem_count += mem_freed
-                    for var in node.outputs:
-                        compute_map[var][0] = 0
-
-                    for k_remove, v_remove in viewedby_remove.iteritems():
-                        for i in v_remove:
-                            viewed_by[k_remove].append(i)
-
-                    for k_add, v_add in viewedby_add.iteritems():
-                        for i in v_add:
-                            viewed_by[k_add].remove(i)
-
-                    for k in viewof_change:
-                        del view_of[k]
-
-            # two data structure used to mimic Python gc
-            viewed_by = {}  # {var1: [vars that view var1]}
-            # The len of the list is the value of python ref count. But we use a list, not just the ref count value.
-            # This is more safe to help detect potential bug  in the algo
-            for var in fgraph.variables:
-                viewed_by[var] = []
-            view_of = {}  # {var1: original var viewed by var1}
-            # The orignal mean that we don't keep trac of all the intermediate
-            # relationship in the view.
-
-            min_memory_generator(executable_nodes, viewed_by, view_of)
-
-            return mem_bound
-
-        for fgraph, nodes_mem in fct_memory.iteritems():
-            # Sum of the size of all variables in bytes
-            sum_size = sum([sum([v for v in val if not isinstance(v, str)])
-                            for key, val in nodes_mem.iteritems()])
-
-            order = fgraph.toposort()
-            # A list of intermediate variable that are not need
-            # after the execution of the corresponding node.
-            # It mean that after executing the node,
-            # the corresponding variable can be gc.
-
-            old_running_memory = count_running_memory(order, fgraph, nodes_mem)
-
-            new_order = fgraph.profile.node_executed_order
-            # A list of new executed node order
-
-            new_running_memory = count_running_memory(new_order,
-                                                      fgraph, nodes_mem)
+                    elif not isinstance(v, str):
+                        node_memory_size += v
+                        running_memory_size += v
+                        if running_memory_size > running_max_memory_size:
+                            running_max_memory_size = running_memory_size
+                        old_storage = post_thunk_old_storage[order.index(node)]
+                        for old_s in old_storage:
+                            old_v = var_mem[node.inputs[old_s]]
+                            if not isinstance(old_v, str):
+                                running_memory_size -= old_v
 
             # Store the max of some stats by any function in this profile.
             max_sum_size = max(max_sum_size, sum_size)
-            max_node_memory_size[0] = max(max_node_memory_size[0],
-                                          sum(old_running_memory[0]))
-            max_running_max_memory_size[0] = max(max_running_max_memory_size[0],
-                                                 sum(old_running_memory[2]))
-
-            # Separate CPU and GPU
-            max_node_memory_size[1] = max(max_node_memory_size[1],
-                                          old_running_memory[0][0])
-            max_node_memory_size[2] = max(max_node_memory_size[2],
-                                          old_running_memory[0][1])
-            max_running_max_memory_size[1] = max(max_running_max_memory_size[1],
-                                                 old_running_memory[2][0])
-            max_running_max_memory_size[2] = max(max_running_max_memory_size[2],
-                                                 old_running_memory[2][1])
-
-            max_node_memory_saved_by_inplace = max(
-                max_node_memory_saved_by_inplace, old_running_memory[3])
+            max_node_memory_size = max(max_node_memory_size, node_memory_size)
+            max_running_max_memory_size = max(max_running_max_memory_size,
+                                          running_max_memory_size)
             max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
-                                                old_running_memory[4])
-
-            # Store max of some stats with new order
-            new_max_node_memory_size[0] = max(new_max_node_memory_size[0],
-                                              sum(new_running_memory[0]))
-            new_max_running_max_memory_size[0] = max(new_max_running_max_memory_size[0],
-                                                     sum(new_running_memory[2]))
-
-            # Separate CPU and GPU
-            new_max_node_memory_size[1] = max(new_max_node_memory_size[1],
-                                              new_running_memory[0][0])
-            new_max_node_memory_size[2] = max(new_max_node_memory_size[2],
-                                              new_running_memory[0][1])
-            new_max_running_max_memory_size[1] = max(new_max_running_max_memory_size[1],
-                                                     new_running_memory[2][0])
-            new_max_running_max_memory_size[2] = max(new_max_running_max_memory_size[2],
-                                                     new_running_memory[2][1])
-
-            new_max_node_memory_saved_by_inplace = max(
-                new_max_node_memory_saved_by_inplace, new_running_memory[3])
-            new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
-                                                    new_running_memory[4])
-
-            # Config: whether print min memory peak
-            if config.profiling.min_peak_memory:
-                node_list = fgraph.apply_nodes
-                ttt = time.time()
-                min_peak = count_minimum_peak(node_list, fgraph, nodes_mem)
-                min_peak_time += time.time() - ttt
-                min_max_peak = max(min_max_peak, min_peak)
-
-            del fgraph, nodes_mem
+                                                node_memory_saved_by_view)
+            max_node_memory_saved_by_inplace = max(
+                max_node_memory_saved_by_inplace, node_memory_saved_by_inplace)
+
+            del fgraph, nodes_mem, post_thunk_old_storage, node
 
         if len(fct_memory) > 1:
             print >> file,  ("Memory Profile "
@@ -1069,50 +714,21 @@ def min_memory_generator(executable_nodes, viewed_by, view_of):
             print >> file,  "Memory Profile"
 
         print >> file, "(Sparse variables are ignored)"
-        print >> file, "(For values in brackets, it's for linker = c|py"
 
         print >> file,  "---"
 #        print >> file,  "    Max if no gc, inplace and view: %dKB" % int(
 #            round(max_sum_size / 1024))
-        print >> file,  "    Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round(
-            new_max_node_memory_size[0] / 1024.)), int(round(
-                max_node_memory_size[0] / 1024.)))
-        print >> file,  "    CPU: %dKB (%dKB)" % ((int(round(
-            new_max_node_memory_size[1] / 1024.)), int(round(
-                max_node_memory_size[1] / 1024.))))
-        print >> file,  "    GPU: %dKB (%dKB)" % ((int(round(
-            new_max_node_memory_size[2] / 1024.)), int(round(
-                max_node_memory_size[2] / 1024.))))
-
-        print >> file,  "---"
-
-        print >> file,  "    Max if linker=cvm(default): %dKB (%dKB)" % (int(round(
-            new_max_running_max_memory_size[0] / 1024.)), int(round(
-                max_running_max_memory_size[0] / 1024.)))
-        print >> file,  "    CPU: %dKB (%dKB)" % ((int(round(
-            new_max_running_max_memory_size[1] / 1024.)), int(round(
-                max_running_max_memory_size[1] / 1024.))))
-        print >> file,  "    GPU: %dKB (%dKB)" % ((int(round(
-            new_max_running_max_memory_size[2] / 1024.)), int(round(
-                max_running_max_memory_size[2] / 1024.))))
-
-        print >> file,  "---"
-
-        if min_max_peak:
-            print >> file,  "    Minimum peak from all valid apply node order is %dKB(took %.3fs to compute)" % (int(round(
-                min_max_peak / 1024.)), min_peak_time)
-        print >> file,  "    Memory saved if views are used: %dKB (%dKB)" % (int(
-            round(new_max_node_memory_saved_by_view / 1024.)), int(
-            round(max_node_memory_saved_by_view / 1024.)))
-        print >> file,  "    Memory saved if inplace ops are used: %dKB (%dKB)" % \
-            (int(round(new_max_node_memory_saved_by_inplace / 1024.)),
-             int(round(max_node_memory_saved_by_inplace / 1024.)))
-        print >> file,  "    Memory saved if gc is enabled: %dKB (%dKB)" % (int(
-            round(new_max_node_memory_size[0] - new_max_running_max_memory_size[0]) / 1024.), int(
-            round(max_node_memory_size[0] - max_running_max_memory_size[0]) / 1024.))
-
-        print >> file,  "---"
-
+        print >> file,  "    Max if linker=cvm (default): unknown"
+        print >> file,  "    Max if no gc (allow_gc=False): %dKB" % int(round(
+                             max_node_memory_size / 1024.))
+        print >> file,  "    Max if linker=c|py: %dKB" % int(round(
+            max_running_max_memory_size / 1024.))
+#        print >> file,  "    Memory saved if views are used: %dKB" % int(
+#            round(max_node_memory_saved_by_view / 1024.))
+#        print >> file,  "    Memory saved if inplace ops are used: %dKB" % \
+#            int(round(max_node_memory_saved_by_inplace / 1024.))
+        print >> file,  "    Memory saved if gc is enabled (linker=c|py): %dKB" % int(
+            round(max_node_memory_size - max_running_max_memory_size) / 1024.)
         if (hasattr(theano, 'sandbox') and
             hasattr(theano.sandbox, 'cuda') and
             hasattr(theano.sandbox.cuda, 'cuda_ndarray') and
@@ -1165,8 +781,8 @@ def min_memory_generator(executable_nodes, viewed_by, view_of):
         print >> file,  (
             '   ... (remaining %i Apply account for %4dB/%dB (%s) of the'
             ' Apply with dense outputs sizes)') % (max(0, len(node_mem) - N),
-                                                   sum_remaining,
-                                                   size_sum_dense, p
+                                                       sum_remaining,
+                                                       size_sum_dense, p
                                                    )
         print >> file, ''
         if N == 0:
@@ -1192,18 +808,18 @@ def summary(self, file=sys.stderr, n_ops_to_print=20,
         elif self.fct_callcount > 0:
             print >> file, ("  No execution time accumulated "
                             "(hint: try config profiling.time_thunks=1)")
-        if self.variable_shape or self.variable_strides:
+        if config.profile_memory:
             self.summary_memory(file, n_apply_to_print)
         if self.optimizer_profile:
-            print >> file, "Optimizer Profile"
-            print >> file, "-----------------"
+            print "Optimizer Profile"
+            print "-----------------"
             self.optimizer_profile[0].print_profile(file,
                                                     self.optimizer_profile[1])
 
 
-if 0:  # old code still to be ported from ProfileMode
+if 0: # old code still to be ported from ProfileMode
     def long_print(self, file=sys.stderr, fct_name=None, message=None,
-                   n_apply_to_print=15, n_ops_to_print=20, print_apply=False):
+            n_apply_to_print=15, n_ops_to_print=20, print_apply=False):
         """
         Print a readable summary of the stats.
 
@@ -1225,7 +841,7 @@ def long_print(self, file=sys.stderr, fct_name=None, message=None,
         sop_time = {}
         sop_call = {}
         sop_op = {}
-        # map each op class to Bool. True iff all applies were done in c.
+        #map each op class to Bool. True iff all applies were done in c.
         sop_c = {}
         for a, t in op_time.items():
             typ = type(a)
@@ -1252,13 +868,13 @@ def long_print(self, file=sys.stderr, fct_name=None, message=None,
                 msg = '*'
             else:
                 msg = ' '
-            print '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d %s' % (f, ftot, t, tot, t / nb_call, msg, nb_call, nb_op, a)
+            print '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_op, a)
         print '   ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
-            % (max(0, len(sotimes) - n_ops_to_print),
-               sum(f for f, t, a, ci, nb_call, nb_op in
-                   sotimes[n_ops_to_print:]),
-               sum(t for f, t, a, ci, nb_call, nb_op in
-                   sotimes[n_ops_to_print:]))
+                % (max(0, len(sotimes) - n_ops_to_print),
+                  sum(f for f, t, a, ci, nb_call, nb_op in
+                      sotimes[n_ops_to_print:]),
+                  sum(t for f, t, a, ci, nb_call, nb_op in
+                      sotimes[n_ops_to_print:]))
 
         total_time = time.time() - import_time
         total_fct_time = sum(fct_call_time.values())
@@ -1268,13 +884,13 @@ def long_print(self, file=sys.stderr, fct_name=None, message=None,
         print 'Theano fct summary: <% total fct time> <total time> <time per call> <nb call> <fct name>'
         for key in fct_call.keys():
             if fct_call[key] > 0:
-                print '   %4.1f%% %.3fs %.2es %d %s' % (
+                print '   %4.1f%% %.3fs %.2es %d %s'%(
                     fct_call_time[key] / total_fct_time * 100,
                     fct_call_time[key],
                     fct_call_time[key] / fct_call[key],
                     fct_call[key], key.name)
             else:
-                print '   NOT CALLED', key.name
+                print '   NOT CALLED',key.name
 
         if total_fct_time > 0:
             time_pr_in_fct = local_time / total_fct_time * 100
@@ -1290,11 +906,11 @@ def long_print(self, file=sys.stderr, fct_name=None, message=None,
         print 'Theano fct call %.3fs %.1f%%' % (total_fct_time,
                                                 total_fct_time / total_time *
                                                 100)
-        print ('   Theano Op time (included in fct call, Time spent '
-               'running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)' %
-               (local_time, local_time / total_time * 100, time_pr_in_fct))
-        print 'Other time since import %.3fs %.1f%%' % (other_time, other_time / total_time * 100)
-        print '%i Theano fct call, %.3fs per call' % (total_fct_call, time_per_call)
+        print '   Theano Op time (included in fct call, Time spent running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)' % (local_time,
+                                                                                                                                  local_time / total_time * 100,
+                                                                                                                                  time_pr_in_fct)
+        print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100)
+        print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call)
 
         print
         print "List of apply that don't have float64 as input but have float64 in outputs. Usefull to know if we forgot some cast when using floatX=float32 or gpu code."
@@ -1302,7 +918,7 @@ def long_print(self, file=sys.stderr, fct_name=None, message=None,
         for fct in fct_call.keys():
             for idx, node in enumerate(fct.maker.fgraph.toposort()):
                 if any(hasattr(i, 'dtype') and i.dtype == 'float64' for i in node.outputs) and not any(hasattr(i, 'dtype') and i.dtype == 'float64' for i in node.inputs):
-                    print str(node), idx, fct.name, str([getattr(i, 'dtype', None) for i in node.inputs]), str([getattr(i, 'dtype', None) for i in node.outputs])
+                    print str(node), idx, fct.name, str([getattr(i,'dtype',None) for i in node.inputs]),str([getattr(i,'dtype',None) for i in node.outputs])
 
         if any([x[2].__name__.startswith("Gpu") for x in sotimes]):
             cpu = []
@@ -1320,8 +936,8 @@ def long_print(self, file=sys.stderr, fct_name=None, message=None,
             sum_trans = sum(so[1] for so in trans)
             print
 
-            print "Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op" % (
-                sum_cpu, sum_cpu / local_time * 100, sum_gpu, sum_gpu / local_time * 100, sum_trans, sum_trans / local_time * 100)
+            print "Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op"%(
+                sum_cpu, sum_cpu/local_time*100, sum_gpu, sum_gpu/local_time*100, sum_trans, sum_trans/local_time*100)
 
             print "Theano function input that are float64"
             print "<fct name> <input name> <input type> <str input>"
@@ -1336,25 +952,20 @@ def long_print(self, file=sys.stderr, fct_name=None, message=None,
         from theano.tensor.raw_random import RandomFunction
         import theano
         import theano.scalar as scal
-        scalar_op_amdlibm_no_speed_up = [scal.LT, scal.GT, scal.LE, scal.GE, scal.EQ, scal.NEQ, scal.InRange, scal.Switch, scal.OR, scal.XOR, scal.AND, scal.Invert, scal.Maximum,
-                                         scal.Minimum, scal.Add, scal.Mul, scal.Sub, scal.TrueDiv, scal.IntDiv, scal.Clip, scal.First, scal.Second, scal.Identity, scal.Cast, scal.Sgn, scal.Neg, scal.Inv, scal.Sqr]
-        scalar_op_amdlibm_speed_up = [scal.Mod, scal.Pow, scal.Ceil, scal.Floor, scal.RoundHalfToEven, scal.RoundHalfAwayFromZero, scal.Log, scal.Log2, scal.Log10, scal.Log1p, scal.Exp,
-                                      scal.Sqrt, scal.Abs, scal.Cos,  scal.Sin,  scal.Tan,  scal.Tanh,  scal.Cosh,  scal.Sinh, T.nnet.sigm.ScalarSigmoid, T.nnet.sigm.ScalarSoftplus]  # Abs, Mod in float{32,64} only
+        scalar_op_amdlibm_no_speed_up = [scal.LT, scal.GT, scal.LE, scal.GE, scal.EQ, scal.NEQ, scal.InRange, scal.Switch, scal.OR, scal.XOR, scal.AND, scal.Invert, scal.Maximum, scal.Minimum, scal.Add, scal.Mul, scal.Sub, scal.TrueDiv, scal.IntDiv, scal.Clip, scal.First, scal.Second, scal.Identity, scal.Cast, scal.Sgn, scal.Neg, scal.Inv, scal.Sqr ]
+        scalar_op_amdlibm_speed_up = [scal.Mod, scal.Pow, scal.Ceil, scal.Floor, scal.RoundHalfToEven, scal.RoundHalfAwayFromZero, scal.Log, scal.Log2, scal.Log10, scal.Log1p, scal.Exp, scal.Sqrt, scal.Abs, scal.Cos,  scal.Sin,  scal.Tan,  scal.Tanh,  scal.Cosh,  scal.Sinh, T.nnet.sigm.ScalarSigmoid, T.nnet.sigm.ScalarSoftplus ]#Abs, Mod in float{32,64} only
 
         def get_scalar_ops(s):
             if isinstance(s, theano.scalar.Composite):
                 l = []
                 for node in s.fgraph.toposort():
-                    l += get_scalar_ops(node.op)
+                    l+=get_scalar_ops(node.op)
                 return l
-            else:
-                return [s]
-
+            else: return [s]
         def list_scalar_op(op):
             if isinstance(op.scalar_op, theano.scalar.Composite):
                 return get_scalar_ops(op.scalar_op)
-            else:
-                return [op.scalar_op]
+            else: return [op.scalar_op]
 
         def amdlibm_speed_up(op):
             if not isinstance(op, T.Elemwise):
@@ -1367,7 +978,6 @@ def amdlibm_speed_up(op):
                     elif s_op.__class__ not in scalar_op_amdlibm_no_speed_up:
                         print "We don't know if amdlibm will accelerate this scalar op.", s_op
                 return False
-
         def exp_float32_op(op):
             if not isinstance(op, T.Elemwise):
                 return False
@@ -1375,30 +985,30 @@ def exp_float32_op(op):
                 l = list_scalar_op(op)
                 return any([s_op.__class__ in [scal.Exp] for s_op in l])
 
-        # tip 1
-        if config.floatX == 'float64':
+        #tip 1
+        if config.floatX=='float64':
             print "  - Try the Theano flag floatX=float32"
 
-        # tip 2
-        if not config.lib.amdlibm and any([amdlibm_speed_up(a.op) for i, a in apply_time]):
+        #tip 2
+        if not config.lib.amdlibm and any([amdlibm_speed_up(a.op) for i,a in apply_time]):
             print "  - Try installing amdlibm and set the Theano flag lib.amdlibm=True. This speed up only some Elemwise operation."
 
-        # tip 3
-        if not config.lib.amdlibm and any([exp_float32_op(a.op) and a.inputs[0].dtype == 'float32' for i, a in apply_time]):
+        #tip 3
+        if not config.lib.amdlibm and any([exp_float32_op(a.op) and a.inputs[0].dtype=='float32' for i,a in apply_time]):
             print "  - With the default gcc libm, exp in float32 is slower than in float64! Try Theano flags floatX=float64 or install amdlibm and set the theano flags lib.amdlibm=True"
 
-        # tip 4
+        #tip 4
         for a, t in apply_time.iteritems():
             node = a
             if (isinstance(node.op, T.Dot) and
-                    all([len(i.type.broadcastable) == 2 for i in node.inputs])):
+                all([len(i.type.broadcastable) == 2 for i in node.inputs])):
                 print ("  - You have a dot operation that was not optimized "
                        "to dot22 that is faster. Make sure the inputs are "
                        "float32 or float64 and are the same for both inputs. "
                        "Currently they are: %s" %
                        [i.type for i in node.inputs])
 
-        # tip 5
+        #tip 5
         for a, t in apply_time.iteritems():
             node = a
             if isinstance(node.op, RandomFunction):
@@ -1450,15 +1060,15 @@ def print_summary(self,
         outputs_size = self.outputs_size
 
         self.print_summary_("print_summary",
-                            None,
-                            None,
-                            None,
-                            apply_time,
-                            op_cimpl,
-                            message,
-                            outputs_size,
-                            n_apply_to_print,
-                            n_ops_to_print)
+                None,
+                None,
+                None,
+                apply_time,
+                op_cimpl,
+                message,
+                outputs_size,
+                n_apply_to_print,
+                n_ops_to_print)
 
     def print_diff_summary(self, other, n_apply_to_print=15,
                            n_ops_to_print=20):
@@ -1486,7 +1096,7 @@ def diff_dict(a_time, b_time_):
                 tb = b_time.pop(a, 0)
                 r[a] += ta - tb
 
-            # they are missing in a
+            #they are missing in a
             for a, t in b_time.items():
                 r.setdefault(a, 0)
                 r[a] += t
@@ -1501,10 +1111,10 @@ def diff_dict(a_time, b_time_):
         outputs_size = diff_dict(self.outputs_size, other.outputs_size)
 
         self.print_summary_(
-            "print_diff_summary", compile_time, fct_call_time, fct_call,
-            apply_time, op_cimpl, message, outputs_size,
-            n_apply_to_print=n_apply_to_print,
-            n_ops_to_print=n_ops_to_print, print_apply=False)
+                "print_diff_summary", compile_time, fct_call_time, fct_call,
+                apply_time, op_cimpl, message, outputs_size,
+                n_apply_to_print=n_apply_to_print,
+                n_ops_to_print=n_ops_to_print, print_apply=False)
 
 
 class ScanProfileStats(ProfileStats):
diff --git a/theano/compile/sharedvalue.py b/theano/compile/sharedvalue.py
index 7a5385b6f7e..c3d311e7bad 100644
--- a/theano/compile/sharedvalue.py
+++ b/theano/compile/sharedvalue.py
@@ -9,7 +9,7 @@
 import numpy
 
 # Theano imports
-from theano.gof import Container, Variable, generic, utils
+from theano.gof import Container, Variable, generic
 
 _logger = logging.getLogger('theano.compile.sharedvalue')
 
@@ -154,7 +154,7 @@ def _value_set(self, new_value):
                         " instead.")
 
     # We keep this just to raise an error
-    value = property(_value_get, _value_set)
+    value = property(_value_get, _value_set),
 
 
 def shared_constructor(ctor, remove=False):
@@ -169,33 +169,15 @@ def shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
     """Return a SharedVariable Variable, initialized with a copy or
     reference of `value`.
 
-    This function iterates over
-    :ref:`constructor functions <shared_constructor>`
-    to find a suitable SharedVariable subclass.
-    The suitable one is the first constructor that accept the given value.
-
-    This function is meant as a convenient default.  If you want to use a
-    specific shared variable constructor, consider calling it directly.
-
-    ``theano.shared`` is a shortcut to this function.
+    This function iterates over constructor functions (see
+    `shared_constructor`) to find a suitable SharedVariable subclass.
 
     :note: By passing kwargs, you effectively limit the set of
-        potential constructors to those that can accept those kwargs.
+    potential constructors to those that can accept those kwargs.
 
-    :note: Some shared variable have ``borrow`` as extra kwargs.
+    :note: Some shared variable have 'borrow' as extra kwargs.
            `See <http://deeplearning.net/software/theano/tutorial/aliasing.html#borrowing-when-creating-shared-variables>`_ for detail.
 
-    :note: Some shared variable have ``broadcastable`` as extra kwargs.
-        As shared variable shapes can change, all dimensions default
-        to not being broadcastable, even if ``value`` has a shape of 1
-        along some dimension. This parameter allows you to create
-        for example a `row` or `column` 2d tensor.
-
-    .. attribute:: constructors
-
-        A list of shared variable constructors that will be tried in reverse
-        order.
-
     """
 
     try:
@@ -204,10 +186,8 @@ def shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
 
         for ctor in reversed(shared.constructors):
             try:
-                var = ctor(value, name=name, strict=strict,
-                           allow_downcast=allow_downcast, **kwargs)
-                utils.add_tag_trace(var)
-                return var
+                return ctor(value, name=name, strict=strict,
+                            allow_downcast=allow_downcast, **kwargs)
             except TypeError:
                 continue
             # This may happen when kwargs were supplied
@@ -221,10 +201,7 @@ def shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
                            ' using \'theano.shared(..., borrow=True)\'',)
         raise
 
-    raise TypeError('No suitable SharedVariable constructor could be found.'
-                    ' Are you sure all kwargs are supported?'
-                    ' We do not support the parameter dtype or type.'
-                    ' value="%s". parameters="%s"' %
+    raise TypeError('No suitable SharedVariable constructor could be found',
                     (value, kwargs))
 
 shared.constructors = []
diff --git a/theano/compile/tests/test_builders.py b/theano/compile/tests/test_builders.py
index 62dcddec9d9..d34ca8e6f5f 100644
--- a/theano/compile/tests/test_builders.py
+++ b/theano/compile/tests/test_builders.py
@@ -1,7 +1,7 @@
 import numpy
 import unittest
 
-from theano import config, shared
+from theano import config
 
 from theano.compile import function
 
@@ -17,9 +17,7 @@ def test_straightforward(self):
         x, y, z = T.matrices('xyz')
         e = x + y * z
         op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
-        # (1+3*5=array of 16) - (3+1*5=array of 8)
-        f = op(x, y, z) - op(y, z, x)
-
+        f = op(x, y, z) - op(y, z, x) #(1+3*5=array of 16) - (3+1*5=array of 8)
         fn = function([x, y, z], f)
         xv = numpy.ones((2, 2), dtype=config.floatX)
         yv = numpy.ones((2, 2), dtype=config.floatX)*3
@@ -49,7 +47,7 @@ def test_size_changes(self):
     def test_grad(self):
         x, y, z = T.matrices('xyz')
         e = x + y * z
-        op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
+        op = OpFromGraph([x, y, z], [e], mode='FAST_RUN', grad_depth = 2)
         f = op(x, y, z)
         f = f - T.grad(T.sum(f), y)
         fn = function([x, y, z], f)
@@ -58,56 +56,6 @@ def test_grad(self):
         zv = numpy.ones((2, 2), dtype=config.floatX)*5
         assert numpy.all(11.0 == fn(xv, yv, zv))
 
-    def test_grad_grad(self):
-        x, y, z = T.matrices('xyz')
-        e = x + y * z
-        op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
-        f = op(x, y, z)
-        f = f - T.grad(T.sum(f), y)
-        f = f - T.grad(T.sum(f), y)
-        fn = function([x, y, z], f)
-        xv = numpy.ones((2, 2), dtype=config.floatX)
-        yv = numpy.ones((2, 2), dtype=config.floatX)*3
-        zv = numpy.ones((2, 2), dtype=config.floatX)*5
-        assert numpy.allclose(6.0, fn(xv, yv, zv))
-
-    def test_shared(self):
-        x, y, z = T.matrices('xyz')
-        s = shared(numpy.random.rand(2, 2).astype(config.floatX))
-        e = x + y * z + s
-        op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
-        # (1+3*5=array of 16) - (3+1*5=array of 8)
-        f = op(x, y, z) - op(y, z, x)
-
-        fn = function([x, y, z], f)
-        xv = numpy.ones((2, 2), dtype=config.floatX)
-        yv = numpy.ones((2, 2), dtype=config.floatX)*3
-        zv = numpy.ones((2, 2), dtype=config.floatX)*5
-        #print function, function.__module__
-        #print fn.maker.fgraph.toposort()
-        assert numpy.allclose(8.0, fn(xv, yv, zv))
-        assert numpy.allclose(8.0, fn(xv, yv, zv))
-
-    def test_shared_grad(self):
-        x, y, z = T.matrices('xyz')
-        s = shared(numpy.random.rand(2, 2).astype(config.floatX))
-        e = x + y * z + s
-        op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
-        f = op(x, y, z)
-        f = f - T.grad(T.sum(f), y)
-        fn = function([x, y, z], f)
-        xv = numpy.ones((2, 2), dtype=config.floatX)
-        yv = numpy.ones((2, 2), dtype=config.floatX) * 3
-        zv = numpy.ones((2, 2), dtype=config.floatX) * 5
-        assert numpy.allclose(11.0 + s.get_value(), fn(xv, yv, zv))
-
-        # grad again the shared variable
-        f = op(x, y, z)
-        f = f - T.grad(T.sum(f), s)
-        fn = function([x, y, z], f)
-        assert numpy.allclose(15.0 + s.get_value(),
-                              fn(xv, yv, zv))
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/theano/compile/tests/test_debugmode.py b/theano/compile/tests/test_debugmode.py
index ef20f2bf010..cdd357903b5 100644
--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
@@ -7,7 +7,6 @@
 from theano import gof
 import theano
 import theano.tensor
-from theano.compat import exc_message
 from theano.compile import debugmode
 import theano.compile
 from theano.tests import unittest_tools as utt
@@ -156,11 +155,11 @@ def c_code(self, node, name, inp, out, sub):
         prep_vars = """
             //the output array has size M x N
             npy_intp M = PyArray_DIMS(%(a)s)[0];
-            npy_intp Sa = PyArray_STRIDES(%(a)s)[0] / PyArray_DESCR(%(a)s)->elsize;
-            npy_intp Sz = PyArray_STRIDES(%(z)s)[0] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Sa = %(a)s->strides[0] / PyArray_DESCR(%(a)s)->elsize;
+            npy_intp Sz = %(z)s->strides[0] / PyArray_DESCR(%(z)s)->elsize;
 
-            npy_double * Da = (npy_double*)PyArray_BYTES(%(a)s);
-            npy_double * Dz = (npy_double*)PyArray_BYTES(%(z)s);
+            npy_double * Da = (npy_double*)%(a)s->data;
+            npy_double * Dz = (npy_double*)%(z)s->data;
 
             //clear the output array
             for (npy_intp m = 0; m < M; ++m)
@@ -249,39 +248,6 @@ def insert_broken_add(node):
     assert False
 
 
-def test_badoptimization_opt_err():
-    """This variant of test_badoptimization() replace the working code
-    with a new apply node that will raise an error.
-
-    """
-    @gof.local_optimizer([theano.tensor.add])
-    def insert_bigger_b_add(node):
-        if node.op == theano.tensor.add:
-            inputs = list(node.inputs)
-            if inputs[-1].owner is None:
-                inputs[-1] = theano.tensor.concatenate((inputs[-1],
-                                                        inputs[-1]))
-                return [node.op(*inputs)]
-        return False
-    edb = gof.EquilibriumDB()
-    edb.register('insert_bigger_b_add', insert_bigger_b_add, 'all')
-    opt = edb.query('+all')
-
-    a = theano.tensor.dvector()
-    b = theano.tensor.dvector()
-
-    f = theano.function([a, b], a + b,
-                        mode=debugmode.DebugMode(optimizer=opt))
-
-    try:
-        f([1.0, 2.0, 3.0], [2, 3, 4],)
-    except Exception, e:
-        assert 'insert_bigger_b_add' in exc_message(e)
-        return  # TEST PASS
-
-    assert False
-
-
 def test_stochasticoptimization():
 
     # this optimization alternates between triggering and not triggering.
diff --git a/theano/compile/tests/test_function.py b/theano/compile/tests/test_function.py
deleted file mode 100644
index acd9bda8787..00000000000
--- a/theano/compile/tests/test_function.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import cPickle
-import os
-import shutil
-import tempfile
-
-import numpy
-
-import theano
-
-
-def test_function_dump():
-    v = theano.tensor.vector()
-    fct1 = theano.function([v], v + 1)
-
-    try:
-        tmpdir = tempfile.mkdtemp()
-        fname = os.path.join(tmpdir, 'test_function_dump.pkl')
-        theano.function_dump(fname, [v], v + 1)
-        f = open(fname, 'rb')
-        l = cPickle.load(f)
-        f.close()
-    finally:
-        if tmpdir is not None:
-            shutil.rmtree(tmpdir)
-
-    fct2 = theano.function(**l)
-    x = [1, 2, 3]
-    assert numpy.allclose(fct1(x), fct2(x))
diff --git a/theano/compile/tests/test_function_module.py b/theano/compile/tests/test_function_module.py
index 5f441268893..1a007ba270c 100644
--- a/theano/compile/tests/test_function_module.py
+++ b/theano/compile/tests/test_function_module.py
@@ -371,12 +371,7 @@ def test_borrow_output(self):
         four = f(o)
         assert numpy.all(four==4)
         f(o+.1) #should clobber the memory used to store four
-        if theano.config.cxx:
-            assert not numpy.all(four==4)
-        else:
-            # The Elemwise.perform method don't reuse memory
-            # as some numpy version don't support that correctly.
-            assert numpy.all(four==4)
+        assert not numpy.all(four==4)
 
     def test_disconnected_input(self):
         a = T.scalar('a')
@@ -399,26 +394,6 @@ def test_givens_input_var(self):
         y = x * 2
         self.assertRaises(RuntimeError, function, [x], y, givens={x: x + 1})
 
-    def test_free(self):
-        """
-        Make test on free() function
-        """
-        x = T.vector('x')
-        func = function([x], x+1)
-        func.fn.allow_gc = False
-        func([1])
-        
-        check_list = []
-        for key, val in func.fn.storage_map.iteritems():
-            if not isinstance(key, theano.gof.Constant):
-                check_list.append(val)
-        assert any([val[0] for val in check_list])
-
-        func.free()
-
-        for key, val in func.fn.storage_map.iteritems():
-            if not isinstance(key, theano.gof.Constant):
-                assert (val[0] == None)
 
 class T_picklefunction(unittest.TestCase):
 
diff --git a/theano/compile/tests/test_function_name.py b/theano/compile/tests/test_function_name.py
deleted file mode 100644
index da3c4ac30ec..00000000000
--- a/theano/compile/tests/test_function_name.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import unittest
-
-import os
-import re
-
-import theano
-from theano import tensor
-
-
-class FunctionName(unittest.TestCase):
-    def test_function_name(self):
-        x = tensor.vector('x')
-        func = theano.function([x], x + 1.)
-
-        regex = re.compile(os.path.basename('.*test_function_name.pyc?:13'))
-        assert(regex.match(func.name) is not None)
diff --git a/theano/compile/tests/test_modes.py b/theano/compile/tests/test_modes.py
index b6edbebe62d..72323abe136 100644
--- a/theano/compile/tests/test_modes.py
+++ b/theano/compile/tests/test_modes.py
@@ -17,7 +17,7 @@ def test1(self):
         linker_classes_involved = []
 
         predef_modes = ['FAST_COMPILE', 'FAST_RUN', 'DEBUG_MODE']
-        # Use a new instance of ProfileMode instead of 'ProfileMode' to
+        # Use a new instance of ProfileMode instead of 'PROFILE_MODE' to
         # avoid printing a profile mode summary in nose output
         predef_modes.append(ProfileMode())
 
@@ -42,7 +42,7 @@ def test1(self):
         # there should be
         # - VM_Linker
         # - OpWiseCLinker (FAST_RUN)
-        # - WrapLinker ("ProfileMode")
+        # - WrapLinker (PROFILE_MODE)
         # - PerformLinker (FAST_COMPILE)
         # - DebugMode's Linker  (DEBUG_MODE)
         assert 5 == len(set(linker_classes_involved))
diff --git a/theano/compile/tests/test_ops.py b/theano/compile/tests/test_ops.py
deleted file mode 100644
index 8c5459cc81d..00000000000
--- a/theano/compile/tests/test_ops.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""
- Tests for the Op decorator
-"""
-import numpy as np
-
-from theano.tests import unittest_tools as utt
-from theano import function
-import theano
-from theano import tensor
-from theano.tensor import dmatrix, dvector
-from numpy import allclose
-from theano.compile import as_op
-import pickle
-
-
-# This is for test_pickle, since the function still has to be
-# reachable from pickle (as in it cannot be defined inline)
-@as_op([dmatrix, dmatrix], dmatrix)
-def mul(a, b):
-    return a*b
-
-
-class OpDecoratorTests(utt.InferShapeTester):
-    def test_1arg(self):
-        x = dmatrix('x')
-
-        @as_op(dmatrix, dvector)
-        def diag(x):
-            return np.diag(x)
-
-        fn = function([x], diag(x))
-        r = fn([[1.5, 5], [2, 2]])
-        r0 = np.array([1.5, 2])
-
-        assert allclose(r, r0), (r, r0)
-
-    def test_2arg(self):
-        x = dmatrix('x')
-        x.tag.test_value = np.zeros((2, 2))
-        y = dvector('y')
-        y.tag.test_value = [0, 0]
-
-        @as_op([dmatrix, dvector], dvector)
-        def diag_mult(x, y):
-            return np.diag(x) * y
-
-        fn = function([x, y], diag_mult(x, y))
-        r = fn([[1.5, 5], [2, 2]], [1, 100])
-        r0 = np.array([1.5, 200])
-
-        assert allclose(r, r0), (r, r0)
-
-    def test_infer_shape(self):
-        x = dmatrix('x')
-        x.tag.test_value = np.zeros((2, 2))
-        y = dvector('y')
-        y.tag.test_value = [0, 0]
-
-        def infer_shape(node, shapes):
-            x, y = shapes
-            return [y]
-
-        @as_op([dmatrix, dvector], dvector, infer_shape)
-        def diag_mult(x, y):
-            return np.diag(x) * y
-
-        self._compile_and_check([x, y], [diag_mult(x, y)],
-                                [[[1.5, 5], [2, 2]], [1, 100]],
-                                diag_mult.__class__, warn=False)
-
-    def test_pickle(self):
-        x = dmatrix('x')
-        y = dmatrix('y')
-
-        m = mul(x, y)
-
-        s = pickle.dumps(m)
-        m2 = pickle.loads(s)
-
-        assert m2.owner.op == m.owner.op
-
-
-def test_shape_i_hash():
-    assert isinstance(theano.tensor.opt.Shape_i(np.int64(1)).__hash__(),
-                      int)
diff --git a/theano/compile/tests/test_profiling.py b/theano/compile/tests/test_profiling.py
deleted file mode 100644
index b5b7453db61..00000000000
--- a/theano/compile/tests/test_profiling.py
+++ /dev/null
@@ -1,104 +0,0 @@
-"""
-Test of memory profiling
-
-"""
-import StringIO
-
-import numpy
-
-import theano
-import theano.tensor as T
-from theano.ifelse import ifelse
-
-
-def test_profiling():
-
-    config1 = theano.config.profile
-    config2 = theano.config.profile_memory
-    config3 = theano.config.profiling.min_peak_memory
-    try:
-        theano.config.profile = True
-        theano.config.profile_memory = True
-        theano.config.profiling.min_peak_memory = True
-
-        x = [T.fvector("val%i" % i) for i in range(3)]
-
-        z = []
-        z += [T.outer(x[i], x[i + 1]).sum(axis=1) for i in range(len(x) - 1)]
-        z += [x[i] + x[i + 1] for i in range(len(x) - 1)]
-
-        p = theano.ProfileStats(False)
-
-        if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
-            m = "FAST_RUN"
-        else:
-            m = None
-
-        f = theano.function(x, z, profile=p, name="test_profiling",
-                            mode=m)
-
-        inp = [numpy.arange(1024, dtype='float32') + 1 for i in range(len(x))]
-        output = f(*inp)
-
-        buf = StringIO.StringIO()
-        f.profile.summary(buf)
-
-        # regression testing for future algo speed up
-        the_string = buf.getvalue()
-        lines1 = [l for l in the_string.split("\n") if "Max if linker" in l]
-        lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l]
-        if theano.config.device == 'cpu':
-            assert "Max if linker=cvm(default): 4112KB (8204KB)" in the_string, (
-                lines1, lines2)
-            assert "Minimum peak from all valid apply node order is 4104KB" in the_string, (
-                lines1, lines2)
-        else:
-            assert "Max if linker=cvm(default): 8220KB (8220KB)" in the_string, (
-                lines1, lines2)
-            assert "Minimum peak from all valid apply node order is 4116KB" in the_string, (
-                lines1, lines2)
-
-    finally:
-        theano.config.profile = config1
-        theano.config.profile_memory = config2
-        theano.config.profiling.min_peak_memory = config3
-
-
-def test_ifelse():
-    config1 = theano.config.profile
-    config2 = theano.config.profile_memory
-
-    try:
-        theano.config.profile = True
-        theano.config.profile_memory = True
-
-        a, b = T.scalars('a', 'b')
-        x, y = T.scalars('x', 'y')
-
-        z = ifelse(T.lt(a, b), x * 2, y * 2)
-
-        p = theano.ProfileStats(False)
-
-        if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
-            m = "FAST_RUN"
-        else:
-            m = None
-
-        f_ifelse = theano.function([a, b, x, y], z, profile=p, name="test_ifelse",
-                                   mode=m)
-
-        val1 = 0.
-        val2 = 1.
-        big_mat1 = 10
-        big_mat2 = 11
-
-        out = f_ifelse(val1, val2, big_mat1, big_mat2)
-
-    finally:
-        theano.config.profile = config1
-        theano.config.profile_memory = config2
-
-
-if __name__ == '__main__':
-    test_profiling()
-    test_ifelse()
diff --git a/theano/configdefaults.py b/theano/configdefaults.py
index 58ed2e972bc..728eeeedb15 100644
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -1,10 +1,10 @@
 import os
-import sys
 import logging
+import subprocess
 
-import theano
-from theano.configparser import (AddConfigVar, BoolParam, ConfigParam, EnumStr,
-                                 IntParam, StrParam, TheanoConfigParser)
+from theano.configparser import (
+        AddConfigVar, BoolParam, ConfigParam, EnumStr, IntParam,
+        TheanoConfigParser)
 from theano.misc.cpucount import cpuCount
 from theano.misc.windows import call_subprocess_Popen
 
@@ -12,26 +12,10 @@
 
 config = TheanoConfigParser()
 
-def floatX_convert(s):
-    if s == "32":
-        return "float32"
-    elif s == "64":
-        return "float64"
-    else:
-        return s
-
 AddConfigVar('floatX',
-             "Default floating-point precision for python casts",
-             EnumStr('float64', 'float32', convert=floatX_convert,),
-)
-
-AddConfigVar('warn_float64',
-             "Do an action when a tensor variable with float64 dtype is"
-             " created. They can't be run on the GPU with the current(old)"
-             " gpu back-end and are slow with gamer GPUs.",
-             EnumStr('ignore', 'warn', 'raise', 'pdb'),
-             in_c_key=False,
-)
+        "Default floating-point precision for python casts",
+        EnumStr('float64', 'float32'),
+        )
 
 AddConfigVar('cast_policy',
         "Rules for implicit type casting",
@@ -60,42 +44,20 @@ def floatX_convert(s):
 # gpu means let the driver select the gpu. Needed in case of gpu in
 # exclusive mode.
 # gpuX mean use the gpu number X.
-class DeviceParam(ConfigParam):
-    def __init__(self, default, *options, **kwargs):
-        self.default = default
-
-        def filter(val):
-            if val.startswith('cpu') or val.startswith('gpu') \
-                    or val.startswith('opencl') or val.startswith('cuda'):
-                return val
-            else:
-                raise ValueError(('Invalid value ("%s") for configuration '
-                                  'variable "%s". Valid options start with '
-                                  'one of "cpu", "gpu", "opencl", "cuda"'
-                                  % (val, self.fullname)))
-        over = kwargs.get("allow_override", True)
-        super(DeviceParam, self).__init__(default, filter, over)
-
-    def __str__(self):
-        return '%s (cpu, gpu*, opencl*, cuda*) ' % (self.fullname,)
-
 AddConfigVar('device',
         ("Default device for computations. If gpu*, change the default to try "
          "to move computation to it and to put shared variable of float32 "
          "on it. Do not use upper case letters, only lower case even if "
          "NVIDIA use capital letters."),
-        DeviceParam('cpu', allow_override=False),
+        EnumStr('cpu', 'gpu',
+            'gpu0', 'gpu1', 'gpu2', 'gpu3',
+            'gpu4', 'gpu5', 'gpu6', 'gpu7',
+            'gpu8', 'gpu9', 'gpu10', 'gpu11',
+            'gpu12', 'gpu13', 'gpu14', 'gpu15',
+                allow_override=False),
         in_c_key=False,
         )
 
-AddConfigVar('gpuarray.init_device',
-             """
-             Device to initialize for gpuarray use without moving
-             computations automatically.
-             """,
-             StrParam(''),
-             in_c_key=False)
-
 AddConfigVar('init_gpu_device',
         ("Initialize the gpu device to use, works only if device=cpu. "
          "Unlike 'device', setting this option will NOT move computations, "
@@ -119,7 +81,6 @@ def __str__(self):
         BoolParam(True, allow_override=False),
         in_c_key=False)
 
-
 # Do not add FAST_RUN_NOGC to this list (nor any other ALL CAPS shortcut).
 # The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'.
 # The old all capital letter way of working is deprecated as it is not
@@ -132,67 +93,51 @@ def __str__(self):
                 'FAST_COMPILE', 'PROFILE_MODE', 'DEBUG_MODE'),
         in_c_key=False)
 
-param = "g++"
+enum = EnumStr("g++", "")
 
 # Test whether or not g++ is present: disable C code if it is not.
+# Using the dummy file descriptor below is a workaround for a crash experienced
+# in an unusual Python 2.4.4 Windows environment with the default stdin=None.
+dummy_stdin = open(os.devnull)
 try:
-    rc = call_subprocess_Popen(['g++', '-v'])
-except OSError:
-    param = ""
-    rc = 1
-
-# On Mac we test for 'clang++' and use it by default
-if sys.platform == 'darwin':
     try:
-        rc = call_subprocess_Popen(['clang++', '-v'])
-        param = "clang++"
+        rc = call_subprocess_Popen(['g++', '-v'], stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE,
+                                   stdin=dummy_stdin).wait()
     except OSError:
-        pass
-
-# Try to find the full compiler path from the name
-if param != "":
-    import distutils.spawn
-    newp = distutils.spawn.find_executable(param)
-    if newp is not None:
-        param = newp
-    del newp
-    del distutils
-
-AddConfigVar('cxx',
-             "The C++ compiler to use. Currently only g++ is"
-             " supported, but supporting additional compilers should not be "
-             "too difficult. "
-             "If it is empty, no C++ code is compiled.",
-             StrParam(param),
-             in_c_key=False)
-del param
-
-if rc == 0 and config.cxx != "":
+        rc = 1
+finally:
+    dummy_stdin.close()
+    del dummy_stdin
+if rc == 0:
     # Keep the default linker the same as the one for the mode FAST_RUN
     AddConfigVar('linker',
                  ("Default linker used if the theano flags mode is Mode "
-                  "or ProfileMode(deprecated)"),
+                  "or ProfileMode"),
                  EnumStr('cvm', 'c|py', 'py', 'c', 'c|py_nogc', 'c&py',
                      'vm', 'vm_nogc', 'cvm_nogc'),
                  in_c_key=False)
 else:
-    # g++ is not present or the user disabled it,
-    # linker should default to python only.
+    # g++ is not present, linker should default to python only
     AddConfigVar('linker',
                  ("Default linker used if the theano flags mode is Mode "
-                  "or ProfileMode(deprecated)"),
-                 EnumStr('vm', 'py', 'vm_nogc'),
+                  "or ProfileMode"),
+                 EnumStr('py', 'vm', 'vm_nogc'),
                  in_c_key=False)
-    try:
-        # If the user provided an empty value for cxx, do not warn.
-        theano.configparser.fetch_val_for_key('cxx')
-    except KeyError:
-        _logger.warning(
-            'g++ not detected ! Theano will be unable to execute '
+    _logger.warning('g++ not detected ! Theano will be unable to execute '
             'optimized C-implementations (for both CPU and GPU) and will '
             'default to Python implementations. Performance will be severely '
-            'degraded. To remove this warning, set Theano flags cxx to an '
-            'empty string.')
+            'degraded.')
+    enum = EnumStr("")
+
+AddConfigVar('cxx',
+             "The C++ compiler to use. Currently only g++ is"
+             " supported, but supporting additional compilers should not be "
+             "too difficult. "
+             "If it is empty, no C++ code is compiled.",
+             enum,
+             in_c_key=False)
+del enum
 
 
 #Keep the default value the same as the one for the mode FAST_RUN
@@ -208,15 +153,10 @@ def __str__(self):
 #Keep the default optimizer the same as the one for the mode FAST_RUN
 AddConfigVar('optimizer',
         ("Default optimizer. If not None, will use this linker with the Mode "
-         "object (not ProfileMode(deprecated) or DebugMode)"),
+         "object (not ProfileMode or DebugMode)"),
         EnumStr('fast_run', 'merge', 'fast_compile', 'None'),
         in_c_key=False)
 
-AddConfigVar('optimizer_verbose',
-             "If True, we print all optimization being applied",
-             BoolParam(False),
-             in_c_key=False)
-
 AddConfigVar('on_opt_error',
         ("What to do when an optimization crashes: warn and skip it, raise "
          "the exception, or fall into the pdb debugger."),
@@ -286,6 +226,12 @@ def safe_no_home(home):
         "Use amd's amdlibm numerical library",
         BoolParam(False))
 
+AddConfigVar('op.set_flops',
+        ("currently used only in ConvOp. The profile mode will print the "
+         "flops/s for the op."),
+        BoolParam(False),
+        in_c_key=False)
+
 AddConfigVar('gpuelemwise.sync',
         "when true, wait that the gpu fct finished and check it error code.",
         BoolParam(True),
@@ -293,28 +239,13 @@ def safe_no_home(home):
 
 AddConfigVar('traceback.limit',
              "The number of stack to trace. -1 mean all.",
-# We default to 6 to be able to know where v1 + v2 is created in the
-# user script. The bigger this number is, the more run time it takes.
-             IntParam(6),
+             IntParam(5),
              in_c_key=False)
 
 AddConfigVar('experimental.mrg',
              "Another random number generator that work on the gpu",
              BoolParam(False))
 
-AddConfigVar('experimental.unpickle_gpu_on_cpu',
-             "Allow unpickling of pickled CudaNdarrays as numpy.ndarrays."
-             "This is useful, if you want to open a CudaNdarray without "
-             "having cuda installed."
-             "If you have cuda installed, this will force unpickling to"
-             "be done on the cpu to numpy.ndarray."
-             "Please be aware that this may get you access to the data,"
-             "however, trying to unpicke gpu functions will not succeed."
-             "This flag is experimental and may be removed any time, when"
-             "gpu<>cpu transparency is solved.",
-             BoolParam(default=False),
-             in_c_key=False)
-
 AddConfigVar('numpy.seterr_all',
              ("Sets numpy's behaviour for floating-point errors, ",
               "see numpy.seterr. "
@@ -443,51 +374,12 @@ def warn_default(version):
         BoolParam(False),
         in_c_key=False)
 
-AddConfigVar('warn.signal_conv2d_interface',
-             ("Warn we use the new signal.conv2d() when its interface"
-              " changed mid June 2014"),
-             BoolParam(warn_default('0.7')),
-             in_c_key=False)
-
-AddConfigVar('warn.reduce_join',
-             ('Your current code is fine, but Theano versions '
-              'prior to 0.7 (or this development version) '
-              'might have given an incorrect result. '
-              'To disable this warning, set the Theano flag '
-              'warn.reduce_join to False. The problem was an '
-              'optimization, that modified the pattern '
-              '"Reduce{scalar.op}(Join(axis=0, a, b), axis=0)", '
-              'did not check the reduction axis. So if the '
-              'reduction axis was not 0, you got a wrong answer.'),
-             BoolParam(warn_default('0.7')),
-             in_c_key=False)
-
 AddConfigVar('compute_test_value',
         ("If 'True', Theano will run each op at graph build time, using "
          "Constants, SharedVariables and the tag 'test_value' as inputs "
          "to the function. This helps the user track down problems in the "
          "graph before it gets optimized."),
-        EnumStr('off', 'ignore', 'warn', 'raise', 'pdb'),
-        in_c_key=False)
-
-
-AddConfigVar('compute_test_value_opt',
-             ("For debugging Theano optimization only."
-              " Same as compute_test_value, but is used"
-              " during Theano optimization"),
-             EnumStr('off', 'ignore', 'warn', 'raise', 'pdb'),
-             in_c_key=False)
-
-AddConfigVar('unpickle_function',
-             ("Replace unpickled Theano functions with None. "
-              "This is useful to unpickle old graphs that pickled"
-              " them when it shouldn't"),
-             BoolParam(True),
-             in_c_key=False)
-
-AddConfigVar('reoptimize_unpickled_function',
-        "Re-optimize the graph when a theano function is unpickled from the disk.",
-        BoolParam(True, allow_override=True),
+        EnumStr('off', 'ignore', 'warn', 'raise'),
         in_c_key=False)
 
 
@@ -549,26 +441,3 @@ def warn_default(version):
              BoolParam(default_openmp),
              in_c_key=False,
          )
-
-AddConfigVar('openmp_elemwise_minsize',
-             "If OpenMP is enabled, this is the minimum size of vectors "
-             "for which the openmp parallelization is enabled "
-             "in element wise ops.",
-             IntParam(200000),
-             in_c_key=False,
-         )
-
-AddConfigVar('check_input',
-             "Specify if types should check their input in their C code. "
-             "It can be used to speed up compilation, reduce overhead "
-              "(particularly for scalars) and reduce the number of generated C "
-              "files.",
-             BoolParam(True))
-
-AddConfigVar('cache_optimizations',
-             "WARNING: work in progress, does not work yet. "
-             "Specify if the optimization cache should be used. This cache will "
-             "any optimized graph and its optimization. Actually slow downs a lot "
-             "the first optimization, and could possibly still contains some bugs. "
-             "Use at your own risks.",
-             BoolParam(False))
diff --git a/theano/configparser.py b/theano/configparser.py
index 57dd9e16508..8d489488990 100644
--- a/theano/configparser.py
+++ b/theano/configparser.py
@@ -4,7 +4,6 @@
 
 import logging
 import os
-import shlex
 import sys
 import warnings
 
@@ -40,10 +39,7 @@ def parse_config_string(config_string, issue_warnings=True):
     Parses a config string (comma-separated key=value components) into a dict.
     """
     config_dict = {}
-    my_splitter = shlex.shlex(config_string, posix=True)
-    my_splitter.whitespace = ','
-    my_splitter.whitespace_split = True
-    for kv_pair in my_splitter:
+    for kv_pair in THEANO_FLAGS.split(','):
         kv_pair = kv_pair.strip()
         if not kv_pair:
             continue
@@ -78,14 +74,8 @@ def config_files_from_theanorc():
 
 config_files = config_files_from_theanorc()
 theano_cfg = ConfigParser.SafeConfigParser(
-    {'USER': os.getenv("USER", os.path.split(os.path.expanduser('~'))[-1]),
-     'LSCRATCH': os.getenv("LSCRATCH", ""),
-     'TMPDIR': os.getenv("TMPDIR", ""),
-     'TEMP': os.getenv("TEMP", ""),
-     'TMP': os.getenv("TMP", ""),
-     'PID': str(os.getpid()),
- }
-)
+        {'USER': os.getenv("USER", os.path.split(os.path.expanduser('~'))[-1])}
+        )
 theano_cfg.read(config_files)
 # Having a raw version of the config around as well enables us to pass
 # through config values that contain format strings.
@@ -195,7 +185,7 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
         parameter
 
     :type root: object
-    :param root: used for recursive calls -- do not provide an argument for
+    :param root: used for recusive calls -- do not provide an argument for
         this parameter.
 
     :type in_c_key: boolean
@@ -309,11 +299,7 @@ def __init__(self, default, *options, **kwargs):
                 raise ValueError('Valid values for an EnumStr parameter '
                         'should be strings', val, type(val))
 
-        convert = kwargs.get("convert", None)
-
         def filter(val):
-            if convert:
-                val = convert(val)
             if val in self.all:
                 return val
             else:
diff --git a/theano/gof/__init__.py b/theano/gof/__init__.py
index e16fb29d614..04ddac857c3 100644
--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -43,7 +43,7 @@
     local_bitwidth, python_int_bitwidth
 
 from theano.gof.fg import \
-    CachedConstantError, InconsistencyError, MissingInputError, FunctionGraph
+    InconsistencyError, MissingInputError, FunctionGraph
 
 from theano.gof.destroyhandler import \
     DestroyHandler
@@ -55,16 +55,14 @@
     Container, Linker, LocalLinker, PerformLinker, WrapLinker, WrapLinkerMany
 
 from theano.gof.op import \
-    Op, OpenMPOp, PureOp, COp, ops_with_inner_function
+    Op, OpenMPOp, PureOp, ops_with_inner_function
 
-from theano.gof.opt import (
-    Optimizer,
-    optimizer, inplace_optimizer,
-    SeqOptimizer,
-    MergeOptimizer,
+from theano.gof.opt import (Optimizer, optimizer, SeqOptimizer,
+    MergeOptimizer, MergeOptMerge,
     LocalOptimizer, local_optimizer, LocalOptGroup,
     OpSub, OpRemove, PatternSub,
     NavigatorOptimizer, TopoOptimizer, EquilibriumOptimizer,
+    InplaceOptimizer, PureThenInplaceOptimizer,
     OpKeyOptimizer)
 
 from theano.gof.optdb import \
@@ -80,9 +78,4 @@
     Type, Generic, generic
 
 from theano.gof.utils import \
-    hashtype, object2, MethodNotDefined
-
-import theano
-
-if theano.config.cmodule.preload_cache:
-    cc.get_module_cache()
+    object2, MethodNotDefined
diff --git a/theano/gof/cc.py b/theano/gof/cc.py
index 4c48e27ccd0..7bd0bfd3efd 100644
--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -29,11 +29,7 @@ def hash_from_code(msg):
     import hashlib
 
     def hash_from_code(msg):
-        try:
-            return hashlib.md5(msg).hexdigest()
-        except TypeError:
-            assert isinstance(msg, numpy.ndarray)
-            return hashlib.md5(numpy.getbuffer(msg)).hexdigest()
+        return hashlib.md5(msg).hexdigest()
 else:
     import md5
 
@@ -55,8 +51,8 @@ def hash_from_file(file_path):
 # of cutils_ext.
 from theano.configparser import AddConfigVar, StrParam
 AddConfigVar('gcc.cxxflags',
-             "Extra compiler flags for gcc",
-             StrParam(""))
+        "Extra compiler flags for gcc",
+        StrParam(""))
 
 # gof imports
 from theano.gof import graph
@@ -70,6 +66,7 @@ def hash_from_file(file_path):
 
 import logging
 _logger = logging.getLogger("theano.gof.cc")
+_logger.setLevel(logging.WARN)
 
 from theano.gof.callcache import CallCache
 
@@ -142,18 +139,6 @@ def failure_code(sub):
         goto __label_%(id)i;}''' % sub
 
 
-def failure_code_init(sub):
-    "Code for failure in the struct init."
-    return '''{
-        if (!PyErr_Occurred()) {
-            PyErr_SetString(PyExc_RuntimeError,
-                "Unexpected error in an Op's C code. "
-                "No Python exception was set.");
-            }
-        return %(id)d;
-}''' % sub
-
-
 def code_gen(blocks):
     """WRITEME From a list of L{CodeBlock} instances, returns a string
     that executes them all in sequence. eg for C{(decl1, task1,
@@ -216,7 +201,8 @@ def struct_gen(args, struct_builders, blocks, sub):
         #     be executed if any step in the constructor fails and the
         #     latter only at destruction time.
         struct_decl += block.declare
-        struct_init_head = struct_init_head + ("\n%s" % block.behavior)
+        struct_init_head = struct_init_head + ("\n{\n%s" % block.behavior)
+        struct_init_tail = ("%s\n}\n" % block.cleanup) + struct_init_tail
         struct_cleanup += block.cleanup
 
     behavior = code_gen(blocks)
@@ -268,9 +254,7 @@ def struct_gen(args, struct_builders, blocks, sub):
 
     # TODO: add some error checking to make sure storage_<x> are
     # 1-element lists and __ERROR is a 3-elements list.
-
     struct_code = """
-    namespace {
     struct %(name)s {
         PyObject* __ERROR;
 
@@ -285,9 +269,13 @@ def struct_gen(args, struct_builders, blocks, sub):
         int init(PyObject* __ERROR, %(args_decl)s) {
             %(storage_incref)s
             %(storage_set)s
+            int %(failure_var)s = 0;
             %(struct_init_head)s
             this->__ERROR = __ERROR;
             return 0;
+            %(struct_init_tail)s
+            %(storage_decref)s
+            %(do_return)s
         }
         void cleanup(void) {
             %(struct_cleanup)s
@@ -299,7 +287,6 @@ def struct_gen(args, struct_builders, blocks, sub):
             %(do_return)s
         }
     };
-    }
     """ % sub
 
     return struct_code
@@ -315,18 +302,10 @@ def get_nothing(r, name, sub):
 
 def get_c_declare(r, name, sub):
     """Wrapper around c_declare that declares py_name"""
-
-    if any([c != "output" and getattr(c.op, 'check_input',
-        config.check_input) for (c, _) in r.clients]) or (r.owner
-        and getattr(r.owner.op, 'check_input', True)):
-
-        c_declare = r.type.c_declare(name, sub, True)
-    else:
-        c_declare = r.type.c_declare(name, sub, False)
     pre = """
     PyObject* py_%(name)s;
     """ % locals()
-    return pre + c_declare
+    return pre + r.type.c_declare(name, sub)
 
 
 def get_c_init(r, name, sub):
@@ -340,51 +319,20 @@ def get_c_init(r, name, sub):
 
 def get_c_extract(r, name, sub):
     """Wrapper around c_extract that initializes py_name from storage."""
-    if any([getattr(c.op, 'check_input', config.check_input) for (c, _) in
-            r.clients]):
-        # check_broadcast is just an hack to easily remove just the
-        # broadcast check on the old GPU back-end. This check isn't
-        # done in the new GPU back-end or on the CPU.
-        if any([getattr(c.op, 'check_broadcast', True) for (c, _) in
-                r.clients]):
-            c_extract = r.type.c_extract(name, sub, True)
-        else:
-            try:
-                c_extract = r.type.c_extract(
-                    name, sub, True,
-                    check_broadcast=False)
-            except TypeError, e:
-                c_extract = r.type.c_extract(name, sub, True)
-    else:
-        c_extract = r.type.c_extract(name, sub, False)
-
     pre = """
     py_%(name)s = PyList_GET_ITEM(storage_%(name)s, 0);
     {Py_XINCREF(py_%(name)s);}
     """ % locals()
-    return pre + c_extract
+    return pre + r.type.c_extract(name, sub)
 
 
 def get_c_extract_out(r, name, sub):
     """Wrapper around c_extract_out that initializes py_name from storage."""
-    # check_broadcast is just an hack to easily remove just the
-    # broadcast check on the old GPU back-end. This check isn't
-    # done in the new GPU back-end or on the CPU.
-    check_input = getattr(r.owner.op, 'check_input', config.check_input)
-    if getattr(r.owner.op, 'check_broadcast', True):
-        c_extract = r.type.c_extract_out(name, sub, check_input)
-    else:
-        try:
-            c_extract = r.type.c_extract_out(name, sub, check_input,
-                                             check_broadcast=False)
-        except TypeError, e:
-            c_extract = r.type.c_extract_out(name, sub, check_input)
-
     pre = """
     py_%(name)s = PyList_GET_ITEM(storage_%(name)s, 0);
     {Py_XINCREF(py_%(name)s);}
     """ % locals()
-    return pre + c_extract
+    return pre + r.type.c_extract_out(name, sub)
 
 
 def get_c_cleanup(r, name, sub):
@@ -447,7 +395,7 @@ def struct_variable_codeblocks(variable, policies, id, symbol_table, sub):
     sub = dict(sub)
 #    sub['name'] = name
     sub['id'] = id
-    sub['fail'] = failure_code_init(sub)
+    sub['fail'] = failure_code(sub)
     sub['py_ptr'] = "py_%s" % name
     sub['stor_ptr'] = "storage_%s" % name
     # struct_declare, struct_behavior, struct_cleanup, sub)
@@ -484,7 +432,7 @@ def __init__(self, schedule=None):
     def accept(self, fgraph, no_recycling=None):
         """WRITEME"""
         if no_recycling is None:
-            no_recycling = []
+            no_recycling =  []
         if self.fgraph is not None and self.fgraph is not fgraph:
             return type(self)().accept(fgraph, no_recycling)
             #raise Exception("Cannot accept from a Linker that is already"
@@ -502,13 +450,8 @@ def fetch_variables(self):
         fgraph = self.fgraph
         self.inputs = fgraph.inputs
         self.outputs = fgraph.outputs
-
         # list(fgraph.variables)
-        # We need to include the not used inputs in our variables,
-        # otherwise we can't pass them to the module.
-        self.variables = [var for var in self.inputs if not len(var.clients)]
-        self.variables += graph.variables(self.inputs, self.outputs)
-
+        self.variables = graph.variables(self.inputs, self.outputs)
         # The orphans field is listified to ensure a consistent order.
         #list(fgraph.orphans.difference(self.outputs))
         self.orphans = list(r for r in self.variables
@@ -541,7 +484,6 @@ def code_gen(self):
         self.consts = []
 
         c_support_code_apply = []
-        c_init_code_apply = []
 
         symbol = {}
 
@@ -558,10 +500,12 @@ def code_gen(self):
         failure_var = "__failure"
         id = 1
 
+        sub = dict(failure_var=failure_var)
+
         for variable in self.variables:
-            sub = dict(failure_var=failure_var)
 
             # it might be possible to inline constant variables as C literals
+##            if getattr(variable, 'constant', False):
             # policy = [[what to declare in the struct,
             #            what to do at construction,
             #            what to do at destruction],
@@ -571,6 +515,9 @@ def code_gen(self):
             if variable in self.inputs:
                 # we need to extract the new inputs at each run
                 # they do not need to be relayed to Python, so we don't sync
+#                 if isinstance(variable, Constant):
+#                     raise TypeError("Inputs to CLinker cannot be Constant.",
+#                                     variable)
                 policy = [[get_nothing, get_nothing, get_nothing],
                           [get_c_declare, get_c_extract, get_c_cleanup]]
             elif variable in self.orphans:
@@ -580,7 +527,7 @@ def code_gen(self):
                 if isinstance(variable, graph.Constant):
                     try:
                         symbol[variable] = ("(" + variable.type.c_literal(
-                            variable.data) + ")")
+                                variable.data) + ")")
                         self.consts.append(variable)
                         self.orphans.remove(variable)
                         continue
@@ -642,8 +589,15 @@ def code_gen(self):
             id += 2
 
         for node_num, node in enumerate(self.node_order):
-            # Why is this here?
+
+            # We populate sub with a mapping from the variable names
+            # specified by the op's c_var_names method to the actual
+            # variable names that we will use.
+##            ivnames, ovnames = op.c_var_names()
             sub = dict(failure_var=failure_var)
+##            for variable, vname in zip(op.inputs + op.outputs,
+##                                       ivnames + ovnames):
+##                sub[vname] = symbol[variable]
 
             # The placeholder will be replaced by a hash of the entire
             # code (module + support code) in DynamicModule.code.
@@ -656,19 +610,15 @@ def code_gen(self):
             isyms = [symbol[r] for r in node.inputs]
             osyms = [symbol[r] for r in node.outputs]
 
+            # c_validate_update is deprecated
+            if hasattr(node.op, 'c_validate_update'):
+                raise Exception("c_validate_update is deprecated,"
+                                " move contents to c_code", node.op)
+
             # Make the CodeBlock for c_code
             sub['id'] = id
-            sub['struct_id'] = id + 1
             sub['fail'] = failure_code(sub)
 
-            sub_struct = dict()
-            sub_struct['id'] = id + 1
-            sub_struct['fail'] = failure_code_init(sub)
-
-            struct_support = ""
-            struct_init = ""
-            struct_cleanup = ""
-
             op = node.op
             # type-specific support code
             try:
@@ -679,41 +629,8 @@ def code_gen(self):
             else:
                 # The following will be executed if the "try" block succeeds
                 assert isinstance(c_support_code_apply[-1], basestring), (
-                    str(node.op) +
-                    " didn't return a string for c_support_code_apply")
-
-            try:
-                c_init_code_apply.append(op.c_init_code_apply(node, name))
-            except utils.MethodNotDefined:
-                pass
-            else:
-                assert isinstance(c_init_code_apply[-1], basestring), (
-                    str(node.op) +
-                    " didn't return a string for c_init_code_apply")
-
-            try:
-                struct_init = op.c_init_code_struct(node, id + 1, sub_struct)
-                assert isinstance(struct_init, basestring), (
-                    str(node.op) +
-                    " didn't return a string for c_init_code_struct")
-            except utils.MethodNotDefined:
-                pass
-
-            try:
-                struct_support = op.c_support_code_struct(node, id + 1)
-                assert isinstance(struct_support, basestring), (
-                    str(node.op) +
-                    " didn't return a string for c_support_code_struct")
-            except utils.MethodNotDefined:
-                pass
-
-            try:
-                struct_cleanup = op.c_cleanup_code_struct(node, id + 1)
-                assert isinstance(struct_cleanup, basestring), (
-                    str(node.op) +
-                    " didn't return a string for c_cleanup_code_struct")
-            except utils.MethodNotDefined:
-                pass
+                        str(node.op) +
+                        " didn't returned a string for c_support_code_apply")
 
             # emit c_code
             try:
@@ -721,12 +638,7 @@ def code_gen(self):
             except utils.MethodNotDefined:
                 raise NotImplementedError("%s cannot produce C code" % op)
             assert isinstance(behavior, basestring), (
-                str(node.op) + " didn't return a string for c_code")
-            # To help understand what is following. It help read the c code.
-            # This prevent different op that generate the same c code
-            # to be merged, I suppose this won't happen...
-            behavior = ("// Op class " + node.op.__class__.__name__ + "\n" +
-                        behavior)
+                str(node.op) + " didn't returned a string for c_code")
 
             try:
                 cleanup = op.c_code_cleanup(node, name, isyms, osyms, sub)
@@ -739,12 +651,6 @@ def code_gen(self):
             tasks.append((node, 'code', id))
             id += 1
 
-            init_blocks.append(CodeBlock(struct_support, struct_init,
-                                         struct_cleanup, {'id': id}))
-            init_tasks.append((node, 'init', id))
-            id += 1
-
-
         # List of arg names for use in struct_gen. Note the call to
         # uniq: duplicate inputs must only be passed once because they
         # are mapped to the same name.  Duplicates are defined by (a
@@ -771,7 +677,6 @@ def code_gen(self):
         self.tasks = tasks
         all_info = self.inputs + self.outputs + self.orphans
         self.c_support_code_apply = c_support_code_apply
-        self.c_init_code_apply = c_init_code_apply
 
         if (self.init_tasks, self.tasks) != self.get_init_tasks():
             print >> sys.stderr, "init_tasks\n", self.init_tasks
@@ -837,7 +742,7 @@ def compile_args(self):
 
         c_compiler = self.c_compiler()
 
-        ret = utils.uniq(ret)  # to remove duplicate
+        ret = list(set(ret))  # to remove duplicate
         # The args set by the compiler include the user flags. We do not want
         # to reorder them
         ret += c_compiler.compile_args()
@@ -867,22 +772,7 @@ def headers(self):
                 ret += x.c_headers()
             except utils.MethodNotDefined:
                 pass
-        return utils.uniq(ret)
-
-    def init_code(self):
-        """
-        Return a list of code snippets that have to be inserted
-        in the module initialization code.
-        The return value will not contain duplicates.
-        """
-        ret = []
-        for x in [y.type for y in self.variables] + [
-            y.op for y in self.node_order]:
-            try:
-                ret += x.c_init_code()
-            except utils.MethodNotDefined:
-                pass
-        return utils.uniq(ret)
+        return list(set(ret))
 
     def c_compiler(self):
         c_compiler = None
@@ -919,7 +809,7 @@ def header_dirs(self):
                 ret += x.c_header_dirs()
             except utils.MethodNotDefined:
                 pass
-        return utils.uniq(ret)
+        return list(set(ret))
 
     def libraries(self):
         """WRITEME
@@ -935,7 +825,7 @@ def libraries(self):
                 ret += x.c_libraries()
             except utils.MethodNotDefined:
                 pass
-        return utils.uniq(ret)
+        return list(set(ret))
 
     def lib_dirs(self):
         """WRITEME
@@ -951,7 +841,7 @@ def lib_dirs(self):
                 ret += x.c_lib_dirs()
             except utils.MethodNotDefined:
                 pass
-        return utils.uniq(ret)
+        return list(set(ret))
 
     def __compile__(self, input_storage=None,
                     output_storage=None, keep_lock=False):
@@ -1010,8 +900,7 @@ def get_init_tasks(self):
             id += 2
         for node in self.node_order:
             tasks.append((node, 'code', id))
-            init_tasks.append((node, 'init', id + 1))
-            id += 2
+            id += 1
         return init_tasks, tasks
 
     def make_thunk(self, input_storage=None, output_storage=None,
@@ -1044,7 +933,6 @@ def make_thunk(self, input_storage=None, output_storage=None,
             keep_lock=keep_lock)
 
         res = _CThunk(cthunk, init_tasks, tasks, error_storage)
-        res.nodes = self.node_order
         return res, in_storage, out_storage
 
     def cmodule_key(self):
@@ -1122,15 +1010,14 @@ def cmodule_key(self):
         no_recycle list.
         """
         return self.cmodule_key_(self.fgraph, self.no_recycling,
-                                 compile_args=self.compile_args(),
-                                 libraries=self.libraries(),
-                                 header_dirs=self.header_dirs(),
-                                 c_compiler=self.c_compiler(),
-                             )
-
-    def cmodule_key_(self, fgraph, no_recycling, compile_args=None,
-                     libraries=None, header_dirs=None, insert_config_md5=True,
-                     c_compiler=None):
+                          compile_args=self.compile_args(),
+                          libraries=self.libraries(),
+                          header_dirs=self.header_dirs(),
+                          c_compiler=self.c_compiler(),
+                          )
+
+    def cmodule_key_(self, fgraph, no_recycling, compile_args=None, libraries=None,
+                     header_dirs=None, insert_config_md5=True, c_compiler=None):
         """
         Do the actual computation of cmodule_key in a static method
         to allow it to be reused in scalar.Composite.__eq__
@@ -1146,7 +1033,7 @@ def cmodule_key_(self, fgraph, no_recycling, compile_args=None,
         # seen 'so far' in the loop below
         fgraph_computed_set = set()
         fgraph_inputs_dict = dict((i, (-1, pos)) for pos, i in
-                                  enumerate(fgraph.inputs))
+                               enumerate(fgraph.inputs))
         constant_ids = dict()
         op_pos = {}  # Apply -> topological position
 
@@ -1254,7 +1141,7 @@ def in_sig(i, topological_pos, i_idx):
             sig.append((
                 node.op,
                 tuple((i.type, in_sig(i, node_pos, ipos))
-                      for ipos, i in enumerate(node.inputs)),
+                    for ipos, i in enumerate(node.inputs)),
                 (1,  # Increment if cmodule change its handling of outputs
                     tuple(o in no_recycling for o in node.outputs))))
 
@@ -1267,11 +1154,6 @@ def in_sig(i, topological_pos, i_idx):
             op_pos[node] = node_pos
             fgraph_computed_set.update(node.outputs)
 
-        # Add not used input in the key
-        for ipos, var in [(i, var) for i, var in enumerate(fgraph.inputs)
-                          if not len(var.clients)]:
-            sig.append((var.type, in_sig(var, -1, ipos)))
-
         #crystalize the signature and version
         sig = tuple(sig)
         version = tuple(version)
@@ -1366,22 +1248,22 @@ def build_dynamic_module(self):
         # instantiate.
         if PY3:
             static = """
-        static int {struct_name}_executor({struct_name} *self) {{
+        int {struct_name}_executor({struct_name} *self) {{
             return self->run();
         }}
 
-        static void {struct_name}_destructor(PyObject *capsule) {{
+        void {struct_name}_destructor(PyObject *capsule) {{
             {struct_name} *self = ({struct_name} *)PyCapsule_GetContext(capsule);
             delete self;
         }}
         """.format(struct_name=self.struct_name)
         else:
             static = """
-        static int %(struct_name)s_executor(%(struct_name)s* self) {
+        int %(struct_name)s_executor(%(struct_name)s* self) {
             return self->run();
         }
 
-        static void %(struct_name)s_destructor(void* executor, void* self) {
+        void %(struct_name)s_destructor(void* executor, void* self) {
             delete ((%(struct_name)s*)self);
         }
         """ % dict(struct_name=self.struct_name)
@@ -1394,8 +1276,6 @@ def build_dynamic_module(self):
         mod.add_function(instantiate)
         for header in self.headers():
             mod.add_include(header)
-        for init_code_block in self.init_code() + self.c_init_code_apply:
-            mod.add_init_code(init_code_block)
 
         return mod
 
@@ -1449,10 +1329,7 @@ def instantiate_code(self, n_args):
         print >> code, '     return NULL;'
         print >> code, '  }'
         print >> code, '  %(struct_name)s* struct_ptr = new %(struct_name)s();' % locals()
-        print >> code, '  if (struct_ptr->init(', ','.join('PyTuple_GET_ITEM(argtuple, %i)' % n for n in xrange(n_args)), ') != 0) {'
-        print >> code, '    delete struct_ptr;'
-        print >> code, '    return NULL;'
-        print >> code, '  }'
+        print >> code, '  struct_ptr->init(', ','.join('PyTuple_GET_ITEM(argtuple, %i)' % n for n in xrange(n_args)), ');'
         if PY3:
             print >> code, """\
     PyObject* thunk = PyCapsule_New((void*)(&{struct_name}_executor), NULL, {struct_name}_destructor);
@@ -1500,7 +1377,7 @@ def find_task(self, failure_code):
         # note that the failure code is distributed in two lists
         if failure_code < 2 * n:
             return [self.init_tasks, self.tasks][
-                failure_code % 2][failure_code // 2]
+                failure_code % 2][failure_code / 2]
         else:
             return self.tasks[failure_code - n]
 
@@ -1514,10 +1391,11 @@ def __call__(self):
                 trace = ()
             try:
                 exc_type, _exc_value, exc_trace = self.error_storage
-                if task in self.nodes:
-                    self.position_of_error = self.nodes.index(task)
+                if hasattr(task, "outputs"):
+                    exc_value = exc_type(_exc_value, task, task.outputs)
+                else:
+                    exc_value = exc_type(_exc_value, task)
                 # this can be used to retrieve the location the Op was declared
-                exc_value = exc_type(_exc_value)
                 exc_value.__thunk_trace__ = trace
             except Exception:
                 print >> sys.stderr, ('ERROR retrieving error_storage.'
@@ -1553,10 +1431,10 @@ class OpWiseCLinker(link.LocalLinker):
     __cache__ = {}
 
     def __init__(self,
-                 fallback_on_perform=True,
-                 allow_gc=None,
-                 nice_errors=True,
-                 schedule=None):
+            fallback_on_perform=True,
+            allow_gc=None,
+            nice_errors=True,
+            schedule=None):
         if allow_gc is None:
             allow_gc = config.allow_gc
         self.fgraph = None
@@ -1571,10 +1449,10 @@ def accept(self, fgraph, no_recycling=None):
             no_recycling = []
         if self.fgraph is not None and self.fgraph is not fgraph:
             return type(self)(
-                fallback_on_perform=self.fallback_on_perform,
-                allow_gc=self.allow_gc,
-                nice_errors=self.nice_errors
-            ).accept(fgraph, no_recycling)
+                    fallback_on_perform=self.fallback_on_perform,
+                    allow_gc=self.allow_gc,
+                    nice_errors=self.nice_errors
+                    ).accept(fgraph, no_recycling)
             #raise Exception("Cannot accept from a Linker that is
             #already tied to another FunctionGraph.")
         self.fgraph = fgraph
@@ -1595,7 +1473,7 @@ def make_all(self, profiler=None, input_storage=None, output_storage=None):
             no_recycling = self.no_recycling
 
             input_storage, output_storage, storage_map = link.map_storage(
-                fgraph, order, input_storage, output_storage)
+                                    fgraph, order, input_storage, output_storage)
             if self.allow_gc:
                 computed, last_user = link.gc_helper(order)
                 post_thunk_old_storage = []
@@ -1618,12 +1496,9 @@ def make_all(self, profiler=None, input_storage=None, output_storage=None):
                     if theano.config.cxx:
                         node.op._op_use_c_code = True
                     thunks += [node.op.make_thunk(node,
-                                                  storage_map,
-                                                  compute_map,
-                                                  no_recycling)]
-                    thunks[-1].inputs = [storage_map[v] for v in node.inputs]
-                    thunks[-1].outputs = [storage_map[v] for v in node.outputs]
-
+                                        storage_map,
+                                        compute_map,
+                                        no_recycling)]
                 finally:
                     node.op._op_use_c_code = old_value
 
@@ -1643,9 +1518,9 @@ def make_all(self, profiler=None, input_storage=None, output_storage=None):
                                 for r in no_recycling if r not in fgraph.inputs]
 
             f = link.streamline(fgraph, thunks, order,
-                                post_thunk_old_storage,
-                                no_recycling=no_recycling,
-                                nice_errors=self.nice_errors)
+                    post_thunk_old_storage,
+                    no_recycling=no_recycling,
+                    nice_errors=self.nice_errors)
 
             f.allow_gc = self.allow_gc
 
diff --git a/theano/gof/cmodule.py b/theano/gof/cmodule.py
index 60b6542925e..f38a748bdb1 100644
--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -12,6 +12,7 @@
 import sys
 import tempfile
 import time
+import itertools
 
 import distutils.sysconfig
 
@@ -24,13 +25,12 @@
 import numpy.distutils  # TODO: TensorType should handle this
 
 import theano
-from theano.compat import any, PY3, next, decode, decode_iter
-from theano.compat.six import b, BytesIO, StringIO
+from theano.compat import PY3, next, decode, decode_iter
+from theano.compat.six import StringIO
 from theano.gof.utils import flatten
 from theano.configparser import config
 from theano.gof.cc import hash_from_code
-from theano.misc.windows import (subprocess_Popen, call_subprocess_Popen,
-                                 output_subprocess_Popen)
+from theano.misc.windows import call_subprocess_Popen
 
 # we will abuse the lockfile mechanism when reading and writing the registry
 from theano.gof import compilelock
@@ -62,12 +62,8 @@
              BoolParam(False))
 
 
-AddConfigVar('cmodule.preload_cache',
-             "If set to True, will preload the C module cache at import time",
-             BoolParam(False, allow_override=False),
-             in_c_key=False)
-
 _logger = logging.getLogger("theano.gof.cmodule")
+_logger.setLevel(logging.WARNING)
 
 METH_VARARGS = "METH_VARARGS"
 METH_NOARGS = "METH_NOARGS"
@@ -148,7 +144,12 @@ def __init__(self, name=None):
         self.support_code = []
         self.functions = []
         self.includes = ["<Python.h>", "<iostream>"]
-        self.init_blocks = []
+
+        #TODO: this should come from TensorType
+        self.includes.append('<numpy/arrayobject.h>')
+
+        #TODO: from TensorType
+        self.init_blocks = ['import_array();']
 
     def print_methoddef(self, stream):
         print >> stream, "static PyMethodDef MyMethods[] = {"
@@ -313,13 +314,12 @@ def last_access_time(path):
     return os.stat(path)[stat.ST_ATIME]
 
 
-def module_name_from_dir(dirname, err=True, files=None):
+def module_name_from_dir(dirname, err=True):
     """
     Scan the contents of a cache directory and return full path of the
     dynamic lib in it.
     """
-    if files is None:
-        files = os.listdir(dirname)
+    files = os.listdir(dirname)
     names = [file for file in files
              if file.endswith('.so') or file.endswith('.pyd')]
     if len(names) == 0 and not err:
@@ -368,7 +368,7 @@ def get_module_hash(src_code, key):
     # it changes, then the module hash should be different.
     # We start with the source code itself (stripping blanks might avoid
     # recompiling after a basic indentation fix for instance).
-    to_hash = [l.strip() for l in src_code.split('\n')]
+    to_hash = map(str.strip, src_code.split('\n'))
     # Get the version part of the key (ignore if unversioned).
     if key[0]:
         to_hash += map(str, key[0])
@@ -475,8 +475,8 @@ def save_pkl(self):
         """
         # Note that writing in binary mode is important under Windows.
         try:
-            with open(self.key_pkl, 'wb') as f:
-                cPickle.dump(self, f, protocol=cPickle.HIGHEST_PROTOCOL)
+            cPickle.dump(self, open(self.key_pkl, 'wb'),
+                         protocol=cPickle.HIGHEST_PROTOCOL)
         except cPickle.PicklingError:
             _logger.warning("Cache leak due to unpickle-able key data %s",
                             self.keys)
@@ -646,21 +646,18 @@ def refresh(self, age_thresh_use=None, delete_if_problem=False):
             time_now = time.time()
             # Go through directories in alphabetical order to ensure consistent
             # behavior.
-            subdirs = sorted(os.listdir(self.dirname))
-            for root in subdirs:
-                root = os.path.join(self.dirname, root)
+            root_dirs_files = sorted(os.walk(self.dirname),
+                                     key=operator.itemgetter(0))
+            for root, dirs, files in root_dirs_files:
                 key_pkl = os.path.join(root, 'key.pkl')
                 if key_pkl in self.loaded_key_pkl:
                     continue
-                if not os.path.isdir(root):
-                    continue
-                files = os.listdir(root)
-                if 'delete.me' in files or not files:
+                elif 'delete.me' in files or not files:
                     _rmtree(root, ignore_nocleanup=True,
                             msg="delete.me found in dir")
                 elif 'key.pkl' in files:
                     try:
-                        entry = module_name_from_dir(root, files=files)
+                        entry = module_name_from_dir(root)
                     except ValueError:  # there is a key but no dll!
                         if not root.startswith("/tmp"):
                             # Under /tmp, file are removed periodically by the
@@ -680,8 +677,7 @@ def unpickle_failure():
                                          "unpickle cache file %s", key_pkl)
 
                         try:
-                            with open(key_pkl, 'rb') as f:
-                                key_data = cPickle.load(f)
+                            key_data = cPickle.load(open(key_pkl, 'rb'))
                         except EOFError:
                             # Happened once... not sure why (would be worth
                             # investigating if it ever happens again).
@@ -824,7 +820,8 @@ def unpickle_failure():
                 # We do nothing here.
 
             # Clean up the name space to prevent bug.
-            del root, files, subdirs
+            if root_dirs_files:
+                del root, dirs, files
 
             # Remove entries that are not in the filesystem.
             items_copy = list(self.module_hash_to_key_data.iteritems())
@@ -1126,9 +1123,7 @@ def check_key(self, key, key_pkl):
         # Verify that when we reload the KeyData from the pickled file, the
         # same key can be found in it, and is not equal to more than one
         # other key.
-        with open(key_pkl, 'rb') as f:
-            key_data = cPickle.load(f)
-
+        key_data = cPickle.load(open(key_pkl, 'rb'))
         found = sum(key == other_key for other_key in key_data.keys)
         msg = ''
         if found == 0:
@@ -1446,12 +1441,8 @@ def get_gcc_shared_library_arg():
 
 
 def std_include_dirs():
-    numpy_inc_dirs = numpy.distutils.misc_util.get_numpy_include_dirs()
-    py_inc = distutils.sysconfig.get_python_inc()
-    py_plat_spec_inc = distutils.sysconfig.get_python_inc(plat_specific=True)
-    python_inc_dirs = ([py_inc] if py_inc == py_plat_spec_inc
-                       else [py_inc, py_plat_spec_inc])
-    return numpy_inc_dirs + python_inc_dirs
+    return (numpy.distutils.misc_util.get_numpy_include_dirs()
+            + [distutils.sysconfig.get_python_inc()])
 
 
 def std_lib_dirs_and_libs():
@@ -1466,26 +1457,17 @@ def std_lib_dirs_and_libs():
         # directories.
         python_lib_dirs = [os.path.join(os.path.dirname(python_inc), 'libs')]
         if "Canopy" in python_lib_dirs[0]:
-            # Canopy stores libpython27.a and libmsccr90.a in this directory.
+            # Canopy store libpython27.a and libmsccr90.a in this directory.
             # For some reason, these files are needed when compiling Python
             # modules, even when libpython27.lib and python27.dll are
             # available, and the *.a files have to be found earlier than
             # the other ones.
-
-            #When Canopy is installed for the user:
-            #sys.prefix:C:\Users\username\AppData\Local\Enthought\Canopy\User
-            #sys.base_prefix:C:\Users\username\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.1.0.1371.win-x86_64
-            #When Canopy is installed for all users:
-            #sys.base_prefix: C:\Program Files\Enthought\Canopy\App\appdata\canopy-1.1.0.1371.win-x86_64
-            #sys.prefix: C:\Users\username\AppData\Local\Enthought\Canopy\User
-            #So we need to use sys.prefix as it support both cases.
-            #sys.base_prefix support only one case
-            libdir = os.path.join(sys.prefix, 'libs')
-
+            libdir = os.path.join(sys.base_prefix, '..', '..', '..',
+                                  'User', 'libs')
             for f, lib in [('libpython27.a', 'libpython 1.2'),
                            ('libmsvcr90.a', 'mingw 4.5.2')]:
                 if not os.path.exists(os.path.join(libdir, f)):
-                    print ("Your Python version is from Canopy. " +
+                    print ("Your python version is from Canopy. " +
                            "You need to install the package '" + lib +
                            "' from Canopy package manager."
                            )
@@ -1521,9 +1503,14 @@ def gcc_llvm():
     It don't support all g++ parameters even if it support many of them.
     """
     if gcc_llvm.is_llvm is None:
+        pass
+        p = None
         try:
-            p_out = output_subprocess_Popen([theano.config.cxx, '--version'])
-            output = p_out[0] + p_out[1]
+            p = call_subprocess_Popen(['g++', '--version'],
+                                      stdout=subprocess.PIPE,
+                                      stderr=subprocess.PIPE)
+            p.wait()
+            output = p.stdout.read() + p.stderr.read()
         except OSError:
             # Typically means g++ cannot be found.
             # So it is not an llvm compiler.
@@ -1532,9 +1519,9 @@ def gcc_llvm():
             # compile when g++ is not available. If this happen, it
             # will crash later so supposing it is not llvm is "safe".
             output = b('')
+        del p
         gcc_llvm.is_llvm = b("llvm") in output
     return gcc_llvm.is_llvm
-
 gcc_llvm.is_llvm = None
 
 
@@ -1544,7 +1531,7 @@ class GCC_compiler(object):
 
     @staticmethod
     def version_str():
-        return theano.config.cxx + " " + gcc_version_str
+        return "g++ " + gcc_version_str
 
     @staticmethod
     def compile_args():
@@ -1569,61 +1556,39 @@ def compile_args():
                         "         It is better to let Theano/g++ find it"
                         " automatically, but we don't do it now")
                     detect_march = False
-                    GCC_compiler.march_flags = []
                     break
 
-        if ('g++' not in theano.config.cxx and
-            'clang++' not in theano.config.cxx):
-            _logger.warn(
-                "OPTIMIZATION WARNING: your Theano flag `cxx` seems not to be"
-                " the g++ compiler. So we disable the compiler optimization"
-                " specific to g++ that tell to compile for a specific CPU."
-                " At worst, this could cause slow down.\n"
-                "         You can add those parameters to the compiler yourself"
-                " via the Theano flag `gcc.cxxflags`."
-            )
-            detect_march = False
-
         if detect_march:
             GCC_compiler.march_flags = []
 
             def get_lines(cmd, parse=True):
-                p = subprocess_Popen(cmd,
-                                     stdout=subprocess.PIPE,
-                                     stderr=subprocess.PIPE,
-                                     stdin=subprocess.PIPE,
-                                     shell=True)
-                # For mingw64 with GCC >= 4.7, passing os.devnull
-                # as stdin (which is the default) results in the process
-                # waiting forever without returning. For that reason,
-                # we use a pipe, and use the empty string as input.
-                (stdout, stderr) = p.communicate(input=b(''))
+                p = call_subprocess_Popen(cmd,
+                                          stdout=subprocess.PIPE,
+                                          stderr=subprocess.PIPE,
+                                          shell=True)
+                p.wait()
                 if p.returncode != 0:
                     return None
 
-                lines = BytesIO(stdout + stderr).readlines()
-                lines = decode_iter(lines)
+                stdout = decode_iter(p.stdout.readlines())
+                stderr = decode_iter(p.stderr.readlines())
+                lines = []
                 if parse:
-                    selected_lines = []
-                    for line in lines:
-                        if ("COLLECT_GCC_OPTIONS=" in line or
-                            "CFLAGS=" in line or
-                            "CXXFLAGS=" in line or
-                            "-march=native" in line):
+                    for line in itertools.chain(stdout, stderr):
+                        if "COLLECT_GCC_OPTIONS=" in line:
                             continue
-                        elif "-march=" in line:
-                            selected_lines.append(line.strip())
-                        elif "-mtune=" in line:
-                            selected_lines.append(line.strip())
-                        elif "-target-cpu" in line:
-                            selected_lines.append(line.strip())
-                    lines = list(set(selected_lines))  # to remove duplicate
-
-                return lines
+                        elif "-march=" in line and "-march=native" not in line:
+                            lines.append(line.strip())
+                        elif "-mtune=" in line and "-march=native" not in line:
+                            lines.append(line.strip())
+                    lines = list(set(lines))  # to remove duplicate
+                else:
+                    lines = itertools.chain(stdout, stderr)
+                    return list(lines)
 
             # The '-' at the end is needed. Otherwise, g++ do not output
             # enough information.
-            native_lines = get_lines("%s -march=native -E -v -" % theano.config.cxx)
+            native_lines = get_lines("g++ -march=native -E -v -")
             if native_lines is None:
                 _logger.info("Call to 'g++ -march=native' failed,"
                              "not setting -march flag")
@@ -1634,23 +1599,15 @@ def get_lines(cmd, parse=True):
 
         if detect_march:
             if len(native_lines) != 1:
-                if len(native_lines) == 0:
-                    # That means we did not select the right lines, so
-                    # we have to report all the lines instead
-                    reported_lines = get_lines("%s -march=native -E -v -" % theano.config.cxx,
-                                               parse=False)
-                else:
-                    reported_lines = native_lines
                 _logger.warn(
                     "OPTIMIZATION WARNING: Theano was not able to find the"
                     " g++ parameters that tune the compilation to your "
                     " specific CPU. This can slow down the execution of Theano"
                     " functions. Please submit the following lines to"
                     " Theano's mailing list so that we can fix this"
-                    " problem:\n %s",
-                    reported_lines)
+                    " problem:\n %s", native_lines)
             else:
-                default_lines = get_lines("%s -E -v -" % theano.config.cxx)
+                default_lines = get_lines("g++ -E -v -")
                 _logger.info("g++ default lines: %s", default_lines)
                 if len(default_lines) < 1:
                     _logger.warn(
@@ -1661,89 +1618,15 @@ def get_lines(cmd, parse=True):
                         " functions. Please submit the following lines to"
                         " Theano's mailing list so that we can fix this"
                         " problem:\n %s",
-                        get_lines("%s -E -v -" % theano.config.cxx, parse=False))
+                        get_lines("g++ -E -v -", parse=False))
                 else:
-                    # Some options are actually given as "-option value",
-                    # we want to treat them as only one token when comparing
-                    # different command lines.
-                    # Heuristic: tokens not starting with a dash should be
-                    # joined with the previous one.
-                    def join_options(init_part):
-                        new_part = []
-                        for i in range(len(init_part)):
-                            p = init_part[i]
-                            if p.startswith('-'):
-                                p_list = [p]
-                                while ((i + 1 < len(init_part)) and
-                                       not init_part[i + 1].startswith('-')):
-                                    # append that next part to p_list
-                                    p_list.append(init_part[i + 1])
-                                    i += 1
-                                new_part.append(' '.join(p_list))
-                            elif i == 0:
-                                # The first argument does not usually start
-                                # with "-", still add it
-                                new_part.append(p)
-                            # Else, skip it, as it was already included
-                            # with the previous part.
-                        return new_part
-
-                    part = join_options(native_lines[0].split())
-
+                    part = native_lines[0].split()
                     for line in default_lines:
                         if line.startswith(part[0]):
-                            part2 = [p for p in join_options(line.split())
-                                     if (not 'march' in p and
-                                         not 'mtune' in p and
-                                         not 'target-cpu' in p)]
+                            part2 = [p for p in line.split()
+                                     if not 'march' in p and not 'mtune' in p]
                             new_flags = [p for p in part if p not in part2]
-                            # Replace '-target-cpu value', which is an option
-                            # of clang, with '-march=value', for g++
-                            for i, p in enumerate(new_flags):
-                                if 'target-cpu' in p:
-                                    opt = p.split()
-                                    if len(opt) == 2:
-                                        opt_name, opt_val = opt
-                                        new_flags[i] = '-march=%s' % opt_val
-
-                            # Some versions of GCC report the native arch
-                            # as "corei7-avx", but it generates illegal
-                            # instructions, and should be "corei7" instead.
-                            # Affected versions are:
-                            # - 4.6 before 4.6.4
-                            # - 4.7 before 4.7.3
-                            # - 4.8 before 4.8.1
-                            # Earlier versions did not have arch "corei7-avx"
-                            for i, p in enumerate(new_flags):
-                                if 'march' not in p:
-                                    continue
-                                opt = p.split('=')
-                                if len(opt) != 2:
-                                    # Inexpected, but do not crash
-                                    continue
-                                opt_val = opt[1]
-                                if not opt_val.endswith('-avx'):
-                                    # OK
-                                    continue
-                                # Check the version of GCC
-                                version = gcc_version_str.split('.')
-                                if len(version) != 3:
-                                    # Unexpected, but should not be a problem
-                                    continue
-                                mj, mn, patch = [int(vp) for vp in version]
-                                if (((mj, mn) == (4, 6) and patch < 4) or
-                                        ((mj, mn) == (4, 7) and patch <= 3) or
-                                        ((mj, mn) == (4, 8) and patch < 1)):
-                                    new_flags[i] = p.rstrip('-avx')
-
-                            # Go back to split arguments, like
-                            # ["-option", "value"],
-                            # as this is the way g++ expects them split.
-                            split_flags = []
-                            for p in new_flags:
-                                split_flags.extend(p.split())
-
-                            GCC_compiler.march_flags = split_flags
+                            GCC_compiler.march_flags = new_flags
                             break
                     _logger.info("g++ -march=native equivalent flags: %s",
                                  GCC_compiler.march_flags)
@@ -1755,13 +1638,12 @@ def join_options(init_part):
         #to use the new API, but not everywhere. When finished, enable
         #the following macro to assert that we don't bring new code
         #that use the old API.
-        cxxflags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION")
+        #cxxflags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION")
         numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
 
         # numpy 1.7 deprecated the following macro but the new one didn't
         # existed in the past
         if bool(numpy_ver < [1, 7]):
-            cxxflags.append("-D NPY_ARRAY_ENSUREARRAY=NPY_ENSUREARRAY")
             cxxflags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
             cxxflags.append("-D NPY_ARRAY_ALIGNED=NPY_ALIGNED")
             cxxflags.append("-D NPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
@@ -1774,13 +1656,10 @@ def join_options(init_part):
         # in the key of the compiled module, avoiding potential conflicts.
 
         # Figure out whether the current Python executable is 32
-        # or 64 bit and compile accordingly. This step is ignored for ARM
-        # architectures in order to make Theano compatible with the Raspberry
-        # Pi.
-        if not any(['arm' in flag for flag in cxxflags]):
-            n_bits = local_bitwidth()
-            cxxflags.append('-m%d' % n_bits)
-            _logger.debug("Compiling for %s bit architecture", n_bits)
+        # or 64 bit and compile accordingly.
+        n_bits = local_bitwidth()
+        cxxflags.append('-m%d' % n_bits)
+        _logger.debug("Compiling for %s bit architecture", n_bits)
 
         if sys.platform != 'win32':
             # Under Windows it looks like fPIC is useless. Compiler warning:
@@ -1793,15 +1672,41 @@ def join_options(init_part):
             # link with libpython.
             cxxflags.append('-DMS_WIN64')
 
+        #DSE Patch 1 for supporting OSX frameworks; add -framework Python
         if sys.platform == 'darwin':
-            # Use the already-loaded python symbols.
             cxxflags.extend(['-undefined', 'dynamic_lookup'])
+            python_inc = distutils.sysconfig.get_python_inc()
+            # link with the framework library *if specifically requested*
+            # config.mac_framework_link is by default False, since on some mac
+            # installs linking with -framework causes a Bus Error
+            if (python_inc.count('Python.framework') > 0 and
+                config.cmodule.mac_framework_link):
+                cxxflags.extend(['-framework', 'Python'])
+            if 'Anaconda' in sys.version:
+                new_path = os.path.join(sys.prefix, "lib")
+                v = os.getenv("DYLD_FALLBACK_LIBRARY_PATH", None)
+                if v is not None:
+                    # This will resolve symbolic links
+                    v = os.path.realpath(v)
+
+                # The python __import__ don't seam to take into account
+                # the new env variable "DYLD_FALLBACK_LIBRARY_PATH"
+                # when we set with os.environ['...'] = X or os.putenv()
+                # So we tell the user and tell him what todo.
+                if v is None or new_path not in v.split(":"):
+                    raise Exception(
+                        "The environment variable "
+                        "'DYLD_FALLBACK_LIBRARY_PATH' does not contain "
+                        "the '%s' path in its value. This will make "
+                        "Theano unable to compile c code. Update "
+                        "'DYLD_FALLBACK_LIBRARY_PATH' to contain the "
+                        "said value, this will fix this error."
+                        % new_path)
 
         return cxxflags
 
     @staticmethod
-    def try_compile_tmp(src_code, tmp_prefix='', flags=(),
-                        try_run=False, output=False):
+    def try_compile_tmp(src_code, tmp_prefix='', flags=(), try_run=False):
         """Try to compile (and run) a test program.
 
         This is useful in various occasions, to check if libraries
@@ -1812,7 +1717,6 @@ def try_compile_tmp(src_code, tmp_prefix='', flags=(),
 
         If try_run is False, returns the compilation status.
         If try_run is True, returns a (compile_status, run_status) pair.
-        If output is there, we append the stdout and stderr to the output.
         """
         if not theano.config.cxx:
             return False
@@ -1824,43 +1728,43 @@ def try_compile_tmp(src_code, tmp_prefix='', flags=(),
             fd, path = tempfile.mkstemp(suffix='.c', prefix=tmp_prefix)
             exe_path = path[:-2]
             try:
-                # Python3 compatibility: try to cast Py3 strings as Py2 strings
-                try:
-                    src_code = b(src_code)
-                except Exception:
-                    pass
                 os.write(fd, src_code)
                 os.close(fd)
                 fd = None
-                out, err, p_ret = output_subprocess_Popen(
-                    [theano.config.cxx, path, '-o', exe_path] + flags)
-                if p_ret != 0:
+                proc = call_subprocess_Popen(
+                        ['g++', path, '-o', exe_path] + flags,
+                        stdout=subprocess.PIPE,
+                        stderr=subprocess.PIPE)
+                proc.wait()
+                if proc.returncode != 0:
                     compilation_ok = False
                 elif try_run:
-                    out, err, p_ret = output_subprocess_Popen([exe_path])
-                    run_ok = (p_ret == 0)
+                    # Try to execute the program
+                    try:
+                        proc = call_subprocess_Popen([exe_path],
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE)
+                        proc.wait()
+                        run_ok = (proc.returncode == 0)
+                    finally:
+                        os.remove(exe_path)
             finally:
                 try:
                     if fd is not None:
                         os.close(fd)
                 finally:
                     os.remove(path)
-                    os.remove(exe_path)
+
         except OSError, e:
             compilation_ok = False
 
-        if not try_run and not output:
+        if not try_run:
             return compilation_ok
-        elif not try_run and output:
-            return (compilation_ok, out, err)
-        elif not output:
-            return (compilation_ok, run_ok)
         else:
-            return (compilation_ok, run_ok, out, err)
+            return (compilation_ok, run_ok)
 
     @staticmethod
-    def try_flags(flag_list, preambule="", body="",
-                  try_run=False, output=False):
+    def try_flags(flag_list):
         '''
         Try to compile a dummy file with these flags.
 
@@ -1870,17 +1774,14 @@ def try_flags(flag_list, preambule="", body="",
         if not theano.config.cxx:
             return False
 
-        code = b("""
-        %(preambule)s
+        code = """
         int main(int argc, char** argv)
         {
-            %(body)s
             return 0;
         }
-        """ % locals())
+        """
         return GCC_compiler.try_compile_tmp(code, tmp_prefix='try_flags_',
-                                            flags=flag_list, try_run=try_run,
-                                            output=output)
+                flags=flag_list, try_run=False)
 
     @staticmethod
     def compile_str(module_name, src_code, location=None,
@@ -1955,7 +1856,7 @@ def compile_str(module_name, src_code, location=None,
                                     (module_name, get_lib_extension()))
 
         _logger.debug('Generating shared lib %s', lib_filename)
-        cmd = [theano.config.cxx, get_gcc_shared_library_arg(), '-g']
+        cmd = ['g++', get_gcc_shared_library_arg(), '-g']
 
         if config.cmodule.remove_gxx_opt:
             cmd.extend(p for p in preargs if not p.startswith('-O'))
@@ -1977,14 +1878,14 @@ def print_command_line_error():
             print >> sys.stderr, ' '.join(cmd)
 
         try:
-            p_out = output_subprocess_Popen(cmd)
-            compile_stderr = decode(p_out[1])
+            p = call_subprocess_Popen(cmd, stderr=subprocess.PIPE)
+            compile_stderr = decode(p.communicate()[1])
         except Exception:
             # An exception can occur e.g. if `g++` is not found.
             print_command_line_error()
             raise
 
-        status = p_out[2]
+        status = p.returncode
 
         if status:
             print '==============================='
diff --git a/theano/gof/compiledir.py b/theano/gof/compiledir.py
index 8165802c3b7..2479179b3c8 100644
--- a/theano/gof/compiledir.py
+++ b/theano/gof/compiledir.py
@@ -1,6 +1,5 @@
 import cPickle
 import errno
-import logging
 import os
 import platform
 import re
@@ -16,17 +15,24 @@
 import theano
 from theano.configparser import config, AddConfigVar, ConfigParam, StrParam
 from theano.gof.utils import flatten
-from theano.misc.windows import output_subprocess_Popen
-
-
-_logger = logging.getLogger("theano.gof.compiledir")
+from theano.misc.windows import call_subprocess_Popen
 
+# Using the dummy file descriptors below is a workaround for a crash
+# experienced in an unusual Python 2.4.4 Windows environment with the default
+# None values.
+dummy_err = open(os.devnull, 'w')
+p = None
 try:
-    p_out = output_subprocess_Popen([theano.config.cxx, '-dumpversion'])
-    gcc_version_str = p_out[0].strip().decode()
+    p = call_subprocess_Popen(['g++', '-dumpversion'],
+                              stdout=subprocess.PIPE,
+                              stderr=dummy_err.fileno())
+    p.wait()
+    gcc_version_str = p.stdout.readline().strip().decode()
 except OSError:
     # Typically means gcc cannot be found.
     gcc_version_str = 'GCC_NOT_FOUND'
+del p
+del dummy_err
 
 
 def local_bitwidth():
@@ -175,7 +181,7 @@ def cleanup():
     """
     Delete keys in old format from the compiledir.
 
-    Old clean up include key in old format or with old version of the c_code:
+    Old clean up include key in old format:
     1) keys that have an ndarray in them.
        Now we use a hash in the keys of the constant data.
     2) key that don't have the numpy ABI version in them
@@ -198,50 +204,24 @@ def cleanup():
                         have_c_compiler = False
                         for obj in flatten(key):
                             if isinstance(obj, numpy.ndarray):
-                                #Reuse have_npy_abi_version to
-                                #force the removing of key
-                                have_npy_abi_version = False
+                                keydata.remove_key(key)
                                 break
                             elif isinstance(obj, basestring):
                                 if obj.startswith('NPY_ABI_VERSION=0x'):
                                     have_npy_abi_version = True
                                 elif obj.startswith('c_compiler_str='):
                                     have_c_compiler = True
-                            elif (isinstance(obj, (theano.gof.Op, theano.gof.Type)) and
-                                  hasattr(obj, 'c_code_cache_version')):
-                                v = obj.c_code_cache_version()
-                                if v not in [(), None] and v not in key[0]:
-                                    #Reuse have_npy_abi_version to
-                                    #force the removing of key
-                                    have_npy_abi_version = False
-                                    break
 
                         if not have_npy_abi_version or not have_c_compiler:
-                            try:
-                                #This can happen when we move the compiledir.
-                                if keydata.key_pkl != filename:
-                                    keydata.key_pkl = filename
-                                keydata.remove_key(key)
-                            except IOError, e:
-                                _logger.error(
-                                    "Could not remove file '%s'. To complete "
-                                    "the clean-up, please remove manually "
-                                    "the directory containing it.",
-                                    filename)
+                            keydata.remove_key(key)
                     if len(keydata.keys) == 0:
                         shutil.rmtree(os.path.join(compiledir, directory))
 
                 except EOFError:
-                    _logger.error(
-                        "Could not read key file '%s'. To complete "
-                        "the clean-up, please remove manually "
-                        "the directory containing it.",
-                        filename)
+                    print ("ERROR while reading this key file '%s'."
+                           " Delete its directory" % filename)
             except IOError:
-                _logger.error(
-                    "Could not clean up this directory: '%s'. To complete "
-                    "the clean-up, please remove it manually.",
-                    directory)
+                pass
         finally:
             if file is not None:
                 file.close()
diff --git a/theano/gof/compilelock.py b/theano/gof/compilelock.py
index d90570eeaf0..394613c6778 100644
--- a/theano/gof/compilelock.py
+++ b/theano/gof/compilelock.py
@@ -1,47 +1,38 @@
-# Locking mechanism to ensure no two compilations occur simultaneously
-# in the same compilation directory (which can cause crashes).
-
-import atexit
-import os
-import random
-import socket  # only used for gethostname()
-import time
-import logging
+# Locking mechanism to ensure no two compilations occur simultaneously in the
+# same compilation directory (which can cause crashes).
 
 from theano import config
-from theano.configparser import AddConfigVar, IntParam
-
-_logger = logging.getLogger("theano.gof.compilelock")
-# If the user provided a logging level, we don't want to override it.
-if _logger.level == logging.NOTSET:
-    # INFO will show the "Refreshing lock" messages
-    _logger.setLevel(logging.INFO)
-
-AddConfigVar('compile.wait',
-             """Time to wait before retrying to aquire the compile lock.""",
-             IntParam(5, lambda i: i > 0, allow_override=False),
-             in_c_key=False)
+import os, random, time, atexit
+import socket # only used for gethostname()
+import logging
+_logger=logging.getLogger("theano.gof.compilelock")
+_logger.setLevel(logging.INFO) # INFO will show the the messages "Refreshing lock" message
 
-def _timeout_default():
-    return config.compile.wait * 24
+# In seconds, time that a process will wait before deciding to override an
+# existing lock. An override only happens when the existing lock is held by
+# the same owner *and* has not been 'refreshed' by this owner for more than
+# 'timeout_before_override' seconds.
+timeout_before_override = 120
 
-AddConfigVar('compile.timeout',
-             """In seconds, time that a process will wait before deciding to
-override an existing lock. An override only happens when the existing
-lock is held by the same owner *and* has not been 'refreshed' by this
-owner for more than this period. Refreshes are done every half timeout
-period for running processes.""",
-             IntParam(_timeout_default, lambda i: i >= 0,
-                      allow_override=False),
-             in_c_key=False)
+# In seconds, duration before a lock is refreshed. More precisely, the lock is
+# refreshed each time 'get_lock()' is called (typically for each file being
+# compiled) and the existing lock has not been refreshed in the past
+# 'refresh_every' seconds.
+refresh_every = 60
 
 
 def force_unlock():
     """
     Delete the compilation lock if someone else has it.
     """
-    get_lock(min_wait=0, max_wait=0.001, timeout=0)
-    release_lock()
+    global timeout_before_override
+    timeout_backup = timeout_before_override
+    timeout_before_override = 0
+    try:
+        get_lock(min_wait=0, max_wait=0.001)
+        release_lock()
+    finally:
+        timeout_before_override = timeout_backup
 
 
 def get_lock(lock_dir=None, **kw):
@@ -75,24 +66,22 @@ def get_lock(lock_dir=None, **kw):
     if get_lock.lock_is_enabled:
         # Only really try to acquire the lock if we do not have it already.
         if get_lock.n_lock == 0:
-            lock(get_lock.lock_dir, **kw)
+            lock(get_lock.lock_dir, timeout=timeout_before_override, **kw)
             atexit.register(Unlocker.unlock, get_lock.unlocker)
             # Store time at which the lock was set.
             get_lock.start_time = time.time()
         else:
-            # Check whether we need to 'refresh' the lock. We do this
-            # every 'config.compile.timeout / 2' seconds to ensure
-            # no one else tries to override our lock after their
-            # 'config.compile.timeout' timeout period.
+            # Check whether we need to 'refresh' the lock. We do this every
+            # 'refresh_every' seconds to ensure noone else tries to override
+            # our lock after their 'timeout_before_override' timeout period.
             now = time.time()
-            if now - get_lock.start_time > config.compile.timeout/2:
+            if now - get_lock.start_time > refresh_every:
                 lockpath = os.path.join(get_lock.lock_dir, 'lock')
                 _logger.info('Refreshing lock %s', str(lockpath))
                 refresh_lock(lockpath)
                 get_lock.start_time = now
     get_lock.n_lock += 1
 
-
 def release_lock():
     """
     Release lock on compilation directory.
@@ -104,64 +93,51 @@ def release_lock():
         get_lock.start_time = None
         get_lock.unlocker.unlock()
 
-
 def set_lock_status(use_lock):
     """
     Enable or disable the lock on the compilation directory (which is enabled
     by default). Disabling may make compilation slightly faster (but is not
     recommended for parallel execution).
 
-    :param use_lock: whether to use the compilation lock or not
-    :type  use_lock: bool
+    @param use_lock: whether to use the compilation lock or not
+    @type  use_lock: bool
     """
     get_lock.lock_is_enabled = use_lock
 
-# This is because None is a valid input for timeout
-notset = object()
-
-def lock(tmp_dir, timeout=notset, min_wait=None, max_wait=None, verbosity=1):
+def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
     """
     Obtain lock access by creating a given temporary directory (whose base will
     be created if needed, but will not be deleted after the lock is removed).
     If access is refused by the same lock owner during more than 'timeout'
     seconds, then the current lock is overridden. If timeout is None, then no
     timeout is performed.
-
     The lock is performed by creating a 'lock' file in 'tmp_dir' that contains
     a unique id identifying the owner of the lock (the process id, followed by
     a random string).
-
     When there is already a lock, the process sleeps for a random amount of
     time between min_wait and max_wait seconds before trying again.
-
     If 'verbosity' is >= 1, then a message will be displayed when we need to
     wait for the lock. If it is set to a value >1, then this message will be
     displayed each time we re-check for the presence of the lock. Otherwise it
     is displayed only when we notice the lock's owner has changed.
 
-    :param str tmp_dir: lock directory that will be created when
-                        acquiring the lock
+    @param tmp_dir: lock directory that will be created when acquiring the lock
+    @type  tmp_dir: string
 
-    :param timeout: time (in seconds) to wait before replacing an
-                    existing lock (default config 'compile.timeout')
-    :type  timeout: int or None
+    @param timeout: time (in seconds) to wait before replacing an existing lock
+    @type  timeout: int or None
 
-    :param int min_wait: minimum time (in seconds) to wait before
-                         trying again to get the lock
-                         (default config 'compile.wait')
+    @param min_wait: minimum time (in seconds) to wait before trying again to
+                     get the lock
+    @type  min_wait: int
 
-    :param int max_wait: maximum time (in seconds) to wait before
-                         trying again to get the lock
-                         (default 2 * min_wait)
+    @param max_wait: maximum time (in seconds) to wait before trying again to
+                     get the lock
+    @type  max_wait: int
 
-    :param int verbosity: amount of feedback displayed to screen (default 1)
+    @param verbosity: amount of feedback displayed to screen
+    @type  verbosity: int
     """
-    if min_wait is None:
-        min_wait = config.compile.wait
-    if max_wait is None:
-        max_wait = min_wait * 2
-    if timeout is notset:
-        timeout = config.compile.timeout
     # Create base of lock directory if required.
     base_lock = os.path.dirname(tmp_dir)
     if not os.path.isdir(base_lock):
@@ -180,11 +156,6 @@ def lock(tmp_dir, timeout=notset, min_wait=None, max_wait=None, verbosity=1):
     my_pid = os.getpid()
     no_display = (verbosity == 0)
 
-    nb_error = 0
-    # The number of time we sleep when their is no errors.
-    # Used to don't display it the first time to display it less frequently.
-    # And so don't get as much email about this!
-    nb_wait = 0
     # Acquire lock.
     while True:
         try:
@@ -193,15 +164,13 @@ def lock(tmp_dir, timeout=notset, min_wait=None, max_wait=None, verbosity=1):
             other_dead = False
             while os.path.isdir(tmp_dir):
                 try:
-                    with open(lock_file) as f:
-                        read_owner = f.readlines()[0].strip()
-
-                    # The try is transition code for old locks.
-                    # It may be removed when people have upgraded.
+                    read_owner = open(lock_file).readlines()[0].strip()
+                    # the try is transtion code for old locks
+                    # it may be removed when poeple have upgraded
                     try:
                         other_host = read_owner.split('_')[2]
                     except IndexError:
-                        other_host = ()  # make sure it isn't equal to any host
+                        other_host = () # make sure it isn't equal to any host
                     if other_host == socket.gethostname():
                         try:
                             os.kill(int(read_owner.split('_')[0]), 0)
@@ -215,12 +184,12 @@ def lock(tmp_dir, timeout=notset, min_wait=None, max_wait=None, verbosity=1):
                     if not no_display:
                         msg = "process '%s'" % read_owner.split('_')[0]
                         _logger.warning("Overriding existing lock by dead %s "
-                                        "(I am process '%s')", msg, my_pid)
+                                "(I am process '%s')", msg, my_pid)
                     get_lock.unlocker.unlock()
                     continue
                 if last_owner == read_owner:
                     if (timeout is not None and
-                        time.time() - time_start >= timeout):
+                            time.time() - time_start >= timeout):
                         # Timeout exceeded or locking process dead.
                         if not no_display:
                             if read_owner == 'failure':
@@ -228,37 +197,31 @@ def lock(tmp_dir, timeout=notset, min_wait=None, max_wait=None, verbosity=1):
                             else:
                                 msg = "process '%s'" % read_owner.split('_')[0]
                             _logger.warning("Overriding existing lock by %s "
-                                            "(I am process '%s')", msg, my_pid)
+                                    "(I am process '%s')", msg, my_pid)
                         get_lock.unlocker.unlock()
                         continue
                 else:
                     last_owner = read_owner
                     time_start = time.time()
                     no_display = (verbosity == 0)
-                if not no_display and nb_wait > 0:
+                if not no_display:
                     if read_owner == 'failure':
                         msg = 'unknown process'
                     else:
                         msg = "process '%s'" % read_owner.split('_')[0]
                     _logger.info("Waiting for existing lock by %s (I am "
-                                 "process '%s')", msg, my_pid)
-                    _logger.info("To manually release the lock, delete %s",
-                                 tmp_dir)
+                         "process '%s')", msg, my_pid)
+                    _logger.info("To manually release the lock, delete %s", tmp_dir)
                     if verbosity <= 1:
                         no_display = True
-                nb_wait += 1
                 time.sleep(random.uniform(min_wait, max_wait))
 
             try:
                 os.mkdir(tmp_dir)
             except OSError:
-                # Error while creating the directory: someone else
-                # must have tried at the exact same time.
-                nb_error += 1
-                if nb_error < 10:
-                    continue
-                else:
-                    raise
+                # Error while creating the directory: someone else must have tried
+                # at the exact same time.
+                continue
             # Safety check: the directory should be here.
             assert os.path.isdir(tmp_dir)
 
@@ -267,9 +230,7 @@ def lock(tmp_dir, timeout=notset, min_wait=None, max_wait=None, verbosity=1):
 
             # Verify we are really the lock owner (this should not be needed,
             # but better be safe than sorry).
-            with open(lock_file) as f:
-                owner = f.readlines()[0].strip()
-
+            owner = open(lock_file).readlines()[0].strip()
             if owner != unique_id:
                 # Too bad, try again.
                 continue
@@ -280,28 +241,22 @@ def lock(tmp_dir, timeout=notset, min_wait=None, max_wait=None, verbosity=1):
         except Exception, e:
             # If something wrong happened, we try again.
             _logger.warning("Something wrong happened: %s %s", type(e), e)
-            nb_error += 1
-            if nb_error > 10:
-                raise
             time.sleep(random.uniform(min_wait, max_wait))
             continue
 
-
 def refresh_lock(lock_file):
     """
     'Refresh' an existing lock by re-writing the file containing the owner's
     unique id, using a new (randomly generated) id, which is also returned.
     """
-    unique_id = '%s_%s_%s' % (
-        os.getpid(),
-        ''.join([str(random.randint(0, 9)) for i in range(10)]),
-        socket.gethostname())
+    unique_id = '%s_%s_%s' % (os.getpid(),
+            ''.join([str(random.randint(0,9)) for i in range(10)]),
+            socket.gethostname())
     lock_write = open(lock_file, 'w')
     lock_write.write(unique_id + '\n')
     lock_write.close()
     return unique_id
 
-
 class Unlocker(object):
     """
     Class wrapper around release mechanism so that the lock is automatically
@@ -319,14 +274,12 @@ def __del__(self):
         self.unlock()
 
     def unlock(self):
-        """Remove current lock.
-
-        This function does not crash if it is unable to properly
-        delete the lock file and directory. The reason is that it
-        should be allowed for multiple jobs running in parallel to
-        unlock the same directory at the same time (e.g. when reaching
-        their timeout limit).
-
+        """
+        Remove current lock.
+        This function does not crash if it is unable to properly delete the lock
+        file and directory. The reason is that it should be allowed for multiple
+        jobs running in parallel to unlock the same directory at the same time
+        (e.g. when reaching their timeout limit).
         """
         # If any error occurs, we assume this is because someone else tried to
         # unlock this directory at the same time.
diff --git a/theano/gof/cutils.py b/theano/gof/cutils.py
index 396a1034791..38e6816e9f2 100644
--- a/theano/gof/cutils.py
+++ b/theano/gof/cutils.py
@@ -1,4 +1,3 @@
-import errno
 import os
 import sys
 
@@ -52,7 +51,7 @@ def compile_cutils():
                                            'op': complexadd % {'type': t}}
                         for t in complex_types])
 
-    fn_array = ("static inplace_map_binop addition_funcs[] = {" +
+    fn_array = ("inplace_map_binop addition_funcs[] = {" +
             ''.join(["""
             #if defined(%(typen)s)
             %(type)s_inplace_add,
@@ -62,7 +61,7 @@ def compile_cutils():
             """NULL};
             """)
 
-    type_number_array = ("static int type_numbers[] = {" +
+    type_number_array = ("int type_numbers[] = {" +
             ''.join(["""
             #if defined(%(typen)s)
             %(typen)s,
@@ -249,11 +248,7 @@ def compile_cutils():
 
     loc = os.path.join(config.compiledir, 'cutils_ext')
     if not os.path.exists(loc):
-        try:
-            os.mkdir(loc)
-        except OSError, e:
-            assert e.errno == errno.EEXIST
-            assert os.path.exists(loc), loc
+        os.mkdir(loc)
 
     args = cmodule.GCC_compiler.compile_args()
     cmodule.GCC_compiler.compile_str('cutils_ext', code, location=loc,
@@ -269,11 +264,7 @@ def compile_cutils():
     sys.path.insert(0, config.compiledir)
     location = os.path.join(config.compiledir, 'cutils_ext')
     if not os.path.exists(location):
-        try:
-            os.mkdir(location)
-        except OSError, e:
-            assert e.errno == errno.EEXIST
-            assert os.path.exists(location), location
+        os.mkdir(location)
     if not os.path.exists(os.path.join(location, '__init__.py')):
         open(os.path.join(location, '__init__.py'), 'w').close()
 
diff --git a/theano/gof/destroyhandler.py b/theano/gof/destroyhandler.py
index d38167da92f..f877bcbe061 100644
--- a/theano/gof/destroyhandler.py
+++ b/theano/gof/destroyhandler.py
@@ -11,7 +11,6 @@
 
 from fg import InconsistencyError
 
-
 class ProtocolError(Exception):
     """Raised when FunctionGraph calls DestroyHandler callbacks in
     an invalid way, for example, pruning or changing a node that has
@@ -19,7 +18,6 @@ class ProtocolError(Exception):
     """
     pass
 
-
 def _contains_cycle(fgraph, orderings):
     """
 
@@ -46,6 +44,7 @@ def _contains_cycle(fgraph, orderings):
     inputs = fgraph.inputs
     outputs = fgraph.outputs
 
+
     # this is hard-coded reimplementation of functions from graph.py
     # reason: go faster, prepare for port to C.
     # specifically, it could be replaced with a wrapper
@@ -56,6 +55,7 @@ def _contains_cycle(fgraph, orderings):
 
     # this is performance-critical code. it is the largest single-function
     # bottleneck when compiling large graphs.
+
     assert isinstance(outputs, (tuple, list, deque))
 
     # TODO: For more speed - use a defaultdict for the orderings
@@ -111,7 +111,7 @@ def _contains_cycle(fgraph, orderings):
         # this is faster than calling get_parents
         owner = var.owner
         if owner:
-            parents = [owner]
+            parents = [ owner ]
         else:
             parents = []
 
@@ -172,15 +172,15 @@ def _contains_cycle(fgraph, orderings):
         # and increment the visited node count without double-counting
         node = visitable.popleft()
         visited += 1
-        for client in node_to_children.get(node, []):
+        for client in node_to_children.get(node,[]):
             parent_counts[client] -= 1
             # If all of a node's parents have been visited,
             # it may now be visited too
             if not parent_counts[client]:
                 visitable.append(client)
 
-    return visited != len(parent_counts)
 
+    return visited != len(parent_counts)
 
 def getroot(r, view_i):
     """
@@ -197,7 +197,6 @@ def getroot(r, view_i):
     except KeyError:
         return r
 
-
 def add_impact(r, view_o, impact):
     """
     In opposition to getroot, which finds the variable that is viewed *by* r, this function
@@ -212,17 +211,15 @@ def add_impact(r, view_o, impact):
           IG thinks so, based on reading the code. It looks like get_impact
           does what this docstring said this function does.
     """
-    for v in view_o.get(r, []):
+    for v in view_o.get(r,[]):
         impact.add(v)
         add_impact(v, view_o, impact)
 
-
 def get_impact(root, view_o):
     impact = OrderedSet()
     add_impact(root, view_o, impact)
     return impact
 
-
 def fast_inplace_check(inputs):
     """ Return the variables in inputs that are posible candidate for as inputs of inplace operation
 
@@ -230,14 +227,12 @@ def fast_inplace_check(inputs):
     :param inputs: inputs Variable that you want to use as inplace destination
     """
     fgraph = inputs[0].fgraph
-    Supervisor = theano.compile.function_module.Supervisor
-    protected_inputs = [f.protected for f in fgraph._features
-                        if isinstance(f, Supervisor)]
-    protected_inputs = sum(protected_inputs, [])  # flatten the list
+    protected_inputs = [f.protected for f in fgraph._features if isinstance(f,theano.compile.function_module.Supervisor)]
+    protected_inputs = sum(protected_inputs,[])#flatten the list
     protected_inputs.extend(fgraph.outputs)
 
     inputs = [i for i in inputs if
-              not isinstance(i, graph.Constant)
+              not isinstance(i,graph.Constant)
               and not fgraph.destroyers(i)
               and i not in protected_inputs]
     return inputs
@@ -299,18 +294,14 @@ def on_attach(self, fgraph):
             if self.fgraph is fgraph:
                 already_there = True
             if self.fgraph not in [None, fgraph]:
-                raise Exception("A DestroyHandler instance can only serve"
-                                " one FunctionGraph. (Matthew 6:24)")
+                raise Exception("A DestroyHandler instance can only serve one FunctionGraph. (Matthew 6:24)")
             for attr in ('destroyers', 'destroy_handler'):
                 if hasattr(fgraph, attr):
                     already_there = True
 
             if already_there:
-                # FunctionGraph.attach_feature catches AlreadyThere
-                # and cancels the attachment
-                raise toolbox.AlreadyThere(
-                    "DestroyHandler feature is already present or in"
-                    " conflict with another plugin.")
+                # FunctionGraph.attach_feature catches AlreadyThere and cancels the attachment
+                raise toolbox.AlreadyThere("DestroyHandler feature is already present or in conflict with another plugin.")
 
             ####### end of checking ############
 
@@ -351,7 +342,7 @@ def refresh_droot_impact(self):
         def _build_droot_impact(self):
             droot = {}   # destroyed view + nonview variables -> foundation
             impact = {}  # destroyed nonview variable -> it + all views of it
-            root_destroyer = {}  # root -> destroyer apply
+            root_destroyer = {} # root -> destroyer apply
 
             for app in self.destroyers:
                 for output_idx, input_idx_list in app.op.destroy_map.items():
@@ -361,8 +352,7 @@ def _build_droot_impact(self):
                     input = app.inputs[input_idx]
                     input_root = getroot(input, self.view_i)
                     if input_root in droot:
-                        raise InconsistencyError(
-                            "Multiple destroyers of %s" % input_root)
+                        raise InconsistencyError("Multiple destroyers of %s" % input_root)
                     droot[input_root] = input_root
                     root_destroyer[input_root] = app
                     #input_impact = set([input_root])
@@ -390,7 +380,7 @@ def on_detach(self, fgraph):
             delattr(self.fgraph, 'destroy_handler')
             self.fgraph = None
 
-        def on_import(self, fgraph, app, reason):
+        def on_import(self, fgraph, app):
             """Add Apply instance to set which must be computed"""
 
             #if app in self.debug_all_apps: raise ProtocolError("double import")
@@ -404,9 +394,7 @@ def on_import(self, fgraph, app, reason):
             # add this symbol to the forward and backward maps
             for o_idx, i_idx_list in getattr(app.op, 'view_map', {}).items():
                 if len(i_idx_list) > 1:
-                    raise NotImplementedError(
-                        'destroying this output invalidates multiple inputs',
-                        (app. op))
+                    raise NotImplementedError('destroying this output invalidates multiple inputs', (app.op))
                 o = app.outputs[o_idx]
                 i = app.inputs[i_idx_list[0]]
                 self.view_i[o] = i
@@ -414,7 +402,7 @@ def on_import(self, fgraph, app, reason):
 
             # update self.clients
             for i, input in enumerate(app.inputs):
-                self.clients.setdefault(input, {}).setdefault(app, 0)
+                self.clients.setdefault(input, {}).setdefault(app,0)
                 self.clients[input][app] += 1
 
             for i, output in enumerate(app.outputs):
@@ -422,7 +410,7 @@ def on_import(self, fgraph, app, reason):
 
             self.stale_droot = True
 
-        def on_prune(self, fgraph, app, reason):
+        def on_prune(self, fgraph, app):
             """Remove Apply instance from set which must be computed"""
             #if app not in self.debug_all_apps: raise ProtocolError("prune without import")
             #self.debug_all_apps.remove(app)
@@ -454,7 +442,7 @@ def on_prune(self, fgraph, app, reason):
 
             self.stale_droot = True
 
-        def on_change_input(self, fgraph, app, i, old_r, new_r, reason):
+        def on_change_input(self, fgraph, app, i, old_r, new_r):
             """app.inputs[i] changed from old_r to new_r """
             if app == 'output':
                 # app == 'output' is special key that means FunctionGraph is redefining which nodes are being
@@ -472,8 +460,7 @@ def on_change_input(self, fgraph, app, i, old_r, new_r, reason):
                 self.clients[new_r][app] += 1
 
                 #UPDATE self.view_i, self.view_o
-                for o_idx, i_idx_list in getattr(app.op, 'view_map',
-                                                 {}).items():
+                for o_idx, i_idx_list in getattr(app.op, 'view_map', {}).items():
                     if len(i_idx_list) > 1:
                         #destroying this output invalidates multiple inputs
                         raise NotImplementedError()
@@ -506,8 +493,7 @@ def validate(self, fgraph):
                 ords = self.orderings(fgraph)
 
                 if _contains_cycle(fgraph, ords):
-                    raise InconsistencyError(
-                        "Dependency graph contains cycles")
+                    raise InconsistencyError("Dependency graph contains cycles")
             else:
                 #James's Conjecture:
                 #If there are no destructive ops, then there can be no cycles.
@@ -532,14 +518,13 @@ def orderings(self, fgraph):
                 droot, impact, __ignore = self.refresh_droot_impact()
 
                 # check for destruction of constants
-                illegal_destroy = [r for r in droot if
-                        getattr(r.tag,'indestructible', False) or
+                illegal_destroy = [r for r in droot if \
+                        getattr(r.tag,'indestructible', False) or \
                         isinstance(r, graph.Constant)]
                 if illegal_destroy:
                     #print 'destroying illegally'
-                    raise InconsistencyError(
-                        "Attempting to destroy indestructible variables: %s" %
-                        illegal_destroy)
+                    raise InconsistencyError("Attempting to destroy indestructible variables: %s" %
+                            illegal_destroy)
 
                 # add destroyed variable clients as computational dependencies
                 for app in self.destroyers:
@@ -584,20 +569,15 @@ def orderings(self, fgraph):
 
                         #CHECK FOR INPUT ALIASING
                         # OPT: pre-compute this on import
-                        tolerate_same = getattr(app.op,
-                                                'destroyhandler_tolerate_same',
-                                                [])
+                        tolerate_same = getattr(app.op, 'destroyhandler_tolerate_same', [])
                         assert isinstance(tolerate_same, list)
-                        tolerated = OrderedSet(idx1 for idx0, idx1 in
-                                               tolerate_same
-                                               if idx0 == destroyed_idx)
+                        tolerated = OrderedSet(idx1 for idx0, idx1 in tolerate_same
+                                if idx0 == destroyed_idx)
                         tolerated.add(destroyed_idx)
-                        tolerate_aliased = getattr(
-                            app.op, 'destroyhandler_tolerate_aliased', [])
+                        tolerate_aliased = getattr(app.op, 'destroyhandler_tolerate_aliased', [])
                         assert isinstance(tolerate_aliased, list)
-                        ignored = OrderedSet(idx1 for idx0, idx1
-                                             in tolerate_aliased
-                                             if idx0 == destroyed_idx)
+                        ignored = OrderedSet(idx1 for idx0, idx1 in tolerate_aliased
+                                if idx0 == destroyed_idx)
                         #print 'tolerated', tolerated
                         #print 'ignored', ignored
                         for i, input in enumerate(app.inputs):
@@ -620,7 +600,6 @@ def orderings(self, fgraph):
 
             return rval
 
-
 class DestroyHandler(toolbox.Bookkeeper):
     """
     The DestroyHandler class detects when a graph is impossible to evaluate
@@ -662,7 +641,7 @@ class DestroyHandler(toolbox.Bookkeeper):
     The following data structures remain to be converted:
         <unknown>
     """
-    pickle_rm_attr = ["destroyers"]
+
 
     def __init__(self, do_imports_on_attach=True):
         self.fgraph = None
@@ -707,21 +686,25 @@ def on_attach(self, fgraph):
         if self.fgraph is fgraph:
             already_there = True
         if self.fgraph is not None:
-            raise Exception(
-                "A DestroyHandler instance can only serve one"
-                " FunctionGraph. (Matthew 6:24)")
+            raise Exception("A DestroyHandler instance can only serve one FunctionGraph. (Matthew 6:24)")
         for attr in ('destroyers', 'destroy_handler'):
             if hasattr(fgraph, attr):
                 already_there = True
 
         if already_there:
             # FunctionGraph.attach_feature catches AlreadyThere and cancels the attachment
-            raise toolbox.AlreadyThere(
-                "DestroyHandler feature is already present"
-                " or in conflict with another plugin.")
+            raise toolbox.AlreadyThere("DestroyHandler feature is already present or in conflict with another plugin.")
 
         ####### Annotate the FunctionGraph ############
-        self.unpickle(fgraph)
+
+        def get_destroyers_of(r):
+            droot, impact, root_destroyer = self.refresh_droot_impact()
+            try:
+                return [root_destroyer[droot[r]]]
+            except Exception:
+                return []
+
+        fgraph.destroyers = get_destroyers_of
         fgraph.destroy_handler = self
 
         self.fgraph = fgraph
@@ -736,15 +719,6 @@ def on_attach(self, fgraph):
         if self.do_imports_on_attach:
             toolbox.Bookkeeper.on_attach(self, fgraph)
 
-    def unpickle(self, fgraph):
-        def get_destroyers_of(r):
-            droot, impact, root_destroyer = self.refresh_droot_impact()
-            try:
-                return [root_destroyer[droot[r]]]
-            except Exception:
-                return []
-        fgraph.destroyers = get_destroyers_of
-
     def refresh_droot_impact(self):
         """
         Makes sure self.droot, self.impact, and self.root_destroyer are
@@ -764,8 +738,7 @@ def refresh_droot_impact(self):
                     input = app.inputs[input_idx]
                     input_root = getroot(input, self.view_i)
                     if input_root in droot:
-                        raise InconsistencyError(
-                            "Multiple destroyers of %s" % input_root)
+                        raise InconsistencyError("Multiple destroyers of %s" % input_root)
                     droot[input_root] = input_root
                     root_destroyer[input_root] = app
                     input_impact = get_impact(input_root, self.view_o)
@@ -792,11 +765,10 @@ def on_detach(self, fgraph):
         delattr(self.fgraph, 'destroy_handler')
         self.fgraph = None
 
-    def on_import(self, fgraph, app, reason):
+    def on_import(self, fgraph, app):
         """Add Apply instance to set which must be computed"""
 
-        if app in self.debug_all_apps:
-            raise ProtocolError("double import")
+        if app in self.debug_all_apps: raise ProtocolError("double import")
         self.debug_all_apps.add(app)
         #print 'DH IMPORT', app, id(app), id(self), len(self.debug_all_apps)
 
@@ -805,12 +777,9 @@ def on_import(self, fgraph, app, reason):
             self.destroyers.add(app)
 
         # add this symbol to the forward and backward maps
-        for o_idx, i_idx_list in getattr(app.op, 'view_map',
-                                         OrderedDict()).items():
+        for o_idx, i_idx_list in getattr(app.op, 'view_map', OrderedDict()).items():
             if len(i_idx_list) > 1:
-                raise NotImplementedError(
-                    'destroying this output invalidates multiple inputs',
-                    (app. op))
+                raise NotImplementedError('destroying this output invalidates multiple inputs', (app.op))
             o = app.outputs[o_idx]
             i = app.inputs[i_idx_list[0]]
             self.view_i[o] = i
@@ -818,7 +787,7 @@ def on_import(self, fgraph, app, reason):
 
         # update self.clients
         for i, input in enumerate(app.inputs):
-            self.clients.setdefault(input, OrderedDict()).setdefault(app, 0)
+            self.clients.setdefault(input, OrderedDict()).setdefault(app,0)
             self.clients[input][app] += 1
 
         for i, output in enumerate(app.outputs):
@@ -826,10 +795,9 @@ def on_import(self, fgraph, app, reason):
 
         self.stale_droot = True
 
-    def on_prune(self, fgraph, app, reason):
+    def on_prune(self, fgraph, app):
         """Remove Apply instance from set which must be computed"""
-        if app not in self.debug_all_apps:
-            raise ProtocolError("prune without import")
+        if app not in self.debug_all_apps: raise ProtocolError("prune without import")
         self.debug_all_apps.remove(app)
 
         #UPDATE self.clients
@@ -844,8 +812,7 @@ def on_prune(self, fgraph, app, reason):
         # deleted on_detach().
 
         #UPDATE self.view_i, self.view_o
-        for o_idx, i_idx_list in getattr(app.op, 'view_map',
-                                         OrderedDict()).items():
+        for o_idx, i_idx_list in getattr(app.op, 'view_map', OrderedDict()).items():
             if len(i_idx_list) > 1:
                 #destroying this output invalidates multiple inputs
                 raise NotImplementedError()
@@ -860,15 +827,14 @@ def on_prune(self, fgraph, app, reason):
 
         self.stale_droot = True
 
-    def on_change_input(self, fgraph, app, i, old_r, new_r, reason):
+    def on_change_input(self, fgraph, app, i, old_r, new_r):
         """app.inputs[i] changed from old_r to new_r """
         if app == 'output':
             # app == 'output' is special key that means FunctionGraph is redefining which nodes are being
             # considered 'outputs' of the graph.
             pass
         else:
-            if app not in self.debug_all_apps:
-                raise ProtocolError("change without import")
+            if app not in self.debug_all_apps: raise ProtocolError("change without import")
 
             #UPDATE self.clients
             self.clients[old_r][app] -= 1
@@ -879,8 +845,7 @@ def on_change_input(self, fgraph, app, i, old_r, new_r, reason):
             self.clients[new_r][app] += 1
 
             #UPDATE self.view_i, self.view_o
-            for o_idx, i_idx_list in getattr(app.op, 'view_map',
-                                             OrderedDict()).items():
+            for o_idx, i_idx_list in getattr(app.op, 'view_map', OrderedDict()).items():
                 if len(i_idx_list) > 1:
                     #destroying this output invalidates multiple inputs
                     raise NotImplementedError()
@@ -917,14 +882,6 @@ def validate(self, fgraph):
         else:
             #James's Conjecture:
             #If there are no destructive ops, then there can be no cycles.
-
-            #FB: This isn't always True. It can happend that
-            #optimization introduce node that depend on itself. This
-            #is very rare and should not happen in general. It will be
-            #caught later. The error will be far from the source. But
-            #doing this conjecture should speed up compilation most of
-            #the time. The user should create such dependency except
-            #if he mess too much with the internal.
             pass
         return True
 
diff --git a/theano/gof/fg.py b/theano/gof/fg.py
index 720472be8ae..8e16296859f 100644
--- a/theano/gof/fg.py
+++ b/theano/gof/fg.py
@@ -4,12 +4,7 @@
 Contains the FunctionGraph class and exception
 types that it can raise
 """
-import StringIO
 import sys
-import time
-import traceback
-
-import theano
 from theano.gof import graph
 from theano.gof import utils
 from theano.gof import toolbox
@@ -21,15 +16,6 @@
 from theano.gof.python25 import OrderedDict
 from theano.misc.ordered_set import OrderedSet
 
-
-class CachedConstantError(Exception):
-    """An exception thrown when we put in a FunctionGraph a Constant
-    that is cached.  This should not happen as the user can reuse this
-    cached constant in other FunctionGraph.
-    """
-    pass
-
-
 class InconsistencyError(Exception):
     """
     This exception should be thrown by listeners to FunctionGraph when the
@@ -76,7 +62,7 @@ class FunctionGraph(utils.object2):
 
     """
 
-    def __init__(self, inputs, outputs, features=None, clone=True):
+    def __init__(self, inputs, outputs, features=None):
         """
         Create an FunctionGraph which operates on the subgraph bound by the inputs and
         outputs sets.
@@ -87,20 +73,7 @@ def __init__(self, inputs, outputs, features=None, clone=True):
         #TODO: document what variables are[not] set in the FunctionGraph when a feature
         is added via the constructor.  How constructed is the FunctionGraph?
 
-        Note: the intermediate nodes between 'inputs' and 'outputs' are not explicitely
-        passed.
-         
-        :param inputs: inputs nodes of the graph, usually declared by the user
-        :param outputs: outputs nodes of the graph.
-        :param clone: If true, we will clone the graph. This is
-        useful to remove the constant cache problem.
-
         """
-        if clone:
-            inputs, outputs = graph.clone(inputs, outputs)
-
-        self.execute_callbacks_time = 0
-        self.execute_callbacks_times = {}
 
         if features is None:
             features = []
@@ -109,8 +82,7 @@ def __init__(self, inputs, outputs, features=None, clone=True):
         # so I probably am) this should be a set.
         self._features = []
 
-        # All apply nodes in the subgraph defined by inputs and
-        # outputs are cached in this field
+        # All apply nodes in the subgraph defined by inputs and outputs are cached in this field
         self.apply_nodes = set()
 
         # Ditto for variable nodes
@@ -132,7 +104,7 @@ def __init__(self, inputs, outputs, features=None, clone=True):
             self.__setup_r__(input)
             self.variables.add(input)
 
-        self.__import_r__(outputs, reason="init")
+        self.__import_r__(outputs)
         for i, output in enumerate(outputs):
             output.clients.append(('output', i))
 
@@ -140,17 +112,12 @@ def __init__(self, inputs, outputs, features=None, clone=True):
         self.variable_locks = {}
         self.profile = None
 
+
     ### Setup a Variable ###
+
     def __setup_r__(self, r):
         # sets up r so it belongs to this fgraph
-        if getattr(r, 'cached', False):
-            raise CachedConstantError(
-                "You manually constructed a FunctionGraph, but you passed it a"
-                " graph that has a cached constant. This should not happen."
-                " Clone the graph before building the FunctionGraph.")
-        if (hasattr(r, 'fgraph') and
-            r.fgraph is not None and
-            r.fgraph is not self):
+        if hasattr(r, 'fgraph') and r.fgraph is not None and r.fgraph is not self:
             raise Exception("%s is already owned by another fgraph" % r)
         r.fgraph = self
         r.clients = []
@@ -198,13 +165,13 @@ def disown(self):
         self.inputs = None
         self.outputs = None
 
+
     ### clients ###
+
     def clients(self, r):
         """
         Set of all the (node, i) pairs such that node.inputs[i] is r.
-        Tell differently, a list of (node,i) such that each node have
-        r as input at index i.
-
+        Tell differently, a list of (node,i) such that each node have r as input at index i.
         """
         return r.clients
 
@@ -217,15 +184,12 @@ def __add_clients__(self, r, new_clients):
         """
         if set(r.clients).intersection(set(new_clients)):
             print >> sys.stderr, 'ERROR: clients intersect!'
-            print >> sys.stderr, '  RCLIENTS of', r, [(n, i, type(n), id(n))
-                                                      for n, i in r.clients]
-            print >> sys.stderr, '  NCLIENTS of', r, [(n, i, type(n), id(n))
-                                                      for n, i in new_clients]
+            print >> sys.stderr, '  RCLIENTS of', r, [(n,i, type(n), id(n)) for n,i in r.clients]
+            print >> sys.stderr, '  NCLIENTS of', r, [(n,i, type(n), id(n)) for n,i in new_clients]
         assert not set(r.clients).intersection(set(new_clients))
         r.clients += new_clients
 
-    def __remove_clients__(self, r, clients_to_remove,
-                           prune=True, reason=None):
+    def __remove_clients__(self, r, clients_to_remove, prune = True):
         """ WRITEME
         r -> variable
         clients_to_remove -> list of (op, i) pairs such that node.inputs[i] is not r anymore.
@@ -238,34 +202,37 @@ def __remove_clients__(self, r, clients_to_remove,
                 print >> sys.stderr, 'ERROR: DUPLICATE CLIENT ENTRY...'
                 print >> sys.stderr, '  ENTRY', repr(entry), type(entry[0])
                 print >> sys.stderr, '  CLIENTS', repr(r.clients)
-            assert entry not in r.clients  # an op,i pair should be unique
+            assert entry not in r.clients # an op,i pair should be unique
         if not r.clients:
             if prune:
-                self.__prune_r__([r], reason)
+                self.__prune_r__([r])
                 return False
             return True
         return False
 
+
     ### import ###
-    def __import_r__(self, variables, reason):
+
+    def __import_r__(self, variables):
         global NullType
         if NullType is None:
             from null_type import NullType
         # Imports the owners of the variables
+        r_owner_done = set(self.apply_nodes)
         for apply_node in [r.owner for r in variables if r.owner is not None]:
-            if apply_node not in self.apply_nodes:
-                self.__import__(apply_node, reason=reason)
+            if apply_node not in r_owner_done:
+                r_owner_done.add(apply_node)
+                self.__import__(apply_node)
         for r in variables:
             if r.owner is None and not isinstance(r, graph.Constant) and r not in self.inputs:
-                if isinstance(r.type, NullType):
-                    raise TypeError("Computation graph contains a NaN. " +
-                                    r.type.why_null)
+                if isinstance(r.type,NullType):
+                    raise TypeError("Computation graph contains a NaN. "+r.type.why_null)
                 raise MissingInputError("Undeclared input", r)
             if not getattr(r, 'fgraph', None) is self:
                 self.__setup_r__(r)
             self.variables.add(r)
 
-    def __import__(self, apply_node, check=True, reason=None):
+    def __import__(self, apply_node, check = True):
         node = apply_node
 
         # We import the nodes in topological order. We only are interested
@@ -281,9 +248,7 @@ def __import__(self, apply_node, check=True, reason=None):
                 for r in node.inputs:
                     if hasattr(r, 'fgraph') and r.fgraph is not self:
                         raise Exception("%s is already owned by another fgraph" % r)
-                    if (r.owner is None and
-                        not isinstance(r, graph.Constant) and
-                        r not in self.inputs):
+                    if r.owner is None and not isinstance(r, graph.Constant) and r not in self.inputs:
 
                         #Verbose error message
                         #Show a complete chain of variables from the missing input to an output
@@ -335,29 +300,18 @@ def find_path_to(output_var, input_var):
                             #if there is no path then r isn't really a graph input so we shouldn't be running error
                             #handler code in the first place
                             assert path is not None
-                            tr = getattr(r.tag, 'trace', None)
-                            detailed_err_msg = ""
-                            if tr:
-                                sio = StringIO.StringIO()
-                                traceback.print_list(tr, sio)
-                                tr = sio.getvalue()
-                                detailed_err_msg += "\nBacktrace when the variable is created:\n"
-                                detailed_err_msg += str(tr)
-
-                            raise MissingInputError(
+
+                            raise MissingInputError((
                                 'A variable that is an input to the graph was '
                                 'neither provided as an input to the function '
                                 'nor given a value. A chain of variables '
                                 'leading from this input to an output is %s. '
-                                'This chain may not be unique' % str(path) +
-                                detailed_err_msg)
+                                'This chain may not be unique' % str(path)))
 
                         #Standard error message
                         raise MissingInputError((
                             "An input of the graph, used to compute %s, "
-                            "was not provided and not given a value."
-                            "Use the Theano flag exception_verbosity='high',"
-                            "for more information on this error."
+                            "was not provided and not given a value"
                             % str(node)),
                             r)
 
@@ -374,18 +328,20 @@ def find_path_to(output_var, input_var):
                     self.variables.add(input)
                 self.__add_clients__(input, [(node, i)])
             assert node.fgraph is self
-            self.execute_callbacks('on_import', node, reason)
+            self.execute_callbacks('on_import', node)
+
 
     ### prune ###
-    def __prune_r__(self, variables, reason=None):
+
+    def __prune_r__(self, variables):
         # Prunes the owners of the variables.
         for node in set(r.owner for r in variables if r.owner is not None):
-            self.__prune__(node, reason)
+            self.__prune__(node)
         for r in variables:
             if not r.clients and r in self.variables:
                 self.variables.remove(r)
 
-    def __prune__(self, apply_node, reason=None):
+    def __prune__(self, apply_node):
         node = apply_node
         if node not in self.apply_nodes:
             raise Exception("%s does not belong to this FunctionGraph and cannot be pruned." % node)
@@ -400,13 +356,16 @@ def __prune__(self, apply_node, reason=None):
                 return
         self.apply_nodes.remove(node)
         self.variables.difference_update(node.outputs)
-        self.execute_callbacks('on_prune', node, reason)
+        self.execute_callbacks('on_prune', node)
 
         for i, input in enumerate(node.inputs):
-            self.__remove_clients__(input, [(node, i)], reason=reason)
+            self.__remove_clients__(input, [(node, i)])
         #self.__prune_r__(node.inputs)
 
+
+
     ### change input ###
+
     def change_input(self, node, i, new_r, reason=None):
         """WRITEME
         Changes node.inputs[i] to new_r.
@@ -415,52 +374,49 @@ def change_input(self, node, i, new_r, reason=None):
         current value of node.inputs[i] which we want to replace.
 
         For each feature that has a 'on_change_input' method, calls:
-          feature.on_change_input(function_graph, node, i, old_r, new_r, reason)
+          feature.on_change_input(function_graph, node, i, old_r, new_r, [reason])
         """
         # TODO: ERROR HANDLING FOR LISTENERS (should it complete the change or revert it?)
         if node == 'output':
             r = self.outputs[i]
             if not r.type == new_r.type:
                 raise TypeError("The type of the replacement must be the"
-                                " same as the type of the original Variable.",
-                                r, new_r)
+                        " same as the type of the original Variable.",
+                        r, new_r)
             self.outputs[i] = new_r
         else:
             if node.fgraph is not self:
                 raise Exception("Cannot operate on %s because it does not"
-                                " belong to this FunctionGraph" % node)
+                        " belong to this FunctionGraph" % node)
             r = node.inputs[i]
             if not r.type == new_r.type:
                 raise TypeError("The type of the replacement must be the"
-                                " same as the type of the original Variable.",
-                                r, new_r)
+                        " same as the type of the original Variable.",
+                        r, new_r)
             node.inputs[i] = new_r
 
         if r is new_r:
             return
 
-        self.__import_r__([new_r], reason=reason)
+        self.__import_r__([new_r])
         self.__add_clients__(new_r, [(node, i)])
         prune = self.__remove_clients__(r, [(node, i)], False)
         # Precondition: the substitution is semantically valid
         # However it may introduce cycles to the graph,  in which case the
         # transaction will be reverted later.
-        self.execute_callbacks('on_change_input', node, i,
-                               r, new_r, reason=reason)
+        self.execute_callbacks('on_change_input', node, i, r, new_r, reason=reason)
 
         if prune:
-            self.__prune_r__([r], reason=reason)
+            self.__prune_r__([r])
+
 
     ### replace ###
-    def replace(self, r, new_r, reason=None, verbose=None):
+
+    def replace(self, r, new_r, reason=None):
         """ WRITEME
         This is the main interface to manipulate the subgraph in FunctionGraph.
         For every node that uses r as input, makes it use new_r instead.
         """
-        if verbose is None:
-            verbose = config.optimizer_verbose
-        if verbose:
-            print reason, r, new_r
         if r.fgraph is not self:
             raise Exception("Cannot replace %s because it does not belong to this FunctionGraph" % r, str(reason))
         if not r.type == new_r.type:
@@ -470,24 +426,7 @@ def replace(self, r, new_r, reason=None, verbose=None):
             # because it makes it easier to implement some optimizations for multiple-output ops
             return
 
-        if theano.config.compute_test_value != 'off':
-            try:
-                tval = theano.gof.op.get_test_value(r)
-                new_tval = theano.gof.op.get_test_value(new_r)
-            except AttributeError:
-                pass
-            else:
-                tval_shape = getattr(tval, 'shape', None)
-                new_tval_shape = getattr(new_tval, 'shape', None)
-                if tval_shape != new_tval_shape:
-                    raise AssertionError(
-                        "The replacement variable has a test value with "
-                        "a shape different from the original variable's "
-                        "test value. Original: %s, new: %s"
-                        % (tval_shape, new_tval_shape),
-                        r, new_r, str(reason))
-
-        for node, i in list(r.clients):  # copy the client list for iteration
+        for node, i in list(r.clients): # copy the client list for iteration
             assert (node == 'output' and self.outputs[i] is r) or (node.inputs[i] is r)
             self.change_input(node, i, new_r, reason=reason)
 
@@ -501,9 +440,11 @@ def replace_all(self, pairs, reason=None):
         for r, new_r in pairs:
             self.replace(r, new_r, reason=reason)
 
+
+
     def extend(self, feature):
         warnings.warn("FunctionGraph.extend is deprecatd. It has been "
-                      "renamed to FunctionGraph.attach_feature")
+                "renamed to FunctionGraph.attach_feature")
         return self.attach_feature(feature)
 
     def attach_feature(self, feature):
@@ -514,7 +455,7 @@ def attach_feature(self, feature):
 
         # Filter out literally identical features
         if feature in self._features:
-            return  # the feature is already present
+            return # the feature is already present
 
         # Filter out functionally identical features.
         # Features may use their on_attach method to raise
@@ -526,7 +467,7 @@ def attach_feature(self, feature):
                 attach(self)
             except toolbox.AlreadyThere:
                 return
-        self.execute_callbacks_times.setdefault(feature, 0)
+
         #it would be nice if we could require a specific class instead of
         #a "workalike" so we could do actual error checking
         #if not isinstance(feature, toolbox.Feature):
@@ -540,9 +481,7 @@ def remove_feature(self, feature):
         """WRITEME
         Removes the feature from the graph.
 
-        Calls feature.on_detach(function_graph) if an on_detach method
-        is defined.
-
+        Calls feature.on_detach(function_graph) if an on_detach method is defined.
         """
         try:
             self._features.remove(feature)
@@ -552,14 +491,15 @@ def remove_feature(self, feature):
         if detach is not None:
             detach(self)
 
+
     ### callback utils ###
+
     def execute_callbacks(self, name, *args, **kwargs):
         """WRITEME
         Calls
           getattr(feature, name)(*args)
         for each feature which has a method called after name.
         """
-        t0 = time.time()
         for feature in self._features:
             try:
                 fn = getattr(feature, name)
@@ -568,10 +508,16 @@ def execute_callbacks(self, name, *args, **kwargs):
                 # try; the AttributeError reall must come from feature.${name}
                 # not existing
                 continue
-            tf0 = time.time()
-            fn(self, *args, **kwargs)
-            self.execute_callbacks_times[feature] += time.time() - tf0
-        self.execute_callbacks_time += time.time() - t0
+
+            #####HORRIBLE OPTIONAL ARGUMENT HACK
+            try:
+                fn(self, *args, **kwargs)
+            except TypeError, e:
+                if str(e) == "on_change_input() got an unexpected keyword argument 'reason'" and len(kwargs) == 1:
+                    fn(self, *args)
+                else:
+                    raise
+
 
     def collect_callbacks(self, name, *args):
         """WRITEME
@@ -588,7 +534,9 @@ def collect_callbacks(self, name, *args):
             d[feature] = fn(*args)
         return d
 
+
     ### misc ###
+
     def toposort(self):
         """WRITEME
         Returns an ordering of the graph's Apply nodes such that:
@@ -604,8 +552,8 @@ def toposort(self):
         if len(self.apply_nodes) < 2:
             # optimization
             # when there are 0 or 1 nodes, no sorting is necessary
-            # This special case happens a lot because the OpWiseCLinker
-            # produces 1-element graphs.
+            # This special case happens a lot because the OpWiseCLinker produces
+            # 1-element graphs.
             return list(self.apply_nodes)
         fg = self
 
@@ -620,33 +568,30 @@ def orderings(self):
         Return dict d s.t. d[node] is a list of nodes that must be evaluated
         before node itself can be evaluated.
 
-        This is used primarily by the destroy_handler feature to ensure that
-        all clients of any destroyed inputs have already computed their
-        outputs.
+        This is used primarily by the destroy_handler feature to ensure that all
+        clients of any destroyed inputs have already computed their outputs.
 
         :note: This only calls the orderings() fct on all features. It does not
                take care of computing dependencies by itself.
 
         """
-        ords = OrderedDict()
+        ords =  OrderedDict()
         assert isinstance(self._features, list)
         for feature in self._features:
             if hasattr(feature, 'orderings'):
                 orderings = feature.orderings(self)
                 if not isinstance(orderings, OrderedDict):
-                    raise TypeError("Non-deterministic return value from " +
-                                    str(feature.orderings) +
-                                    ". Nondeterministic object is " +
-                                    str(orderings))
+                    raise TypeError("Non-deterministic return value from " \
+                            +str(feature.orderings) \
+                            +". Nondeterministic object is "+str(orderings))
                 for node, prereqs in orderings.items():
                     if not isinstance(prereqs, (list, OrderedSet)):
-                        raise TypeError(
-                            "prereqs must be a type with a "
-                            "deterministic iteration order, or toposort "
-                            " will be non-deterministic.")
+                        raise TypeError("prereqs must be a type with a "
+                                "deterministic iteration order, or toposort "
+                                " will be non-deterministic.")
                     ords.setdefault(node, []).extend(prereqs)
         # eliminate duplicate prereqs
-        for (node, prereqs) in ords.items():
+        for (node,prereqs) in ords.items():
             ords[node] = list(OrderedSet(prereqs))
         return ords
 
@@ -679,48 +624,34 @@ def check_integrity(self):
         if self.apply_nodes != nodes:
             missing = nodes.difference(self.apply_nodes)
             excess = self.apply_nodes.difference(nodes)
-            raise Exception(
-                "The nodes are inappropriately cached. missing, in excess: ",
-                missing, excess)
+            raise Exception("The nodes are inappropriately cached. missing, in excess: ", missing, excess)
         for node in nodes:
             if node.fgraph is not self:
-                raise Exception("Node should belong to the FunctionGraph.",
-                                node)
+                raise Exception("Node should belong to the FunctionGraph.", node)
             for i, variable in enumerate(node.inputs):
                 if variable.fgraph is not self:
-                    raise Exception(
-                        "Input of node should belong to the FunctionGraph.",
-                        variable, (node, i))
+                    raise Exception("Input of node should belong to the FunctionGraph.", variable, (node, i))
                 if (node, i) not in variable.clients:
-                    raise Exception("Inconsistent clients list.",
-                                    (node, i), variable.clients)
+                    raise Exception("Inconsistent clients list.", (node, i), variable.clients)
         variables = set(graph.variables(self.inputs, self.outputs))
         if set(self.variables) != variables:
             missing = variables.difference(self.variables)
             excess = self.variables.difference(variables)
-            raise Exception(
-                "The variables are inappropriately cached. missing, in excess: ",
-                missing, excess)
+            raise Exception("The variables are inappropriately cached. missing, in excess: ", missing, excess)
         for variable in variables:
-            if (variable.owner is None and
-                variable not in self.inputs and
-                not isinstance(variable, graph.Constant)):
+            if variable.owner is None and variable not in self.inputs and not isinstance(variable, graph.Constant):
                 raise Exception("Undeclared input.", variable)
             if variable.fgraph is not self:
-                raise Exception("Variable should belong to the FunctionGraph.",
-                                variable)
+                raise Exception("Variable should belong to the FunctionGraph.", variable)
             for node, i in variable.clients:
                 if node == 'output':
                     if self.outputs[i] is not variable:
-                        raise Exception("Inconsistent clients list.",
-                                        variable, self.outputs[i])
+                        raise Exception("Inconsistent clients list.", variable, self.outputs[i])
                     continue
                 if node not in nodes:
-                    raise Exception("Client not in FunctionGraph.",
-                                    variable, (node, i))
+                    raise Exception("Client not in FunctionGraph.", variable, (node, i))
                 if node.inputs[i] is not variable:
-                    raise Exception("Inconsistent clients list.",
-                                    variable, node.inputs[i])
+                    raise Exception("Inconsistent clients list.", variable, node.inputs[i])
 
     def __str__(self):
         return "[%s]" % ", ".join(graph.as_string(self.inputs, self.outputs))
@@ -728,43 +659,20 @@ def __str__(self):
     def __repr__(self):
         return self.__str__()
 
+
     ### clone ###
-    def clone(self, check_integrity=True):
+
+    def clone(self):
         """WRITEME"""
-        return self.clone_get_equiv(check_integrity)[0]
+        return self.clone_get_equiv()[0]
 
-    def clone_get_equiv(self, check_integrity=True):
+    def clone_get_equiv(self):
         """WRITEME"""
         equiv = graph.clone_get_equiv(self.inputs, self.outputs)
-        if check_integrity:
-            self.check_integrity()
+        self.check_integrity()
         e = FunctionGraph([equiv[i] for i in self.inputs],
-                          [equiv[o] for o in self.outputs])
-        if check_integrity:
-            e.check_integrity()
+                [equiv[o] for o in self.outputs])
+        e.check_integrity()
         for feature in self._features:
             e.attach_feature(feature)
         return e, equiv
-
-    def __getstate__(self):
-        """This is needed as some feature introduce instancemethod and
-        this is not picklable.
-        """
-        d = self.__dict__.copy()
-        for feature in self._features:
-            for attr in getattr(feature, "pickle_rm_attr", []):
-                del d[attr]
-        # The class Updater take fct as parameter and they are lambda function, so unpicklable.
-
-        # execute_callbacks_times have reference to optimizer, and they can't 
-        # be pickled as the decorators with parameters aren't pickable.
-        if "execute_callbacks_times" in d:
-            del d["execute_callbacks_times"]
-
-        return d
-
-    def __setstate__(self, dct):
-        self.__dict__.update(dct)
-        for feature in self._features:
-            if hasattr(feature, "unpickle"):
-                feature.unpickle(self)
diff --git a/theano/gof/graph.py b/theano/gof/graph.py
index e02e9542524..bc938cb66df 100644
--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -135,14 +135,9 @@ def default_output(self):
             if len(self.outputs) == 1:
                 return self.outputs[0]
             else:
-                raise AttributeError(
-                    "%s.default_output should be an output index." % self.op)
-        elif not isinstance(do, (int, long)):
-            raise AttributeError("%s.default_output should be an int or long" %
-                                 self.op)
+                raise AttributeError("%s.default_output should be an output index." % self.op)
         elif do < 0 or do >= len(self.outputs):
-            raise AttributeError("%s.default_output is out of range." %
-                                 self.op)
+            raise AttributeError("%s.default_output is out of range." % self.op)
         return self.outputs[do]
 
     def env_getter(self):
@@ -420,22 +415,15 @@ def eval(self, inputs_to_values=None):
         if inputs_to_values is None:
             inputs_to_values = {}
 
-        if not hasattr(self, '_fn_cache'):
-            self._fn_cache = dict()
+        if not hasattr(self, '_fn'):
+            self._fn_inputs = inputs_to_values.keys()
+            self._fn = theano.function(self._fn_inputs, self)
+        args = [inputs_to_values[param] for param in self._fn_inputs]
 
-        inputs = tuple(sorted(inputs_to_values.keys(), key=id))
-        if not inputs in self._fn_cache:
-            self._fn_cache[inputs] = theano.function(inputs, self)
-        args = [inputs_to_values[param] for param in inputs]
-
-        rval = self._fn_cache[inputs](*args)
+        rval = self._fn(*args)
 
         return rval
 
-    def __getstate__(self):
-        d = self.__dict__.copy()
-        d.pop("_fn_cache", None)
-        return d
     env = property(env_getter, env_setter, env_deleter)
 
 
@@ -507,14 +495,14 @@ def stack_search(start, expand, mode='bfs', build_inv=False):
     :param start: search from these nodes
     :type expand: callable
     :param expand:
-        when we get to a node, add expand(node) to the list of nodes to visit.
-        This function should return a list, or None
+        when we get to a node, add expand(node) to the list of nodes to visit.  This function
+        should return a list, or None
     :rtype: list of `Variable` or `Apply` instances (depends on `expend`)
     :return: the list of nodes in order of traversal.
 
     :note:
-        a node will appear at most once in the return value, even if it
-        appears multiple times in the start parameter.
+        a node will appear at most once in the return value, even if it appears multiple times
+        in the start parameter.
 
     :postcondition: every element of start is transferred to the returned list.
     :postcondition: start is empty.
@@ -561,7 +549,9 @@ def ancestors(variable_list, blockers=None):
     """
     def expand(r):
         if r.owner and (not blockers or r not in blockers):
-            return reversed(r.owner.inputs)
+            l = list(r.owner.inputs)
+            l.reverse()
+            return l
     dfs_variables = stack_search(deque(variable_list), expand, 'dfs')
     return dfs_variables
 
@@ -811,7 +801,7 @@ def deps(obj):
             if isinstance(obj, Variable):
                 if obj.owner:
                     rval = [obj.owner]
-            elif isinstance(obj, Apply):
+            if isinstance(obj, Apply):
                 rval = list(obj.inputs)
             rval.extend(orderings.get(obj, []))
         else:
@@ -878,7 +868,6 @@ def is_same_graph(var1, var2, givens=None, debug=False):
     # Get result from the merge-based function.
     rval1 = is_same_graph_with_merge(var1=var1, var2=var2, givens=givens)
     # Get result from the function `equal_computations` from scan_utils.
-
     use_equal_computations = True
     if givens:
         # We need to build the `in_xs` and `in_ys` lists. To do this, we need
diff --git a/theano/gof/lazylinker_c.c b/theano/gof/lazylinker_c.c.txt
similarity index 98%
rename from theano/gof/lazylinker_c.c
rename to theano/gof/lazylinker_c.c.txt
index 1f79b1f4d2a..401e06f5623 100644
--- a/theano/gof/lazylinker_c.c
+++ b/theano/gof/lazylinker_c.c.txt
@@ -793,8 +793,7 @@ int lazy_rec_eval(CLazyLinker * self, Py_ssize_t var_idx, PyObject*one, PyObject
   set_position_of_error(self, owner_idx);
   return err;
 }
-
-static PyObject *
+PyObject *
 CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
 {
   CLazyLinker * self = (CLazyLinker*)_self;
@@ -930,34 +929,6 @@ static PyMethodDef CLazyLinker_methods[] = {
 };
 #endif
 
-
-static PyObject *
-CLazyLinker_get_allow_gc(CLazyLinker *self, void *closure)
-{
-    return PyBool_FromLong(self->allow_gc);
-}
-
-static int
-CLazyLinker_set_allow_gc(CLazyLinker *self, PyObject *value, void *closure)
-{
-  if(!PyBool_Check(value))
-    return -1;
-
-  if (value == Py_True)
-    self->allow_gc = true;
-  else
-    self->allow_gc = false;
-  return 0;
-}
-
-static PyGetSetDef CLazyLinker_getset[] = {
-  {(char*)"allow_gc",
-   (getter)CLazyLinker_get_allow_gc,
-   (setter)CLazyLinker_set_allow_gc,
-   (char*)"do this function support allow_gc",
-   NULL},
-  {NULL, NULL, NULL, NULL}  /* Sentinel */
-};
 static PyMemberDef CLazyLinker_members[] = {
     {(char*)"nodes", T_OBJECT_EX, offsetof(CLazyLinker, nodes), 0,
      (char*)"list of nodes"},
@@ -1011,7 +982,7 @@ static PyTypeObject lazylinker_ext_CLazyLinkerType = {
     0,                         /* tp_iternext */
     0,//CLazyLinker_methods,       /* tp_methods */
     CLazyLinker_members,       /* tp_members */
-    CLazyLinker_getset,        /* tp_getset */
+    0,                         /* tp_getset */
     0,                         /* tp_base */
     0,                         /* tp_dict */
     0,                         /* tp_descr_get */
@@ -1024,7 +995,7 @@ static PyTypeObject lazylinker_ext_CLazyLinkerType = {
 
 static PyObject * get_version(PyObject *dummy, PyObject *args)
 {
-  PyObject *result = PyFloat_FromDouble(0.21);
+  PyObject *result = PyFloat_FromDouble(0.20);
   return result;
 }
 
diff --git a/theano/gof/lazylinker_c.py b/theano/gof/lazylinker_c.py
index 79232c98457..7bd3a44d688 100644
--- a/theano/gof/lazylinker_c.py
+++ b/theano/gof/lazylinker_c.py
@@ -1,9 +1,5 @@
 import errno
-import logging
-import os
-import sys
-import warnings
-
+import os, logging, sys
 
 import theano
 from theano import config
@@ -14,8 +10,7 @@
 _logger = logging.getLogger('theano.gof.lazylinker_c')
 
 force_compile = False
-version = 0.21  # must match constant returned in function get_version()
-
+version = 0.20  # must match constant returned in function get_version()
 
 def try_import():
     global lazylinker_ext
@@ -23,7 +18,6 @@ def try_import():
     import lazylinker_ext
     del sys.path[0]
 
-
 def try_reload():
     sys.path[0:0] = [config.compiledir]
     reload(lazylinker_ext)
@@ -80,37 +74,16 @@ def try_reload():
             if version != getattr(lazylinker_ext, '_version', None):
                 raise ImportError()
         except ImportError:
-            # It is useless to try to compile if there isn't any
-            # compiler!  But we still want to try to load it, in case
-            # the cache was copied from another computer.
-            if not theano.config.cxx:
-                raise
             _logger.info("Compiling new CVM")
             dirname = 'lazylinker_ext'
-            cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c')
-            if not os.path.exists(cfile):
-                # This can happen in not normal case. We just
-                # disable the c clinker. If we are here the user
-                # didn't disable the compiler, so print a warning.
-                warnings.warn(
-                    "The file lazylinker_c.c is not available. This do"
-                    "not happen normally. You are probably in a strange"
-                    "setup. This mean Theano can not use the cvm:"
-                    "our c execution engine for Theano function. If you"
-                    "want to remove this warning, use the Theano flag"
-                    "'cxx=' (set to an empty string) to disable all c"
-                    "code generation."
-                )
-                raise ImportError("The file lazylinker_c.c is not available.")
+            # We use a .txt extensions as otherwise it don't get
+            # included when we create a package to send to pypi
+            # This happen even if we tell to include *.c files
+            cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c.txt')
             code = open(cfile).read()
             loc = os.path.join(config.compiledir, dirname)
             if not os.path.exists(loc):
-                try:
-                    os.mkdir(loc)
-                except OSError, e:
-                    assert e.errno == errno.EEXIST
-                    assert os.path.exists(loc)
-
+                os.mkdir(loc)
             args = cmodule.GCC_compiler.compile_args()
             cmodule.GCC_compiler.compile_str(dirname, code, location=loc,
                                              preargs=args)
diff --git a/theano/gof/link.py b/theano/gof/link.py
index e4c527f2d05..c4d98cf2a84 100644
--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -1,6 +1,5 @@
 """WRITEME"""
-from copy import copy, deepcopy
-import StringIO
+from copy import copy
 import sys
 import traceback
 
@@ -13,7 +12,7 @@
 
 
 def log_thunk_trace(value, f=sys.stderr):
-    """Log Theano's diagnostic stack trace for an exception
+    """Log theano's diagnostic stack trace for an exception
     raised by raise_with_op.
     """
     # in future, consider accepting `write` as arg rather than file
@@ -56,15 +55,15 @@ def thunk_hook(type, value, trace):
 
 
 # TODO: Make this work with linker defined schedule
-def raise_with_op(node, thunk=None, exc_info=None):
+def raise_with_op(op, exc_info=None):
     """
     Re-raise an exception while annotating the exception object with
     debug info.
 
     Parameters
     ----------
-    node : Apply node
-        The Apply node object that resulted in the raised exception.
+    op : object
+        The Op object that resulted in the raised exception.
     exc_info : tuple, optional
         A tuple containing the exception type, exception object and
         associated traceback, as would be returned by a call to
@@ -94,16 +93,13 @@ def raise_with_op(node, thunk=None, exc_info=None):
         # print a simple traceback from KeyboardInterrupt
         raise exc_type, exc_value, exc_trace
     try:
-        trace = node.tag.trace
+        trace = op.tag.trace
     except AttributeError:
-        try:
-            trace = node.op.tag.trace
-        except AttributeError:
-            trace = ()
+        trace = ()
     exc_value.__thunk_trace__ = trace
-    exc_value.__op_instance__ = node
-    if node in node.fgraph.toposort():
-        exc_value.__applynode_index__ = node.fgraph.toposort().index(node)
+    exc_value.__op_instance__ = op
+    if op in op.fgraph.toposort():
+        exc_value.__applynode_index__ = op.fgraph.toposort().index(op)
     else:
         exc_value.__applynode_index__ = None
 
@@ -112,67 +108,6 @@ def raise_with_op(node, thunk=None, exc_info=None):
     if raise_with_op.print_thunk_trace:
         log_thunk_trace(exc_value)
 
-    hints = []
-    detailed_err_msg = "\nApply node that caused the error: " + str(node)
-
-    types = [getattr(ipt, 'type', 'No type') for ipt in node.inputs]
-    detailed_err_msg += "\nInputs types: %s\n" % types
-
-    if thunk is not None:
-        if hasattr(thunk, 'inputs'):
-            shapes = [getattr(ipt[0], 'shape', 'No shapes')
-                      for ipt in thunk.inputs]
-            strides = [getattr(ipt[0], 'strides', 'No strides')
-                       for ipt in thunk.inputs]
-            scalar_values = []
-            for ipt in thunk.inputs:
-                if getattr(ipt[0], "size", -1) <= 5:
-                    scalar_values.append(ipt[0])
-                else:
-                    scalar_values.append("not shown")
-        else:
-            shapes = "The thunk don't have an inputs attributes."
-            strides = "So we can't access the strides of inputs values"
-            scalar_values = "And can't print its inputs scalar value"
-
-        detailed_err_msg += ("Inputs shapes: %s" % shapes +
-                             "\nInputs strides: %s" % strides +
-                             "\nInputs values: %s\n" % scalar_values)
-    else:
-        hints.append(
-            "HINT: Use another linker then the c linker to"
-            " have the inputs shapes and strides printed.")
-
-    # Print node backtrace
-    tr = getattr(node.tag, 'trace', None)
-    if tr:
-        sio = StringIO.StringIO()
-        traceback.print_list(tr, sio)
-        tr = sio.getvalue()
-        detailed_err_msg += "\nBacktrace when the node is created:\n"
-        detailed_err_msg += str(tr)
-    else:
-        hints.append(
-            "HINT: Re-running with most Theano optimization disabled could"
-            " give you a back-trace of when this node was created. This can"
-            " be done with by setting the Theano flag"
-            " 'optimizer=fast_compile'. If that does not work,"
-            " Theano optimizations can be disabled with 'optimizer=None'.")
-
-    if theano.config.exception_verbosity == 'high':
-        f = StringIO.StringIO()
-        theano.printing.debugprint(node, file=f, stop_on_name=True,
-                                   print_type=True)
-        detailed_err_msg += "\nDebugprint of the apply node: \n"
-        detailed_err_msg += f.getvalue()
-
-    else:
-        hints.append(
-            "HINT: Use the Theano flag 'exception_verbosity=high'"
-            " for a debugprint of this apply node.")
-
-    exc_value = exc_type(str(exc_value) + detailed_err_msg +
-                         '\n' + '\n'.join(hints))
     raise exc_type, exc_value, exc_trace
 
 raise_with_op.print_thunk_trace = False
@@ -259,7 +194,7 @@ def __init__(self, r, storage, readonly=False, strict=False,
         """WRITEME
 
         :Parameters:
-         `r`: a Variable or a Type
+         `r`: a variable
          `storage`: a list of length 1, whose element is the value for `r`
          `readonly`: True indicates that this should not be setable by Function[r] = val
          `strict`: if True, we don't allow type casting.
@@ -278,8 +213,6 @@ def __init__(self, r, storage, readonly=False, strict=False,
             self.type = r.type
         if name is None:
             self.name = r.name
-        else:
-            self.name = name
 
         self.storage = storage
         self.readonly = readonly
@@ -321,30 +254,6 @@ def __str__(self):
     def __repr__(self):
         return "<" + repr(self.storage[0]) + ">"
 
-    def __deepcopy__(self, memo):
-        data_was_in_memo = id(self.storage[0]) in memo
-        r = type(self)(
-            deepcopy(self.type, memo=memo),
-            deepcopy(self.storage, memo=memo),
-            deepcopy(self.readonly, memo=memo),
-            deepcopy(self.strict, memo=memo),
-            deepcopy(self.allow_downcast, memo=memo),
-            deepcopy(self.name, memo=memo),
-            )
-        # Work around NumPy deepcopy of ndarray with 0 dimention that
-        # don't return an ndarray.
-        if (r.storage[0] is not None and
-            not self.type.is_valid_value(r.storage[0])):
-
-            assert not data_was_in_memo
-            assert self.type.is_valid_value(self.storage[0])
-            # This should also work for read only container.
-            r.storage[0] = self.type.filter(r.storage[0],
-                                            strict=False,
-                                            allow_downcast=False)
-            memo[id(self.storage[0])] = r.storage[0]
-        return r
-
 
 def map_storage(fgraph, order, input_storage, output_storage):
     """Ensure there is storage (a length-1 list) for inputs, outputs, and interior nodes.
@@ -451,7 +360,7 @@ def streamline_default_f():
                     for old_s in old_storage:
                         old_s[0] = None
             except Exception:
-                raise_with_op(node, thunk)
+                raise_with_op(node)
         f = streamline_default_f
     elif nice_errors:
         thunk_node_list = zip(thunks, order)
@@ -463,7 +372,7 @@ def streamline_nice_errors_f():
                 for thunk, node in thunk_node_list:
                     thunk()
             except Exception:
-                raise_with_op(node, thunk)
+                raise_with_op(node)
         f = streamline_nice_errors_f
     else:
         # don't worry about raise_with_op, just go a little faster.
@@ -587,8 +496,6 @@ def make_all(self, profiler=None, input_storage=None, output_storage=None):
                                               storage_map,
                                               compute_map,
                                               no_recycling)]
-                thunks[-1].inputs = [storage_map[v] for v in node.inputs]
-                thunks[-1].outputs = [storage_map[v] for v in node.outputs]
             finally:
                 node.op._op_use_c_code = old_value
 
@@ -617,7 +524,6 @@ def make_all(self, profiler=None, input_storage=None, output_storage=None):
 
         f.allow_gc = self.allow_gc #HACK: this is a way of passing an arg to Function.__call__
         add_clear_storage(f, computed, storage_map)
-        f.storage_map = storage_map
 
         return f, [Container(input, storage) for input, storage in zip(fgraph.inputs, input_storage)], \
             [Container(output, storage, True) for output, storage in zip(fgraph.outputs, output_storage)], \
@@ -760,7 +666,7 @@ def f():
                 try:
                     wrapper(i, node, *thunks)
                 except Exception:
-                    raise_with_op(node, thunk)
+                    raise_with_op(node)
         f.thunk_groups = thunk_groups
 
         return f, inputs0, outputs0
diff --git a/theano/gof/op.py b/theano/gof/op.py
index ac85eecb094..0d79bce1bc3 100644
--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -12,11 +12,7 @@
 
 __docformat__ = "restructuredtext en"
 
-import inspect
 import logging
-import numpy
-import os
-import sys
 import warnings
 
 import theano
@@ -129,6 +125,22 @@ def c_code_cache_version(self):
         """
         return ()
 
+    def c_code_cache_version_apply(self, node):
+        """Return a tuple of integers indicating the version of this Op.
+
+        An empty tuple indicates an 'unversioned' Op that will not be cached between processes.
+
+        The cache mechanism may erase cached modules that have been superceded by newer
+        versions.  See `ModuleCache` for details.
+
+        :note: See also `c_code_cache_version()`
+
+        :note: This function overrides `c_code_cache_version` unless it explicitly calls
+        `c_code_cache_version`.  The default implementation simply calls `c_code_cache_version`
+        and ignores the `node` argument.
+        """
+        return self.c_code_cache_version()
+
     def c_compile_args(self):
         """Optional: Return a list of compile args recommended to compile the
         code returned by other methods in this class.
@@ -162,17 +174,6 @@ def c_no_compile_args(self):
         """
         raise utils.MethodNotDefined("c_no_compile_args", type(self), self.__class__.__name__)
 
-    def c_init_code(self):
-        """
-        Optional: return a list of code snippets to be inserted in module
-        initialization.
-
-        :Exceptions:
-         - `MethodNotDefined`: the subclass does not override this method
-        """
-        raise utils.MethodNotDefined("c_init_code", type(self),
-                                     self.__class__.__name__)
-
 
 class CLinkerOp(CLinkerObject):
     """
@@ -192,26 +193,24 @@ def c_code(self, node, name, inputs, outputs, sub):
 
         :Parameters:
          `node` : Apply instance
-           The node for which we are compiling the current c_code.
-           The same Op may be used in more than one node.
-         `name` : A string
-           A name that is automatically assigned and guaranteed to be
-           unique.
+           WRITEME
+         `name` : WRITEME
+           WRITEME
          `inputs` : list of strings
-           There is a string for each input of the function, and the
-           string is the name of a C variable pointing to that input.
-           The type of the variable depends on the declared type of
-           the input.  There is a corresponding python variable that
-           can be accessed by prepending "py_" to the name in the
-           list.
+           There is a string for each input of the function, and the string is the name of a C
+           `PyObject` variable pointing to that input.
          `outputs` : list of strings
-           Each string is the name of a C variable where the Op should
-           store its output.  The type depends on the declared type of
-           the output.  There is a corresponding python variable that
-           can be accessed by prepending "py_" to the name in the
-           list.  In some cases the outputs will be preallocated and
-           the value of the variable may be pre-filled.  The value for
-           an unallocated output is type-dependent.
+           Each string is the name of a `PyObject` pointer where the Op should
+           store its variables.  As of version 0.4.0, this pointer could be
+           NULL, or contain an object allocated during a previous call to the
+           same function, unchanged from the end of the previous execution.
+           In a future version, there will be no guarantee on where that
+           object will be created (it could be allocated during a previous
+           execution, or by another Op, by the Mode, etc.). It will still
+           be of an appropriate Type (in the Theano sense) to store the output
+           of the computation: for instance, for a TensorVariable, it will be a
+           Numpy ndarray with the right number of dimensions, and the right dtype.
+           However, its shape, or stride pattern, could not be adequate.
          `sub` : dict of strings
            extra symbols defined in `CLinker` sub symbols (such as 'fail').
            WRITEME
@@ -223,75 +222,52 @@ def c_code(self, node, name, inputs, outputs, sub):
         raise utils.MethodNotDefined('%s.c_code' \
                 % self.__class__.__name__)
 
-    def c_code_cache_version_apply(self, node):
-        """Return a tuple of integers indicating the version of this Op.
-
-        An empty tuple indicates an 'unversioned' Op that will not be
-        cached between processes.
-
-        The cache mechanism may erase cached modules that have been
-        superceded by newer versions.  See `ModuleCache` for details.
-
-        :note: See also `c_code_cache_version()`
-
-        :note: This function overrides `c_code_cache_version` unless
-               it explicitly calls `c_code_cache_version`.  The
-               default implementation simply calls
-               `c_code_cache_version` and ignores the `node` argument.
-        """
-        return self.c_code_cache_version()
-
     def c_code_cleanup(self, node, name, inputs, outputs, sub):
-        """
-        Optional: Return C code to run after c_code, whether it failed
-                  or not.
+        """Optional: Return C code to run after c_code, whether it failed or not.
+
+        QUESTION: is this function optional?
 
         This is a convenient place to clean up things allocated by c_code().
 
         :Parameters:
          `node` : Apply instance
            WRITEME
-         `name` : A string
-           A name that is automatically assigned and guaranteed to be
-           unique.
+         `name` : WRITEME
+           WRITEME
          `inputs` : list of strings
-           There is a string for each input of the function, and the
-           string is the name of a C variable pointing to that input.
-           The type of the variable depends on the declared type of
-           the input. There is a corresponding python variable that
-           can be accessed by prepending "py_" to the name in the
-           list.
+           There is a string for each input of the function, and the string is the name of a C
+           `PyObject` variable pointing to that input.
          `outputs` : list of strings
-           Each string is the name of a C variable correspoinding to
-           one of the outputs of the Op. The type depends on the
-           declared type of the output. There is a corresponding
-           python variable that can be accessed by prepending "py_" to
-           the name in the list.
+           Each string is the name of a `PyObject` pointer where the Op should store its
+           variables.  This pointer could be NULL, or contain an object of the right
+           Type (in the Theano sense) to store the output of the computation.
+           For instance, for a TensorVariable, it will be a Numpy ndarray with
+           the right number of dimensions, and the right dtype. However, its
+           shape, or stride pattern, could not be adequate.
+           It could be unchanged from the end of the previous execution, or allocated
+           by another Op, or by the Mode.
          `sub` : dict of strings
            extra symbols defined in `CLinker` sub symbols (such as 'fail').
            WRITEME
 
+        WRITEME
+
         :Exceptions:
          - `MethodNotDefined`: the subclass does not override this method
+
         """
         raise utils.MethodNotDefined('%s.c_code_cleanup' \
                 % self.__class__.__name__)
 
     def c_support_code_apply(self, node, name):
-        """Optional: Return utility code for use by an `Op` that will be
-        inserted at global scope, that can be specialized for the
-        support of a particular `Apply` node.
+        """Optional: Return utility code for use by an `Op` that will be inserted at global
+        scope, that can be specialized for the support of a particular `Apply` node.
 
         :param node: an Apply instance in the graph being compiled
 
-        :param name: a string or number that serves to uniquely
-                     identify this node.  Symbol names defined by this
-                     support code should include the name, so that
-                     they can be called from the c_code, and so that
-                     they do not cause name collisions.
-
-        :note: This function is called in addition to c_support_code
-               and will supplement whatever is returned from there.
+        :param node_id: a string or number that serves to uniquely identify this node.
+        Symbol names defined by this support code should include the node_id, so that they can
+        be called from the c_code, and so that they do not cause name collisions.
 
         :Exceptions:
          - `MethodNotDefined`: Subclass does not implement this method
@@ -300,88 +276,6 @@ def c_support_code_apply(self, node, name):
         raise utils.MethodNotDefined("c_support_code_apply",
                 type(self), self.__class__.__name__)
 
-    def c_init_code_apply(self, node, name):
-        """
-        Optional: return a code string specific to the apply
-        to be inserted in the module initialization code.
-
-        :param node: an Apply instance in the graph being compiled
-
-        :param name: a string or number that serves to uniquely
-                     identify this node.  Symbol names defined by this
-                     support code should include the name, so that
-                     they can be called from the c_code, and so that
-                     they do not cause name collisions.
-
-        :note: This function is called in addition to c_init_code
-               and will supplement whatever is returned from there.
-
-        :Exceptions:
-         - `MethodNotDefined`: the subclass does not override this method
-        """
-        raise utils.MethodNotDefined("c_init_code_apply", type(self),
-                                     self.__class__.__name__)
-
-    def c_init_code_struct(self, node, struct_id, sub):
-        """
-        Optional: return a code string specific to the apply
-        to be inserted in the struct initialization code.
-
-        :param node: an Apply instance in the graph being compiled
-
-        :param struct_id: a number that serves to uniquely identify
-                          this code.  The c_code will receive another
-                          sub parameter named struct_id that will
-                          contain this name.
-
-        :param sub: a dictionary of values to substitute in the code.
-                    Most notably it contains a 'fail' entry that you
-                    should place in your code after setting a python
-                    exception to indicate an error.
-
-        :Exceptions:
-         - `MethodNotDefined`: the subclass does not override this method
-        """
-        raise utils.MethodNotDefined("c_init_code_apply", type(self),
-                                     self.__class__.__name__)
-
-    def c_support_code_struct(self, node, struct_id):
-        """Optional: Return utility code for use by an `Op` that will be
-        inserted at struct scope, that can be specialized for the
-        support of a particular `Apply` node.
-
-        :param node: an Apply instance in the graph being compiled
-
-        :param struct_id: a number that serves to uniquely identify
-                          this code.  The c_code will receive another
-                          sub parameter named struct_id that will
-                          contain this name.
-
-        :Exceptions:
-         - `MethodNotDefined`: Subclass does not implement this method
-
-        """
-        raise utils.MethodNotDefined("c_support_code_struct",
-                type(self), self.__class__.__name__)
-
-    def c_cleanup_code_struct(self, node, struct_id):
-        """
-        Optional: return a code string specific to the apply to be
-        inserted in the struct cleanup code.
-
-        :param node: an Apply instance in the graph being compiled
-
-        :param struct_id: a number that serves to uniquely identify
-                          this code.  The c_code will receive another
-                          sub parameter named struct_id that will
-                          contain this name.
-
-        :Exceptions:
-        - `MethodNotDefined`: the subclass does not override this method
-        """
-        raise utils.MethodNotDefined("c_cleanup_code_struct", type(self),
-                                     self.__class__.__name__)
-
 
 class PureOp(object):
     """
@@ -514,9 +408,6 @@ def __call__(self, *inputs, **kwargs):
                     elif config.compute_test_value == 'ignore':
                         # silently skip test
                         run_perform = False
-                    elif config.compute_test_value == 'pdb':
-                        import pdb
-                        pdb.post_mortem(sys.exc_info()[2])
                     else:
                         raise ValueError('%s is invalid for option config.compute_Test_value' % config.compute_test_value)
 
@@ -540,8 +431,6 @@ def __call__(self, *inputs, **kwargs):
                 # compute output value once with test inputs to validate graph
                 thunk = node.op.make_thunk(node, storage_map, compute_map,
                         no_recycling=[])
-                thunk.inputs = [storage_map[v] for v in node.inputs]
-                thunk.outputs = [storage_map[v] for v in node.outputs]
 
                 required = thunk()
                 assert not required  # We provided all inputs
@@ -657,33 +546,6 @@ def __new__(cls, *args, **kwargs):
     def __init__(self, use_c_code=theano.config.cxx):
         self._op_use_c_code = use_c_code
 
-    def _props(self):
-        return tuple(getattr(self, a) for a in self.__props__)
-
-    def __hash__(self):
-        if hasattr(self, '__props__'):
-            return hash((type(self), self._props()))
-        else:
-            return super(Op, self).__hash__()
-
-    def __str__(self):
-        if hasattr(self, '__props__'):
-            if len(self.__props__) == 0:
-                return "%s" % (self.__class__.__name__,)
-            else:
-                return "%s{%s}" % (
-                    self.__class__.__name__,
-                    ", ".join("%s=%r" % (p, getattr(self, p))
-                              for p in self.__props__))
-        else:
-            return super(Op, self).__str__()
-
-    def __eq__(self, other):
-        if hasattr(self, '__props__'):
-            return (type(self) == type(other) and self._props() == other._props())
-        else:
-            return NotImplemented
-
     def make_thunk(self, node, storage_map, compute_map, no_recycling):
         """
         :param node: something previously returned by self.make_node
@@ -714,7 +576,7 @@ def make_thunk(self, node, storage_map, compute_map, no_recycling):
         #logger.debug('Compiling node %i of graph' % node_idx)
         if self._op_use_c_code:
             try:
-                e = FunctionGraph(node.inputs, node.outputs)
+                e = FunctionGraph(*graph.clone(node.inputs, node.outputs))
 
                 e_no_recycling = [new_o
                         for (new_o, old_o) in zip(e.outputs, node.outputs)
@@ -774,11 +636,8 @@ def get_test_value(v):
     For a Shared variable, it is the internal value.
     For another Variable, it is the content of v.tag.test_value.
     """
-    if not isinstance(v, graph.Variable):
-        v_var = theano.tensor.as_tensor_variable(v)
-    else:
-        v_var = v
-    return PureOp._get_test_value(v_var)
+    v_tensor = theano.tensor.as_tensor_variable(v)
+    return PureOp._get_test_value(v_tensor)
 
 
 def missing_test_message(msg):
@@ -914,24 +773,12 @@ def __init__(self, openmp=None):
             openmp = theano.config.openmp
         self.openmp = openmp
 
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        # If we unpickle old op
-        if not hasattr(self, "openmp"):
-            self.openmp = False
-
     def c_compile_args(self):
         self.update_self_openmp()
         if self.openmp:
             return ['-fopenmp']
         return []
 
-    def c_headers(self):
-        self.update_self_openmp()
-        if self.openmp:
-            return ["omp.h"]
-        return []
-
     @staticmethod
     def test_gxx_support():
         code = """
@@ -960,12 +807,11 @@ def update_self_openmp(self):
             if OpenMPOp.gxx_support_openmp is None:
                 OpenMPOp.gxx_support_openmp = OpenMPOp.test_gxx_support()
                 if not OpenMPOp.gxx_support_openmp:
-                    # We want to warn only once.
+                    #We want to warn only once.
                     warnings.warn(
                         "Your g++ compiler fails to compile OpenMP code. We"
                         " know this happen with some version of the EPD mingw"
-                        " compiler and LLVM compiler on Mac OS X."
-                        " We disable openmp everywhere in Theano."
+                        " compiler. We disable openmp everywhere in Theano."
                         " To remove this warning set the theano flags `openmp`"
                         " to False.",
                         stacklevel=3)
@@ -977,202 +823,3 @@ def make_thunk(self, node, storage_map, compute_map, no_recycling):
         self.update_self_openmp()
         return super(OpenMPOp, self).make_thunk(node, storage_map,
                                                 compute_map, no_recycling)
-
-
-class COp(Op):
-    """ Class to allow an op to have an external C implementation.
-
-    An op can use this class by inheriting from it and calling its
-    __init__() method, providing it with a path to an external file containing
-    the C implementation and the name of the function, in that file, to call
-    to perform the computations for the op.
-    """
-
-    def __init__(self, func_file, func_name):
-
-        self.func_file = func_file
-        self.func_name = func_name
-
-        # Define the markers that can be used to delimit sections in the
-        # external C code
-        self.support_code_marker = "THEANO_SUPPORT_CODE_SECTION"
-        self.apply_code_marker = "THEANO_APPLY_CODE_SECTION"
-        self.c_code_markers = [self.support_code_marker,
-                               self.apply_code_marker]
-
-        # Load the external C code
-        try:
-            # Attempt to find the file self.func_file in the folder where the
-            # concrete type of the COp instance is defined
-
-            # Get the name of the folder where the concrete type of the COp is
-            # defined
-            path_concrete_type = inspect.getfile(self.__class__)
-            folder_concrete_type = os.path.dirname(path_concrete_type)
-
-            # Try to open the file from there
-            f = open(os.path.join(folder_concrete_type, self.func_file), "r")
-            self.func_code = f.read()
-            f.close()
-
-        except IOError:
-
-            # Add information to the exception message to inform the user
-            # on the locations in which the class COp will look for the
-            # specified file
-            message = ("The path to the external C implementation should "
-                       "be given as a relative path from the folder "
-                       "where the Op is defined. ")
-
-            # Can't update the exception's message by modifying e.args
-            # because IOErrors don't use their attribute args to generate
-            # their error message
-            e.strerror = message + e.strerror
-            raise e
-
-        # Separate the contents of the file in sections and validate that at
-        # lest one of the necessary code sections has been defined
-        self.code_sections = self.parse_external_c_code(self.func_code)
-
-        if sum([marker in self.code_sections.keys()
-               for marker in self.c_code_markers]) == 0:
-
-            raise(RuntimeError, "The provided C implementation does not "
-                  "define a support code section or a support code apply "
-                  "section.")
-
-    def parse_external_c_code(self, code):
-
-        # Obtain the positions of the C code markers used in the C code
-        positions = [(code.index(marker), marker)
-                     for marker in self.c_code_markers if marker in code]
-
-        # Go over the markers in their order of occurence and extract
-        # the C code they concern
-        positions.sort()
-        code_sections = {}
-
-        for i in range(len(positions)):
-
-            marker_start, marker = positions[i]
-
-            if i < len(positions) - 1:
-                # This is not the last section in the code : extract the code
-                # between the beginning of the current marker and the
-                # beginning of the next one.
-                next_marker_start = positions[i+1][0]
-                section = code[marker_start: next_marker_start]
-            else:
-                # This is the last section in the code : extract the remaining
-                # C code
-                section = code[marker_start:]
-
-            cleaned_section = section.replace(marker, "")
-            code_sections[marker] = cleaned_section
-
-        return code_sections
-
-    def c_code_cache_version(self):
-        return hash(self.func_code)
-
-    def c_support_code(self):
-
-        if self.support_code_marker in self.code_sections:
-            return self.code_sections[self.support_code_marker]
-        else:
-            raise utils.MethodNotDefined("c_support_code",
-                type(self), self.__class__.__name__)
-
-    def c_support_code_apply(self, node, name):
-
-        if self.apply_code_marker in self.code_sections:
-            apply_code = self.code_sections[self.apply_code_marker]
-
-            if hasattr(self, 'check_inputs') and self.check_inputs == False:
-                return apply_code
-            else:
-                define_macros, undef_macros = self.get_c_macros(node, name)
-                return os.linesep.join([define_macros, apply_code,
-                                        undef_macros])
-
-        else:
-            raise utils.MethodNotDefined("c_support_code_apply",
-                type(self), self.__class__.__name__)
-
-
-    def format_c_function_args(self, inp, out):
-        # Generate an string containing the arguments sent to the external C
-        # function. The argstring will be of format :
-        # "input0, input1, input2, &output0, &output1"
-        return ", ".join(list(inp) + ["&%s" % o for o in out])
-
-    def get_c_macros(self, node, name):
-
-        define_template = "#define %s %s" + os.linesep
-        undef_template = "#undef %s" + os.linesep
-        define_macros = ""
-        undef_macros = ""
-
-        # Extract the various properties of the input and output variables
-        variables = node.inputs + node.outputs
-        variable_names = (["INPUT_%i" % i for i in range(len(node.inputs))] +
-                          ["OUTPUT_%i" % i for i in range(len(node.inputs))])
-        variable_dtypes_names = [v.dtype for v in variables]
-        variable_dtypes = [numpy.dtype(d) for d in variable_dtypes_names]
-        variable_typenums = [d.num for d in variable_dtypes]
-        variable_itemsizes = [d.itemsize for d in variable_dtypes]
-
-        # Generate dtype macros
-        for i in range(len(variables)):
-            macro_name = "DTYPE_" + variable_names[i]
-            macro_value = "npy_" + variable_dtypes_names[i]
-
-            define_macros += define_template % (macro_name, macro_value)
-            undef_macros += undef_template % macro_name
-
-        # Generate typenum macros
-        for i in range(len(variables)):
-            macro_name = "TYPENUM_" + variable_names[i]
-            macro_value = variable_typenums[i]
-
-            define_macros += define_template % (macro_name, macro_value)
-            undef_macros += undef_template % macro_name
-
-        # Generate itemsize macros
-        for i in range(len(variables)):
-            macro_name = "ITEMSIZE_" + variable_names[i]
-            macro_value = variable_itemsizes[i]
-
-            define_macros += define_template % (macro_name, macro_value)
-            undef_macros += undef_template % macro_name
-
-        # Generate a macro to mark code as being apply-specific
-        define_macros += define_template % ("APPLY_SPECIFIC(str)",
-                                            "str##_%s" % name)
-        undef_macros += undef_template % "APPLY_SPECIFIC"
-
-        return define_macros, undef_macros
-
-    def c_code(self, node, name, inp, out, sub):
-
-        func_name = self.func_name
-        func_args = self.format_c_function_args(inp, out)
-        fail = sub['fail']
-
-        # Generate the code to define/undefine the C macros
-        define_macros, undef_macros = self.get_c_macros(node, name)
-
-        # Generate the C code
-        c_code = """
-        %(define_macros)s
-        {
-            int result = %(func_name)s(%(func_args)s);
-            if (result != 0)
-            {
-                %(fail)s;
-            }
-        }
-        %(undef_macros)s
-        """ % locals()
-
-        return c_code
diff --git a/theano/gof/opt.py b/theano/gof/opt.py
index 3d389a50ad6..dcbdb173955 100644
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -8,7 +8,6 @@
 import pdb
 import sys
 import time
-import warnings
 
 import numpy
 
@@ -22,6 +21,8 @@
 from theano import config
 from theano.gof.python25 import any, all, deque
 
+#if sys.version_info[:2] >= (2,5):
+#  from collections import defaultdict
 
 _logger = logging.getLogger('theano.gof.opt')
 
@@ -75,13 +76,7 @@ def optimize(self, fgraph, *args, **kwargs):
           opt.apply(fgraph)
         """
         self.add_requirements(fgraph)
-        try:
-            orig = theano.tensor.basic.constant.enable
-            theano.tensor.basic.constant.enable = False
-            ret = self.apply(fgraph, *args, **kwargs)
-        finally:
-            theano.tensor.basic.constant.enable = orig
-        return ret
+        return self.apply(fgraph, *args, **kwargs)
 
     def __call__(self, fgraph):
         """WRITEME
@@ -113,13 +108,13 @@ def print_profile(self, prof):
 
 class FromFunctionOptimizer(Optimizer):
     """WRITEME"""
-    def __init__(self, fn, requirements=()):
+    def __init__(self, fn):
         self.apply = fn
-        self.requirements = requirements
 
     def add_requirements(self, fgraph):
-        for req in self.requirements:
-            req(fgraph)
+        # Added by default
+        #fgraph.attach_feature(toolbox.ReplaceValidate())
+        pass
 
     def print_summary(self, stream=sys.stdout, level=0, depth=-1):
         print >> stream, "%s%s id=%i" % (
@@ -130,9 +125,6 @@ def print_summary(self, stream=sys.stdout, level=0, depth=-1):
     def __call__(self, *args, **kwargs):
         return self.fn(*args, **kwargs)
 
-    def __str__(self):
-        return self.__name__
-
 
 def optimizer(f):
     """decorator for FromFunctionOptimizer"""
@@ -141,18 +133,8 @@ def optimizer(f):
     return rval
 
 
-def inplace_optimizer(f):
-    """decorator for FromFunctionOptimizer"""
-    dh_handler = dh.DestroyHandler
-    requirements = (lambda fgraph:
-                    fgraph.attach_feature(dh_handler()),)
-    rval = FromFunctionOptimizer(f, requirements)
-    rval.__name__ = f.__name__
-    return rval
-
-
 class SeqOptimizer(Optimizer, list):
-    # inherit from Optimizer first to get Optimizer.__hash__
+    #inherit from Optimizer first to get Optimizer.__hash__
     """WRITEME
     Takes a list of L{Optimizer} instances and applies them
     sequentially.
@@ -183,10 +165,6 @@ def apply(self, fgraph):
         l = []
         if fgraph.profile:
             validate_before = fgraph.profile.validate_time
-            sub_validate_time = [validate_before]
-        else:
-            sub_validate_time = []
-        callback_before = fgraph.execute_callbacks_time
         nb_node_before = len(fgraph.apply_nodes)
         sub_profs = []
         for optimizer in self:
@@ -195,8 +173,6 @@ def apply(self, fgraph):
                 sub_prof = optimizer.optimize(fgraph)
                 l.append(float(time.time() - t0))
                 sub_profs.append(sub_prof)
-                if fgraph.profile:
-                    sub_validate_time.append(fgraph.profile.validate_time)
             except AssertionError:
                 # do not catch Assertion failures
                 raise
@@ -211,9 +187,8 @@ def apply(self, fgraph):
             validate_time = fgraph.profile.validate_time - validate_before
         else:
             validate_time = None
-        callback_time = fgraph.execute_callbacks_time - callback_before
-        return (self, l, validate_time, callback_time, nb_node_before,
-                len(fgraph.apply_nodes), sub_profs, sub_validate_time)
+        return (self, l, validate_time, nb_node_before,
+                len(fgraph.apply_nodes), sub_profs)
 
     def __str__(self):
         return "SeqOpt(%s)" % list.__str__(self)
@@ -233,8 +208,8 @@ def print_summary(self, stream=sys.stdout, level=0, depth=-1):
 
     @staticmethod
     def print_profile(stream, prof, level=0):
-        (opts, prof, validate_time, callback_time, nb_node_before,
-         nb_node_after, sub_profs, sub_validate_time) = prof
+        (opts, prof, validate_time, nb_node_before,
+         nb_node_after, sub_profs) = prof
         blanc = ('    ' * level)
 
         print >> stream, blanc, "SeqOptimizer",
@@ -247,10 +222,8 @@ def print_profile(stream, prof, level=0):
                               sum(prof), nb_node_before, nb_node_after))
         print >> stream, \
                 blanc, "  %.3fs for fgraph.validate()" % (validate_time)
-        print >> stream, \
-                blanc, "  %.3fs for callback" % (callback_time)
         if level == 0:
-            print >> stream, blanc, "  time      - (name, class, index) - validate time"
+            print >> stream, blanc, "  time      - (name, class, index)"
         ll = []
         for opt in opts:
             if hasattr(opt, "__name__"):
@@ -272,14 +245,7 @@ def cmp(a, b):
         for (t, opt) in lll[::-1]:
             #if t < 1:
             #    continue
-            if sub_validate_time:
-                i = opt[-1]
-                val_time = sub_validate_time[i + 1] - sub_validate_time[i]
-                print >> stream, blanc, '  %.6fs - %s - %.3fs' % (
-                    t, opt, val_time)
-            else:
-                print >> stream, blanc, '  %.6fs - %s' % (t, opt)
-
+            print >> stream, blanc, '  %.6fs - %s' % (t, opt)
             if sub_profs[opt[-1]]:
                 opts[opt[-1]].print_profile(stream, sub_profs[opt[-1]],
                                             level=level + 1)
@@ -301,9 +267,9 @@ def merge_profile(prof1, prof2):
                          prof2[1][idx2])
             new_l.append(l)
             if hasattr(l, 'merge_profile'):
-                assert len(prof1[6][idx1]) == len(prof2[6][idx2])
-                new_sub_profile.append(l.merge_profile(prof1[6][idx1],
-                                                       prof2[6][idx2]))
+                assert len(prof1[5][idx1]) == len(prof2[5][idx1])
+                new_sub_profile.append(l.merge_profile(prof1[5][idx1],
+                                                       prof2[5][idx2]))
             else:
                 new_sub_profile.append(None)
 
@@ -327,10 +293,10 @@ def merge_profile(prof1, prof2):
                         p = prof2
                     new_t[idx] += p[1][p[0].index(l)]
                     if hasattr(l, 'merge_profile'):
-                        assert len(p[6][p[0].index(l)]) == \
+                        assert len(p[5][p[0].index(l)]) == \
                                 len(new_sub_profile[idx])
                         new_sub_profile[idx] = l.merge_profile(
-                            new_sub_profile[idx], p[6][p[0].index(l)])
+                            new_sub_profile[idx], p[5][p[0].index(l)])
                     else:
                         new_sub_profile[idx] = None
                 continue
@@ -341,19 +307,14 @@ def merge_profile(prof1, prof2):
             new_t.append(p[1][p[0].index(l)])
             idx = p[0].index(l)
             new_l.append(l)
-            new_sub_profile.append(p[6][idx])
+            new_sub_profile.append(p[5][idx])
 
         new_opt = SeqOptimizer(*new_l)
-        #We need to assert based on the name as we merge also based on
-        #the name.
-        assert set([l.name for l in prof1[0]]).issubset(
-            set([l.name for l in new_l]))
-        assert set([l.name for l in prof2[0]]).issubset(
-            set([l.name for l in new_l]))
+        assert set(prof1[0]).issubset(set(new_l))
+#        assert set(prof2[0]).issubset(set(new_l))
         assert len(new_t) == len(new_opt) == len(new_sub_profile)
         return (new_opt, new_t, prof1[2] + prof2[2],
-                prof1[3] + prof2[3],
-                -1, -1, new_sub_profile, [])
+                -1, -1, new_sub_profile)
 
 
 class _metadict:
@@ -379,29 +340,23 @@ def __setitem__(self, item, value):
             self.l.append((item, value))
 
     def __delitem__(self, item):
-        try:
-            if item in self.d:
-                del self.d[item]
-                return
-        except TypeError, e:
-            assert "unhashable type" in str(e)
-        for i, (key, val) in enumerate(self.l):
-            if key == item:
-                del self.l[i]
-                return
+        if item in self.d:
+            del self.d[item]
+        else:
+            for i, (key, val) in enumerate(self.l):
+                if key == item:
+                    del self.l[i]
+                    return
             raise KeyError(item)
 
     def discard(self, item):
-        try:
-            if item in self.d:
-                del self.d[item]
-                return
-        except TypeError, e:
-            assert "unhashable type" in str(e)
-        for i, (key, val) in enumerate(self.l):
-            if key == item:
-                del self.l[i]
-                return
+        if item in self.d:
+            del self.d[item]
+        else:
+            for i, (key, val) in enumerate(self.l):
+                if key == item:
+                    del self.l[i]
+                    return
 
     def get(self, item, default):
         try:
@@ -466,9 +421,9 @@ def on_attach(self, fgraph):
         self.blacklist = []
 
         for node in fgraph.toposort():
-            self.on_import(fgraph, node, "on_attach")
+            self.on_import(fgraph, node)
 
-    def on_change_input(self, fgraph, node, i, r, new_r, reason):
+    def on_change_input(self, fgraph, node, i, r, new_r):
         # If inputs to node change, it is not guaranteed that it is distinct
         # from the other nodes in nodes_seen
         if node in self.nodes_seen:
@@ -478,14 +433,14 @@ def on_change_input(self, fgraph, node, i, r, new_r, reason):
         if isinstance(new_r, graph.Constant):
             self.process_constant(fgraph, new_r)
 
-    def on_import(self, fgraph, node, reason):
+    def on_import(self, fgraph, node):
         for c in node.inputs:
             if isinstance(c, graph.Constant):
                 self.process_constant(fgraph, c)
 
         self.process_node(fgraph, node)
 
-    def on_prune(self, fgraph, node, reason):
+    def on_prune(self, fgraph, node):
         self.nodes_seen.discard(node)
         for c in node.inputs:
             if isinstance(c, graph.Constant) and (len(c.clients) <= 1):
@@ -537,8 +492,7 @@ def process_node(self, fgraph, node):
                 continue
 
             inputs_match = all(node_in is cand_in
-                               for node_in, cand_in in zip(node.inputs,
-                                                           candidate.inputs))
+                    for node_in, cand_in in zip(node.inputs, candidate.inputs))
             if inputs_match and node.op == candidate.op:
                 if (node, candidate) in self.blacklist:
                     # They were already tried, and there was an error
@@ -585,92 +539,21 @@ def apply(self, fgraph):
         # Constant and non-constant are now applied in the same phase.
         # I am not sure why, but it seems to be faster this way.
         sched = fgraph.merge_feature.scheduled
-        nb_fail = 0
-        t0 = time.time()
-        if fgraph.profile:
-            validate_before = fgraph.profile.validate_time
-            callback_before = fgraph.execute_callbacks_time
-            callbacks_before = fgraph.execute_callbacks_times.copy()
-
-        nb_merged = 0
-        nb_constant = 0
         while sched:
             pairs_list = sched.pop()
             success = True
             for pairs in pairs_list:
-                # We must check again the equivalence, as the graph
-                # can have changed. If so, doing the replacement can
-                # introduce node that depend on itself.  Doing the
-                # full check of such cycle everytimes is very time
-                # consumming. I think this double check is faster then
-                # doing the full cycle check. The full cycle check is
-                # skipped by validate() if the graph don't contain
-                # destroyers.
-                node = pairs[0][0]
-                candidate = pairs[0][1]
-                if node.owner and candidate.owner:
-                    node = node.owner
-                    candidate = candidate.owner
-                    inputs_match = all(node_in is cand_in
-                                       for node_in, cand_in in zip(
-                                           node.inputs, candidate.inputs))
-                    # No need to compare the op again, as it don't change.
-                    if not inputs_match:
-                        continue
                 try:
-                    fgraph.replace_all_validate(pairs, 'MergeOptimizer')
+                    fgraph.replace_all_validate(pairs, 'Merge')
                 except InconsistencyError:
                     success = False
-                    nb_fail += 1
                     fgraph.merge_feature.blacklist.append(
-                        (pairs[0][0].owner, pairs[0][1].owner))
+                            (pairs[0][0].owner, pairs[0][1].owner))
                 if success:
-                    nb_merged += len(pairs)
-                    if isinstance(pairs[0][0], graph.Constant):
-                        nb_constant += 1
-                        #print pairs, pairs[0][0].type
                     break
 
-        if fgraph.profile:
-            validate_time = fgraph.profile.validate_time - validate_before
-            callback_time = fgraph.execute_callbacks_time - callback_before
-            callbacks_time = {}
-            for k, v in fgraph.execute_callbacks_times.iteritems():
-                if k in callbacks_before:
-                    callbacks_time[k] = v - callbacks_before[k]
-                else:
-                    callbacks_time[k] = v
-        else:
-            validate_time = None
-            callback_time = None
-            callbacks_time = {}
         # clear blacklist
         fgraph.merge_feature.blacklist = []
-        return (nb_fail, time.time() - t0, validate_time,
-                callback_time, callbacks_time, nb_merged, nb_constant)
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    @staticmethod
-    def print_profile(stream, prof, level=0):
-        (nb_fail, replace_time, validate_time,
-         callback_time, callbacks_time, nb_merged, nb_constant) = prof
-
-        blanc = ('    ' * level)
-        print >> stream, blanc, "MergeOptimizer"
-        print >> stream, blanc, "  nb_fail", nb_fail
-        print >> stream, blanc, "  replace_time", replace_time
-        print >> stream, blanc, "  validate_time", validate_time
-        print >> stream, blanc, "  callback_time", callback_time
-        if callback_time > 1:
-            print >> stream, blanc, "  callbacks_time"
-            for i in sorted(callbacks_time.iteritems(), key=lambda a: a[1]):
-                if i[1] > 0:
-                    print i
-        print >> stream, blanc, "  nb_merged", nb_merged
-        print >> stream, blanc, "  nb_constant", nb_constant
-
 
 merge_optimizer = MergeOptimizer()
 
@@ -689,9 +572,7 @@ def is_same_graph_with_merge(var1, var2, givens=None):
     givens = copied[2]
     # Create FunctionGraph.
     inputs = theano.gof.graph.inputs(vars)
-    # The clone isn't needed as we did a deepcopy and we cloning will
-    # break the mapping in givens.
-    fgraph = theano.gof.fg.FunctionGraph(inputs, vars, clone=False)
+    fgraph = theano.gof.fg.FunctionGraph(inputs, vars)
     # Perform Variable substitution.
     for to_replace, replace_by in givens.iteritems():
         fgraph.replace(to_replace, replace_by)
@@ -713,6 +594,18 @@ def is_same_graph_with_merge(var1, var2, givens=None):
         return o1 is o2
 
 
+def MergeOptMerge(opt):
+    """WRITEME
+    Returns an Optimizer that merges the graph then applies the
+    optimizer in opt and then merges the graph again in case the
+    opt introduced additional similarities.
+    """
+    merger = merge_optimizer
+    opt = SeqOptimizer([merger, opt, merger])
+    opt.name = "MergeOptMerge"
+    return opt
+
+
 def pre_constant_merge(vars):
     """
     Merge constants in the subgraph used to compute nodes in `vars`.
@@ -730,8 +623,7 @@ def pre_constant_merge(vars):
     seen_var = set()
     # signature -> variable (for constants)
     const_sig_inv = {}
-    if isinstance(vars, graph.Variable):
-        vars = [vars]
+
     def recursive_merge(var):
         if var in seen_var:
             return var
@@ -742,18 +634,9 @@ def recursive_merge(var):
         seen_var.add(var)
         if isinstance(var, graph.Constant):
             sig = var.signature()
-            try:
-                if sig in const_sig_inv:
-                    return const_sig_inv[sig]
-                const_sig_inv[sig] = var
-            except TypeError:  # unhashable type
-                warnings.warn(
-                    "We work around a problem, the following variable"
-                    " signature isn't hashable. Please, report this to"
-                    " theano-dev so that the better fix is done. %s" % var)
-                # Some python object like slice aren't hashable. So
-                # don't merge them here.
-                pass
+            if sig in const_sig_inv:
+                return const_sig_inv[sig]
+            const_sig_inv[sig] = var
             return var
         if var.owner:
             for idx, inp in enumerate(var.owner.inputs):
@@ -780,14 +663,6 @@ def __hash__(self):
             _optimizer_idx[0] += 1
         return self._optimizer_idx
 
-    def tracks(self):
-        """
-        Return the list of op classes that this opt applies to.
-
-        Return None to apply to all nodes.
-        """
-        return None
-
     def transform(self, node):
         """Transform a subgraph whose output is `node`.
 
@@ -798,15 +673,13 @@ def transform(self, node):
           or
         - <list of variables> to use in place of `node`'s outputs in the
           greater graph.
-        - dict(old variables -> new variables). A dictionary that map
-          from old variables to new variables to replace.
 
         :type node: an Apply instance
 
         """
 
         raise utils.MethodNotDefined("transform",
-                                     type(self), self.__class__.__name__)
+                type(self), self.__class__.__name__)
 
     def add_requirements(self, fgraph):
         """
@@ -823,113 +696,20 @@ def print_summary(self, stream=sys.stdout, level=0, depth=-1):
                 (' ' * level), self.__class__.__name__, id(self))
 
 
-theano.configparser.AddConfigVar('metaopt.verbose',
-        "Enable verbose output for meta optimizers",
-        theano.configparser.BoolParam(False), in_c_key=False)
-
-class LocalMetaOptimizer(LocalOptimizer):
-    """Base class for meta-optimizers that try a set of LocalOptimizers
-    to replace a node and choose the one that executes the fastest"""
-
-    def __init__(self, tracks=None, optimizers=()):
-        self._tracks = tracks
-        self.optimizers = list(optimizers)
-        self.verbose = config.metaopt.verbose
-
-    def register(self, optimizer):
-        self.optimizers.append(optimizer)
-
-    def tracks(self):
-        return self._tracks
-
-    def transform(self, node):
-        # safety check: depending on registration, tracks may have been ignored
-        if self._tracks is not None:
-            if not isinstance(node.op, tuple(self._tracks)):
-                return
-        # first, we need to provide dummy values for all inputs
-        # to the node that are not shared variables anyway
-        givens = {}
-        missing = set()
-        for input in node.inputs:
-            if isinstance(input, theano.compile.SharedVariable):
-                pass
-            elif hasattr(input.tag, 'test_value'):
-                givens[input] = theano.shared(
-                        input.type.filter(input.tag.test_value),
-                        input.name, borrow=True)
-            else:
-                missing.add(input)
-        if missing:
-            givens.update(self.provide_inputs(node, missing))
-            missing.difference_update(givens.keys())
-        # ensure we have data for all input variables that need it
-        if missing:
-            if self.verbose:
-                print ("%s cannot meta-optimize %s, "
-                       "%d of %d input shapes unknown" %
-                       (self.__class__.__name__, node, len(missing), node.nin))
-            return
-        # now we can apply the different optimizations in turn,
-        # compile the resulting subgraphs and time their execution
-        if self.verbose:
-            print ("%s meta-optimizing %s (%d choices):" %
-                   (self.__class__.__name__, node, len(self.optimizers)))
-        timings = []
-        for opt in self.optimizers:
-            outputs = opt.transform(node)
-            if outputs:
-                try:
-                    fn = theano.function([], outputs, givens=givens)
-                    timing = min(self.time_call(fn) for _ in range(3))
-                except Exception as e:
-                    if self.verbose:
-                        print "* %s: exception" % opt, e
-                    continue
-                else:
-                    if self.verbose:
-                        print "* %s: %.5g sec" % (opt, timing)
-                    timings.append((timing, outputs, opt))
-            else:
-                if self.verbose:
-                    print "* %s: not applicable" % opt
-        # finally, we choose the fastest one
-        if timings:
-            timings.sort()
-            if self.verbose:
-                print "= %s" % timings[0][2]
-            return timings[0][1]
-        return
-
-    def provide_inputs(self, node, inputs):
-        """If implemented, returns a dictionary mapping all symbolic variables
-        in ``inputs`` to SharedVariable instances of suitable dummy values. The
-        ``node`` can be inspected to infer required input shapes."""
-        raise NotImplementedError()
-
-    def time_call(self, fn):
-        start = time.time()
-        fn()
-        return time.time() - start
-
-
 class FromFunctionLocalOptimizer(LocalOptimizer):
     """WRITEME"""
-    def __init__(self, fn, tracks=None, requirements=()):
+    def __init__(self, fn, tracks=None):
+        if tracks is None:
+            tracks = []
         self.transform = fn
         self._tracks = tracks
-        self.requirements = requirements
-
-    def add_requirements(self, fgraph):
-        for req in self.requirements:
-            req(fgraph)
 
     def tracks(self):
         return self._tracks
 
     def __str__(self):
         return getattr(self, '__name__',
-                       '<FromFunctionLocalOptimizer instance>')
+                '<FromFunctionLocalOptimizer instance>')
 
     def print_summary(self, stream=sys.stdout, level=0, depth=-1):
         print >> stream, "%s%s id=%i" % (
@@ -938,21 +718,10 @@ def print_summary(self, stream=sys.stdout, level=0, depth=-1):
                 id(self))
 
 
-def local_optimizer(tracks, inplace=False):
+def local_optimizer(*tracks):
     def decorator(f):
         """WRITEME"""
-        if tracks is not None:
-            if len(tracks) is 0:
-                raise ValueError, ("Use None instead of an empty list to apply to all nodes.", f.__module__, f.__name__)
-            for t in tracks:
-                if not (isinstance(t, op.Op) or issubclass(t, op.PureOp)):
-                    raise ValueError, ("Tracks are op classes or instances", f.__module__, f.__name__)
-        requirements = ()
-        if inplace:
-            dh_handler = dh.DestroyHandler
-            requirements = (lambda fgraph:
-                            fgraph.attach_feature(dh_handler()),)
-        rval = FromFunctionLocalOptimizer(f, tracks, requirements)
+        rval = FromFunctionLocalOptimizer(f, tracks)
         rval.__name__ = f.__name__
         return rval
     return decorator
@@ -962,9 +731,6 @@ class LocalOptGroup(LocalOptimizer):
     """WRITEME"""
 
     def __init__(self, *optimizers):
-        if len(optimizers) == 1 and isinstance(optimizers[0], list):
-            # This happen when created by LocalGroupDB.
-            optimizers = tuple(optimizers[0])
         self.opts = optimizers
         self.reentrant = any(getattr(opt, 'reentrant', True)
                              for opt in optimizers)
@@ -973,16 +739,8 @@ def __init__(self, *optimizers):
 
     def __str__(self):
         return getattr(self, '__name__',
-                       ('LocalOptGroup(%s)' %
-                        ','.join([str(o) for o in self.opts])))
-
-    def tracks(self):
-        t = []
-        for l in self.opts:
-            tt = l.tracks()
-            if tt:
-                t.extend(tt)
-        return t
+                ('<theano.gof.opt.LocalOptGroup instance>'
+                    + str([str(o) for o in self.opts])))
 
     def transform(self, node):
         for opt in self.opts:
@@ -998,10 +756,6 @@ def print_summary(self, stream=sys.stdout, level=0, depth=-1):
             for lopt in self.opts:
                 lopt.print_summary(stream, level=(level + 2), depth=depth)
 
-    def add_requirements(self, fgraph):
-        for opt in self.opts:
-            opt.add_requirements(fgraph)
-
 
 class _LocalOpKeyOptGroup(LocalOptGroup):
     """WRITEME"""
@@ -1043,7 +797,7 @@ def op_key(self):
         return self.op1
 
     def tracks(self):
-        return [self.op1]
+        return [[self.op1]]
 
     def transform(self, node):
         if node.op != self.op1:
@@ -1074,7 +828,7 @@ def op_key(self):
         return self.op
 
     def tracks(self):
-        return [self.op]
+        return [[self.op]]
 
     def transform(self, node):
         if node.op != self.op:
@@ -1141,10 +895,8 @@ class PatternSub(LocalOptimizer):
                       (scrabble, 'x'))
     """
 
-    def __init__(self, in_pattern, out_pattern,
-                 allow_multiple_clients=False,
-                 skip_identities_fn=None, name=None, pdb=False,
-                 tracks=(), get_nodes=None):
+    def __init__(self, in_pattern, out_pattern, allow_multiple_clients=False,
+            skip_identities_fn=None, name=None, pdb=False):
         """
         Creates a PatternSub that replaces occurrences of
         in_pattern by occurrences of out_pattern.
@@ -1154,21 +906,8 @@ def __init__(self, in_pattern, out_pattern,
         :param allow_multiple_clients: if False, the pattern matching will fail
                                        if one of the subpatterns has more than
                                        one client.
-        :param skip_identities_fn: TODO
-        :param name: Allow to override this optimizer name
         :param pdb: if True, we invoke pdb when the first node in the
                     pattern match.
-        :param tracks: Optional. The values that self.tracks() will
-            return. Useful to speed up optimization some times.
-        :param get_nodes: Optional. If you provide `tracks`, you must
-            provide this parameter. It must be a function that take the
-            tracked node and return a list of node on which we will try
-            this optimizer.
-
-        `tracks` and `get_nodes` can be used to make this optimizer
-        track a less frequent Op, so this will make this optimizer
-        tried less frequently,
-
         """
         self.in_pattern = in_pattern
         self.out_pattern = out_pattern
@@ -1178,57 +917,53 @@ def __init__(self, in_pattern, out_pattern,
             self.op = self.in_pattern['pattern'][0]
         else:
             raise TypeError("The pattern to search for must start with "
-                            "a specific Op instance.")
-        self.__doc__ = (self.__class__.__doc__ +
-                        "\n\nThis instance does: " +
-                        str(self) + "\n")
+                    "a specific Op instance.")
+        self.__doc__ = (self.__class__.__doc__
+                + "\n\nThis instance does: "
+                + str(self) + "\n")
         self.allow_multiple_clients = allow_multiple_clients
         self.skip_identities_fn = skip_identities_fn
         if name:
             self.__name__ = name
         self.pdb = pdb
-        self._tracks = tracks
-        self.get_nodes = get_nodes
-        if tracks != ():
-            assert get_nodes
+
+    def skip_identities(self, expr):
+        if self.skip_identities_fn:
+            return self.skip_identities_fn(expr)
 
     def op_key(self):
         return self.op
 
     def tracks(self):
-        if self._tracks != ():
-            return self._tracks
-        return [self.op]
+        def helper(pattern, sofar):
+            if isinstance(pattern, (list, tuple)):
+                sofar = sofar + (pattern[0],)
+                return reduce(tuple.__add__,
+                              tuple(helper(p, sofar) for p in pattern[1:]),
+                              ())
+            elif isinstance(pattern, dict):
+                return helper(pattern['pattern'], sofar)
+            else:
+                return (sofar,)
+        return set(helper(self.in_pattern, ()))
 
-    def transform(self, node, get_nodes=True):
+    def transform(self, node):
         """
         Checks if the graph from node corresponds to in_pattern. If it does,
         constructs out_pattern and performs the replacement.
         """
-        if get_nodes and self.get_nodes is not None:
-            for real_node in self.get_nodes(node):
-                if real_node == "output":
-                    continue
-                ret = self.transform(real_node, get_nodes=False)
-                if ret is not False and ret is not None:
-                    assert len(real_node.outputs) == len(ret)
-                    return dict(zip(real_node.outputs, ret))
-
         if node.op != self.op:
             return False
-        #TODO: if we remove pdb, do this speed things up?
+
         def match(pattern, expr, u, allow_multiple_clients=False, pdb=False):
-            #TODO move outside match
             def retry_with_equiv():
-                if not self.skip_identities_fn:
-                    return False
-                expr_equiv = self.skip_identities_fn(expr)
+                expr_equiv = self.skip_identities(expr)
                 if expr_equiv is None:
                     return False
                 #TODO: Not sure how to handle multiple_clients flag
                 ###print 'retrying match', pattern, expr_equiv
                 return match(pattern, expr_equiv, u,
-                             allow_multiple_clients=allow_multiple_clients)
+                        allow_multiple_clients=allow_multiple_clients)
 
             if isinstance(pattern, (list, tuple)):
                 if expr.owner is None:
@@ -1248,8 +983,8 @@ def retry_with_equiv():
                     real_pattern = pattern['pattern']
                 except KeyError:
                     raise KeyError(
-                        "Malformed pattern: %s (expected key 'pattern')"
-                        % pattern)
+                            "Malformed pattern: %s (expected key 'pattern')"
+                            % pattern)
                 constraint = pattern.get('constraint', lambda expr: True)
                 if constraint(expr):
                     return match(real_pattern, expr, u,
@@ -1281,19 +1016,19 @@ def retry_with_equiv():
                 pdb.set_trace()
             return u
 
+        def build(pattern, u):
+            if isinstance(pattern, (list, tuple)):
+                args = [build(p, u) for p in pattern[1:]]
+                return pattern[0](*args)
+            elif isinstance(pattern, basestring):
+                return u[unify.Var(pattern)]
+            elif isinstance(pattern, (int, float)):
+                return pattern
+            else:
+                return pattern.clone()
         u = match(self.in_pattern, node.out, unify.Unification(), True,
-                  self.pdb)
+                self.pdb)
         if u:
-            def build(pattern, u):
-                if isinstance(pattern, (list, tuple)):
-                    args = [build(p, u) for p in pattern[1:]]
-                    return pattern[0](*args)
-                elif isinstance(pattern, basestring):
-                    return u[unify.Var(pattern)]
-                elif isinstance(pattern, (int, float)):
-                    return pattern
-                else:
-                    return pattern.clone()
             p = self.out_pattern
             new = build(p, u)
             ####print "PatternSub matched:", new
@@ -1340,30 +1075,6 @@ def print_summary(self, stream=sys.stdout, level=0, depth=-1):
 
 # Use the following classes to apply LocalOptimizers
 
-class Updater:
-    def __init__(self, importer, pruner, chin):
-        self.importer = importer
-        self.pruner = pruner
-        self.chin = chin
-
-    def on_import(self, fgraph, node, reason):
-        if self.importer:
-            self.importer(node)
-
-    def on_prune(self, fgraph, node, reason):
-        if self.pruner:
-            self.pruner(node)
-
-    def on_change_input(self, fgraph, node, i, r, new_r, reason):
-        if self.chin:
-            self.chin(node, i, r, new_r, reason)
-
-    def on_detach(self, fgraph):
-        # To allow pickling this object
-        self.importer = None
-        self.pruner = None
-        self.chin = None
-
 
 class NavigatorOptimizer(Optimizer):
     """Abstract class
@@ -1400,7 +1111,7 @@ def warn_ignore(exc, nav, repl_pairs, local_opt):
         pass
 
     def __init__(self, local_opt, ignore_newtrees='auto',
-                 failure_callback=None):
+            failure_callback=None):
         """
         :param local_opt:  a LocalOptimizer to apply over a FunctionGraph
             (or None is Ok too).
@@ -1452,7 +1163,18 @@ def attach_updater(self, fgraph, importer, pruner, chin=None):
         if importer is None and pruner is None:
             return None
 
-        u = Updater(importer, pruner, chin)
+        class Updater:
+            if importer is not None:
+                def on_import(self, fgraph, node):
+                    importer(node)
+            if pruner is not None:
+                def on_prune(self, fgraph, node):
+                    pruner(node)
+            if chin is not None:
+                def on_change_input(self, fgraph, node, i, r, new_r):
+                    chin(node, i, r, new_r)
+
+        u = Updater()
         fgraph.attach_feature(u)
         return u
 
@@ -1492,33 +1214,22 @@ def process_node(self, fgraph, node, lopt=None):
         except Exception, e:
             if self.failure_callback is not None:
                 self.failure_callback(e, self,
-                                      [(x, None) for x in node.outputs],
-                                      lopt)
+                        [(x, None) for x in node.outputs], lopt)
                 return False
             else:
                 raise
         if replacements is False or replacements is None:
             return False
-        old_vars = node.outputs
-        if isinstance(replacements, dict):
-            old_vars = replacements.keys()
-            replacements = replacements.values()
-        elif not isinstance(replacements, (tuple, list)):
+        if not isinstance(replacements, (tuple, list)):
             raise TypeError('Optimizer %s gave wrong type of replacement. '
-                            'Expected list or tuple.' % lopt)
-        if len(old_vars) != len(replacements):
+                    'Expected list or tuple.' % lopt)
+        if len(node.outputs) != len(replacements):
             raise ValueError('Optimizer %s gave wrong number of replacements'
-                             % lopt)
-        # None in the replacement mean that this variable isn't used
-        # and we want to remove it
-        for r, rnew in zip(old_vars, replacements):
-            if rnew is None and len(r.clients) > 0:
-                raise ValueError("A local optimizer tried to remove a Variable that is used")
+                    % lopt)
         # If an output would be replaced by itself, no need to perform
         # the replacement
-        repl_pairs = [(r, rnew) for r, rnew in zip(old_vars, replacements)
-                      if rnew is not r and rnew is not None]
-
+        repl_pairs = [(r, rnew) for r, rnew in zip(node.outputs, replacements)
+                if rnew is not r]
         if len(repl_pairs) == 0:
             return False
         try:
@@ -1547,28 +1258,24 @@ def print_summary(self, stream=sys.stdout, level=0, depth=-1):
                 (' ' * level), self.__class__.__name__, id(self))
         if depth != 0:
             self.local_opt.print_summary(stream, level=(level + 2),
-                                         depth=(depth - 1))
+                    depth=(depth - 1))
 
 
 class TopoOptimizer(NavigatorOptimizer):
     """WRITEME"""
 
     def __init__(self, local_opt, order='in_to_out', ignore_newtrees=False,
-                 failure_callback=None):
+            failure_callback=None):
         if order not in ['out_to_in', 'in_to_out']:
             raise ValueError("order must be 'out_to_in' or 'in_to_out'")
         self.order = order
         NavigatorOptimizer.__init__(self, local_opt, ignore_newtrees,
-                                    failure_callback)
+                failure_callback)
 
     def apply(self, fgraph, start_from=None):
         if start_from is None:
             start_from = fgraph.outputs
-        callback_before = fgraph.execute_callbacks_time
-        nb_nodes_start = len(fgraph.apply_nodes)
-        t0 = time.time()
         q = deque(graph.io_toposort(fgraph.inputs, start_from))
-        io_t = time.time() - t0
 
         def importer(node):
             if node is not current_node:
@@ -1582,55 +1289,30 @@ def pruner(node):
                     pass
 
         u = self.attach_updater(fgraph, importer, pruner)
-        nb = 0
         try:
-            t0 = time.time()
             while q:
                 if self.order == 'out_to_in':
                     node = q.pop()
                 else:
                     node = q.popleft()
                 current_node = node
-                nb += self.process_node(fgraph, node)
-            loop_t = time.time() - t0
+                self.process_node(fgraph, node)
         except Exception:
             self.detach_updater(fgraph, u)
             raise
         self.detach_updater(fgraph, u)
 
-        callback_time = fgraph.execute_callbacks_time - callback_before
-        nb_nodes_end = len(fgraph.apply_nodes)
-        return (nb, nb_nodes_start, nb_nodes_end,
-                io_t, loop_t, callback_time)
-
-    @staticmethod
-    def print_profile(stream, prof, level=0):
-        (nb, nb_nodes_start, nb_nodes_end,
-         io_t, loop_t, callback_time) = prof
-
-        blanc = ('    ' * level)
-        print >> stream, blanc, "TopoOptimizer"
-        print >> stream, blanc, "  nb_node (start, end, changed)", (
-            nb_nodes_start, nb_nodes_end, nb)
-        print >> stream, blanc, "  init io_toposort", io_t
-        print >> stream, blanc, "  loop time", loop_t
-        print >> stream, blanc, "  callback_time", callback_time
-
-    def __str__(self):
-        return getattr(self, '__name__',
-                       '<TopoOptimizer instance>')
-
 
 class OpKeyOptimizer(NavigatorOptimizer):
     """WRITEME"""
 
     def __init__(self, local_opt, ignore_newtrees=False,
-                 failure_callback=None):
+            failure_callback=None):
         if not hasattr(local_opt, 'op_key'):
             raise TypeError("LocalOptimizer for OpKeyOptimizer must have "
-                            "an 'op_key' method.")
+                    "an 'op_key' method.")
         NavigatorOptimizer.__init__(self, local_opt, ignore_newtrees,
-                                    failure_callback)
+                failure_callback)
 
     def apply(self, fgraph):
         op = self.local_opt.op_key()
@@ -1675,10 +1357,10 @@ class ChangeTracker:
     def __init__(self):
         self.changed = False
 
-    def on_import(self, fgraph, node, reason):
+    def on_import(self, fgraph, node):
         self.changed = True
 
-    def on_change_input(self, fgraph, node, i, r, new_r, reason):
+    def on_change_input(self, fgraph, node, i, r, new_r):
         self.changed = True
 
     def reset(self):
@@ -1692,56 +1374,40 @@ class EquilibriumOptimizer(NavigatorOptimizer):
     def __init__(self,
                  optimizers,
                  failure_callback=None,
-                 ignore_newtrees=True,
+                 max_depth=None,
                  max_use_ratio=None):
-        """ Apply optimizations until equilibrium point.
-
+        """
         :param optimizers:  list or set of local or global optimizations to
             apply until equilibrium.
 
         :param max_use_ratio: each optimizer can be applied at most
             (size of graph * this number) times
-        :param ignore_newtrees: See EquilibriumDB ignore_newtrees
-            parameter definition
+
+        :param max_depth: TODO what does this do? (EquilibriumDB sets it to 5)
 
         """
 
         super(EquilibriumOptimizer, self).__init__(
             None,
-            ignore_newtrees=ignore_newtrees,
+            ignore_newtrees=True,
             failure_callback=failure_callback)
-        self.local_optimizers_map = dict()
-        self.local_optimizers_all = []
+        self.local_optimizers = []
         self.global_optimizers = []
 
         for opt in optimizers:
             if isinstance(opt, LocalOptimizer):
-                if opt.tracks() is None:
-                    self.local_optimizers_all.append(opt)
-                else:
-                    for c in opt.tracks():
-                        self.local_optimizers_map.setdefault(c, []).append(opt)
+                self.local_optimizers.append(opt)
             else:
                 self.global_optimizers.append(opt)
+        self.max_depth = max_depth
         self.max_use_ratio = max_use_ratio
         assert self.max_use_ratio is not None, (
                 'max_use_ratio has to be a number')
 
-    def get_local_optimizers(self):
-        for opt in self.local_optimizers_all:
-            yield opt
-        # if repeat is not a problem we can drop the set
-        s = set()
-        for lopt in self.local_optimizers_map.values():
-            for opt in lopt:
-                if opt not in s:
-                    yield opt
-                    s.add(opt)
-
     def add_requirements(self, fgraph):
         super(EquilibriumOptimizer, self).add_requirements(fgraph)
         fgraph.attach_feature(ChangeTracker())
-        for opt in self.get_local_optimizers():
+        for opt in self.local_optimizers:
             opt.add_requirements(fgraph)
         for opt in self.global_optimizers:
             opt.add_requirements(fgraph)
@@ -1749,45 +1415,37 @@ def add_requirements(self, fgraph):
     def apply(self, fgraph, start_from=None):
         if start_from is None:
             start_from = fgraph.outputs
-        else:
-            for node in start_from:
-                assert node in fgraph.outputs
-
         changed = True
         max_use_abort = False
         opt_name = None
-        global_process_count = {}
-        start_nb_nodes = len(fgraph.apply_nodes)
+        process_count = {}
         max_nb_nodes = len(fgraph.apply_nodes)
         max_use = max_nb_nodes * self.max_use_ratio
 
         loop_timing = []
-        loop_process_count = []
         global_opt_timing = []
-        time_opts = {}
+        time_lopts = {}
         io_toposort_timing = []
         nb_nodes = []
-        for opt in self.global_optimizers + list(self.get_local_optimizers()):
-            global_process_count.setdefault(opt, 0)
-            time_opts.setdefault(opt, 0)
+        for gopt in self.global_optimizers:
+            process_count.setdefault(gopt, 0)
+
+        for lopt in self.local_optimizers:
+            process_count.setdefault(lopt, 0)
+            time_lopts.setdefault(lopt, 0)
 
         while changed and not max_use_abort:
-            process_count = {}
             t0 = time.time()
             changed = False
 
             #apply global optimizers
             for gopt in self.global_optimizers:
                 fgraph.change_tracker.reset()
-                t_opt = time.time()
                 gopt.apply(fgraph)
-                time_opts[gopt] += time.time() - t_opt
                 if fgraph.change_tracker.changed:
-                    process_count.setdefault(gopt, 0)
                     process_count[gopt] += 1
-                    global_process_count[gopt] += 1
                     changed = True
-                    if global_process_count[gopt] > max_use:
+                    if process_count[gopt] > max_use:
                         max_use_abort = True
                         opt_name = (getattr(gopt, "name", None)
                                     or getattr(gopt, "__name__", ""))
@@ -1795,6 +1453,9 @@ def apply(self, fgraph, start_from=None):
             global_opt_timing.append(float(time.time() - t0))
 
             #apply local optimizer
+            for node in start_from:
+                assert node in fgraph.outputs
+
             topo_t0 = time.time()
             q = deque(graph.io_toposort(fgraph.inputs, start_from))
             io_toposort_timing.append(time.time() - topo_t0)
@@ -1820,18 +1481,14 @@ def pruner(node):
                     node = q.pop()
                     current_node = node
 
-                    for lopt in (self.local_optimizers_all +
-                                 self.local_optimizers_map.get(type(node.op), []) +
-                                 self.local_optimizers_map.get(node.op, [])):
-                        t_opt = time.time()
+                    for lopt in self.local_optimizers:
+                        t_lopt = time.time()
                         lopt_change = self.process_node(fgraph, node, lopt)
-                        time_opts[lopt] += time.time() - t_opt
+                        time_lopts[lopt] += time.time() - t_lopt
                         if lopt_change:
-                            process_count.setdefault(lopt, 0)
                             process_count[lopt] += 1
-                            global_process_count[lopt] += 1
                             changed = True
-                            if global_process_count[lopt] > max_use:
+                            if process_count[lopt] > max_use:
                                 max_use_abort = True
                                 opt_name = (getattr(lopt, "name", None)
                                             or getattr(lopt, "__name__", ""))
@@ -1841,106 +1498,66 @@ def pruner(node):
             finally:
                 self.detach_updater(fgraph, u)
 
-            loop_process_count.append(process_count)
             loop_timing.append(float(time.time() - t0))
 
-        end_nb_nodes = len(fgraph.apply_nodes)
-
         if max_use_abort:
             _logger.error("EquilibriumOptimizer max'ed out by '%s'" % opt_name
                           + ". You can safely raise the current threshold of "
                           + "%f with the theano flag 'optdb.max_use_ratio'." %
                           config.optdb.max_use_ratio)
 
-        return (self, loop_timing, loop_process_count,
-                (start_nb_nodes, end_nb_nodes, max_nb_nodes),
-                global_opt_timing, nb_nodes, time_opts, io_toposort_timing)
+        return (self, loop_timing, process_count, max_nb_nodes,
+                global_opt_timing, nb_nodes, time_lopts, io_toposort_timing)
 
     def print_summary(self, stream=sys.stdout, level=0, depth=-1):
         name = getattr(self, 'name', None)
         print >> stream, "%s%s %s id=%i" % (
                 (' ' * level), self.__class__.__name__, name, id(self))
         if depth != 0:
-            for lopt in self.get_local_optimizers():
+            for lopt in self.local_optimizers:
                 lopt.print_summary(stream, level=(level + 2),
-                                   depth=(depth - 1))
+                        depth=(depth - 1))
 
     @staticmethod
     def print_profile(stream, prof, level=0):
-        (opt, loop_timing, loop_process_count,
-         (start_nb_nodes, end_nb_nodes, max_nb_nodes),
-         global_opt_timing, nb_nodes, time_opts, io_toposort_timing) = prof
-
+        (opt, loop_timing, process_count, max_nb_nodes,
+         global_opt_timing, nb_nodes, time_lopts, io_toposort_timing) = prof
         blanc = ('    ' * level)
         print >> stream, blanc, "EquilibriumOptimizer",
         print >> stream, blanc, getattr(opt, "name",
                                         getattr(opt, "__name__", ""))
-        print >> stream, blanc, "  time %.3fs for %d passes" % (
-                sum(loop_timing), len(loop_timing))
-        print >> stream, blanc, "  nb nodes (start, end,  max) %d %d %d" % (
-                start_nb_nodes, end_nb_nodes, max_nb_nodes)
-        print >> stream, blanc, "  time io_toposort %.3fs" % sum(
+        print >> stream, blanc, " time %.3fs for %d passes, %d nodes max" % (
+                sum(loop_timing), len(loop_timing), max_nb_nodes)
+        print >> stream, blanc, " time io_toposort %.3fs" % sum(
             io_toposort_timing)
-        s = sum([time_opts[o] for o in opt.get_local_optimizers()])
-        print >> stream, blanc, "  time in local optimizers %.3fs" % s
-        s = sum([time_opts[o] for o in opt.global_optimizers])
-        print >> stream, blanc, "  time in global optimizers %.3fs" % s
         for i in range(len(loop_timing)):
-            lopt = ""
-            if loop_process_count[i]:
-                d = list(reversed(sorted(loop_process_count[i].iteritems(),
-                                         key=lambda a: a[1])))
-                lopt = " ".join([str((str(k), v)) for k, v
-                                 in d[:5]])
-                if len(d) > 5:
-                    lopt += " ..."
-            print >> stream, blanc, ('  %2d - %.3fs %d (%.3fs in global opts, '
-                                     '%.3fs io_toposort) - %d nodes - %s' % (
+            print >> stream, blanc, ('%d - %.3fs (%.3fs in global opts, '
+                                     '%.3fs io_toposort) - %d nodes' % (
                                          i, loop_timing[i],
-                                         sum(loop_process_count[i].values()),
                                          global_opt_timing[i],
-                                         io_toposort_timing[i], nb_nodes[i],
-                                         lopt))
+                                         io_toposort_timing[i], nb_nodes[i]))
 
         count_opt = []
-        not_used = []
-        not_used_time = 0
-        process_count = {}
-        for o in opt.global_optimizers + list(opt.get_local_optimizers()):
-            process_count.setdefault(o, 0)
-        for count in loop_process_count:
-            for o, v in count.iteritems():
-                process_count[o] += v
         for opt, count in process_count.iteritems():
             if count > 0:
-                count_opt.append((time_opts[opt], count, opt))
-            else:
-                not_used.append((time_opts[opt], opt))
-                not_used_time += time_opts[opt]
+                count_opt.append((time_lopts[opt], count, opt))
 
         if count_opt:
             print >> stream, blanc, \
-                    '  times - times applied - name:'
+                    'times applied - optimizer (only those applied):'
             count_opt.sort()
             for (t, count, opt) in count_opt[::-1]:
                 print >> stream, blanc, '  %.3fs - %d - %s' % (
                     t, count, opt)
-            print >> stream, blanc, '  %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
-                not_used_time, len(not_used))
-            not_used.sort()
-            for (t, opt) in not_used[::-1]:
-                if t > 0:
-                    # Skip opt that have 0 times, they probably wasn't even tried.
-                    print >> stream, blanc + "  ", '  %.3fs - %s' % (t, opt)
             print >> stream
 
     @staticmethod
     def merge_profile(prof1, prof2):
-        #(opt, loop_timing, loop_process_count, max_nb_nodes,
-        # global_opt_timing, nb_nodes, time_opts, io_toposort_timing) = prof1
+        #(opt, loop_timing, process_count, max_nb_nodes,
+        # global_opt_timing, nb_nodes, time_lopts, io_toposort_timing) = prof1
 
-        local_optimizers = set(prof1[0].get_local_optimizers()).union(
-            prof2[0].get_local_optimizers())
+        local_optimizers = set(prof1[0].local_optimizers).union(
+            prof2[0].local_optimizers)
         global_optimizers = set(prof1[0].global_optimizers).union(
             prof2[0].global_optimizers)
         new_opt = EquilibriumOptimizer(
@@ -1958,15 +1575,12 @@ def merge_list(l1, l2):
 
         loop_timing = merge_list(prof1[1], prof2[1])
 
-        loop_process_count = list(prof1[2])
-        for i in range(min(len(loop_process_count), len(prof2[2]))):
-            process_count = loop_process_count[i]
-            for process, count in prof2[2][i].iteritems():
-                if process in process_count:
-                    process_count[process] += count
-                else:
-                    process_count[process] = count
-        loop_process_count.extend(prof2[2][len(loop_process_count):])
+        process_count = prof1[2].copy()
+        for process, count in prof2[2].iteritems():
+            if process in process_count:
+                process_count[process] += count
+            else:
+                process_count[process] = count
 
         max_nb_nodes = max(prof1[3], prof2[3])
 
@@ -1974,12 +1588,12 @@ def merge_list(l1, l2):
 
         nb_nodes = merge_list(prof1[5], prof2[5])
 
-        time_opts = prof1[6].copy()
+        time_lopts = prof1[6].copy()
         for opt, t in prof2[6].iteritems():
-            if opt in time_opts:
-                time_opts[opt] += t
+            if opt in time_lopts:
+                time_lopts[opt] += t
             else:
-                time_opts[opt] = t
+                time_lopts[opt] = t
 
         io_toposort_timing = merge_list(prof1[7], prof2[7])
 
@@ -1988,11 +1602,11 @@ def merge_list(l1, l2):
         assert len(loop_timing) == max(len(prof1[1]), len(prof2[1]))
         return (new_opt,
                 loop_timing,
-                loop_process_count,
+                process_count,
                 max_nb_nodes,
                 global_opt_timing,
                 nb_nodes,
-                time_opts,
+                time_lopts,
                 io_toposort_timing)
 
 #################
@@ -2102,3 +1716,31 @@ def local_recursive_function(list_opt, out, optimized_vars, depth):
     final_outs, optimized_nodes = local_recursive_function(
         list_optimizations, out, {}, 0)
     return final_outs[out_index]
+
+
+############
+### Misc ###
+############
+
+class InplaceOptimizer(Optimizer):
+
+    def __init__(self, inplace):
+        self.inplace = inplace
+
+    def apply(self, fgraph):
+        self.inplace(fgraph)
+
+    def add_requirements(self, fgraph):
+        fgraph.attach_feature(dh.DestroyHandler())
+
+
+class PureThenInplaceOptimizer(Optimizer):
+
+    def __init__(self, pure, inplace):
+        self.pure = pure
+        self.inplace = inplace
+
+    def apply(self, fgraph):
+        self.pure(fgraph)
+        fgraph.attach_feature(dh.DestroyHandler())
+        self.inplace(fgraph)
diff --git a/theano/gof/optdb.py b/theano/gof/optdb.py
index 7dbec455a81..4a55eb43f47 100644
--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -1,9 +1,8 @@
 import sys
 
-import numpy
-
 from theano.gof.python25 import DefaultOrderedDict
-from theano.misc.ordered_set import OrderedSet
+
+import numpy
 from theano.compat.six import StringIO
 from theano.gof import opt
 from theano.configparser import AddConfigVar, FloatParam
@@ -27,26 +26,12 @@ def __hash__(self):
         return self._optimizer_idx
 
     def __init__(self):
-        self.__db__ = DefaultOrderedDict(OrderedSet)
+        self.__db__ = DefaultOrderedDict(set)
         self._names = set()
         self.name = None  # will be reset by register
         #(via obj.name by the thing doing the registering)
 
-    def register(self, name, obj, *tags, **kwargs):
-        """
-        :param name: name of the optimizer.
-        :param obj: the optimizer to register.
-        :param tags: tag name that allow to select the optimizer.
-        :param kwargs: If non empty, should contain
-            only use_db_name_as_tag=False.
-            By default, all optimizations registered in EquilibriumDB
-            are selected when the EquilibriumDB name is used as a
-            tag. We do not want this behavior for some optimizer like
-            local_remove_all_assert. use_db_name_as_tag=False remove
-            that behavior. This mean only the optimizer name and the
-            tags specified will enable that optimization.
-
-        """
+    def register(self, name, obj, *tags):
         # N.B. obj is not an instance of class Optimizer.
         # It is an instance of a DB.In the tests for example,
         # this is not always the case.
@@ -56,22 +41,19 @@ def register(self, name, obj, *tags, **kwargs):
             raise ValueError('The name of the object cannot be an existing'
                              ' tag or the name of another existing object.',
                              obj, name)
-        if kwargs:
-            assert "use_db_name_as_tag" in kwargs
-            assert kwargs["use_db_name_as_tag"] is False
-        else:
-            if self.name is not None:
-                tags = tags + (self.name,)
-        obj.name = name
         # This restriction is there because in many place we suppose that
         # something in the DB is there only once.
-        if obj.name in self.__db__:
+        if getattr(obj, 'name', "") in self.__db__:
             raise ValueError('''You can\'t register the same optimization
 multiple time in a DB. Tryed to register "%s" again under the new name "%s".
  Use theano.gof.ProxyDB to work around that''' % (obj.name, name))
-        self.__db__[name] = OrderedSet([obj])
+
+        if self.name is not None:
+            tags = tags + (self.name,)
+        obj.name = name
+        self.__db__[name] = set([obj])
         self._names.add(name)
-        self.__db__[obj.__class__.__name__].add(obj)
+
         self.add_tags(name, *tags)
 
     def add_tags(self, name, *tags):
@@ -97,16 +79,15 @@ def remove_tags(self, name, *tags):
     def __query__(self, q):
         if not isinstance(q, Query):
             raise TypeError('Expected a Query.', q)
-        # The ordered set is needed for deterministic optimization.
-        variables = OrderedSet()
+        variables = set()
         for tag in q.include:
             variables.update(self.__db__[tag])
         for tag in q.require:
             variables.intersection_update(self.__db__[tag])
         for tag in q.exclude:
             variables.difference_update(self.__db__[tag])
-        remove = OrderedSet()
-        add = OrderedSet()
+        remove = set()
+        add = set()
         for obj in variables:
             if isinstance(obj, DB):
                 sq = q.subquery.get(obj.name, q)
@@ -162,19 +143,15 @@ def __init__(self, include, require=None, exclude=None,
         :param position_cutoff: Used by SequenceDB to keep only optimizer that
                                 are positioned before the cut_off point.
         """
-        self.include = OrderedSet(include)
-        self.require = require or OrderedSet()
-        self.exclude = exclude or OrderedSet()
+        self.include = set(include)
+        self.require = require or set()
+        self.exclude = exclude or set()
         self.subquery = subquery or {}
         self.position_cutoff = position_cutoff
         if isinstance(self.require, (list, tuple)):
-            self.require = OrderedSet(self.require)
+            self.require = set(self.require)
         if isinstance(self.exclude, (list, tuple)):
-            self.exclude = OrderedSet(self.exclude)
-
-    def __str__(self):
-        return "Query{inc=%s,ex=%s,require=%s,subquery=%s,position_cutoff=%d}" % (
-            self.include, self.exclude, self.require, self.subquery, self.position_cutoff)
+            self.exclude = set(self.exclude)
 
     #add all opt with this tag
     def including(self, *tags):
@@ -202,33 +179,24 @@ def requiring(self, *tags):
 
 
 class EquilibriumDB(DB):
-    """A set of potential optimizations which should be applied in an
+    """ A set of potential optimizations which should be applied in an
         arbitrary order until equilibrium is reached.
 
     Canonicalize, Stabilize, and Specialize are all equilibrium optimizations.
 
-    :param ignore_newtrees: If False, we will apply local opt on new
-        node introduced during local optimization application. This
-        could result in less fgraph iterations, but this don't mean it
-        will be faster globally.
-
     .. note::
 
         We can put LocalOptimizer and Optimizer as EquilibriumOptimizer
         suppor both.
 
     """
-    def __init__(self, ignore_newtrees=True):
-        super(EquilibriumDB, self).__init__()
-        self.ignore_newtrees = ignore_newtrees
 
     def query(self, *tags, **kwtags):
         opts = super(EquilibriumDB, self).query(*tags, **kwtags)
-        return opt.EquilibriumOptimizer(
-            opts,
-            max_use_ratio=config.optdb.max_use_ratio,
-            ignore_newtrees=self.ignore_newtrees,
-            failure_callback=opt.NavigatorOptimizer.warn_inplace)
+        return opt.EquilibriumOptimizer(opts,
+                max_depth=5,
+                max_use_ratio=config.optdb.max_use_ratio,
+                failure_callback=opt.NavigatorOptimizer.warn_inplace)
 
 
 class SequenceDB(DB):
@@ -244,7 +212,6 @@ class SequenceDB(DB):
     other tags) fast_run and fast_compile optimizers are drawn is a SequenceDB.
 
     """
-    seq_opt = opt.SeqOptimizer
 
     def __init__(self, failure_callback=opt.SeqOptimizer.warn):
         super(SequenceDB, self).__init__()
@@ -272,22 +239,14 @@ def query(self, *tags, **kwtags):
                 position_cutoff = tags[0].position_cutoff
 
         opts = [o for o in opts if self.__position__[o.name] < position_cutoff]
-        # We want to sort by position and then if collision by name
-        # for deterministic optimization.  Since Python 2.2, sort is
-        # stable, so sort by name first, then by position. This give
-        # the order we want.
-        opts.sort(key=lambda obj: obj.name)
         opts.sort(key=lambda obj: self.__position__[obj.name])
-        kwargs = {}
-        if self.failure_callback:
-            kwargs["failure_callback"] = self.failure_callback
-        ret = self.seq_opt(opts, **kwargs)
+        ret = opt.SeqOptimizer(opts, failure_callback=self.failure_callback)
         if hasattr(tags[0], 'name'):
             ret.name = tags[0].name
         return ret
 
     def print_summary(self, stream=sys.stdout):
-        print >> stream, self.__class__.__name__ + " (id %i)" % id(self)
+        print >> stream, "SequenceDB (id %i)" % id(self)
         positions = self.__position__.items()
 
         def c(a, b):
@@ -304,19 +263,6 @@ def __str__(self):
         return sio.getvalue()
 
 
-class LocalGroupDB(SequenceDB):
-    """This generate a local optimizer of type LocalOptGroup instead
-    of a global optimizer.
-
-    It support the tracks, to only get applied to some Op.
-    """
-    seq_opt = opt.LocalOptGroup
-
-    def __init__(self, failure_callback=opt.SeqOptimizer.warn):
-        super(LocalGroupDB, self).__init__()
-        self.failure_callback = None
-
-
 class ProxyDB(DB):
     """
     Wrap an existing proxy.
diff --git a/theano/gof/tests/test_cc.py b/theano/gof/tests/test_cc.py
index a566508147b..a406221825d 100644
--- a/theano/gof/tests/test_cc.py
+++ b/theano/gof/tests/test_cc.py
@@ -2,11 +2,9 @@
 import unittest
 
 from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
 
-import theano
 from theano.gof.link import PerformLinker
-from theano.gof.cc import CLinker, DualLinker, OpWiseCLinker
+from theano.gof.cc import *
 from theano.gof.type import Type
 from theano.gof.graph import Variable, Apply, Constant
 from theano.gof.op import Op
@@ -22,7 +20,7 @@ class TDouble(Type):
     def filter(self, data):
         return float(data)
 
-    def c_declare(self, name, sub, check_input=True):
+    def c_declare(self, name, sub):
         return "double %(name)s; void* %(name)s_bad_thing;" % locals()
 
     def c_init(self, name, sub):
@@ -35,7 +33,7 @@ def c_init(self, name, sub):
     def c_literal(self, data):
         return str(data)
 
-    def c_extract(self, name, sub, check_input=True):
+    def c_extract(self, name, sub):
         return """
         if (!PyFloat_Check(py_%(name)s)) {
             PyErr_SetString(PyExc_TypeError, "not a double!");
@@ -229,17 +227,6 @@ def test_clinker_dups():
     # note: for now the behavior of fn(2.0, 7.0) is undefined
 
 
-def test_clinker_not_used_inputs():
-    if not theano.config.cxx:
-        raise SkipTest("G++ not available, so we need to skip this test.")
-    # Testing that unused inputs are allowed.
-    x, y, z = inputs()
-    e = add(x, y)
-    lnk = CLinker().accept(Env([x, y, z], [e]))
-    fn = lnk.make_function()
-    assert fn(2.0, 1.5, 1.0) == 3.5
-
-
 def test_clinker_dups_inner():
     if not theano.config.cxx:
         raise SkipTest("G++ not available, so we need to skip this test.")
@@ -255,7 +242,6 @@ def test_clinker_dups_inner():
 # Test OpWiseCLinker #
 ######################
 
-# slow on linux, but near sole test and very central
 def test_opwiseclinker_straightforward():
     x, y, z = inputs()
     e = add(mul(add(x, y), div(x, y)), bad_sub(bad_sub(x, y), z))
@@ -267,7 +253,6 @@ def test_opwiseclinker_straightforward():
         # The python version of bad_sub always return -10.
         assert fn(2.0, 2.0, 2.0) == -6
 
-
 def test_opwiseclinker_constant():
     x, y, z = inputs()
     x = Constant(tdouble, 7.2, name='x')
diff --git a/theano/gof/tests/test_cmodule.py b/theano/gof/tests/test_cmodule.py
index 7cfb8977091..edc0c1a024c 100644
--- a/theano/gof/tests/test_cmodule.py
+++ b/theano/gof/tests/test_cmodule.py
@@ -7,7 +7,6 @@
 import numpy
 
 import theano
-from theano.gof.cmodule import GCC_compiler
 
 
 class MyOp(theano.compile.ops.DeepCopyOp):
@@ -45,7 +44,7 @@ def test_inter_process_cache():
     x, y = theano.tensor.dvectors('xy')
     f = theano.function([x, y], [MyOp()(x), MyOp()(y)])
     f(numpy.arange(60), numpy.arange(60))
-    if theano.config.mode == 'FAST_COMPILE' or theano.config.cxx == "":
+    if theano.config.mode == 'FAST_COMPILE':
         assert MyOp.nb_called == 0
     else:
         assert MyOp.nb_called == 1
@@ -54,15 +53,7 @@ def test_inter_process_cache():
     x, y = theano.tensor.dvectors('xy')
     f = theano.function([x, y], [MyOp()(x), MyOp()(y)])
     f(numpy.arange(60), numpy.arange(60))
-    if theano.config.mode == 'FAST_COMPILE' or theano.config.cxx == "":
+    if theano.config.mode == 'FAST_COMPILE':
         assert MyOp.nb_called == 0
     else:
         assert MyOp.nb_called == 1
-
-
-def test_flag_detection():
-    # Check that the code detecting blas flags does not raise any exception.
-    # It used to happen on python 3 because of improper string handling,
-    # but was not detected because that path is not usually taken,
-    # so we test it here directly.
-    GCC_compiler.try_flags(["-lblas"])
diff --git a/theano/gof/tests/test_compute_test_value.py b/theano/gof/tests/test_compute_test_value.py
index 69aafa13000..533eb0c6205 100644
--- a/theano/gof/tests/test_compute_test_value.py
+++ b/theano/gof/tests/test_compute_test_value.py
@@ -1,7 +1,4 @@
-import os
-import sys
-import traceback
-import warnings
+import os, sys, traceback, warnings
 
 import numpy
 from nose.plugins.skip import SkipTest
@@ -17,30 +14,6 @@
 from theano.tensor.basic import _allclose
 
 
-# Used in TestComputeTestValue.test_no_perform
-class IncOneC(Op):
-    """An Op with only a C (c_code) implementation"""
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def make_node(self, input):
-        input = scalar.as_scalar(input)
-        output = input.type()
-        return Apply(self, [input], [output])
-
-    def c_code_cache_version(self):
-        return (1,)
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        x, = inputs
-        z, = outputs
-        return "%(z)s = %(x)s + 1;" % locals()
-
-
 class TestComputeTestValue(unittest.TestCase):
 
     def test_variable_only(self):
@@ -49,33 +22,34 @@ def test_variable_only(self):
             theano.config.compute_test_value = 'raise'
 
             x = T.matrix('x')
-            x.tag.test_value = numpy.random.rand(3, 4).astype(config.floatX)
+            x.tag.test_value = numpy.random.rand(3,4).astype(config.floatX)
             y = T.matrix('y')
-            y.tag.test_value = numpy.random.rand(4, 5).astype(config.floatX)
+            y.tag.test_value = numpy.random.rand(4,5).astype(config.floatX)
 
             # should work
-            z = T.dot(x, y)
+            z = T.dot(x,y)
             assert hasattr(z.tag, 'test_value')
-            f = theano.function([x, y], z)
+            f = theano.function([x,y], z)
             assert _allclose(f(x.tag.test_value, y.tag.test_value),
                              z.tag.test_value)
 
             # this test should fail
-            y.tag.test_value = numpy.random.rand(6, 5).astype(config.floatX)
+            y.tag.test_value = numpy.random.rand(6,5).astype(config.floatX)
             self.assertRaises(ValueError, T.dot, x, y)
         finally:
             theano.config.compute_test_value = orig_compute_test_value
 
+
     def test_compute_flag(self):
         orig_compute_test_value = theano.config.compute_test_value
         try:
             x = T.matrix('x')
             y = T.matrix('y')
-            y.tag.test_value = numpy.random.rand(4, 5).astype(config.floatX)
+            y.tag.test_value = numpy.random.rand(4,5).astype(config.floatX)
 
             # should skip computation of test value
             theano.config.compute_test_value = 'off'
-            z = T.dot(x, y)
+            z = T.dot(x,y)
             assert not hasattr(z.tag, 'test_value')
 
             # should fail when asked by user
@@ -101,25 +75,25 @@ def test_string_var(self):
             theano.config.compute_test_value = 'raise'
 
             x = T.matrix('x')
-            x.tag.test_value = numpy.random.rand(3, 4).astype(config.floatX)
+            x.tag.test_value = numpy.random.rand(3,4).astype(config.floatX)
             y = T.matrix('y')
-            y.tag.test_value = numpy.random.rand(4, 5).astype(config.floatX)
+            y.tag.test_value = numpy.random.rand(4,5).astype(config.floatX)
 
-            z = theano.shared(numpy.random.rand(5, 6).astype(config.floatX))
+            z = theano.shared(numpy.random.rand(5,6).astype(config.floatX))
 
             # should work
-            out = T.dot(T.dot(x, y), z)
+            out = T.dot(T.dot(x,y), z)
             assert hasattr(out.tag, 'test_value')
-            tf = theano.function([x, y], out)
+            tf = theano.function([x,y], out)
             assert _allclose(
-                tf(x.tag.test_value, y.tag.test_value),
-                out.tag.test_value)
+                    tf(x.tag.test_value, y.tag.test_value),
+                    out.tag.test_value)
 
-            def f(x, y, z):
-                return T.dot(T.dot(x, y), z)
+            def f(x,y,z):
+                return T.dot(T.dot(x,y),z)
 
             # this test should fail
-            z.set_value(numpy.random.rand(7, 6).astype(config.floatX))
+            z.set_value(numpy.random.rand(7,6).astype(config.floatX))
             self.assertRaises(ValueError, f, x, y, z)
         finally:
             theano.config.compute_test_value = orig_compute_test_value
@@ -130,18 +104,17 @@ def test_shared(self):
             theano.config.compute_test_value = 'raise'
 
             x = T.matrix('x')
-            x.tag.test_value = numpy.random.rand(3, 4).astype(config.floatX)
-            y = theano.shared(numpy.random.rand(4, 6).astype(config.floatX),
-                              'y')
+            x.tag.test_value = numpy.random.rand(3,4).astype(config.floatX)
+            y = theano.shared(numpy.random.rand(4,6).astype(config.floatX), 'y')
 
             # should work
-            z = T.dot(x, y)
+            z = T.dot(x,y)
             assert hasattr(z.tag, 'test_value')
             f = theano.function([x], z)
             assert _allclose(f(x.tag.test_value), z.tag.test_value)
 
             # this test should fail
-            y.set_value(numpy.random.rand(5, 6).astype(config.floatX))
+            y.set_value(numpy.random.rand(5,6).astype(config.floatX))
             self.assertRaises(ValueError, T.dot, x, y)
         finally:
             theano.config.compute_test_value = orig_compute_test_value
@@ -151,18 +124,17 @@ def test_ndarray(self):
         try:
             theano.config.compute_test_value = 'raise'
 
-            x = numpy.random.rand(2, 3).astype(config.floatX)
-            y = theano.shared(numpy.random.rand(3, 6).astype(config.floatX),
-                              'y')
+            x = numpy.random.rand(2,3).astype(config.floatX)
+            y = theano.shared(numpy.random.rand(3,6).astype(config.floatX), 'y')
 
             # should work
-            z = T.dot(x, y)
+            z = T.dot(x,y)
             assert hasattr(z.tag, 'test_value')
             f = theano.function([], z)
             assert _allclose(f(), z.tag.test_value)
 
             # this test should fail
-            x = numpy.random.rand(2, 4).astype(config.floatX)
+            x = numpy.random.rand(2,4).astype(config.floatX)
             self.assertRaises(ValueError, T.dot, x, y)
         finally:
             theano.config.compute_test_value = orig_compute_test_value
@@ -172,18 +144,17 @@ def test_constant(self):
         try:
             theano.config.compute_test_value = 'raise'
 
-            x = T.constant(numpy.random.rand(2, 3), dtype=config.floatX)
-            y = theano.shared(numpy.random.rand(3, 6).astype(config.floatX),
-                              'y')
+            x = T.constant(numpy.random.rand(2,3), dtype=config.floatX)
+            y = theano.shared(numpy.random.rand(3,6).astype(config.floatX), 'y')
 
             # should work
-            z = T.dot(x, y)
+            z = T.dot(x,y)
             assert hasattr(z.tag, 'test_value')
             f = theano.function([], z)
             assert _allclose(f(), z.tag.test_value)
 
             # this test should fail
-            x = T.constant(numpy.random.rand(2, 4), dtype=config.floatX)
+            x = T.constant(numpy.random.rand(2,4), dtype=config.floatX)
             self.assertRaises(ValueError, T.dot, x, y)
         finally:
             theano.config.compute_test_value = orig_compute_test_value
@@ -195,9 +166,9 @@ def test_incorrect_type(self):
 
             x = T.fmatrix('x')
             # Incorrect dtype (float64) for test_value
-            x.tag.test_value = numpy.random.rand(3, 4)
+            x.tag.test_value = numpy.random.rand(3,4)
             y = T.dmatrix('y')
-            y.tag.test_value = numpy.random.rand(4, 5)
+            y.tag.test_value = numpy.random.rand(4,5)
 
             self.assertRaises(TypeError, T.dot, x, y)
         finally:
@@ -210,9 +181,9 @@ def test_overided_function(self):
         try:
             config.compute_test_value = "raise"
             x = T.matrix()
-            x.tag.test_value = numpy.zeros((2, 3), dtype=config.floatX)
+            x.tag.test_value = numpy.zeros((2,3), dtype=config.floatX)
             y = T.matrix()
-            y.tag.test_value = numpy.zeros((2, 2), dtype=config.floatX)
+            y.tag.test_value = numpy.zeros((2,2), dtype=config.floatX)
             self.assertRaises(ValueError, x.__mul__, y)
         finally:
             theano.config.compute_test_value = orig_compute_test_value
@@ -255,7 +226,7 @@ def test_scan_err1(self):
             k = T.iscalar("k")
             A = T.matrix("A")
             k.tag.test_value = 3
-            A.tag.test_value = numpy.random.rand(5, 3).astype(config.floatX)
+            A.tag.test_value = numpy.random.rand(5,3).astype(config.floatX)
 
             def fx(prior_result, A):
                 return T.dot(prior_result, A)
@@ -264,10 +235,10 @@ def fx(prior_result, A):
             # we cannot simply use self.assertRaises()
             try:
                 theano.scan(
-                    fn=fx,
-                    outputs_info=T.ones_like(A),
-                    non_sequences=A,
-                    n_steps=k)
+                        fn=fx,
+                        outputs_info=T.ones_like(A),
+                        non_sequences=A,
+                        n_steps=k)
                 assert False
             except ValueError, e:
                 # Get traceback
@@ -291,26 +262,26 @@ def test_scan_err2(self):
             k = T.iscalar("k")
             A = T.matrix("A")
             k.tag.test_value = 3
-            A.tag.test_value = numpy.random.rand(5, 3).astype(config.floatX)
+            A.tag.test_value = numpy.random.rand(5,3).astype(config.floatX)
 
             def fx(prior_result, A):
                 return T.dot(prior_result, A)
 
             self.assertRaises(ValueError,
-                              theano.scan,
-                              fn=fx,
-                              outputs_info=T.ones_like(A.T),
-                              non_sequences=A,
-                              n_steps=k)
+                    theano.scan,
+                    fn=fx,
+                    outputs_info=T.ones_like(A.T),
+                    non_sequences=A,
+                    n_steps=k)
 
             # Since we have to inspect the traceback,
             # we cannot simply use self.assertRaises()
             try:
                 theano.scan(
-                    fn=fx,
-                    outputs_info=T.ones_like(A.T),
-                    non_sequences=A,
-                    n_steps=k)
+                        fn=fx,
+                        outputs_info=T.ones_like(A.T),
+                        non_sequences=A,
+                        n_steps=k)
                 assert False
             except ValueError, e:
                 # The first message is for numpy before 1.6.
@@ -343,6 +314,7 @@ def perform(self, node, inputs, outputs):
                 output, = outputs
                 output[0] = input + 1
 
+
         orig_compute_test_value = theano.config.compute_test_value
         try:
             theano.config.compute_test_value = 'raise'
@@ -353,10 +325,9 @@ def perform(self, node, inputs, outputs):
             o = IncOnePython()(i)
 
             # Check that the c_code function is not implemented
-            self.assertRaises(
-                (NotImplementedError, utils.MethodNotDefined),
-                o.owner.op.c_code,
-                o.owner, 'o', ['x'], 'z', {'fail': ''})
+            self.assertRaises((NotImplementedError, utils.MethodNotDefined),
+                    o.owner.op.c_code,
+                    o.owner, 'o', ['x'], 'z', {'fail': ''})
 
             assert hasattr(o.tag, 'test_value')
             assert o.tag.test_value == 4
@@ -367,6 +338,28 @@ def perform(self, node, inputs, outputs):
     def test_no_perform(self):
         if not theano.config.cxx:
             raise SkipTest("G++ not available, so we need to skip this test.")
+        class IncOneC(Op):
+            """An Op with only a C (c_code) implementation"""
+
+            def __eq__(self, other):
+                return type(self) == type(other)
+
+            def __hash__(self):
+                return hash(type(self))
+
+            def make_node(self, input):
+                input = scalar.as_scalar(input)
+                output = input.type()
+                return Apply(self, [input], [output])
+
+            def c_code_cache_version(self):
+                return (1,)
+
+            def c_code(self, node, name, inputs, outputs, sub):
+                x, = inputs
+                z, = outputs
+                return "%(z)s = %(x)s + 1;" % locals()
+
 
         orig_compute_test_value = theano.config.compute_test_value
         try:
@@ -375,14 +368,12 @@ def test_no_perform(self):
             i = scalar.int32('i')
             i.tag.test_value = 3
 
-            # Class IncOneC is defined outside of the TestComputeTestValue
-            # so it can be pickled and unpickled
             o = IncOneC()(i)
 
             # Check that the perform function is not implemented
             self.assertRaises((NotImplementedError, utils.MethodNotDefined),
-                              o.owner.op.perform,
-                              o.owner, 0, [None])
+                    o.owner.op.perform,
+                    o.owner, 0, [None])
 
             assert hasattr(o.tag, 'test_value')
             assert o.tag.test_value == 4
@@ -396,8 +387,7 @@ def test_disabled_during_compilation(self):
         orig_compute_test_value = theano.config.compute_test_value
         try:
             theano.config.compute_test_value = 'raise'
-            init_Mu1 = theano.shared(
-                numpy.zeros((5,), dtype=config.floatX)).dimshuffle('x', 0)
+            init_Mu1 = theano.shared(numpy.zeros((5,),dtype=config.floatX)).dimshuffle('x',0)
 
             f = theano.function([], outputs=[init_Mu1])
         finally:
diff --git a/theano/gof/tests/test_destroyhandler.py b/theano/gof/tests/test_destroyhandler.py
index 3018207bb3a..54c4aaee2bf 100644
--- a/theano/gof/tests/test_destroyhandler.py
+++ b/theano/gof/tests/test_destroyhandler.py
@@ -8,7 +8,7 @@
 from theano.gof.opt import *
 
 from theano.gof import destroyhandler
-from theano.gof.fg import FunctionGraph, InconsistencyError
+from theano.gof.fg import FunctionGraph as Env, InconsistencyError
 from theano.gof.toolbox import ReplaceValidate
 
 from copy import copy
@@ -32,11 +32,10 @@ def __eq__(self, other):
 
 
 def MyVariable(name):
-    return Variable(MyType(), None, None, name=name)
-
+    return Variable(MyType(), None, None, name = name)
 
 def MyConstant(data):
-    return graph.Constant(MyType(), data=data)
+    return graph.Constant(MyType(), data = data)
 
 
 class MyOp(Op):
@@ -75,13 +74,13 @@ def __str__(self):
 
 
 sigmoid = MyOp(1, 'Sigmoid')
-transpose_view = MyOp(1, 'TransposeView', vmap={0: [0]})
+transpose_view = MyOp(1, 'TransposeView', vmap = {0: [0]})
 add = MyOp(2, 'Add')
-add_in_place = MyOp(2, 'AddInPlace', dmap={0: [0]})
-add_in_place_2 = MyOp(2, 'AddInPlace', dmap={0: [0]},
-                      destroyhandler_tolerate_same=[(0, 1)])
-add_in_place_3 = MyOp(2, 'AddInPlace', dmap={0: [0]},
-                      destroyhandler_tolerate_aliased=[(0, 1)])
+add_in_place = MyOp(2, 'AddInPlace', dmap = {0: [0]})
+add_in_place_2 = MyOp(2, 'AddInPlace', dmap = {0: [0]},
+        destroyhandler_tolerate_same = [(0, 1)])
+add_in_place_3 = MyOp(2, 'AddInPlace', dmap = {0: [0]},
+        destroyhandler_tolerate_aliased = [(0, 1)])
 dot = MyOp(2, 'Dot')
 
 
@@ -91,9 +90,9 @@ def inputs():
     z = MyVariable('z')
     return x, y, z
 
-
-def Env(inputs, outputs, validate=True):
-    e = FunctionGraph(inputs, outputs, clone=False)
+_Env = Env
+def Env(inputs, outputs, validate = True):
+    e = _Env(inputs, outputs)
     e.attach_feature(destroyhandler.DestroyHandler())
     e.attach_feature(ReplaceValidate())
     if validate:
@@ -102,11 +101,9 @@ def Env(inputs, outputs, validate=True):
 
 
 class FailureWatch:
-    # when passed to OpSubOptimizer or PatternOptimizer, counts the
-    # number of failures
+    # when passed to OpSubOptimizer or PatternOptimizer, counts the number of failures
     def __init__(self):
         self.failures = 0
-
     def __call__(self, exc, nav, pairs, lopt):
         assert isinstance(exc, InconsistencyError)
         self.failures += 1
@@ -121,7 +118,6 @@ def consistent(g):
         raise
     #print "Test OK"
 
-
 def inconsistent(g):
     #print "Testing NOT consistent:", g
     try:
@@ -131,23 +127,24 @@ def inconsistent(g):
         raise
     #print "Test OK"
 
+
+
 #################
 # Test protocol #
 #################
 
-
 def test_misc():
     x, y, z = inputs()
     e = transpose_view(transpose_view(transpose_view(transpose_view(x))))
-    g = Env([x, y, z], [e])
+    g = Env([x,y,z], [e])
     consistent(g)
     chk = g.checkpoint()
     PatternOptimizer((transpose_view, (transpose_view, 'x')), 'x').optimize(g)
     assert str(g) == "[x]"
-    new_e = add(x, y)
+    new_e = add(x,y)
     g.replace_validate(x, new_e)
     assert str(g) == "[Add(x, y)]"
-    g.replace(new_e, dot(add_in_place(x, y), transpose_view(x)))
+    g.replace(new_e, dot(add_in_place(x,y), transpose_view(x)))
     assert str(g) == "[Dot(AddInPlace(x, y), TransposeView(x))]"
     inconsistent(g)
     g.revert(chk)
@@ -155,11 +152,12 @@ def test_misc():
     assert str(g) == "[TransposeView(TransposeView(TransposeView(TransposeView(x))))]"
 
 
+
+
 ######################
 # Test protocol skip #
 ######################
 
-
 def test_aliased_inputs_replacement():
     x, y, z = inputs()
     tv = transpose_view(x)
@@ -177,36 +175,32 @@ def test_aliased_inputs_replacement():
     g.replace(tv, sx)
     consistent(g)
 
-
 def test_indestructible():
     x, y, z = inputs()
     x.tag.indestructible = True
     x = copy(x)
-    # checking if indestructible survives the copy!
-    assert x.tag.indestructible
+    assert x.tag.indestructible  # checking if indestructible survives the copy!
     e = add_in_place(x, y)
-    g = Env([x, y, z], [e], False)
+    g = Env([x,y,z], [e], False)
     inconsistent(g)
     g.replace_validate(e, add(x, y))
     consistent(g)
 
-
 def test_usage_loop_through_views_2():
     x, y, z = inputs()
     e0 = transpose_view(transpose_view(sigmoid(x)))
-    e = dot(add_in_place(x, y), transpose_view(e0))
-    g = Env([x, y, z], [e])
-    consistent(g)  # because sigmoid can do the copy
+    e = dot(add_in_place(x,y), transpose_view(e0))
+    g = Env([x,y,z], [e])
+    consistent(g) # because sigmoid can do the copy
     g.replace(e0, x)
-    inconsistent(g)  # we cut off the path to the sigmoid
-
+    inconsistent(g) # we cut off the path to the sigmoid
 
 def test_destroyers_loop():
     # AddInPlace(x, y) and AddInPlace(y, x) should not coexist
     x, y, z = inputs()
     e1 = add(x, y)
     e2 = add(y, x)
-    g = Env([x, y, z], [e1, e2])
+    g = Env([x,y,z], [e1, e2])
     chk = g.checkpoint()
     consistent(g)
     g.replace_validate(e1, add_in_place(x, y))
@@ -238,35 +232,30 @@ def test_aliased_inputs():
     g = Env([x], [e], False)
     inconsistent(g)
 
-
 def test_aliased_inputs2():
     x, y, z = inputs()
     e = add_in_place(x, transpose_view(x))
     g = Env([x], [e], False)
     inconsistent(g)
 
-
 def test_aliased_inputs_tolerate():
     x, y, z = inputs()
     e = add_in_place_2(x, x)
     g = Env([x], [e], False)
     consistent(g)
 
-
 def test_aliased_inputs_tolerate2():
     x, y, z = inputs()
     e = add_in_place_2(x, transpose_view(x))
     g = Env([x], [e], False)
     inconsistent(g)
 
-
 def test_same_aliased_inputs_ignored():
     x, y, z = inputs()
     e = add_in_place_3(x, x)
     g = Env([x], [e], False)
     consistent(g)
 
-
 def test_different_aliased_inputs_ignored():
     x, y, z = inputs()
     e = add_in_place_3(x, transpose_view(x))
@@ -282,17 +271,16 @@ def test_indestructible_through_views():
     x.tag.indestructible = True
     tv = transpose_view(x)
     e = add_in_place(tv, y)
-    g = Env([x, y, z], [e], False)
+    g = Env([x,y,z], [e], False)
     inconsistent(g)
     g.replace_validate(tv, sigmoid(x))
     consistent(g)
 
-
 def test_indirect():
     x, y, z = inputs()
     e0 = add_in_place(x, y)
     e = dot(sigmoid(e0), transpose_view(x))
-    g = Env([x, y, z], [e], False)
+    g = Env([x,y,z], [e], False)
     inconsistent(g)
     new_e0 = add(x, y)
     g.replace(e0, new_e0)
@@ -300,68 +288,58 @@ def test_indirect():
     g.replace(new_e0, add_in_place(x, y))
     inconsistent(g)
 
-
 def test_indirect_2():
     x, y, z = inputs()
     e0 = transpose_view(x)
     e = dot(sigmoid(add_in_place(x, y)), e0)
-    g = Env([x, y, z], [e], False)
+    g = Env([x,y,z], [e], False)
     inconsistent(g)
     new_e0 = add(e0, y)
     g.replace(e0, new_e0)
     consistent(g)
 
-
 def test_long_destroyers_loop():
     x, y, z = inputs()
-    e = dot(dot(add_in_place(x, y),
-                add_in_place(y, z)),
-            add(z, x))
-    g = Env([x, y, z], [e])
+    e = dot(dot(add_in_place(x,y), add_in_place(y,z)), add(z,x))
+    g = Env([x,y,z], [e])
     consistent(g)
     OpSubOptimizer(add, add_in_place).optimize(g)
     consistent(g)
-    # we don't want to see that!
-    assert str(g) != "[Dot(Dot(AddInPlace(x, y), AddInPlace(y, z)), AddInPlace(z, x))]"
-    e2 = dot(dot(add_in_place(x, y),
-                 add_in_place(y, z)),
-             add_in_place(z, x))
+    assert str(g) != "[Dot(Dot(AddInPlace(x, y), AddInPlace(y, z)), AddInPlace(z, x))]" # we don't want to see that!
+    e2 = dot(dot(add_in_place(x,y), add_in_place(y,z)), add_in_place(z,x))
     try:
-        g2 = Env(*graph.clone([x, y, z], [e2]))
+        g2 = Env(*graph.clone([x,y,z], [e2]))
         raise Exception("Shouldn't have reached this point.")
     except InconsistencyError:
         pass
 
-
 def test_misc_2():
     x, y, z = inputs()
     tv = transpose_view(x)
     e = add_in_place(x, tv)
-    g = Env([x, y], [e], False)
+    g = Env([x,y], [e], False)
     inconsistent(g)
     g.replace(tv, x)
     inconsistent(g)
 
-
 def test_multi_destroyers():
     x, y, z = inputs()
     e = add(add_in_place(x, y), add_in_place(x, y))
     try:
-        g = Env([x, y, z], [e])
+        g = Env([x,y,z], [e])
         raise Exception("Shouldn't have reached this point.")
     except InconsistencyError, e:
         pass
 
-
 def test_multi_destroyers_through_views():
     x, y, z = inputs()
     e = dot(add(transpose_view(z), y), add(z, x))
-    g = Env([x, y, z], [e])
+    g = Env([x,y,z], [e])
     consistent(g)
     fail = FailureWatch()
     OpSubOptimizer(add, add_in_place, fail).optimize(g)
     consistent(g)
-    assert fail.failures == 1  # should have succeeded once and failed once
+    assert fail.failures == 1 # should have succeeded once and failed once
 
 
 def test_repair_destroy_path():
@@ -370,59 +348,54 @@ def test_repair_destroy_path():
     e2 = transpose_view(transpose_view(e1))
     e3 = add_in_place(e2, y)
     e4 = add_in_place(e1, z)
-    g = Env([x, y, z], [e3, e4], False)
+    g = Env([x,y,z], [e3, e4], False)
     inconsistent(g)
     g.replace(e2, transpose_view(x))
     inconsistent(g)
 
-
 def test_usage_loop():
     x, y, z = inputs()
-    g = Env([x, y, z], [dot(add_in_place(x, z), x)], False)
+    g = Env([x,y,z], [dot(add_in_place(x, z), x)], False)
     inconsistent(g)
-    # replace add_in_place with add
-    OpSubOptimizer(add_in_place, add).optimize(g)
+    OpSubOptimizer(add_in_place, add).optimize(g) # replace add_in_place with add
     consistent(g)
 
-
 def test_usage_loop_through_views():
     x, y, z = inputs()
     aip = add_in_place(x, y)
     e = dot(aip, transpose_view(x))
-    g = Env([x, y, z], [e], False)
+    g = Env([x,y,z], [e], False)
     inconsistent(g)
     g.replace_validate(aip, add(x, z))
     consistent(g)
 
-
 def test_usage_loop_insert_views():
     x, y, z = inputs()
-    e = dot(add_in_place(x, add(y, z)),
-            sigmoid(sigmoid(sigmoid(sigmoid(sigmoid(x))))))
-    g = Env([x, y, z], [e])
+    e = dot(add_in_place(x, add(y, z)), sigmoid(sigmoid(sigmoid(sigmoid(sigmoid(x))))))
+    g = Env([x,y,z], [e])
     consistent(g)
     fail = FailureWatch()
     OpSubOptimizer(sigmoid, transpose_view, fail).optimize(g)
     consistent(g)
-    # it must keep one sigmoid in the long sigmoid chain
-    assert fail.failures == 1
-
+    assert fail.failures == 1 # it must keep one sigmoid in the long sigmoid chain
 
 def test_value_repl():
     x, y, z = inputs()
     sy = sigmoid(y)
     e = add_in_place(x, sy)
-    g = Env([x, y], [e], False)
+    g = Env([x,y], [e], False)
     consistent(g)
     g.replace(sy, MyConstant("abc"))
     consistent(g)
 
-
 def test_value_repl_2():
     x, y, z = inputs()
     sy = sigmoid(y)
     e = add_in_place(x, sy)
-    g = Env([x, y], [e], False)
+    g = Env([x,y], [e], False)
     consistent(g)
     g.replace(sy, transpose_view(MyConstant("abc")))
     consistent(g)
+
+
+
diff --git a/theano/gof/tests/test_fg.py b/theano/gof/tests/test_fg.py
deleted file mode 100644
index 1b21edd7758..00000000000
--- a/theano/gof/tests/test_fg.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import pickle
-import unittest
-
-import theano
-from theano.gof import CachedConstantError, FunctionGraph
-from theano import tensor as tt
-
-
-class TFunctionGraph(unittest.TestCase):
-    def test_constant_cache_error(self):
-        v = theano.tensor.constant(1)
-        assert v.cached
-        self.assertRaises(CachedConstantError, FunctionGraph, [], [v + 1],
-                          clone=False)
-
-    def test_clone(self):
-        v = theano.tensor.constant(1)
-        assert v.cached
-        FunctionGraph([], [v + 1])
-
-    def test_pickle(self):
-        v = tt.vector()
-        func = theano.gof.FunctionGraph([v], [v + 1])
-
-        s = pickle.dumps(func)
-        func2 = pickle.loads(s)
diff --git a/theano/gof/tests/test_graph.py b/theano/gof/tests/test_graph.py
index c5217507ba8..333c96491e0 100644
--- a/theano/gof/tests/test_graph.py
+++ b/theano/gof/tests/test_graph.py
@@ -1,4 +1,3 @@
-import pickle
 import unittest
 
 from theano import tensor
@@ -293,21 +292,16 @@ def test_merge_only(self):
             debug=False)
 
 
+
 ################
 # eval         #
 ################
 
-class TestEval(unittest.TestCase):
+def test_eval():
+    x = tensor.scalar()
+    y = tensor.scalar()
+    z = x + y
 
-    def setUp(self):
-        self.x, self.y = tensor.scalars('x', 'y')
-        self.z = self.x + self.y
-        self.w = 2 * self.z
+    result = z.eval({x : 1., y : 2.})
 
-    def test_eval(self):
-        self.assertEquals(self.w.eval({self.x : 1., self.y : 2.}), 6.)
-        self.assertEquals(self.w.eval({self.z : 3}), 6.)
-        self.assertTrue(hasattr(self.w, "_fn_cache"),
-                "variable must have cache after eval")
-        self.assertFalse(hasattr(pickle.loads(pickle.dumps(self.w)), '_fn_cache'),
-                "temporary functions must not be serialized")
+    assert result == 3.
diff --git a/theano/gof/tests/test_graph_opt_caching.py b/theano/gof/tests/test_graph_opt_caching.py
deleted file mode 100644
index 390cee1d850..00000000000
--- a/theano/gof/tests/test_graph_opt_caching.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import unittest, os
-import numpy
-import cPickle
-from theano.compat.python2x import DictMixin, OrderedDict
-floatX = 'float32'
-import theano
-import theano.tensor as T
-
-
-
-def test_graph_opt_caching():
-    opt_db_file = theano.config.compiledir+'/optimized_graphs.pkl'
-    os.system('rm %s'%opt_db_file)
-    
-    mode = theano.config.mode
-    if mode in ["DEBUG_MODE", "DebugMode"]:
-        mode = "FAST_RUN"
-    default = theano.config.cache_optimizations
-    try:
-        theano.config.cache_optimizations = True
-        a = T.fmatrix('a')
-        b = T.fmatrix('b')
-        c = theano.shared(numpy.ones((10, 10), dtype=floatX))
-        d = theano.shared(numpy.ones((10, 10), dtype=floatX))
-        e = T.sum(T.sum(T.sum(a ** 2 + b) + c) + d)
-        f1 = theano.function([a, b], e, mode=mode)
-
-        m = T.fmatrix('x1')
-        n = T.fmatrix('x2')
-        p = theano.shared(numpy.ones((10, 10), dtype=floatX))
-        q = theano.shared(numpy.ones((10, 10), dtype=floatX))
-        j = T.sum(T.sum(T.sum(m ** 2 + n) + p) + q)
-        f2 = theano.function([m, n], j, mode=mode)
-        
-        in1 = numpy.ones((10, 10), dtype=floatX)
-        in2 = numpy.ones((10, 10), dtype=floatX)
-        assert f1(in1, in2) == f2(in1, in2)
-    finally:
-        theano.config.cache_optimizations = default
-        
-if __name__ == '__main__':
-    test_graph_opt_caching()
diff --git a/theano/gof/tests/test_lazy.py b/theano/gof/tests/test_lazy.py
index 2a57fec69d2..54fe2737ce4 100644
--- a/theano/gof/tests/test_lazy.py
+++ b/theano/gof/tests/test_lazy.py
@@ -2,7 +2,6 @@
 
 import numpy
 
-import theano
 from theano.gof.op import PureOp
 from theano.gof import Apply, generic, Container
 from theano.gof.link import LocalLinker, map_storage, add_clear_storage
@@ -14,16 +13,15 @@
 class IfElseIfElseIf(PureOp):
 
     def __init__(self, inplace=False):
-        # check destroyhandler and others to ensure that a view_map with
-        self.inplace = inplace
+        self.inplace=inplace # check destroyhandler and others to ensure that a view_map with
         #multiple inputs can work
         assert not self.inplace
 
-    def make_node(self, c1, t1, c2, t2, c3, t3, f3):
+    def make_node(self, c1, t1, c2,t2,c3,t3,f3):
         assert t1.type == f3.type
         assert t2.type == t3.type
         assert t3.type == f3.type
-        return Apply(self, [c1, t1, c2, t2, c3, t3, f3], [t1.type()])
+        return Apply(self, [c1,t1,c2,t2,c3,t3,f3], [t1.type()])
 
     def make_thunk(self, node, storage_map, compute_map, no_recycling):
 
@@ -43,9 +41,8 @@ def thunk():
                     if not input_computed[1][0]:
                         return [1]
                     else:
-                        output_computed[0][0] = 1
-                        output_registers[0][0] = outtype.filter(
-                            deepcopy(input_registers[1][0]))
+                        output_computed[0][0]=1
+                        output_registers[0][0]=outtype.filter(deepcopy(input_registers[1][0]))
                         return []
                 else:
                     if not input_computed[2][0]:
@@ -57,8 +54,7 @@ def thunk():
                                 return [3]
                             else:
                                 output_computed[0][0] = 1
-                                output_registers[0][0] = outtype.filter(
-                                    deepcopy(input_registers[3][0]))
+                                output_registers[0][0] = outtype.filter(deepcopy(input_registers[3][0]))
                                 return []
                         else:
                             if not input_computed[4][0]:
@@ -70,33 +66,30 @@ def thunk():
                                         return [5]
                                     else:
                                         output_computed[0][0] = 1
-                                        output_registers[0][0] = outtype.filter(
-                                            deepcopy(input_registers[5][0]))
+                                        output_registers[0][0] = outtype.filter(deepcopy(input_registers[5][0]))
                                         return []
                                 else:
                                     if not input_computed[6][0]:
                                         return [6]
                                     else:
                                         output_computed[0][0] = 1
-                                        output_registers[0][0] = outtype.filter(
-                                            deepcopy(input_registers[6][0]))
+                                        output_registers[0][0] = outtype.filter(deepcopy(input_registers[6][0]))
                                         return []
 
         thunk.lazy = True
         return thunk
 
 
+
 class NotImplementedOp(PureOp):
-    class E(Exception):
-        pass
+    class E(Exception): pass
 
     def make_node(self, x):
         return Apply(self, [x], [x.type()])
-
     def make_thunk(self, node, storage_map, compute_map, no_recycling):
         def thunk():
             raise self.E()
-        thunk.lazy = False
+        thunk.lazy=False
         return thunk
 
 
@@ -106,31 +99,22 @@ def test_ifelse():
     c = generic()
 
     notimpl = NotImplementedOp()
-    lazys = [True]
-    # We need lazy to end up being True for this test.
-    if theano.config.vm.lazy in [True, None]:
-        lazys = [True, None]
-    cloops = [True, False]
-    if theano.config.cxx == "":
-        cloops = [False]
-    for cloop in cloops:
-        for lazy in lazys:
-            linker = theano.gof.vm.VM_Linker(use_cloop=cloop, lazy=lazy)
-            f = function([a, b, c], ifelse(a, notimpl(b), c),
-                         mode=Mode(linker=linker, optimizer='fast_run'))
-
-            try:
-                #print "case 1"
-                f(1, 'a', 'b')
-                assert False
-            except NotImplementedOp.E:
-                pass
-            #print "... passed"
-
-            #print "case 2"
-            #print f(0, 'a', 'b')
-            assert f(0, 'a', 'b') == 'b'
-            #print "... passed"
+
+    f = function([a,b,c], ifelse(a, notimpl(b), c),
+            mode=Mode(linker='vm', optimizer='fast_run'))
+
+    try:
+        print "case 1"
+        f( 1, 'a', 'b')
+        assert False
+    except NotImplementedOp.E:
+        pass
+    print "... passed"
+
+    print "case 2"
+    print f( 0, 'a', 'b')
+    assert f( 0, 'a', 'b') == 'b'
+    print "... passed"
 
 
 def more_complex_test():
@@ -141,26 +125,19 @@ def more_complex_test():
     x2 = T.scalar('x2')
     c1 = T.scalar('c1')
     c2 = T.scalar('c2')
-    t1 = ifelse(c1, x1, notimpl(x2))
+    t1 = ifelse(c1,x1,notimpl(x2))
     t1.name = 't1'
-    t2 = t1 * 10
+    t2 = t1*10
     t2.name = 't2'
-    t3 = ifelse(c2, t2, x1 + t1)
+    t3 = ifelse(c2,t2, x1+t1)
     t3.name = 't3'
-    t4 = ifelseifelseif(T.eq(x1, x2), x1, T.eq(x1, 5), x2, c2, t3, t3 + 0.5)
+    t4 = ifelseifelseif(T.eq(x1,x2), x1, T.eq(x1,5), x2, c2, t3, t3+0.5)
     t4.name = 't4'
 
-    f = function([c1, c2, x1, x2], t4, mode=Mode(linker='vm',
-                                                 optimizer='fast_run'))
-    if theano.config.vm.lazy is False:
-        try:
-            f(1, 0, numpy.array(10, dtype=x1.dtype), 0)
-            assert False
-        except NotImplementedOp.E:
-            pass
-    else:
-        print f(1, 0, numpy.array(10, dtype=x1.dtype), 0)
-        assert f(1, 0, numpy.array(10, dtype=x1.dtype), 0) == 20.5
+    f = function([c1,c2,x1,x2], t4, mode=Mode(linker='vm', optimizer='fast_run'))
+
+    print f(1, 0, numpy.array(10,dtype=x1.dtype),0)
+    assert f(1,0,numpy.array(10,dtype=x1.dtype),0) == 20.5
     print '... passed'
 
 if __name__ == '__main__':
diff --git a/theano/gof/tests/test_link.py b/theano/gof/tests/test_link.py
index 548878c57a7..d31ab67dbbf 100644
--- a/theano/gof/tests/test_link.py
+++ b/theano/gof/tests/test_link.py
@@ -1,9 +1,5 @@
-from copy import deepcopy
 import unittest
 
-import numpy
-
-import theano
 from theano.gof import graph
 from theano.gof.graph import Variable, Apply, Constant
 from theano.gof.type import Type
@@ -13,7 +9,6 @@
 from theano.gof.link import *
 from theano.compat import cmp
 
-
 def as_variable(x):
     assert isinstance(x, Variable)
     return x
@@ -115,8 +110,7 @@ def test_input_dependency0(self):
         x, y, z = inputs()
         a, d = add(x, y), div(x, y)
         e = mul(a, d)
-        fn = perform_linker(FunctionGraph(*graph.clone([x, y, a],
-                                                       [e]))).make_function()
+        fn = perform_linker(FunctionGraph(*graph.clone([x, y, a], [e]))).make_function()
         assert fn(1.0, 2.0, 9.0) == 4.5
 
     def test_skiphole(self):
@@ -124,8 +118,7 @@ def test_skiphole(self):
         a = add(x, y)
         r = raise_err(a)
         e = add(r, a)
-        fn = perform_linker(FunctionGraph(*graph.clone([x, y, r],
-                                                       [e]))).make_function()
+        fn = perform_linker(FunctionGraph(*graph.clone([x, y, r], [e]))).make_function()
         assert fn(1.0, 2.0, 4.5) == 7.5
 
 
@@ -144,8 +137,8 @@ def wrap(i, node, th):
         x, y, z = inputs()
         e = mul(add(x, y), div(x, y))
         fn, i, o = wrap_linker(
-            FunctionGraph([x, y, z], [e]),
-            [PerformLinker(allow_gc=False)], wrap).make_thunk()
+                FunctionGraph([x, y, z], [e]),
+                [PerformLinker(allow_gc=False)], wrap).make_thunk()
         i[0].data = 1
         i[1].data = 2
         fn()
@@ -162,21 +155,20 @@ def wrap(i, node, th):
         x, y, z = inputs()
         e = mul(add(x, y), div(x, y))
         fn, i, o = wrap_linker(
-            FunctionGraph([x, y, z], [e]),
-            [PerformLinker(allow_gc=False)], wrap).make_thunk()
+                FunctionGraph([x, y, z], [e]),
+                [PerformLinker(allow_gc=False)], wrap).make_thunk()
         i[0].data = 1
         i[1].data = 2
         fn()
         assert nodes == [div, add, mul]
         assert o[0].data == 1.5
 
-
 def test_sort_schedule_fn():
     import theano
     from theano.gof.sched import sort_schedule_fn, make_depends
     x = theano.tensor.matrix('x')
     y = theano.tensor.dot(x[:5]*2, x.T+1).T
-    str_cmp = lambda a, b: cmp(str(a), str(b))  # lexicographical sort
+    str_cmp = lambda a, b: cmp(str(a), str(b)) # lexicographical sort
     linker = theano.OpWiseCLinker(schedule=sort_schedule_fn(str_cmp))
     mode = theano.Mode(linker=linker)
     f = theano.function((x,), (y,), mode=mode)
@@ -184,30 +176,5 @@ def test_sort_schedule_fn():
     nodes = f.maker.linker.make_all()[-1]
     depends = make_depends()
     for a, b in zip(nodes[:-1], nodes[1:]):
-        if not depends((b, a)):
+        if not depends((b,a)):
             assert str(a) < str(b)
-
-
-def test_container_deepcopy():
-    """
-    This is a test to a work around a NumPy bug.
-    """
-    t = theano.tensor.scalar()
-    # It seam that numpy.asarray(0.).astype(floatX) can return a numpy
-    # scalar with some NumPy Version. So we call numpy.asarray with
-    # the dtype parameter.
-    v = numpy.asarray(0., dtype=theano.config.floatX)
-    assert isinstance(v, numpy.ndarray), type(v)
-    for readonly in [True, False]:
-        c = Container(t, [v], readonly=readonly)
-        assert isinstance(c.storage[0], numpy.ndarray), (c.storage[0],
-                                                         type(c.storage[0]))
-        assert c.storage[0].dtype == v.dtype, (c.storage[0].dtype, v.dtype)
-        assert c.storage[0].dtype == c.type.dtype, (c.storage[0].dtype,
-                                                    c.type.dtype)
-        d = deepcopy(c)
-        assert isinstance(d.storage[0], numpy.ndarray), (d.storage[0],
-                                                         type(d.storage[0]))
-        assert d.storage[0].dtype == v.dtype, (d.storage[0].dtype, v.dtype)
-        assert d.storage[0].dtype == c.type.dtype, (d.storage[0].dtype,
-                                                    c.type.dtype)
diff --git a/theano/gof/tests/test_op.py b/theano/gof/tests/test_op.py
index 8b45fd2cf5c..db6ec59e995 100644
--- a/theano/gof/tests/test_op.py
+++ b/theano/gof/tests/test_op.py
@@ -1,9 +1,10 @@
+from copy import copy
 import unittest
 
-from nose.plugins.skip import SkipTest
 import numpy
 
 import theano
+
 import theano.gof.op as op
 from theano.gof.type import Type, Generic
 from theano.gof.graph import Apply, Variable
@@ -44,15 +45,6 @@ def filter(self, x, strict=False, allow_downcast=None):
             raise ValueError("Invalid value")
         return x
 
-    # Added to make those tests pass in DebugMode
-    @staticmethod
-    def may_share_memory(a, b):
-        # As this represent a string and string are immutable, they
-        # never share memory in the DebugMode sence. This is needed as
-        # Python reuse string internally.
-        return False
-
-
 class MyOp(Op):
 
     def make_node(self, *inputs):
@@ -83,49 +75,20 @@ def perform(self, node, inputs, output_storage):
         output_storage[0][0] = 'test Op no input'
 
 
-class StructOp(Op):
-    __props__ = ()
-
-    def do_constant_folding(self, node):
-        # we are not constant
-        return False
-
-    # The input only serves to distinguish thunks
-    def make_node(self, i):
-        return Apply(self, [i], [scalar.uint64()])
-
-    def c_support_code_struct(self, node, struct_id):
-        return "npy_uint64 counter%d;" % (struct_id,)
-
-    def c_init_code_struct(self, node, struct_id, sub):
-        return "counter%d = 0;" % (struct_id,)
-
-    def c_code(self, node, name, input_names, outputs_names, sub):
-        return """
-%(out)s = counter%(sid)s;
-counter%(sid)s++;
-""" % dict(out=outputs_names[0], sid=sub['struct_id'])
-
-    def c_code_cache_version(self):
-        return (0,)
-
-
 class TestOp:
 
     # Sanity tests
     def test_sanity_0(self):
         r1, r2 = MyType(1)(), MyType(2)()
         node = MyOp.make_node(r1, r2)
-        # Are the inputs what I provided?
-        assert [x for x in node.inputs] == [r1, r2]
-        # Are the outputs what I expect?
-        assert [x.type for x in node.outputs] == [MyType(3)]
+        assert [x for x in node.inputs] == [r1, r2] # Are the inputs what I provided?
+        assert [x.type for x in node.outputs] == [MyType(3)] # Are the outputs what I expect?
         assert node.outputs[0].owner is node and node.outputs[0].index == 0
 
     # validate
     def test_validate(self):
         try:
-            MyOp(Generic()(), MyType(1)())  # MyOp requires MyType instances
+            MyOp(Generic()(), MyType(1)()) # MyOp requires MyType instances
             raise Exception("Expected an exception")
         except Exception, e:
             if str(e) != "Error 1":
@@ -137,26 +100,6 @@ def test_op_no_input(self):
         rval = f()
         assert rval == 'test Op no input'
 
-    def test_op_struct(self):
-        if not theano.config.cxx:
-            raise SkipTest("G++ not available, so we need to skip this test.")
-        sop = StructOp()
-        c = sop(theano.tensor.constant(0))
-        mode = None
-        if theano.config.mode == 'FAST_COMPILE':
-            mode = 'FAST_RUN'
-        f = theano.function([], c, mode=mode)
-        rval = f()
-        assert rval == 0
-        rval = f()
-        assert rval == 1
-
-        c2 = sop(theano.tensor.constant(1))
-        f2 = theano.function([], [c, c2], mode=mode)
-        rval = f2()
-        assert rval == [0, 0]
-
-
 class TestMakeThunk(unittest.TestCase):
     def test_no_c_code(self):
         class IncOnePython(Op):
@@ -178,25 +121,28 @@ def perform(self, node, inputs, outputs):
                 output, = outputs
                 output[0] = input + 1
 
+
         i = scalar.int32('i')
         o = IncOnePython()(i)
 
         # Check that the c_code function is not implemented
         self.assertRaises((NotImplementedError, utils.MethodNotDefined),
-                          o.owner.op.c_code,
-                          o.owner, 'o', ['x'], 'z', {'fail': ''})
+                o.owner.op.c_code,
+                o.owner, 'o', ['x'], 'z', {'fail': ''})
 
-        storage_map = {i: [numpy.int32(3)],
-                       o: [None]}
-        compute_map = {i: [True],
-                       o: [False]}
+        storage_map = {
+                i: [numpy.int32(3)],
+                o: [None]}
+        compute_map = {
+                i: [True],
+                o: [False]}
 
         thunk = o.owner.op.make_thunk(o.owner, storage_map, compute_map,
-                                      no_recycling=[])
+                no_recycling=[])
 
         required = thunk()
         # Check everything went OK
-        assert not required  # We provided all inputs
+        assert not required # We provided all inputs
         assert compute_map[o][0]
         assert storage_map[o][0] == 4
 
@@ -220,25 +166,28 @@ def c_code(self, node, name, inputs, outputs, sub):
                 z, = outputs
                 return "%(z)s = %(x)s + 1;" % locals()
 
+
         i = scalar.int32('i')
         o = IncOneC()(i)
 
         # Check that the perform function is not implemented
         self.assertRaises((NotImplementedError, utils.MethodNotDefined),
-                          o.owner.op.perform,
-                          o.owner, 0, [None])
+                o.owner.op.perform,
+                o.owner, 0, [None])
 
-        storage_map = {i: [numpy.int32(3)],
-                       o: [None]}
-        compute_map = {i: [True],
-                       o: [False]}
+        storage_map = {
+                i: [numpy.int32(3)],
+                o: [None]}
+        compute_map = {
+                i: [True],
+                o: [False]}
 
         thunk = o.owner.op.make_thunk(o.owner, storage_map, compute_map,
-                                      no_recycling=[])
+                no_recycling=[])
         if theano.config.cxx:
             required = thunk()
             # Check everything went OK
-            assert not required  # We provided all inputs
+            assert not required # We provided all inputs
             assert compute_map[o][0]
             assert storage_map[o][0] == 4
         else:
@@ -252,33 +201,30 @@ def test_test_value_python_objects():
 
 
 def test_test_value_ndarray():
-    x = numpy.zeros((5, 5))
+    x = numpy.zeros((5,5))
     v = op.get_test_value(x)
     assert (v == x).all()
 
-
 def test_test_value_constant():
-    x = T.as_tensor_variable(numpy.zeros((5, 5)))
+    x = T.as_tensor_variable(numpy.zeros((5,5)))
     v = op.get_test_value(x)
 
-    assert numpy.all(v == numpy.zeros((5, 5)))
-
+    assert numpy.all(v == numpy.zeros((5,5)))
 
 def test_test_value_shared():
-    x = shared(numpy.zeros((5, 5)))
+    x = shared(numpy.zeros((5,5)))
     v = op.get_test_value(x)
 
-    assert numpy.all(v == numpy.zeros((5, 5)))
-
+    assert numpy.all(v == numpy.zeros((5,5)))
 
 def test_test_value_op():
     try:
         prev_value = config.compute_test_value
         config.compute_test_value = 'raise'
-        x = T.log(numpy.ones((5, 5)))
+        x = T.log(numpy.ones((5,5)))
         v = op.get_test_value(x)
 
-        assert numpy.allclose(v, numpy.zeros((5, 5)))
+        assert numpy.allclose(v, numpy.zeros((5,5)))
     finally:
         config.compute_test_value = prev_value
 
@@ -298,11 +244,11 @@ def test_get_debug_values_no_debugger():
     finally:
         config.compute_test_value = prev_value
 
-
 def test_get_det_debug_values_ignore():
     """get_debug_values should return [] when debugger is ignore
         and some values are missing """
 
+
     prev_value = config.compute_test_value
     try:
         config.compute_test_value = 'ignore'
@@ -321,21 +267,21 @@ def test_get_debug_values_success():
     (and the debugger is on)"""
 
     prev_value = config.compute_test_value
-    for mode in ['ignore', 'warn', 'raise']:
+    for mode in [ 'ignore', 'warn', 'raise' ]:
 
         try:
             config.compute_test_value = mode
 
             x = T.vector()
             x.tag.test_value = numpy.zeros((4,), dtype=config.floatX)
-            y = numpy.zeros((5, 5))
+            y = numpy.zeros((5,5))
 
             iters = 0
 
             for x_val, y_val in op.get_debug_values(x, y):
 
                 assert x_val.shape == (4,)
-                assert y_val.shape == (5, 5)
+                assert y_val.shape == (5,5)
 
                 iters += 1
 
@@ -344,7 +290,6 @@ def test_get_debug_values_success():
         finally:
             config.compute_test_value = prev_value
 
-
 def test_get_debug_values_exc():
     """tests that get_debug_value raises an exception when
         debugger is set to raise and a value is missing """
@@ -372,14 +317,13 @@ def test_get_debug_values_exc():
     finally:
         config.compute_test_value = prev_value
 
-
 def test_debug_error_message():
     """tests that debug_error_message raises an
     exception when it should."""
 
     prev_value = config.compute_test_value
 
-    for mode in ['ignore', 'raise']:
+    for mode in [ 'ignore', 'raise' ]:
 
         try:
             config.compute_test_value = mode
diff --git a/theano/gof/tests/test_opt.py b/theano/gof/tests/test_opt.py
index a29ebb2cee3..b343323caed 100644
--- a/theano/gof/tests/test_opt.py
+++ b/theano/gof/tests/test_opt.py
@@ -360,7 +360,7 @@ def test_1(self):
         x, y, z = map(MyVariable, 'xyz')
         e = op3(op4(x, y))
         g = Env([x, y, z], [e])
-        #print g
+        print g
         opt = EquilibriumOptimizer(
             [PatternSub((op1, 'x', 'y'), (op2, 'x', 'y')),
              PatternSub((op4, 'x', 'y'), (op1, 'x', 'y')),
@@ -368,14 +368,14 @@ def test_1(self):
              ],
             max_use_ratio = 10)
         opt.optimize(g)
-        #print g
+        print g
         assert str(g) == '[Op2(x, y)]'
 
     def test_2(self):
         x, y, z = map(MyVariable, 'xyz')
         e = op1(op1(op3(x, y)))
         g = Env([x, y, z], [e])
-        #print g
+        print g
         opt = EquilibriumOptimizer(
             [PatternSub((op1, (op2, 'x', 'y')), (op4, 'x', 'y')),
              PatternSub((op3, 'x', 'y'), (op4, 'x', 'y')),
@@ -391,7 +391,7 @@ def test_low_use_ratio(self):
         x, y, z = map(MyVariable, 'xyz')
         e = op3(op4(x, y))
         g = Env([x, y, z], [e])
-        #print 'before', g
+        print 'before', g
         # display pesky warnings along with stdout
         # also silence logger for 'theano.gof.opt'
         _logger = logging.getLogger('theano.gof.opt')
@@ -407,22 +407,5 @@ def test_low_use_ratio(self):
             opt.optimize(g)
         finally:
             _logger.setLevel(oldlevel)
-        #print 'after', g
+        print 'after', g
         assert str(g) == '[Op1(x, y)]'
-
-
-def test_pre_constant_merge_slice():
-    ms = theano.tensor.type_other.MakeSlice()(1)
-    pre_constant_merge([ms])
-    const_slice = theano.tensor.type_other.SliceConstant(
-        type=theano.tensor.type_other.slicetype,
-        data=slice(1, None, 2))
-    adv = theano.tensor.subtensor.AdvancedSubtensor()(theano.tensor.matrix(),
-                                                      [2, 3], const_slice)
-    pre_constant_merge(adv)
-
-    cst = pre_greedy_local_optimizer([theano.tensor.opt.constant_folding], ms)
-    assert isinstance(cst, theano.tensor.type_other.SliceConstant)
-
-    # Make sure constant of slice signature is hashable.
-    hash(cst.signature())
diff --git a/theano/gof/tests/test_toolbox.py b/theano/gof/tests/test_toolbox.py
index aa28cf0d218..29488eb6c3d 100644
--- a/theano/gof/tests/test_toolbox.py
+++ b/theano/gof/tests/test_toolbox.py
@@ -3,7 +3,7 @@
 from theano.gof.type import Type
 from theano.gof.op import Op
 
-from theano.gof.fg import FunctionGraph, InconsistencyError
+from theano.gof.fg import FunctionGraph as Env, InconsistencyError
 from theano.gof.toolbox import *
 
 
@@ -61,13 +61,14 @@ def inputs():
     return x, y, z
 
 
+
 class TestNodeFinder:
 
     def test_straightforward(self):
         x, y, z = inputs()
         e0 = dot(y, z)
         e = add(add(sigmoid(x), sigmoid(sigmoid(z))), dot(add(x, y), e0))
-        g = FunctionGraph([x, y, z], [e], clone=False)
+        g = Env([x, y, z], [e])
         g.attach_feature(NodeFinder())
 
         assert hasattr(g, 'get_nodes')
diff --git a/theano/gof/tests/test_types.py b/theano/gof/tests/test_types.py
index 71d40d6245b..266805f056e 100644
--- a/theano/gof/tests/test_types.py
+++ b/theano/gof/tests/test_types.py
@@ -1,77 +1,4 @@
-import numpy
 
-import theano
-from theano import Op, Apply
-from theano.tensor import TensorType
-from theano.gof.type import CDataType
-
-from nose.plugins.skip import SkipTest
+from theano.gof.type import *
 
 # todo: test generic
-
-
-class ProdOp(Op):
-    __props__ = ()
-
-    def make_node(self, i):
-        return Apply(self, [i], [CDataType('void *', 'py_decref')()])
-
-    def c_support_code(self):
-        return """
-void py_decref(void *p) {
-  Py_XDECREF((PyObject *)p);
-}
-"""
-
-    def c_code(self, node, name, inps, outs, sub):
-        return """
-Py_XDECREF(%(out)s);
-%(out)s = (void *)%(inp)s;
-Py_INCREF(%(inp)s);
-""" % dict(out=outs[0], inp=inps[0])
-
-    def c_code_cache_version(self):
-        return (0,)
-
-
-class GetOp(Op):
-    __props__ = ()
-
-    def make_node(self, c):
-        return Apply(self, [c], [TensorType('float32', (False,))()])
-
-    def c_support_code(self):
-        return """
-void py_decref(void *p) {
-  Py_XDECREF((PyObject *)p);
-}
-"""
-
-    def c_code(self, node, name, inps, outs, sub):
-        return """
-Py_XDECREF(%(out)s);
-%(out)s = (PyArrayObject *)%(inp)s;
-Py_INCREF(%(out)s);
-""" % dict(out=outs[0], inp=inps[0])
-
-    def c_code_cache_version(self):
-        return (0,)
-
-
-def test_cdata():
-    if not theano.config.cxx:
-        raise SkipTest("G++ not available, so we need to skip this test.")
-    i = TensorType('float32', (False,))()
-    c = ProdOp()(i)
-    i2 = GetOp()(c)
-    mode = None
-    if theano.config.mode == "FAST_COMPILE":
-        mode = "FAST_RUN"
-
-    # This should be a passthrough function for vectors
-    f = theano.function([i], i2, mode=mode)
-
-    v = numpy.random.randn(9).astype('float32')
-
-    v2 = f(v)
-    assert (v2 == v).all()
diff --git a/theano/gof/tests/test_utils.py b/theano/gof/tests/test_utils.py
index 967ed5fa499..732d217d60d 100644
--- a/theano/gof/tests/test_utils.py
+++ b/theano/gof/tests/test_utils.py
@@ -1,5 +1,5 @@
 import theano
-from theano.gof.utils import give_variables_names, unique, remove
+from theano.gof.utils import give_variables_names, unique
 from theano.gof.python25 import all
 
 
@@ -34,11 +34,3 @@ def test_give_variables_names_small():
     give_variables_names(fgraph.variables)
     assert all(var.name for var in fgraph.variables)
     assert unique([var.name for var in fgraph.variables])
-
-
-def test_remove():
-    even = lambda x: x % 2 == 0
-    odd = lambda x: x % 2 == 1
-    # The list are neede as with python 3, remove and filter return generators
-    # and we can't compare generators.
-    assert list(remove(even, range(5))) == list(filter(odd, range(5)))
diff --git a/theano/gof/tests/test_vm.py b/theano/gof/tests/test_vm.py
index d9c5861388b..1941c877354 100644
--- a/theano/gof/tests/test_vm.py
+++ b/theano/gof/tests/test_vm.py
@@ -20,7 +20,6 @@
 from theano.ifelse import ifelse
 import theano
 
-
 class TestCallbacks(unittest.TestCase):
     """
     Test the VM_Linker's callback argument, which can be useful for debugging.
@@ -35,7 +34,7 @@ def callback(self, node, thunk, storage_map, compute_map):
 
     def test_callback(self):
         a, b, c = tensor.scalars('abc')
-        f = function([a, b, c], (a + b) + c,
+        f = function([a,b,c], (a + b) + c,
                 mode=Mode(
                     optimizer=None,
                     linker=vm.VM_Linker(callback=self.callback)))
@@ -45,12 +44,13 @@ def test_callback(self):
         f(1, 2, 3)
         assert sum(self.n_callbacks.values()) == len(f.maker.fgraph.toposort()) * 2
 
+
     def test_callback_with_ifelse(self):
         a, b, c = tensor.scalars('abc')
-        f = function([a, b, c], ifelse(a, 2*b, 2*c),
-                     mode=Mode(
-                         optimizer=None,
-                         linker=vm.VM_Linker(callback=self.callback)))
+        f = function([a,b,c], ifelse(a, 2*b, 2*c),
+                mode=Mode(
+                    optimizer=None,
+                    linker=vm.VM_Linker(callback=self.callback)))
 
         f(1, 2, 3)
         assert self.n_callbacks['IfElse'] == 2
@@ -71,7 +71,6 @@ def numpy_version(x, depth):
         for d in xrange(depth):
             z = (z+z)
         return z
-
     def time_numpy():
         steps_a = 5
         steps_b = 100
@@ -79,10 +78,10 @@ def time_numpy():
 
         numpy_version(x, steps_a)
         t0 = time.time()
-        # print numpy_version(x, steps_a)
+        #print numpy_version(x, steps_a)
         t1 = time.time()
         t2 = time.time()
-        # print numpy_version(x, steps_b)
+        #print numpy_version(x, steps_b)
         t3 = time.time()
         t_a = t1 - t0
         t_b = t3 - t2
@@ -95,17 +94,18 @@ def time_linker(name, linker):
         steps_a = 5
         steps_b = 100
         x = tensor.vector()
-        a = build_graph(x, steps_a)
-        b = build_graph(x, steps_b)
+        a = build_graph(x,steps_a)
+        b = build_graph(x,steps_b)
+
 
         f_a = function([x], a,
-                       mode=Mode(optimizer=None, linker=linker()),
-                       #profile='f_a speed test %s'%name,
-        )
+                mode=Mode(optimizer=None, linker=linker()),
+                #profile='f_a speed test %s'%name,
+                )
         f_b = function([x], b,
-                       mode=Mode(optimizer=None, linker=linker()),
-                       #profile='f_b speed test %s'%name,
-        )
+                mode=Mode(optimizer=None, linker=linker()),
+                #profile='f_b speed test %s'%name,
+                )
 
         f_a([2.0, 3.0])
         t0 = time.time()
@@ -122,18 +122,17 @@ def time_linker(name, linker):
         t_b = t3 - t2
 
         print "%s takes %f s/Kop" % (
-            name,
-            (1000*(t_b-t_a) / (steps_b - steps_a)))
+                name,
+                (1000*(t_b-t_a) / (steps_b - steps_a)))
 
     time_linker('c|py', OpWiseCLinker)
     time_linker('vmLinker', vm.VM_Linker)
-    time_linker('vmLinker_nogc', lambda: vm.VM_Linker(allow_gc=False))
+    time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
     if theano.config.cxx:
-        time_linker('vmLinker_CLOOP', lambda: vm.VM_Linker(allow_gc=False,
-                                                           use_cloop=True))
+        time_linker('vmLinker_CLOOP', lambda : vm.VM_Linker(allow_gc=False,
+                                                            use_cloop=True))
     time_numpy()
 
-
 def test_speed_lazy():
 
     def build_graph(x, depth=5):
@@ -149,16 +148,17 @@ def time_linker(name, linker):
         a = build_graph(x, steps_a)
         b = build_graph(x, steps_b)
 
+
         f_a = function([x], a,
-                       mode=Mode(optimizer=None,
-                                 linker=linker()),
-                       #profile='f_a lazy ifelse %s'%name,
-        )
+                mode=Mode(optimizer=None,
+                    linker=linker()),
+                #profile='f_a lazy ifelse %s'%name,
+                )
         f_b = function([x], b,
-                       mode=Mode(optimizer=None,
-                                 linker=linker()),
-                       #profile='f_b lazy ifelse %s'%name,
-        )
+                mode=Mode(optimizer=None,
+                    linker=linker()),
+                #profile='f_b lazy ifelse %s'%name,
+                )
 
         f_a([2.0])
         t0 = time.time()
@@ -179,34 +179,10 @@ def time_linker(name, linker):
                 (1000*(t_b-t_a) / (steps_b - steps_a)))
 
     time_linker('vmLinker', vm.VM_Linker)
-    time_linker('vmLinker_nogc', lambda: vm.VM_Linker(allow_gc=False))
+    time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
     if theano.config.cxx:
-        time_linker('vmLinker_C', lambda: vm.VM_Linker(allow_gc=False,
-                                                       use_cloop=True))
-
-
-def test_allow_gc_cvm():
-    mode = theano.config.mode
-    if mode in ['DEBUG_MODE', 'DebugMode']:
-        mode = "FAST_RUN"
-
-    v = theano.tensor.vector()
-    f = theano.function([v], v + 1, mode=mode)
-
-    f([1])
-    n = list(f.maker.fgraph.apply_nodes)[0].outputs[0]
-    assert f.fn.storage_map[n][0] is None
-    assert f.fn.allow_gc is True
-
-    f.fn.allow_gc = False
-    assert f.fn.allow_gc is False
-    f([1])
-    assert f.fn.storage_map[n][0] is not None
-    f.fn.allow_gc = True
-    assert f.fn.allow_gc is True
-    f([1])
-    assert f.fn.storage_map[n][0] is None
-
+        time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False,
+                                                        use_cloop=True))
 
 run_memory_usage_tests = False
 if run_memory_usage_tests:
@@ -267,8 +243,8 @@ def time_linker(name, linker):
             a = build_graph(x, steps_a)
 
             f_a = function([x], a,
-                           mode=Mode(optimizer=None,
-                                     linker=linker()))
+                    mode=Mode(optimizer=None,
+                        linker=linker()))
 
             for i in xrange(100000):
                 f_a([2.0])
@@ -301,8 +277,8 @@ def time_linker(name, linker):
             a = build_graph(x, steps_a)
 
             f_a = function([x], a,
-                           mode=Mode(optimizer=None,
-                                     linker=linker()))
+                    mode=Mode(optimizer=None,
+                        linker=linker()))
 
             for i in xrange(500000):
                 f_a([2.0])
diff --git a/theano/gof/toolbox.py b/theano/gof/toolbox.py
index e717e9bac13..ae3dd96298b 100644
--- a/theano/gof/toolbox.py
+++ b/theano/gof/toolbox.py
@@ -1,12 +1,13 @@
 import sys
 import time
 
-from theano import config
 from theano.gof.python25 import partial
 from theano.gof.python25 import OrderedDict
+
 from theano.gof import graph
 
 
+
 class AlreadyThere(Exception):
     """Raised by a Feature's on_attach callback method if the FunctionGraph
     attempting to attach the feature already has a functionally identical
@@ -56,7 +57,7 @@ def on_detach(self, function_graph):
         functionality that it installed into the function_graph.
         """
 
-    def on_import(self, function_graph, node, reason):
+    def on_import(self, function_graph, node):
         """
         Called whenever a node is imported into function_graph, which is
         just before the node is actually connected to the graph.
@@ -65,7 +66,7 @@ def on_import(self, function_graph, node, reason):
         you should do this by implementing on_attach.
         """
 
-    def on_prune(self, function_graph, node, reason):
+    def on_prune(self, function_graph, node):
         """
         Called whenever a node is pruned (removed) from the function_graph,
         after it is disconnected from the graph.
@@ -97,39 +98,14 @@ class Bookkeeper(Feature):
 
     def on_attach(self, fgraph):
         for node in graph.io_toposort(fgraph.inputs, fgraph.outputs):
-            self.on_import(fgraph, node, "on_attach")
+            self.on_import(fgraph, node)
 
     def on_detach(self, fgraph):
         for node in graph.io_toposort(fgraph.inputs, fgraph.outputs):
-            self.on_prune(fgraph, node, 'Bookkeeper.detach')
-
-
-class GetCheckpoint:
-
-    def __init__(self, history, fgraph):
-        self.h = history
-        self.fgraph = fgraph
-
-    def __call__(self):
-        return len(self.h.history[self.fgraph])
-
-
-class LambdExtract:
-
-    def __init__(self, fgraph, node, i, r, reason=None):
-        self.fgraph = fgraph
-        self.node = node
-        self.i = i
-        self.r = r
-        self.reason = reason
-
-    def __call__(self):
-        return self.fgraph.change_input(self.node, self.i, self.r,
-                                    reason=("Revert", self.reason))
+            self.on_prune(fgraph, node)
 
 
 class History(Feature):
-    pickle_rm_attr = ["checkpoint", "revert"]
 
     def __init__(self):
         self.history = {}
@@ -139,14 +115,7 @@ def on_attach(self, fgraph):
             raise AlreadyThere("History feature is already present or in"
                                " conflict with another plugin.")
         self.history[fgraph] = []
-        # Don't call unpickle here, as ReplaceValidate.on_attach()
-        # call to History.on_attach() will call the
-        # ReplaceValidate.unpickle and not History.unpickle
-        fgraph.checkpoint = GetCheckpoint(self, fgraph)
-        fgraph.revert = partial(self.revert, fgraph)
-
-    def unpickle(self, fgraph):
-        fgraph.checkpoint = GetCheckpoint(self, fgraph)
+        fgraph.checkpoint = lambda: len(self.history[fgraph])
         fgraph.revert = partial(self.revert, fgraph)
 
     def on_detach(self, fgraph):
@@ -158,7 +127,8 @@ def on_change_input(self, fgraph, node, i, r, new_r, reason=None):
         if self.history[fgraph] is None:
             return
         h = self.history[fgraph]
-        h.append(LambdExtract(fgraph, node, i, r, reason))
+        h.append(lambda: fgraph.change_input(node, i, r,
+                                          reason=("Revert", reason)))
 
     def revert(self, fgraph, checkpoint):
         """
@@ -175,66 +145,47 @@ def revert(self, fgraph, checkpoint):
 
 
 class Validator(Feature):
-    pickle_rm_attr = ["validate", "consistent"]
 
     def on_attach(self, fgraph):
         for attr in ('validate', 'validate_time'):
             if hasattr(fgraph, attr):
                 raise AlreadyThere("Validator feature is already present or in"
                                    " conflict with another plugin.")
-        # Don't call unpickle here, as ReplaceValidate.on_attach()
-        # call to History.on_attach() will call the
-        # ReplaceValidate.unpickle and not History.unpickle
-        fgraph.validate = partial(self.validate_, fgraph)
-        fgraph.consistent = partial(self.consistent_, fgraph)
 
-    def unpickle(self, fgraph):
-        fgraph.validate = partial(self.validate_, fgraph)
-        fgraph.consistent = partial(self.consistent_, fgraph)
+        def validate():
+            t0 = time.time()
+            ret = fgraph.execute_callbacks('validate')
+            t1 = time.time()
+            if fgraph.profile:
+                fgraph.profile.validate_time += t1 - t0
+            return ret
+
+        fgraph.validate = validate
+
+        def consistent():
+            try:
+                fgraph.validate()
+                return True
+            except Exception:
+                return False
+        fgraph.consistent = consistent
 
     def on_detach(self, fgraph):
         del fgraph.validate
         del fgraph.consistent
 
-    def validate_(self, fgraph):
-        t0 = time.time()
-        ret = fgraph.execute_callbacks('validate')
-        t1 = time.time()
-        if fgraph.profile:
-            fgraph.profile.validate_time += t1 - t0
-        return ret
-
-    def consistent_(self, fgraph):
-        try:
-            fgraph.validate()
-            return True
-        except Exception:
-            return False
-
 
 class ReplaceValidate(History, Validator):
-    pickle_rm_attr = ["replace_validate", "replace_all_validate",
-                      "replace_all_validate_remove"] + \
-                      History.pickle_rm_attr + Validator.pickle_rm_attr
-        
-    
 
     def on_attach(self, fgraph):
-        for attr in ('replace_validate', 'replace_all_validate',
-                     'replace_all_validate_remove'):
+        History.on_attach(self, fgraph)
+        Validator.on_attach(self, fgraph)
+        for attr in ('replace_validate', 'replace_all_validate'):
             if hasattr(fgraph, attr):
                 raise AlreadyThere("ReplaceValidate feature is already present"
                                    " or in conflict with another plugin.")
-        History.on_attach(self, fgraph)
-        Validator.on_attach(self, fgraph)
-        self.unpickle(fgraph)
-
-    def unpickle(self, fgraph):
-        History.unpickle(self, fgraph)
-        Validator.unpickle(self, fgraph)
         fgraph.replace_validate = partial(self.replace_validate, fgraph)
-        fgraph.replace_all_validate = partial(self.replace_all_validate,
-                                              fgraph)
+        fgraph.replace_all_validate = partial(self.replace_all_validate, fgraph)
         fgraph.replace_all_validate_remove = partial(
             self.replace_all_validate_remove, fgraph)
 
@@ -248,14 +199,11 @@ def on_detach(self, fgraph):
     def replace_validate(self, fgraph, r, new_r, reason=None):
         self.replace_all_validate(fgraph, [(r, new_r)], reason=reason)
 
-    def replace_all_validate(self, fgraph, replacements,
-                             reason=None, verbose=None):
+    def replace_all_validate(self, fgraph, replacements, reason=None):
         chk = fgraph.checkpoint()
-        if verbose is None:
-            verbose = config.optimizer_verbose
         for r, new_r in replacements:
             try:
-                fgraph.replace(r, new_r, reason=reason, verbose=False)
+                fgraph.replace(r, new_r, reason=reason)
             except Exception, e:
                 if ('The type of the replacement must be the same' not in
                     str(e) and 'does not belong to this FunctionGraph' not in str(e)):
@@ -271,8 +219,6 @@ def replace_all_validate(self, fgraph, replacements,
         except Exception, e:
             fgraph.revert(chk)
             raise
-        if verbose:
-            print reason, r, new_r
         return chk
 
     def replace_all_validate_remove(self, fgraph, replacements,
@@ -297,18 +243,11 @@ def replace_all_validate_remove(self, fgraph, replacements,
                     print >> out, reason, replacements
                 raise ReplacementDidntRemovedError()
 
-    def __getstate__(self):
-        d = self.__dict__.copy()
-        if "history" in d:
-            del d["history"]
-        return d
-
 
-class NodeFinder(Bookkeeper):
+class NodeFinder(dict, Bookkeeper):
 
     def __init__(self):
         self.fgraph = None
-        self.d = {}
 
     def on_attach(self, fgraph):
         if self.fgraph is not None:
@@ -328,9 +267,9 @@ def on_detach(self, fgraph):
         del fgraph.get_nodes
         Bookkeeper.on_detach(self, fgraph)
 
-    def on_import(self, fgraph, node, reason):
+    def on_import(self, fgraph, node):
         try:
-            self.d.setdefault(node.op, []).append(node)
+            self.setdefault(node.op, []).append(node)
         except TypeError:  # node.op is unhashable
             return
         except Exception, e:
@@ -341,18 +280,18 @@ def on_import(self, fgraph, node, reason):
                 print >> sys.stderr, 'OFFENDING node not hashable'
             raise e
 
-    def on_prune(self, fgraph, node, reason):
+    def on_prune(self, fgraph, node):
         try:
-            nodes = self.d[node.op]
+            nodes = self[node.op]
         except TypeError:  # node.op is unhashable
             return
         nodes.remove(node)
         if not nodes:
-            del self.d[node.op]
+            del self[node.op]
 
     def query(self, fgraph, op):
         try:
-            all = self.d.get(op, [])
+            all = self.get(op, [])
         except TypeError:
             raise TypeError("%s in unhashable and cannot be queried by the"
                             " optimizer" % op)
@@ -373,13 +312,13 @@ def on_detach(self, fgraph):
         if self.active:
             print "-- detaching from: ", fgraph
 
-    def on_import(self, fgraph, node, reason):
+    def on_import(self, fgraph, node):
         if self.active:
-            print "-- importing: %s, reason: %s" % (node, reason)
+            print "-- importing: %s" % node
 
-    def on_prune(self, fgraph, node, reason):
+    def on_prune(self, fgraph, node):
         if self.active:
-            print "-- pruning: %s, reason: %s" % (node, reason)
+            print "-- pruning: %s" % node
 
     def on_change_input(self, fgraph, node, i, r, new_r, reason=None):
         if self.active:
@@ -389,7 +328,7 @@ def on_change_input(self, fgraph, node, i, r, new_r, reason=None):
 
 class PreserveNames(Feature):
 
-    def on_change_input(self, fgraph, node, i, r, new_r, reason=None):
+    def on_change_input(self, fgraph, mode, i, r, new_r, reason=None):
         if r.name is not None and new_r.name is None:
             new_r.name = r.name
 
diff --git a/theano/gof/type.py b/theano/gof/type.py
index 9c11e56f3fa..915fd1a2e1d 100644
--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -2,8 +2,6 @@
 
 __docformat__ = "restructuredtext en"
 
-from theano.compat import PY3
-
 from theano.gof import utils
 from theano.gof.utils import MethodNotDefined, object2
 from theano.gof import graph
@@ -46,7 +44,7 @@ def c_literal(self, data):
         """
         raise MethodNotDefined("c_literal", type(self), self.__class__.__name__)
 
-    def c_declare(self, name, sub, check_input=True):
+    def c_declare(self, name, sub):
         """Required: Return c code to declare variables that will be
         instantiated by `c_extract`.
 
@@ -98,7 +96,7 @@ class constructor for example, whereas the variable ``name``
         """
         raise MethodNotDefined("c_init", type(self), self.__class__.__name__)
 
-    def c_extract(self, name, sub, check_input=True):
+    def c_extract(self, name, sub):
         """Required: Return c code to extract a PyObject * instance.
 
         The code returned from this function must be templated using
@@ -139,7 +137,7 @@ def c_extract(self, name, sub, check_input=True):
         """
         raise MethodNotDefined("c_extract", type(self), self.__class__.__name__)
 
-    def c_extract_out(self, name, sub, check_input=True):
+    def c_extract_out(self, name, sub):
         """Optional: C code to extract a PyObject * instance.
 
         Unlike c_extract, c_extract_out has to accept Py_None,
@@ -157,10 +155,10 @@ def c_extract_out(self, name, sub, check_input=True):
         """ % dict(
                 name=name,
                 c_init_code=self.c_init(name, sub),
-                c_extract_code=self.c_extract(name, sub, check_input))
+                c_extract_code=self.c_extract(name, sub))
 
     def c_cleanup(self, name, sub):
-        """Return c code to clean up after `c_extract`.
+        """Optional: Return c code to clean up after `c_extract`.
 
         This returns C code that should deallocate whatever `c_extract`
         allocated or decrease the reference counts. Do not decrease
@@ -252,7 +250,7 @@ def filter(self, data, strict=False, allow_downcast=None):
     # If filter_inplace is defined, it will be called instead of
     # filter() This is to allow reusing the old allocated memory. As
     # of this writing this is used only when we transfer new data to a
-    # shared variable on the gpu.
+    # shared variable on the gpu.  
 
     #def filter_inplace(value, storage, strict=False, allow_downcast=None)
 
@@ -403,17 +401,10 @@ class SingletonType(Type):
     It saves having to implement __eq__ and __hash__
     """
     __instance = None
-
     def __new__(cls):
-        # If sub-subclass of SingletonType don't redeclare __instance
-        # when we look for it, we will find it in the subclass.  We
-        # don't want that, so we check the class.  When we add one, we
-        # add one only to the current class, so all is working
-        # correctly.
-        if cls.__instance is None or not isinstance(cls.__instance, cls):
+        if cls.__instance is None:
             cls.__instance = Type.__new__(cls)
         return cls.__instance
-
     def __str__(self):
         return self.__class__.__name__
 
@@ -436,7 +427,7 @@ def filter(self, data, strict=False, allow_downcast=None):
     def is_valid_value(self, a):
         return True
 
-    def c_declare(self, name, sub, check_input=True):
+    def c_declare(self, name, sub):
         return """
         PyObject* %(name)s;
         """ % locals()
@@ -446,7 +437,7 @@ def c_init(self, name, sub):
         %(name)s = NULL;
         """ % locals()
 
-    def c_extract(self, name, sub, check_input=True):
+    def c_extract(self, name, sub):
         return """
         Py_INCREF(py_%(name)s);
         %(name)s = py_%(name)s;
@@ -472,119 +463,3 @@ def __str__(self):
         return self.__class__.__name__
 
 generic = Generic()
-
-
-class CDataType(Type):
-    """
-    Represents opaque C data to be passed around. The intent is to
-    ease passing arbitrary data between ops C code.
-    """
-    def __init__(self, ctype, freefunc=None):
-        """
-        Build a type made to represent a C pointer in theano.
-
-        :param ctype: The type of the pointer (complete with the `*`)
-
-        :param freefunc: a function to call to free the pointer.  This
-                         function must have a `void` return and take a
-                         single pointer argument.
-        """
-        assert isinstance(ctype, basestring)
-        self.ctype = ctype
-        if freefunc is not None:
-            assert isinstance(freefunc, basestring)
-        self.freefunc = freefunc
-
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.ctype == other.ctype,
-                self.freefunc == other.freefunc)
-
-    def __hash__(self):
-        return hash((type(self), self.ctype, self.freefunc))
-
-    def filter(self, data, strict=False, allow_downcast=None):
-        if data is not None:
-            raise TypeError("only None is valid")
-
-    def is_valid_value(self, a):
-        return a is None
-
-    def c_declare(self, name, sub, check_input=True):
-        return """
-        %(ctype)s %(name)s;
-        """ % dict(ctype=self.ctype, name=name)
-
-    def c_init(self, name, sub):
-        return "%(name)s = NULL;" % dict(name=name)
-
-    def c_extract(self, name, sub, check_input=True):
-        if PY3:
-            s = """
-  %(name)s = (%(ctype)s)PyCapsule_GetPointer(py_%(name)s, NULL);
-  if (%(name)s == NULL) %(fail)s
-"""
-        else:
-            s = """
-  %(name)s = (%(ctype)s)PyCObject_AsVoidPtr(py_%(name)s);
-"""
-        return s % dict(name=name, ctype=self.ctype, fail=sub['fail'])
-
-    def c_support_code(self):
-        if PY3:
-            return """
-void _py3_destructor(PyObject *o) {
-    void *d = PyCapsule_GetContext(o);
-    void *p = PyCapsule_GetPointer(o, NULL);
-    void (*f)(void *) = (void (*)(void *))d;
-    if (f != NULL) f(p);
-}
-"""
-        else:
-            return ""
-
-    def c_sync(self, name, sub):
-        freefunc = self.freefunc
-        if freefunc is None:
-            freefunc = "NULL"
-        s = """
-Py_XDECREF(py_%(name)s);
-if (%(name)s == NULL) {
-  py_%(name)s = Py_None;
-  Py_INCREF(py_%(name)s);
-} else """
-        if PY3:
-            s += """{
-  py_%(name)s = PyCapsule_New((void *)%(name)s, NULL,
-                              _py3_destructor);
-  if (py_%(name)s != NULL) {
-    if (PyCapsule_SetContext(py_%(name)s, (void *)%(freefunc)s) != 0) {
-      /* This won't trigger a call to freefunc since it could not be
-         set. The error case below will do it. */
-      Py_DECREF(py_%(name)s);
-      /* Signal the error */
-      py_%(name)s = NULL;
-    }
-  }
-}"""
-        else:
-            s += """{
-  py_%(name)s = PyCObject_FromVoidPtr((void *)%(name)s,
-                                      (void (*)(void *))%(freefunc)s);
-}"""
-        if self.freefunc is not None:
-            s += """
-if (py_%(name)s == NULL) { %(freefunc)s(%(name)s); }
-"""
-        return s % dict(name=name, freefunc=freefunc)
-
-    def c_cleanup(self, name, sub):
-        # No need to do anything here since the CObject/Capsule will
-        # free the data for us when released.
-        return ""
-
-    def c_code_cache_version(self):
-        return (2,)
-
-    def __str__(self):
-        return "%s{%s}" % (self.__class__.__name__, self.ctype)
diff --git a/theano/gof/utils.py b/theano/gof/utils.py
index 66f7c5191ea..c6baf95961e 100644
--- a/theano/gof/utils.py
+++ b/theano/gof/utils.py
@@ -12,31 +12,14 @@ def add_tag_trace(thing):
     limit = config.traceback.limit
     if limit == -1:
         limit = None
-    tr = traceback.extract_stack(limit=limit)[:-1]
-    # Different python version use different sementic for
-    # limit. python 2.7 include the call to extrack_stack. The -1 get
-    # rid of it.  We also want to get rid of the add_tag_trace call.
-    if tr and "add_tag_trace" in tr[-1][-1]:
-        tr = tr[:-1]
-    while tr:
-        file_path = tr[-1][0]
-        rm = False
-        for p in ["theano/tensor/",
-                  "theano/gof/"]:
-            if p in file_path:
-                tr = tr[:-1]
-                rm = True
-                break
-        if not rm:
-            break
-
-    thing.tag.trace = tr
+    thing.tag.trace = traceback.extract_stack(limit=limit)[:-1]
     return thing
 
 
-def hashtype(self):
-    t = type(self)
-    return hash(t.__name__) ^ hash(t.__module__)
+def hashgen():
+    hashgen.next += 1
+    return  hashgen.next
+hashgen.next = 0
 
 
 class MethodNotDefined(Exception):
@@ -107,15 +90,15 @@ def rval(*args, **kwargs):
 def deprecated(filename, msg=''):
     """Decorator which will print a warning message on the first call.
 
-    Use it like this::
+    Use it like this:
 
-      @deprecated('myfile', 'do something different...')
-      def fn_name(...)
-          ...
+    @deprecated('myfile', 'do something different...')
+    def fn_name(...)
+        ...
 
-    And it will print::
+    And it will print
 
-      WARNING myfile.fn_name deprecated. do something different...
+    WARNING myfile.fn_name deprecated. do something different...
 
     """
     def _deprecated(f):
@@ -145,7 +128,7 @@ def uniq(seq):
 
 def difference(seq1, seq2):
     """
-    Returns all elements in seq1 which are not in seq2: i.e ``seq1\seq2``
+    Returns all elements in seq1 which are not in seq2: i.e seq1\seq2
     """
     try:
         # try to use O(const * len(seq1)) algo
@@ -160,6 +143,52 @@ def difference(seq1, seq2):
         return [x for x in seq1 if x not in seq2]
 
 
+def partition(f, seq):
+    seqt = []
+    seqf = []
+    for elem in seq:
+        if f(elem):
+            seqt.append(elem)
+        else:
+            seqf.append(elem)
+    return seqt, seqf
+
+
+def attr_checker(*attrs):
+    def f(candidate):
+        for attr in attrs:
+            if not hasattr(candidate, attr):
+                return False
+        return True
+
+    f.__doc__ = ("Checks that the candidate has the following attributes: %s"
+                  % ", ".join(["'%s'" % attr for attr in attrs]))
+    return f
+
+
+def all_bases(cls, accept):
+    rval = set([cls])
+    for base in cls.__bases__:
+        rval.update(all_bases(base, accept))
+    return [cls for cls in rval if accept(cls)]
+
+
+def all_bases_collect(cls, raw_name):
+    rval = set()
+    name = "__%s__" % raw_name
+    if name in cls.__dict__:  # don't use hasattr
+        rval.add(getattr(cls, name))
+    cut = "__%s_override__" % raw_name
+    if not cls.__dict__.get(cut, False):
+        for base in cls.__bases__:
+            rval.update(all_bases_collect(base, raw_name))
+    return rval
+
+
+def camelcase_to_separated(string, sep="_"):
+    return re.sub('(.)([A-Z])', '\\1%s\\2' % sep, string).lower()
+
+
 def to_return_values(values):
     if len(values) == 1:
         return values[0]
@@ -174,6 +203,21 @@ def from_return_values(values):
         return [values]
 
 
+class ClsInit(type):
+    """Class initializer for L{Op} subclasses"""
+    def __init__(cls, name, bases, dct):
+        """
+        Validate and initialize the L{Op} subclass 'cls'
+
+        This function:
+          - changes class attributes input_names and output_names to be lists
+            if they are single strings.
+        """
+        type.__init__(cls, name, bases, dct)
+
+        cls.__clsinit__(cls, name, bases, dct)
+
+
 def toposort(prereqs_d):
     """
     Sorts prereqs_d.keys() topologically.
@@ -212,6 +256,21 @@ def toposort(prereqs_d):
     return seq
 
 
+def print_for_dot(self):
+    #TODO: popen2("dot -Tpng | display") and actually make the graph window
+    #pop up
+    print ("digraph unix { size = '6,6'; node [color = lightblue2;"
+           "style = filled];")
+    for op in self.order:
+        for input in op.inputs:
+            if input.owner:
+                print ' '.join((
+                    input.owner.__class__.__name__ + str(abs(id(input.owner))),
+                    " -> ",
+                    op.__class__.__name__ + str(abs(id(op))),
+                    ";"))
+
+
 class Keyword:
 
     def __init__(self, name, nonzero=True):
@@ -310,9 +369,6 @@ def typename(type):
 
 
 def flatten(a):
-    """
-    Recursively flatten tuple, list and set in a list.
-    """
     if isinstance(a, (tuple, list, set)):
         l = []
         for item in a:
@@ -349,15 +405,3 @@ def give_variables_names(variables):
                 "Maybe you've named some of the variables identically")
 
     return variables
-
-
-def remove(predicate, coll):
-    """ Return those items of collection for which predicate(item) is true.
-
-    >>> from itertoolz import remove
-    >>> def even(x):
-    ...     return x % 2 == 0
-    >>> remove(even, [1, 2, 3, 4])
-    [1, 3]
-    """
-    return filter(lambda x: not predicate(x), coll)
diff --git a/theano/gof/vm.py b/theano/gof/vm.py
index 250060bba87..7f905b7faa5 100644
--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -12,37 +12,35 @@
 
 from theano.gof.python25 import all
 
-from theano.configparser import (config, AddConfigVar,
-                                 BoolParam, ConfigParam, _config_var_list)
+from theano.configparser import config, AddConfigVar, BoolParam, ConfigParam
 
 import theano.gof.cmodule
 
 logger = logging.getLogger(__name__)
 
 AddConfigVar('profile',
-             "If VM should collect profile information",
-             BoolParam(False),
-             in_c_key=False)
+        "If VM should collect profile information",
+        BoolParam(False),
+        in_c_key=False)
 AddConfigVar('profile_optimizer',
-             "If VM should collect optimizer profile information",
-             BoolParam(False),
-             in_c_key=False)
+        "If VM should collect optimizer profile information",
+        BoolParam(False),
+        in_c_key=False)
 AddConfigVar('profile_memory',
-             "If VM should collect memory profile information and print it",
-             BoolParam(False),
-             in_c_key=False)
-
+        "If VM should collect memory profile information and print it",
+        BoolParam(False),
+        in_c_key=False)
 
 def filter_vm_lazy(val):
     if val == 'False' or val is False:
         return False
     elif val == 'True' or val is True:
         return True
-    elif val == 'None' or val is None:
+    elif val == 'None':
         return None
     else:
         raise ValueError('Valid values for an vm.lazy parameter '
-                         'should be None, False or True, not `%s`.' % val)
+                        'should be None, False or True, not `%s`.' % val)
 
 AddConfigVar('vm.lazy',
              "Useful only for the vm linkers. When lazy is None,"
@@ -56,7 +54,6 @@ def filter_vm_lazy(val):
 
 
 class VM(object):
-
     """
     A VM object's __call__ method evaluates a Theano program.
 
@@ -84,7 +81,6 @@ class VM(object):
         storage. False means it *must not* repeat that feedback.
 
     """
-
     def __init__(self, nodes, thunks, pre_call_clear):
         """
         Allocate a virtual machine.
@@ -145,15 +141,6 @@ def update_profile(self, profile):
             profile.variable_shape = self.variable_shape.copy()
             profile.variable_strides = self.variable_strides.copy()
 
-        if hasattr(self, 'node_executed_order'):
-            profile.node_executed_order = self.node_executed_order[:]
-
-        if hasattr(self, 'node_cleared_order'):
-            profile.node_cleared_order = self.node_cleared_order[:]
-
-        if hasattr(self, 'dependencies'):
-            profile.dependencies = self.dependencies.copy()
-
         # clear the timer info out of the buffers
         for i in xrange(len(self.call_times)):
             self.call_times[i] = 0.0
@@ -161,12 +148,10 @@ def update_profile(self, profile):
 
 
 class Loop(VM):
-
     """
     Unconditional start-to-finish program execution in Python.
     No garbage collection is allowed on intermediate results.
     """
-
     def __call__(self):
         if self.time_thunks:
             for cont in self.pre_call_clear:
@@ -180,7 +165,7 @@ def __call__(self):
                     self.call_counts[i] += 1
                     self.call_times[i] += t1 - t0
             except:
-                raise_with_op(node, thunk)
+                raise_with_op(node)
         else:
             for cont in self.pre_call_clear:
                 cont[0] = None
@@ -188,16 +173,14 @@ def __call__(self):
                 for thunk, node in zip(self.thunks, self.nodes):
                     thunk()
             except:
-                raise_with_op(node, thunk)
+                raise_with_op(node)
 
 
 class LoopGC(VM):
-
     """
     Unconditional start-to-finish program execution in Python.
     Garbage collection is possible on intermediate results.
     """
-
     def __init__(self, nodes, thunks, pre_call_clear, post_thunk_clear):
         super(LoopGC, self).__init__(nodes, thunks, pre_call_clear)
         self.post_thunk_clear = post_thunk_clear
@@ -222,7 +205,7 @@ def __call__(self):
                         old_s[0] = None
                     i += 1
             except:
-                raise_with_op(node, thunk)
+                raise_with_op(node)
         else:
             for cont in self.pre_call_clear:
                 cont[0] = None
@@ -233,11 +216,10 @@ def __call__(self):
                     for old_s in old_storage:
                         old_s[0] = None
             except:
-                raise_with_op(node, thunk)
+                raise_with_op(node)
 
 
 class Stack(VM):
-
     """
     Finish-to-start evalution order of thunks.
 
@@ -316,7 +298,6 @@ def run_thunk_of_node(self, node):
         idx = self.node_idx[node]
         t0 = time.time()
         rval = self.thunks[idx]()
-        self.node_executed_order.append(node)
 
         # Some thunks on some computers run faster than the granularity
         # of the time.time clock.
@@ -325,11 +306,11 @@ def run_thunk_of_node(self, node):
         dt = max(time.time() - t0, 1e-10)
         if self.callback is not None:
             self.callback(
-                node=node,
-                thunk=self.thunks[idx],
-                storage_map=self.storage_map,
-                compute_map=self.compute_map,
-            )
+                    node=node,
+                    thunk=self.thunks[idx],
+                    storage_map=self.storage_map,
+                    compute_map=self.compute_map,
+                    )
         return rval, dt
 
     def __call__(self):
@@ -337,9 +318,6 @@ def __call__(self):
         compute_map = self.compute_map
         thunks = self.thunks
         dependencies = self.dependencies
-        self.node_executed_order = []
-        self.node_cleared_order = []
-
         for k in self.storage_map:
             compute_map[k][0] = (k.owner is None)
 
@@ -347,7 +325,7 @@ def __call__(self):
         apply_stack = list(self.base_apply_stack)
         last_apply_stack_len = -1
 
-        # This record all function inputs/shared varibles and constants
+        #This record all function inputs/shared varibles and constants
         for var, data in self.storage_map.iteritems():
             if data[0] is None:
                 continue
@@ -403,7 +381,7 @@ def __call__(self):
                             current_idx = self.node_idx[current_apply]
                             self.call_counts[current_idx] += 1
                             self.call_times[current_idx] += dt
-                            # Computing the memory footprint of the the op
+                            ## Computing the memory footprint of the the op
                             # ?? What about inplace .. if the op is inplace
                             # you don't actually ask for more memory!
                             for (idx, o) in enumerate(
@@ -418,21 +396,13 @@ def __call__(self):
                                 st = getattr(o[0], 'strides',
                                              'input no strides')
                                 if (getattr(o[0], 'flags', False) and
-                                        o[0].flags.c_contiguous):
+                                    o[0].flags.c_contiguous):
                                     st = 'c'
-                                elif (hasattr(data[0], 'is_c_contiguous') and
-                                      data[0].is_c_contiguous()):
-                                    st = "c"
                                 self.variable_strides[var] = st
                     except Exception:
-                        raise_with_op(current_apply,
-                                      self.thunks[self.node_idx[current_apply]])
+                        raise_with_op(current_apply)
                     for o in current_apply.outputs:
                         compute_map[o][0] = 1
-
-                    input_index = []
-                    # A list store the index of inputs variables
-
                     if self.allow_gc:
                         for i in current_apply.inputs:
                             # Garbage Collection -> check if anybody else uses
@@ -443,16 +413,13 @@ def __call__(self):
                                 if all(compute_map[v][0]
                                         for v in dependencies[i]):
                                     storage_map[i][0] = None
-                                    input_index.append(
-                                        current_apply.inputs.index(i))
-
-                                    # DO NOT set compute_map to 0
+                                    #DO NOT set compute_map to 0
 
-                                    # If values become False and the
+                                    #If values become False and the
                                     #current_apply is still in the
-                                    # stack, this will cause it to be
-                                    # recomputed! This can cause wrong value
-                                    # with some combination of inplace op.
+                                    #stack, this will cause it to be
+                                    #recomputed! This can cause wrong value
+                                    #with some combination of inplace op.
                                     compute_map[i][0] = 2
                                     if (config.warn.vm_gc_bug and
                                         current_apply in apply_stack and
@@ -460,22 +427,19 @@ def __call__(self):
                                                 'destroy_map',
                                                 False)):
                                         warnings.warn(
-                                            "There was a bug that existed in the default Theano configuration,"
-                                            " only in the development version between July 5th 2012"
-                                            " and July 30th 2012. This was not in a released version."
-                                            " The bug was affecting this script.",
-                                            # The stack level is not good when
-                                            # inside a Scan.
-                                            stacklevel=3
+        "There was a bug that existed in the default Theano configuration,"
+        " only in the development version between July 5th 2012"
+        " and July 30th 2012. This was not in a released version."
+        " The bug was affecting this script.",
+        #The stack level is not good when inside a Scan.
+        stacklevel=3
                                         )
-                    self.node_cleared_order.append(input_index)
-
                 elif not computed_ins:
                     # -- Non-lazy case, need inputs
                     apply_stack.append(current_apply)
                     apply_stack.extend(inp.owner
-                                       for inp in current_deps
-                                       if inp.owner)
+                            for inp in current_deps
+                            if inp.owner)
 
             elif not computed_outs:
                 #
@@ -494,8 +458,7 @@ def __call__(self):
                     self.call_times[current_idx] += dt
 
                 except Exception:
-                    raise_with_op(current_apply,
-                                  self.thunks[self.node_idx[current_apply]])
+                    raise_with_op(current_apply)
 
                 if requires:
                     for r in requires:
@@ -519,19 +482,14 @@ def __call__(self):
                             self.variable_shape[var] = sh
                             st = getattr(o[0], 'strides', 'input no strides')
                             if (getattr(o[0], 'flags', False) and
-                                    o[0].flags.c_contiguous):
+                                o[0].flags.c_contiguous):
                                 st = 'c'
-                            elif (hasattr(data[0], 'is_c_contiguous') and
-                                  data[0].is_c_contiguous()):
-                                st = "c"
                             self.variable_strides[var] = st
 
-                    input_index = []
-
                     if self.allow_gc:
                         for i in current_apply.inputs:
                             if (dependencies[i] and i.owner and
-                                    i not in self.outputs):
+                                i not in self.outputs):
                                 empty_storage_map = True
                                 for x in dependencies[i]:
                                     if not compute_map[x][0]:
@@ -539,37 +497,23 @@ def __call__(self):
                                         break
                                 if empty_storage_map:
                                     storage_map[i][0] = None
-                                    input_index.append(
-                                        current_apply.inputs.index(i))
-                                    # See the not lazy gc code for explanations
-                                    # of compute_map change
+                                    #See the not lazy gc code for explanations
+                                    #of compute_map change
                                     compute_map[i][0] = 2
 
-                    self.node_cleared_order.append(input_index)
-
         # Hacky coarse gc final pass
         # This is required until we have a proper gc algorithm for graphs with
         # lazy evaluation. See discussion on theano-dev June 19 2012.
-        final_index = []
-
         if self.allow_gc:
             for v in storage_map:
                 if v.owner and not v in self.outputs:
-                    if compute_map[v][0] == 2:
-                        continue
-                    else:
-                        storage_map[v][0] = None
-                        final_index.append(v)
-                        compute_map[v][0] = 2
-
-        self.node_cleared_order.append(final_index)
+                    storage_map[v][0] = None
 
 
 try:
     import lazylinker_c
 
     class CVM(lazylinker_c.CLazyLinker, VM):
-
         def __init__(self, *args, **kwargs):
             lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs)
             # skip VM.__init__
@@ -580,13 +524,12 @@ def __init__(self, *args, **kwargs):
     # already changed the default linker to something else then CVM.
     # Currently this is the py linker.
     # Here we assert that the default linker is not cvm.
-    assert not [x for x in _config_var_list
+    assert not [x for x in theano.configparser._config_var_list
                 if x.fullname == 'linker'][0].default.startswith('cvm'), e
     pass
 
 
 class VM_Linker(link.LocalLinker):
-
     """
     Class that satisfies the Linker interface by acting as a VM factory.
     """
@@ -636,9 +579,9 @@ def accept(self, fgraph, no_recycling=None):
             associated to self, else, a new VM_Linker associated to fgraph.
         """
         if (config.profile and
-                hasattr(theano, 'sandbox') and
-                hasattr(theano.sandbox, 'cuda') and
-                theano.sandbox.cuda.cuda_enabled):
+            hasattr(theano, 'sandbox') and
+            hasattr(theano.sandbox, 'cuda') and
+            theano.sandbox.cuda.cuda_enabled):
             if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
                 raise Exception(
                     "You are running the Theano profiler with CUDA enabled."
@@ -655,12 +598,12 @@ def accept(self, fgraph, no_recycling=None):
             # Warning: make sure to forward the correct values of
             # all parameters to __init__ here.
             return type(self)(
-                allow_gc=self.allow_gc,
-                use_cloop=self.use_cloop,
-                callback=self.callback,
-                lazy=self.lazy,
-                schedule=self.schedule
-            ).accept(fgraph, no_recycling)
+                    allow_gc=self.allow_gc,
+                    use_cloop=self.use_cloop,
+                    callback=self.callback,
+                    lazy=self.lazy,
+                    schedule=self.schedule
+                    ).accept(fgraph, no_recycling)
         self.fgraph = fgraph
         self.no_recycling = no_recycling
         return self
@@ -705,37 +648,38 @@ def compute_gc_dependencies(self, variables):
             if k.owner and k.clients:
                 ls = []
                 for cl in k.clients:
-                    if cl[0] != 'output':
+                    if cl[0] is not 'output':
                         ls += cl[0].outputs
                 dependencies[k] += ls
         return dependencies
 
     def make_vm(self, nodes, thunks,
-                input_storage, output_storage, storage_map,
-                post_thunk_clear,
-                computed,
-                compute_map,
-                updated_vars
-                ):
+            input_storage, output_storage, storage_map,
+            post_thunk_clear,
+            computed,
+            compute_map,
+            updated_vars
+            ):
 
         pre_call_clear = [storage_map[v] for v in self.no_recycling]
 
         if (self.callback is not None or
-                (config.profile and config.profile_memory)):
+            (config.profile and config.profile_memory)):
 
             if self.use_cloop and self.callback is not None:
                 logger.warn('CVM does not support callback, using Stack VM.')
             if self.use_cloop and config.profile_memory:
                 warnings.warn(
                     'CVM does not support memory profile, using Stack VM.')
-            # Needed when allow_gc=True and profiling
-            deps = self.compute_gc_dependencies(storage_map)
+            deps = None
+            if self.allow_gc:
+                deps = self.compute_gc_dependencies(storage_map)
             vm = Stack(
-                nodes, thunks, pre_call_clear,
-                storage_map, compute_map,
-                self.fgraph, self.allow_gc,
-                dependencies=deps,
-                callback=self.callback)
+                    nodes, thunks, pre_call_clear,
+                    storage_map, compute_map,
+                    self.fgraph, self.allow_gc,
+                    dependencies=deps,
+                    callback=self.callback)
         elif self.use_cloop:
             # create a map from nodes to ints and vars to ints
             nodes_idx = {}
@@ -757,18 +701,20 @@ def make_vm(self, nodes, thunks,
             # put storage_map and compute_map into a int-based scheme
             n_applies = len(nodes)
             storage_map_list = [storage_map[vars_idx_inv[i]]
-                                for i in xrange(len(vars_idx_inv))]
+                    for i in xrange(len(vars_idx_inv))]
             compute_map_list = [compute_map[vars_idx_inv[i]]
-                                for i in xrange(len(vars_idx_inv))]
+                    for i in xrange(len(vars_idx_inv))]
             if nodes:
                 assert type(storage_map_list[0]) is list
                 assert type(compute_map_list[0]) is list
 
-            # Needed when allow_gc=True and profiling
-            dependency_map = self.compute_gc_dependencies(storage_map)
-            dependency_map_list = [
-                [vars_idx[d] for d in dependency_map[vars_idx_inv[i]]]
-                for i in xrange(len(vars_idx_inv))]
+            if self.allow_gc:
+                dependency_map = self.compute_gc_dependencies(storage_map)
+                dependency_map_list = [
+                    [vars_idx[d] for d in dependency_map[vars_idx_inv[i]]]
+                    for i in xrange(len(vars_idx_inv))]
+            else:
+                dependency_map_list = None
 
             # build the pointers to node inputs and offsets
             base_input_output_list = []
@@ -804,7 +750,7 @@ def make_vm(self, nodes, thunks,
                 prereq_var_idxs = []
                 for prereq_node in ords.get(node, []):
                     prereq_var_idxs.extend(
-                        [vars_idx[v] for v in prereq_node.outputs])
+                            [vars_idx[v] for v in prereq_node.outputs])
                 prereq_var_idxs = list(set(prereq_var_idxs))
                 prereq_var_idxs.sort()  # TODO: why sort?
                 node_prereqs.append(prereq_var_idxs)
@@ -824,27 +770,27 @@ def make_vm(self, nodes, thunks,
 
             c0 = sys.getrefcount(node_n_inputs)
             vm = CVM(
-                nodes,
-                thunks,
-                pre_call_clear,
-                allow_gc=self.allow_gc,
-                call_counts=[0] * len(nodes),
-                call_times=[0.0] * len(nodes),
-                compute_map_list=compute_map_list,
-                storage_map_list=storage_map_list,
-                base_input_output_list=base_input_output_list,
-                node_n_inputs=node_n_inputs,
-                node_n_outputs=node_n_outputs,
-                node_input_offset=node_input_offset,
-                node_output_offset=node_output_offset,
-                var_owner=var_owner,
-                is_lazy_list=is_lazy_list,
-                output_vars=output_vars,
-                node_prereqs=node_prereqs,
-                node_output_size=node_output_size,
-                update_storage=update_storage,
-                dependencies=dependency_map_list,
-            )
+                    nodes,
+                    thunks,
+                    pre_call_clear,
+                    allow_gc=self.allow_gc,
+                    call_counts=[0] * len(nodes),
+                    call_times=[0.0] * len(nodes),
+                    compute_map_list=compute_map_list,
+                    storage_map_list=storage_map_list,
+                    base_input_output_list=base_input_output_list,
+                    node_n_inputs=node_n_inputs,
+                    node_n_outputs=node_n_outputs,
+                    node_input_offset=node_input_offset,
+                    node_output_offset=node_output_offset,
+                    var_owner=var_owner,
+                    is_lazy_list=is_lazy_list,
+                    output_vars=output_vars,
+                    node_prereqs=node_prereqs,
+                    node_output_size=node_output_size,
+                    update_storage=update_storage,
+                    dependencies=dependency_map_list,
+                    )
             assert c0 == sys.getrefcount(node_n_inputs)
         else:
             lazy = self.lazy
@@ -856,58 +802,45 @@ def make_vm(self, nodes, thunks,
                 # there is no conditional in the graph
                 if self.allow_gc:
                     vm = LoopGC(
-                        nodes,
-                        thunks,
-                        pre_call_clear,
-                        post_thunk_clear)
+                            nodes,
+                            thunks,
+                            pre_call_clear,
+                            post_thunk_clear)
                 else:
                     vm = Loop(
-                        nodes,
-                        thunks,
-                        pre_call_clear)
+                            nodes,
+                            thunks,
+                            pre_call_clear)
             else:
-                # Needed when allow_gc=True and profiling
-                deps = self.compute_gc_dependencies(storage_map)
+                deps = None
+                if self.allow_gc:
+                    deps = self.compute_gc_dependencies(storage_map)
                 vm = Stack(
-                    nodes, thunks, pre_call_clear,
-                    storage_map, compute_map,
-                    self.fgraph, self.allow_gc,
-                    dependencies=deps
-                )
+                        nodes, thunks, pre_call_clear,
+                        storage_map, compute_map,
+                        self.fgraph, self.allow_gc,
+                        dependencies=deps
+                        )
         return vm
 
     def make_all(self, profiler=None, input_storage=None,
                  output_storage=None,
-                 ):
+                ):
         fgraph = self.fgraph
         order = self.schedule(fgraph)
         no_recycling = self.no_recycling
 
         input_storage, output_storage, storage_map = link.map_storage(
-            fgraph, order, input_storage, output_storage)
+                fgraph, order, input_storage, output_storage)
         compute_map = {}
         for k in storage_map:
             compute_map[k] = [k.owner is None]
 
-        thunks = []
-        for node in order:
-            try:
-                thunks.append(node.op.make_thunk(node,
-                                                 storage_map,
-                                                 compute_map,
-                                                 no_recycling))
-                if not hasattr(thunks[-1], 'lazy'):
-                    # We don't want all ops maker to think about lazy Ops.
-                    # So if they didn't specify that its lazy or not, it isn't.
-                    # If this member isn't present, it will crash later.
-                    thunks[-1].lazy = False
-            except Exception, e:
-                e.args = ("The following error happened while"
-                          " compiling the node", node, "\n") + e.args
-                raise
-        for node, thunk in zip(order, thunks):
-            thunk.inputs = [storage_map[v] for v in node.inputs]
-            thunk.outputs = [storage_map[v] for v in node.outputs]
+        thunks = [node.op.make_thunk(node,
+                    storage_map,
+                    compute_map,
+                    no_recycling)
+                        for node in order]
 
         computed, last_user = link.gc_helper(order)
         if self.allow_gc:
@@ -924,14 +857,12 @@ def make_all(self, profiler=None, input_storage=None,
             post_thunk_clear = None
 
         vm = self.make_vm(order, thunks,
-                          input_storage, output_storage, storage_map,
-                          post_thunk_clear,
-                          computed,
-                          compute_map,
-                          self.updated_vars
-                          )
-
-        vm.storage_map = storage_map
+                input_storage, output_storage, storage_map,
+                post_thunk_clear,
+                computed,
+                compute_map,
+                self.updated_vars
+                )
 
         return (vm,
                 [link.Container(input, storage)
diff --git a/theano/gradient.py b/theano/gradient.py
index f01a31f4db1..a060029a144 100644
--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -8,7 +8,6 @@
 __docformat__ = "restructuredtext en"
 
 import __builtin__
-from itertools import izip
 import logging
 import warnings
 _logger = logging.getLogger('theano.gradient')
@@ -18,12 +17,11 @@
 
 import theano
 
+from itertools import izip
 from theano import gof
 from theano.gof import Variable
 from theano.gof.python25 import OrderedDict
 from theano.gof.null_type import NullType
-from theano.gof.op import get_debug_values
-from theano.compile import ViewOp
 
 # we can't do "import theano.tensor"
 # tensor depends on theano.compile
@@ -80,9 +78,10 @@ def grad_not_implemented(op, x_pos, x, comment=""):
     gradient is not implemented.
     """
 
-    return (NullType((
-        "This variable is Null because the grad method for "
-        "input %s (%s) of the %s op is not implemented. %s"
+    return (NullType(
+        (
+            "This variable is Null because the grad method for "
+            "input %s (%s) of the %s op is not implemented. %s"
         ) % (x_pos, x, op, comment)))()
 
 
@@ -342,8 +341,8 @@ def Lop(f, wrt, eval_points, consider_constant=None,
     known = dict(izip(f, grads))
 
     ret = grad(cost=None, known_grads=known,
-               consider_constant=consider_constant, wrt=wrt,
-               disconnected_inputs=disconnected_inputs)
+            consider_constant=consider_constant, wrt=wrt,
+            disconnected_inputs=disconnected_inputs)
 
     return format_as(using_list, using_tuple, ret)
 
@@ -353,24 +352,12 @@ def Lop(f, wrt, eval_points, consider_constant=None,
 #########################
 
 def grad(cost, wrt, consider_constant=None,
-         disconnected_inputs='raise', add_names=True,
-         known_grads=None, return_disconnected='zero'):
+        disconnected_inputs='raise', add_names=True,
+        known_grads=None, return_disconnected='zero'):
     """
-    Return symbolic gradients for one or more variables with respect to some
-    cost.
-
-    For more information about how automatic differentiation works in Theano,
-    see :mod:`gradient`. For information on how to implement the gradient of
-    a certain Op, see :func:`grad`.
-
-    :type cost: Scalar (0-dimensional) tensor variable.
+    :type cost: Scalar (0-dimensional) Variable.
         May optionally be None if known_grads is provided.
-    :param cost: a scalar with respect to which we are differentiating
-
-    :type wrt: Tensor variable or list of variables.
-    :param wrt: term[s] for which we want gradients
-
-    :type consider_constant: list of variables
+    :type wrt: Variable or list of Variables.
     :param consider_constant: a list of expressions not to backpropagate
         through
 
@@ -401,10 +388,9 @@ def grad(cost, wrt, consider_constant=None,
                    None
         - 'Disconnected' : returns variables of type DisconnectedType
 
-    :rtype: variable or list/tuple of Variables (matching `wrt`)
+    :rtype: Variable or list/tuple of Variables (depending upon `wrt`)
 
-    :return: symbolic expression of gradient of `cost` with respect to each
-             of the `wrt` terms.
+    :return: symbolic expression of gradient of `cost` with respect to `wrt`.
              If an element of `wrt` is not differentiable with respect
              to the output, then a zero variable is returned.
              It returns an object of same type as `wrt`: a list/tuple
@@ -420,16 +406,17 @@ def grad(cost, wrt, consider_constant=None,
 
     if cost is not None and isinstance(cost.type, NullType):
         raise ValueError("Can't differentiate a NaN cost."
-                         "cost is NaN because " +
-                         cost.type.why_null)
+            "cost is NaN because " + \
+                cost.type.why_null)
 
     if cost is not None and cost.ndim != 0:
         raise TypeError("cost must be a scalar.")
 
+
     if isinstance(wrt, set):
         raise TypeError("wrt must not be a set. sets have no defined "
-                        "iteration order, so we can't return gradients in a"
-                        "  matching order.")
+                "iteration order, so we can't return gradients in a matching"
+                " order.")
 
     using_list = isinstance(wrt, list)
     using_tuple = isinstance(wrt, tuple)
@@ -439,7 +426,7 @@ def grad(cost, wrt, consider_constant=None,
     for elem in wrt:
         if not isinstance(elem, Variable):
             raise TypeError("Expected Variable, got " + str(elem) +
-                            " of type " + str(type(elem)))
+                    " of type "+str(type(elem)))
 
     outputs = []
     if cost is not None:
@@ -448,7 +435,7 @@ def grad(cost, wrt, consider_constant=None,
         outputs.extend(known_grads.keys())
 
     var_to_app_to_idx = _populate_var_to_app_to_idx(
-        outputs, wrt, consider_constant)
+            outputs, wrt, consider_constant)
 
     # build a dict mapping var to the gradient of cost with respect to var
     grad_dict = OrderedDict()
@@ -465,8 +452,7 @@ def grad(cost, wrt, consider_constant=None,
         # g_cost may be Disconnected or NullType. A creative use of the function,
         # sure, but nonetheless one we can and should support. So before we try
         # to cast it make sure it even has a dtype
-        if (hasattr(g_cost.type, 'dtype') and
-            cost.type.dtype not in tensor.discrete_dtypes):
+        if hasattr(g_cost.type, 'dtype') and cost.type.dtype not in tensor.discrete_dtypes:
             # Here we enforce the constraint that floating point variables have
             # the same dtype as their gradient.
             g_cost = g_cost.astype(cost.type.dtype)
@@ -485,8 +471,8 @@ def grad(cost, wrt, consider_constant=None,
                 'Ambiguous whether %s should be made into tensor'
                 ' or sparse theano variable' % str(type(g_var)))
 
-        if (not isinstance(g_var.type, (NullType, DisconnectedType)) and
-            'float' not in str(g_var.type.dtype)):
+        if not isinstance(g_var.type, (NullType, DisconnectedType)) and 'float' \
+            not in str(g_var.type.dtype):
             raise TypeError("Gradients must always be NullType, "
                     "DisconnectedType, or continuous, but grad was "
                     "given a known_grad of type "+str(g_var.type))
@@ -557,150 +543,6 @@ def handle_disconnected(var):
         rval, = rval
     return rval
 
-def subgraph_grad(wrt, end, start=None, cost=None, details=False):
-    '''
-    With respect to `wrt`, computes gradients of cost and/or from
-    existing `start` gradients, up to the `end` variables of a
-    symbolic digraph.  In other words, computes gradients for a
-    subgraph of the symbolic theano function. Ignores all disconnected
-    inputs.
-    
-    This can be useful when one needs to perform the gradient descent
-    iteratively (e.g. one layer at a time in an MLP), or when a
-    particular operation is not differentiable in theano
-    (e.g. stochastic sampling from a multinomial). In the latter case,
-    the gradient of the non-differentiable process could be
-    approximated by user-defined formula, which could be calculated
-    using the gradients of a cost with respect to samples (0s and
-    1s). These gradients are obtained by performing a subgraph_grad
-    from the `cost` or previously known gradients (`start`) up to the
-    outputs of the stochastic process (`end`).  A dictionary mapping
-    gradients obtained from the user-defined differentiation of the
-    process, to variables, could then be fed into another
-    subgraph_grad as `start` with any other `cost` (e.g. weight
-    decay).
-    
-    In an MLP, we could use subgraph_grad to iteratively backpropagate:
-
-    .. code-block:: python
-
-        x, t = theano.tensor.fvector('x'), theano.tensor.fvector('t')
-        w1 = theano.shared(np.random.randn(3,4))
-        w2 = theano.shared(np.random.randn(4,2))
-        a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
-        a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
-        cost2 = theano.tensor.sqr(a2 - t).sum()
-        cost2 += theano.tensor.sqr(w2.sum())
-        cost1 = theano.tensor.sqr(w1.sum())
-
-        params = [[w2],[w1]]
-        costs = [cost2,cost1]
-        grad_ends = [[a1], [x]]
-
-        next_grad = None
-        param_grads = []
-        for i in xrange(2):
-            param_grad, next_grad = theano.subgraph_grad(
-                wrt=params[i], end=grad_ends[i],
-                start=next_grad, cost=costs[i]
-            )
-            next_grad = dict(zip(grad_ends[i], next_grad))
-            param_grads.extend(param_grad)
-
-    :type wrt: list of variables
-    :param wrt:
-      Gradients are computed with respect to `wrt`.
-    
-    :type end: list of variables
-    :param end:
-      Theano variables at which to end gradient descent (they are
-      considered constant in theano.grad).  For convenience, the
-      gradients with respect to these variables are also returned.
-    
-    :type start: dictionary of variables
-    :param start:
-      If not None, a dictionary mapping variables to their
-      gradients. This is useful when the gradient on some variables
-      are known. These are used to compute the gradients backwards up
-      to the variables in `end` (they are used as known_grad in
-      theano.grad).
-    
-    :type cost: scalar (0-dimensional) variable
-    :param cost: 
-      Additional costs for which to compute the gradients.  For
-      example, these could be weight decay, an l1 constraint, MSE,
-      NLL, etc. May optionally be None if start is provided.  Warning
-      : If the gradients of `cost` with respect to any of the `start`
-      variables is already part of the `start` dictionary, then it may
-      be counted twice with respect to `wrt` and `end`.
-
-      .. warning::
-
-        If the gradients of `cost` with respect to any of the `start`
-        variables is already part of the `start` dictionary, then it
-        may be counted twice with respect to `wrt` and `end`.
-
-
-    :type details: bool
-    :param details:
-      When True, additionally returns the list of gradients from
-      `start` and of `cost`, respectively, with respect to `wrt` (not
-      `end`).
-    
-    :rtype: Tuple of 2 or 4 Lists of Variables
-    
-    :return: Returns lists of gradients with respect to `wrt` and `end`, 
-            respectively.
-
-    .. versionadded:: 0.6.1
-    '''
-    assert ((cost is not None) or (start is not None))
-    assert isinstance(end, list)
-    assert isinstance(wrt, list)
-    if start is not None:
-        assert isinstance(start, dict)
-        
-    params = list(set(wrt + end))
-    
-    start_grads = None
-    cost_grads = None
-    if start is not None:
-        start_grads = list(
-            theano.grad(
-                cost=None, wrt=params, known_grads=start, 
-                consider_constant=end, 
-                disconnected_inputs='ignore'
-            )
-        )
-        
-    if cost is not None:
-        cost_grads = list(
-            theano.grad(
-                cost=cost, wrt=params,
-                consider_constant=end,
-                disconnected_inputs='ignore'
-            )
-        )
-                        
-    grads = None
-    if start is None:
-        grads = cost_grads
-    else:
-        grads = start_grads
-        if cost_grads is not None:
-            for i in range(len(grads)):
-                grads[i] += cost_grads[i]
-    
-    pgrads = OrderedDict(zip(params, grads))
-    # separate wrt from end grads:
-    wrt_grads = list(pgrads[k] for k in wrt)
-    end_grads = list(pgrads[k] for k in end)
-   
-    
-    if details:
-        return wrt_grads, end_grads, start_grads, cost_grads
-    
-    return wrt_grads, end_grads
 
 def _node_to_pattern(node):
     """ given an apply node, obtain its connection pattern
@@ -886,13 +728,11 @@ def visit(var):
 
     return var_to_app_to_idx
 
-
 class NullTypeGradError(TypeError):
     """
     Raised when grad encounters a NullType.
     """
 
-
 class DisconnectedInputError(ValueError):
     """
     Raised when grad is asked to compute the gradient
@@ -900,9 +740,8 @@ class DisconnectedInputError(ValueError):
     disconnected_inputs='raise'.
     """
 
-
 def _populate_grad_dict(var_to_app_to_idx,
-                        grad_dict, wrt, cost_name=None):
+        grad_dict, wrt, cost_name=None):
     """
         Helper function for grad function.
 
@@ -944,7 +783,7 @@ def access_term_cache(node):
 
             # list of bools indicating if each output is connected to the cost
             outputs_connected = [not isinstance(g.type, DisconnectedType)
-                                 for g in output_grads]
+                    for g in output_grads]
 
             connection_pattern = _node_to_pattern(node)
 
@@ -1001,7 +840,7 @@ def access_term_cache(node):
                 # each destroyed input.
                 try:
                     dinputs = [node.inputs[x[0]] for x in
-                               node.op.destroy_map.values()]
+                            node.op.destroy_map.values()]
                 except AttributeError:
                     dinputs = []
 
@@ -1056,49 +895,15 @@ def try_to_copy_if_needed(var):
                     assert (getattr(ng.type, 'dtype', None)
                             not in theano.tensor.discrete_dtypes)
 
-                # If config.compute_test_value is turned on, check that the gradients
-                # on the outputs of this node have the right shape.
-                # We also check the gradient on the inputs later--both checks are needed,
-                # because some gradients are only ever specified by the user, not computed
-                # by Op.grad, and some gradients are only computed and returned, but never
-                # passed as another node's output grads.
-                for idx, packed in enumerate(izip(node.outputs,
-                    new_output_grads)):
-                    orig_output, new_output_grad = packed
-                    if not hasattr(orig_output, 'shape'):
-                        continue
-                    if isinstance(new_output_grad.type, DisconnectedType):
-                        continue
-                    for orig_output_v, new_output_grad_v in get_debug_values(
-                            *packed):
-                        o_shape = orig_output_v.shape
-                        g_shape = new_output_grad_v.shape
-                        if o_shape != g_shape:
-                            raise ValueError(
-                                "Got a gradient of shape " +
-                                str(o_shape) + " on an output of shape " +
-                                str(g_shape))
-
                 input_grads = node.op.grad(inputs, new_output_grads)
 
                 if input_grads is None:
                     raise TypeError("%s.grad returned NoneType, "
-                                    "expected iterable." % str(node.op))
+                            "expected iterable." % str(node.op))
 
                 if len(input_grads) != len(inputs):
-                    raise ValueError(("%s returned the wrong number of" +
-                                      " gradient terms.") % str(node.op))
-# We can not enforce this, as AdvancedSubtensor1 has an option to
-# return the sparse grad for optimization reason.
-
-                    #                for ig, i in zip(input_grads, inputs):
-#                    if (not isinstance(ig.type, (DisconnectedType, NullType)) and
-#                        type(ig.type) != type(i.type)):
-#                        raise ValueError(
-#                            "%s returned the wrong type for gradient terms."
-#                            " Sparse inputs must have sparse grads and dense"
-#                            " inputs must have dense grad. Got %s, expected %s" % (
-#                                str(node.op), ig.type, i.type))
+                    raise ValueError(("%s returned the wrong number of" +\
+                            " gradient terms.") % str(node.op))
 
             # must convert to list in case the op returns a tuple
             # we won't be able to post-process out the Nones if it does that
@@ -1121,7 +926,7 @@ def try_to_copy_if_needed(var):
                     # used to mean undefined, zero, or disconnected.
                     # We therefore don't allow it because its usage has become
                     # so muddied.
-                    raise TypeError(('%s.grad returned None for' +
+                    raise TypeError(('%s.grad returned None for' +\
                              ' a gradient term, '
                             'this is prohibited. Instead of None,'
                             'return zeros_like(input), DisconnectedType()(),'
@@ -1129,18 +934,6 @@ def try_to_copy_if_needed(var):
                             'the grad_undefined or grad_unimplemented helper '
                             'functions.') % node.op)
 
-                # Check that the gradient term for this input has the right shape
-                if hasattr(term, 'shape'):
-                    orig_ipt = inputs[i]
-                    for orig_ipt_v, term_v in get_debug_values(orig_ipt, term):
-                        i_shape = orig_ipt_v.shape
-                        t_shape = term_v.shape
-                        if i_shape != t_shape:
-                            raise ValueError("%s.grad returned object of "
-                                    "shape %s as gradient term on input %d "
-                                    "of shape %s" % (node.op, t_shape, i,
-                                        i_shape))
-
                 if not isinstance(term.type,
                         (NullType, DisconnectedType)):
                     if term.type.dtype not in theano.tensor.float_dtypes:
@@ -1171,7 +964,7 @@ def try_to_copy_if_needed(var):
                             msg += "verifiably zeros."
 
                             msg = msg % (str(node.op), str(term),
-                                         str(type(term)), i)
+                                    str(type(term)), i)
 
                         if is_zero == 'no':
                             msg = "%s.grad returned %s of type %s for input"
@@ -1187,8 +980,8 @@ def try_to_copy_if_needed(var):
 
             #Check that op.connection_pattern matches the connectivity
             #logic driving the op.grad method
-            for i, packed in enumerate(zip(inputs, input_grads,
-                                           inputs_connected)):
+            for i, packed in \
+                enumerate(zip(inputs, input_grads, inputs_connected)):
                 ipt, ig, connected = packed
                 actually_connected = \
                     not isinstance(ig.type, DisconnectedType)
@@ -1234,11 +1027,11 @@ def access_grad_cache(var):
                         if not isinstance(term, gof.Variable):
                             raise TypeError("%s.grad returned %s, expected"
                                     " Variable instance." % (str(node.op),
-                                                             type(term)))
+                                        type(term)))
 
                         if isinstance(term.type, NullType):
                             raise NullTypeGradError("tensor.grad "
-                                "encountered a NaN. " +
+                                "encountered a NaN. " +\
                                     term.type.why_null)
 
                         #Don't try to sum up DisconnectedType placeholders
@@ -1328,9 +1121,9 @@ class numeric_grad(object):
     # For now, we use a heuristic that catches very bad gradients, but is not
     # perfectly accurate.
     type_eps = {'float64': 1e-7,
-                'float32': 3e-4,
-                numpy.dtype('float64'): 1e-7,
-                numpy.dtype('float32'): 3e-4}
+            'float32': 3e-4,
+            numpy.dtype('float64'): 1e-7,
+            numpy.dtype('float32'): 3e-4}
 
     def __init__(self, f, pt, eps=None, out_type=None):
         """Return the gradient of f at pt.
@@ -1437,12 +1230,6 @@ def abs_rel_err(a, b):
         """
         abs_err = abs(a - b)
         rel_err = abs_err / numpy.maximum(abs(a) + abs(b), 1e-8)
-        # The numpy.asarray are needed as if a or b is a sparse matrix
-        # this would result in a numpy.matrix and not a numpy.ndarray
-        # and the behave differently causing problem later.
-        # In particular a_npy_matrix.flatten().shape == (1, n_element)
-        abs_err = numpy.asarray(abs_err)
-        rel_err = numpy.asarray(rel_err)
         return (abs_err, rel_err)
 
     def abs_rel_errors(self, g_pt):
@@ -1456,13 +1243,15 @@ def abs_rel_errors(self, g_pt):
 
         """
         if len(g_pt) != len(self.gf):
-            raise ValueError('argument has wrong number of elements',
-                             len(g_pt))
+            raise ValueError(
+                    'argument has wrong number of elements',
+                    len(g_pt))
         errs = []
         for i, (a, b) in enumerate(zip(g_pt, self.gf)):
             if a.shape != b.shape:
-                raise ValueError('argument element %i has wrong shape %s' % (
-                    i, str((a.shape, b.shape))))
+                raise ValueError(
+                        'argument element %i has wrong shape %s' % (
+                            i, str((a.shape, b.shape))))
             errs.append(numeric_grad.abs_rel_err(a, b))
         return errs
 
@@ -1498,7 +1287,7 @@ def max_err(self, g_pt, abs_tol, rel_tol):
         # max over the arrays in g_pt
         max_arg = numpy.argmax(errs)
         max_pos = pos[max_arg]
-        return (max_arg, max_pos, abs_errs[max_arg], rel_errs[max_arg])
+        return (max_arg, pos[max_arg], abs_errs[max_arg], rel_errs[max_arg])
 
 
 def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
@@ -1528,8 +1317,7 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
         None is type-dependent)
         Raising the value of eps can raise or lower the absolute and
         relative errors of the verification depending on the
-        Op. Raising eps does not lower the verification quality
-        for linear operations. It
+        Op. Raising eps does not lower the verification quality. It
         is better to raise eps than raising abs_tol or rel_tol.
     :param out_type: dtype of output, if complex (i.e. 'complex32' or
         'complex64')
@@ -1537,9 +1325,6 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
         comparison
     :param rel_tol: relative tolerance used as threshold for gradient
         comparison
-    :param cast_to_output_type: if the output is float32 and
-        cast_to_output_type is True, cast the random projection to
-        float32. Otherwise it is float64.
 
     :note: WARNING to unit-test writers: if `op` is a function that builds
         a graph, try to make it a SMALL graph.  Often verify grad is run
@@ -1551,10 +1336,9 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
         covers that case as well by using random projections.
 
     """
-    # The import is here to prevent circular import.
     from theano import compile, shared
     import theano.tensor
-    from theano.tensor import as_tensor_variable, TensorType
+    from theano.tensor import as_tensor_variable, cast, TensorType
     assert isinstance(pt, (list, tuple))
     pt = [numpy.array(p) for p in pt]
 
@@ -1584,12 +1368,11 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
     def function(inputs, output):
         if mode is None:
             f = compile.function(inputs, output, accept_inplace=True,
-                                 allow_input_downcast=True,
-                                 on_unused_input='ignore')
+                    allow_input_downcast=True, on_unused_input='ignore')
         else:
             f = compile.function(inputs, output, accept_inplace=True,
-                                 allow_input_downcast=True, mode=mode,
-                                 on_unused_input='ignore')
+                    allow_input_downcast=True, mode=mode,
+                    on_unused_input='ignore')
         return f
 
     tensor_pt = [TensorType(
@@ -1619,7 +1402,7 @@ def function(inputs, output):
     # otherwise too much precision is lost in numerical gradient
     def random_projection():
         plain = rng.rand(*o_fn_out.shape) + 0.5
-        if cast_to_output_type and o_output.dtype == "float32":
+        if cast_to_output_type:
             return numpy.array(plain, o_output.dtype)
         return plain
 
@@ -1638,38 +1421,29 @@ def random_projection():
     grad_fn = function(tensor_pt, symbolic_grad)
 
     for test_num in xrange(n_tests):
-        try:
-            num_grad = numeric_grad(cost_fn, [p.copy() for p in pt],
-                                    eps, out_type)
+        num_grad = numeric_grad(cost_fn, [p.copy() for p in pt], eps, out_type)
 
-            analytic_grad = grad_fn(*[p.copy() for p in pt])
+        analytic_grad = grad_fn(*[p.copy() for p in pt])
 
-            # Since `tensor_pt` is a list, `analytic_grad` should be one too.
-            assert isinstance(analytic_grad, list)
+        # Since `tensor_pt` is a list, `analytic_grad` should be one too.
+        assert isinstance(analytic_grad, list)
 
-            max_arg, max_err_pos, max_abs_err, max_rel_err = num_grad.max_err(
-                analytic_grad, abs_tol, rel_tol)
+        max_arg, max_err_pos, max_abs_err, max_rel_err =\
+                num_grad.max_err(analytic_grad, abs_tol, rel_tol)
 
-            if max_abs_err > abs_tol and max_rel_err > rel_tol:
+        if max_abs_err > abs_tol and max_rel_err > rel_tol:
 
-                raise verify_grad.E_grad(max_arg, max_err_pos,
-                                         max_abs_err, max_rel_err,
-                                         abs_tol, rel_tol)
+            raise verify_grad.E_grad(max_arg, max_err_pos,
+                    max_abs_err, max_rel_err, abs_tol, rel_tol)
 
-            # get new random projection for next test
-            if test_num < n_tests - 1:
-                t_r.set_value(random_projection(), borrow=True)
-        except Exception, e:
-            e.args += ("\nThe error happened with the following inputs:", pt,
-                       "\nThe value of eps is:", eps,
-                       "\nThe out_type is:", out_type)
-            raise
+        # get new random projection for next test
+        if test_num < n_tests - 1:
+            t_r.set_value(random_projection(), borrow=True)
 
 
 class GradientError(Exception):
     """This error is raised when a gradient is calculated, but incorrect."""
     def __init__(self, arg, err_pos, abs_err, rel_err, abs_tol, rel_tol):
-        Exception.__init__(self)  # to be compatible with python2.4
         self.arg = arg
         self.err_pos = err_pos
         self.abs_err = abs_err
@@ -1735,11 +1509,7 @@ def jacobian(expression, wrt, consider_constant=None,
 
     if expression.ndim == 0:
         # expression is just a scalar, use grad
-        return format_as(using_list, using_tuple,
-                         grad(expression,
-                              wrt,
-                              consider_constant=consider_constant,
-                              disconnected_inputs=disconnected_inputs))
+        return format_as(using_list, using_tuple, grad(expression, wrt))
 
     def inner_function(*args):
         idx = args[0]
@@ -1747,9 +1517,9 @@ def inner_function(*args):
         rvals = []
         for inp in args[2:]:
             rval = grad(expr[idx],
-                        inp,
-                        consider_constant=consider_constant,
-                        disconnected_inputs=disconnected_inputs)
+                     inp,
+                     consider_constant=consider_constant,
+                     disconnected_inputs=disconnected_inputs)
             rvals.append(rval)
         return rvals
     # Computing the gradients does not affect the random seeds on any random
@@ -1757,8 +1527,8 @@ def inner_function(*args):
     # just backtracking over old values. (rp Jan 2012 - if anyone has a
     # counter example please show me)
     jacobs, updates = theano.scan(inner_function,
-                                  sequences=arange(expression.shape[0]),
-                                  non_sequences=[expression] + wrt)
+                            sequences=arange(expression.shape[0]),
+                            non_sequences=[expression] + wrt)
     assert not updates, \
             ("Scan has returned a list of updates. This should not "
              "happen! Report this to theano-users (also include the "
@@ -1767,7 +1537,7 @@ def inner_function(*args):
 
 
 def hessian(cost, wrt, consider_constant=None,
-            disconnected_inputs='raise'):
+             disconnected_inputs='raise'):
     """
     :type cost: Scalar (0-dimensional) Variable.
     :type wrt: Vector (1-dimensional tensor) 'Variable' or list of
@@ -1813,17 +1583,12 @@ def hessian(cost, wrt, consider_constant=None,
         assert input.ndim == 1, \
                 "tensor.hessian expects a (list of) 1 dimensional variable "\
                 "as `wrt`"
-        expr = grad(cost, input, consider_constant=consider_constant,
-                    disconnected_inputs=disconnected_inputs)
-
-        # It is possible that the inputs are disconnected from expr,
-        # even if they are connected to cost.
-        # This should not be an error.
+        expr = grad(cost, input)
         hess, updates = theano.scan(lambda i, y, x: grad(
                             y[i],
                             x,
                             consider_constant=consider_constant,
-                            disconnected_inputs='ignore'),
+                            disconnected_inputs=disconnected_inputs),
                        sequences=arange(expr.shape[0]),
                        non_sequences=[expr, input])
         assert not updates, \
@@ -1862,29 +1627,3 @@ def _is_zero(x):
         return 'no'
 
     return 'yes'
-
-
-class ConsiderConstant(ViewOp):
-    def grad(self, args, g_outs):
-        return [g_out.zeros_like(g_out) for g_out in g_outs]
-consider_constant_ = ConsiderConstant()
-
-
-#I create a function only to have the doc show well.
-def consider_constant(x):
-    """ Consider an expression constant when computing gradients.
-
-    The expression itself is unaffected, but when its gradient is
-    computed, or the gradient of another expression that this
-    expression is a subexpression of, it will not be backpropagated
-    through. In other words, the gradient of the expression is
-    truncated to 0.
-
-    :param x: A Theano expression whose gradient should be truncated.
-
-    :return: The expression is returned unmodified, but its gradient
-        is now truncated to 0.
-
-    .. versionadded:: 0.6.1
-    """
-    return consider_constant_(x)
diff --git a/theano/ifelse.py b/theano/ifelse.py
index 9495d59cd32..eab78e58e70 100644
--- a/theano/ifelse.py
+++ b/theano/ifelse.py
@@ -384,7 +384,7 @@ def ifelse(condition, then_branch, else_branch, name=None):
         return tuple(rval)
 
 
-@gof.local_optimizer([IfElse])
+@gof.local_optimizer([None])
 def cond_make_inplace(node):
     op = node.op
     if isinstance(op, IfElse) and not op.as_view:
@@ -435,17 +435,17 @@ def cond_make_inplace(node):
 acceptable_ops = (theano.tensor.basic.Dot,
                   theano.tensor.basic.Reshape,
                   theano.tensor.basic.Shape,
-                  theano.tensor.SpecifyShape,
+                  theano.tensor.basic.SpecifyShape,
                   theano.tensor.basic.MaxAndArgmax,
-                  theano.tensor.Subtensor,
-                  theano.tensor.IncSubtensor,
+                  theano.tensor.basic.Subtensor,
+                  theano.tensor.basic.IncSubtensor,
                   theano.tensor.basic.Rebroadcast,
                   theano.tensor.basic.Alloc,
                   theano.tensor.elemwise.Elemwise,
                   theano.tensor.elemwise.DimShuffle)
 
 
-@gof.local_optimizer(acceptable_ops)
+@gof.local_optimizer([None])
 def ifelse_lift_single_if_through_acceptable_ops(main_node):
     """This optimization lifts up certain ifelse instances.
 
@@ -493,7 +493,7 @@ def ifelse_lift_single_if_through_acceptable_ops(main_node):
     return nw_outs
 
 
-@gof.local_optimizer([IfElse])
+@gof.local_optimizer([None])
 def cond_merge_ifs_true(node):
     op = node.op
     if not isinstance(op, IfElse):
@@ -517,7 +517,7 @@ def cond_merge_ifs_true(node):
     return op(*old_ins, **dict(return_list=True))
 
 
-@gof.local_optimizer([IfElse])
+@gof.local_optimizer([None])
 def cond_merge_ifs_false(node):
     op = node.op
     if not isinstance(op, IfElse):
@@ -592,7 +592,7 @@ def apply(self, fgraph):
                 fgraph.replace_all_validate(pairs, reason='cond_merge')
 
 
-@gof.local_optimizer([IfElse])
+@gof.local_optimizer([None])
 def cond_remove_identical(node):
     op = node.op
 
@@ -643,7 +643,7 @@ def cond_remove_identical(node):
     return rval
 
 
-@gof.local_optimizer([IfElse])
+@gof.local_optimizer([None])
 def cond_merge_random_op(main_node):
     if isinstance(main_node.op, IfElse):
         return False
diff --git a/theano/misc/buildbot_filter.py b/theano/misc/buildbot_filter.py
index 4e3d847f0bb..1819305de38 100644
--- a/theano/misc/buildbot_filter.py
+++ b/theano/misc/buildbot_filter.py
@@ -12,8 +12,7 @@ def filter_output(fd_in):
             elif toks[0].startswith("ImportError"):
                 s += line
             elif toks[0] in ["KnownFailureTest:", "Exception:", "Failure:",
-                             "AssertionError", "AssertionError:",
-                             "GradientError:"]:
+                           "AssertionError", "AssertionError:"]:
                 s += line
             elif toks[0] == "Executing" and toks[1] in ["tests", 'nosetests']:
                 s += line
diff --git a/theano/misc/check_blas.py b/theano/misc/check_blas.py
index aedcb6be656..7f9428afe7e 100755
--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -44,10 +44,6 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
         print '    compiledir=', theano.config.compiledir
         print '    floatX=', theano.config.floatX
         print '    device=', theano.config.device
-        print 'Some OS information:'
-        print '    sys.platform=', sys.platform
-        print '    sys.version=', sys.version
-        print '    sys.prefix=', sys.prefix
         print 'Some environment variables:'
         print '    MKL_NUM_THREADS=', os.getenv('MKL_NUM_THREADS')
         print '    OMP_NUM_THREADS=', os.getenv('OMP_NUM_THREADS')
@@ -201,44 +197,31 @@ def test():
 
         Test time in float32
 
-        cuda version      6.5    6.0    5.5    5.0    4.2    4.1    4.0    3.2    3.0   # note
+        cuda version      5.0    4.2    4.1    4.0    3.2    3.0   # note
         gpu
-        K6000/NOECC                     0.06s
-        K40                             0.07s
-        K20m/ECC                               0.07s
-        K20/NOECC                              0.07s
-        M2090                           0.19s
-        C2075                                         0.25s
-        M2075                                  0.25s
-        M2070                                  0.25s         0.27s         0.32s
-        M2070-Q                                0.48s         0.27s         0.32s
-        M2050(Amazon)                          0.25s
-        C1060                                                              0.46s
-        K600                            1.04s
-
-        GTX Titan Black                 0.05s
-        GTX Titan(D15U-50)              0.06s  0.06s  don't work
-        GTX 780                         0.06s
-        GTX 980           0.06s        
-        GTX 970           0.08s
-        GTX 680                         0.11s  0.12s  0.154s               0.218s
-        GTX 580                         0.16s  0.16s  0.164s               0.203s
-        GTX 480                         0.19s  0.19s  0.192s               0.237s 0.27s
-        GTX 750 Ti        0.20s
-        GTX 470                         0.23s  0.23s  0.238s               0.297s 0.34s
-        GTX 660                         0.18s  0.20s  0.23s
-        GTX 560                                       0.30s
-        GTX 650 Ti                             0.27s
-        GTX 765M                 0.27s
-        GTX 460                                0.37s                0.45s
-        GTX 285                         0.42s         0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
-        750M                                   0.49s
-        GTX 550 Ti                                                  0.57s
-        GT 520                                        2.68s                3.06s
-        520M                                   2.44s                       3.19s        # with bumblebee on Ubuntu 12.04
-        GT 220                                                             3.80s
-        GT 210                                                      6.35s
-        8500 GT                                                                   10.68s
+        K20m/ECC          0.07s
+        K20/NOECC         0.07s
+        M2070             0.25s         0.27s         0.32s
+        M2050(Amazon)     0.25s
+        C2075                    0.25s
+        C1060                                         0.46s
+
+        GTX Titan(D15U-50)0.06s  don't work
+        GTX 680           0.12s  0.154s               0.218s
+        GTX 580           0.16s  0.164s               0.203s
+        GTX 480           0.19s  0.192s               0.237s 0.27s
+        GTX 470           0.23s  0.238s               0.297s 0.34s
+        GTX 660           0.20s  0.23s
+        GTX 560                  0.30s
+        GTX 650 Ti        0.27s
+        GTX 460           0.37s                0.45s
+        GTX 285                  0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
+        GTX 550 Ti                             0.57s
+        GT 520                   2.68s                3.06s
+        520M              2.44s                       3.19s        # with bumblebee on Ubuntu 12.04
+        GT 220                                        3.80s
+        GT 210                                 6.35s
+        8500 GT                                              10.68s
         """
 
     t, impl = execute(not options.print_only, not options.quiet,
diff --git a/theano/misc/do_nightly_build b/theano/misc/do_nightly_build
index 85cc1591e52..71bd18f1399 100755
--- a/theano/misc/do_nightly_build
+++ b/theano/misc/do_nightly_build
@@ -1,19 +1,17 @@
 #!/bin/bash
 date
 START=`date +%s`
-ARGS="$@"
+ARGS=$@
 PROFILING=""
 RELEASE=""
 
 if [ "$1" == "--release" ]; then
     RELEASE="True"
     shift
-    ARGS="$@"
+    ARGS=$@
 fi
 
 if [ "$1" == "--buildbot" ]; then
-    shift
-    ARGS="$@"
     #we set the compiledir to the /Tmp dir to make the test faster by bypassing the nfs network.
     COMPILEDIR=/Tmp/lisa_theano_compile_dir_theano
     ROOT_CWD=/Tmp/nightly_build
@@ -21,6 +19,7 @@ if [ "$1" == "--buildbot" ]; then
     cd ${ROOT_CWD}/Theano
     git rev-parse HEAD
     #Run tests from inside the Theano directory to prevent import problem.
+    ARGS=""
     PROFILING="--with-coverage --cover-package=theano"
     NOSETESTS=${ROOT_CWD}/Theano/bin/theano-nose
     export PYTHONPATH=${ROOT_CWD}:$PYTHONPATH
@@ -55,6 +54,13 @@ if [ "$RELEASE" ]; then
     echo
 fi
 
+echo "Executing tests with mode=FAST_COMPILE with --batch=1000"
+echo "THEANO_FLAGS=${FLAGS},mode=FAST_COMPILE ${NOSETESTS} --batch=1000 ${ARGS}"
+THEANO_FLAGS=${FLAGS},mode=FAST_COMPILE ${NOSETESTS} --batch=1000 ${ARGS}
+echo "Number of elements in the compiledir:"
+ls ${COMPILEDIR}|wc -l
+echo
+
 echo "Executing tests with mode=FAST_RUN"
 echo "THEANO_FLAGS=cmodule.warn_no_version=True,${FLAGS},mode=FAST_RUN ${NOSETESTS} ${PROFILING} ${ARGS}"
 THEANO_FLAGS=cmodule.warn_no_version=True,${FLAGS},mode=FAST_RUN ${NOSETESTS} ${PROFILING} ${ARGS}
@@ -82,16 +88,6 @@ echo "Executing tests with mode=DEBUG_MODE with seed of the day $seed"
 echo "THEANO_FLAGS=${FLAGS},unittests.rseed=$seed,mode=DEBUG_MODE,DebugMode.check_strides=0,DebugMode.patience=3,DebugMode.check_preallocated_output= ${NOSETESTS} ${ARGS}"
 THEANO_FLAGS=${FLAGS},unittests.rseed=$seed,mode=DEBUG_MODE,DebugMode.check_strides=0,DebugMode.patience=3,DebugMode.check_preallocated_output= ${NOSETESTS} ${ARGS}
 
-#We put this at the end as it have a tendency to loop infinitly.
-#Until we fix the root of the problem we let the rest run, then we can kill this one in the morning.
-# with --batch=1000" # The buildbot freeze sometimes when collecting the tests to run
-echo "Executing tests with mode=FAST_COMPILE"
-echo "THEANO_FLAGS=${FLAGS},mode=FAST_COMPILE ${NOSETESTS} ${ARGS}"
-THEANO_FLAGS=${FLAGS},mode=FAST_COMPILE ${NOSETESTS} ${ARGS}
-echo "Number of elements in the compiledir:"
-ls ${COMPILEDIR}|wc -l
-echo
-
 echo "Number of elements in the compiledir:"
 ls ${COMPILEDIR}|wc -l
 echo
diff --git a/theano/misc/do_nightly_build_send b/theano/misc/do_nightly_build_send
index 16f82784249..18d8f599827 100755
--- a/theano/misc/do_nightly_build_send
+++ b/theano/misc/do_nightly_build_send
@@ -10,29 +10,20 @@ family=['theano-buildbot@googlegroups.com']
 me='lisa@iro.umontreal.ca'
 
 #Those file contain the output of the do_nightly_build script.
-files=["do_nightly_build_theano", "do_nightly_build_pylearn",
-       "do_nightly_build_deeplearning", "do_nightly_build_pylearn2",
-#       "do_nightly_build_theano_python2.4",
-       "do_nightly_build_theano_python3.3.0",
+files=["/tmp/do_nightly_build_theano", "/tmp/do_nightly_build_pylearn",
+       "/tmp/do_nightly_build_deeplearning", "/tmp/do_nightly_build_pylearn2",
+       "/tmp/do_nightly_build_theano_python2.4",
+       "/tmp/do_nightly_build_theano_python3.3.0",
 ]
 msgs=['Theano buildbot', 'Pylearn buildbot', 'Deep Learning Tutorial buildbot',
-      'Pylearn2 buildbot',
-#      'Theano Python2.4 buildbot',
-      'Theano Python3.3.0 buildbot']
+      'Pylearn2 buildbot', 'Theano Python2.4 buildbot', 'Theano Python3.3.0 buildbot']
 
-print 'files', files
-print "msgs", msgs
-print "args", sys.argv
-if len(sys.argv) == 3:
-    #We send just a file with a message
-    files = [sys.argv[1]]
-    msgs = [sys.argv[2]]
-elif len(sys.argv) == 2:
-    #This is a prefix where the output files are
-    files=[os.path.join(sys.argv[1], x) for x in files]
-else:
-    files=[os.path.join('/tmp', x) for x in files]
-print 'path', files
+print files
+print msgs
+print sys.argv
+if len(sys.argv)==2:
+    files=[x+sys.argv[1] for x in files]
+print files
 
 # Here are the email package modules we'll need
 from email.mime.text import MIMEText
diff --git a/theano/misc/elemwise_openmp_speedup.py b/theano/misc/elemwise_openmp_speedup.py
deleted file mode 100644
index b34243fc12c..00000000000
--- a/theano/misc/elemwise_openmp_speedup.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import os
-import subprocess
-import sys
-from optparse import OptionParser
-
-import theano
-
-parser = OptionParser(usage='%prog <options>\n Compute time for'
-                      ' fast and slow elemwise operations')
-parser.add_option('-N', '--N', action='store', dest='N',
-                  default=theano.config.openmp_elemwise_minsize, type="int",
-                  help="Number of vector elements")
-
-
-def runScript(N):
-    script = 'elemwise_time_test.py'
-    dir = os.path.dirname(os.path.abspath(__file__))
-    proc = subprocess.Popen(['python', script, '--script', '-N', str(N)],
-                            stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-                            cwd=dir)
-    (out, err) = proc.communicate()
-    if err:
-        print err
-        sys.exit()
-    return map(float, out.split(" "))
-
-if __name__ == '__main__':
-    options, arguments = parser.parse_args(sys.argv)
-    if hasattr(options, "help"):
-        print options.help
-        sys.exit(0)
-    orig_flags = os.environ.get('THEANO_FLAGS', '')
-    os.environ['THEANO_FLAGS'] = orig_flags + ',openmp=false'
-    (cheapTime, costlyTime) = runScript(N=options.N)
-    os.environ['THEANO_FLAGS'] = orig_flags + ',openmp=true'
-    (cheapTimeOpenmp, costlyTimeOpenmp) = runScript(N=options.N)
-
-    if cheapTime > cheapTimeOpenmp:
-        cheapSpeed = cheapTime / cheapTimeOpenmp
-        cheapSpeedstring = "speedup"
-    else:
-        cheapSpeed = cheapTimeOpenmp / cheapTime
-        cheapSpeedstring = "slowdown"
-
-    if costlyTime > costlyTimeOpenmp:
-        costlySpeed = costlyTime / costlyTimeOpenmp
-        costlySpeedstring = "speedup"
-    else:
-        costlySpeed = costlyTimeOpenmp / costlyTime
-        costlySpeedstring = "slowdown"
-
-    print "Fast op time without openmp %fs with openmp %fs %s %2.2f" % (cheapTime, cheapTimeOpenmp, cheapSpeedstring, cheapSpeed)
-    
-    print "Slow op time without openmp %fs with openmp %fs %s %2.2f" % (costlyTime, costlyTimeOpenmp, costlySpeedstring, costlySpeed)
diff --git a/theano/misc/elemwise_time_test.py b/theano/misc/elemwise_time_test.py
deleted file mode 100644
index 7f2cc4625d6..00000000000
--- a/theano/misc/elemwise_time_test.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from optparse import OptionParser
-import sys
-import time
-
-import numpy as np
-
-import theano
-import theano.tensor as T
-
-parser = OptionParser(usage='%prog <options>\n Compute time for'
-                      ' fast and slow elemwise operations')
-parser.add_option('-N', '--N', action='store', dest='N',
-                  default=theano.config.openmp_elemwise_minsize, type="int",
-                  help="Number of vector elements")
-parser.add_option('--script', action='store_true', dest='script',
-                  default=False,
-                  help="Run program as script and print results on stdoutput")
-
-
-def evalTime(f, v, script=False, loops=1000):
-    min = 1e10
-    for i in xrange(0, loops):
-        t0 = time.time()
-        f(v)
-        dt = time.time() - t0
-        min = dt if dt < min else min
-    if not script:
-        print ' run time in %d loops was %2.9f sec' % (loops, min)
-    return min
-
-
-def ElemwiseOpTime(N, script=False, loops=1000):
-    x = T.vector('x')
-    np.random.seed(1235)
-    v = np.random.random(N).astype(theano.config.floatX)
-    f = theano.function([x], 2*x + x*x)
-    f1 = theano.function([x], T.tanh(x))
-    if not script:
-        if theano.config.openmp:
-            print "With openmp:"
-        print "Fast op ",
-    ceapTime = evalTime(f, v, script=script, loops=loops)
-    if not script:
-        print "Slow op ",
-    costlyTime = evalTime(f1, v, script=script, loops=loops)
-    return (ceapTime, costlyTime)
-
-if __name__ == '__main__':
-    options, arguments = parser.parse_args(sys.argv)
-    if hasattr(options, "help"):
-        print options.help
-        sys.exit(0)
-
-    (cheapTime, costlyTime) = ElemwiseOpTime(N=options.N,
-                                             script=options.script)
-
-    if options.script:
-        sys.stdout.write("%2.9f %2.9f\n" % (cheapTime, costlyTime))
-        sys.stdout.flush()
diff --git a/theano/misc/gnumpy_utils.py b/theano/misc/gnumpy_utils.py
index 34e8e2243b4..739bca9d2c9 100644
--- a/theano/misc/gnumpy_utils.py
+++ b/theano/misc/gnumpy_utils.py
@@ -106,11 +106,8 @@ def garray_to_cudandarray(x):
             strides = [1]
             for i in x.shape[::-1][:-1]:
                 strides.append(strides[-1]*i)
-            strides = strides[::-1]
-            for i in range(len(strides)):
-                if x.shape[i] == 1:
-                    strides[i] = 0
-            strides = tuple(strides)
+            strides = tuple(strides[::-1])
+
 
             import ctypes
             ptr_long = long(ctypes.cast(x._base.mat.data_device, ctypes.c_void_p).value)
diff --git a/theano/misc/hooks/argparse.py b/theano/misc/hooks/argparse.py
new file mode 100644
index 00000000000..717b660baae
--- /dev/null
+++ b/theano/misc/hooks/argparse.py
@@ -0,0 +1,2353 @@
+# -*- coding: utf-8 -*-
+
+# Copyright © 2006-2009 Steven J. Bethard <steven.bethard@gmail.com>.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy
+# of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+"""Command-line parsing library
+
+This module is an optparse-inspired command-line parsing library that:
+
+    - handles both optional and positional arguments
+    - produces highly informative usage messages
+    - supports parsers that dispatch to sub-parsers
+
+The following is a simple usage example that sums integers from the
+command-line and writes the result to a file::
+
+    parser = argparse.ArgumentParser(
+        description='sum the integers at the command line')
+    parser.add_argument(
+        'integers', metavar='int', nargs='+', type=int,
+        help='an integer to be summed')
+    parser.add_argument(
+        '--log', default=sys.stdout, type=argparse.FileType('w'),
+        help='the file where the sum should be written')
+    args = parser.parse_args()
+    args.log.write('%s' % sum(args.integers))
+    args.log.close()
+
+The module contains the following public classes:
+
+    - ArgumentParser -- The main entry point for command-line parsing. As the
+        example above shows, the add_argument() method is used to populate
+        the parser with actions for optional and positional arguments. Then
+        the parse_args() method is invoked to convert the args at the
+        command-line into an object with attributes.
+
+    - ArgumentError -- The exception raised by ArgumentParser objects when
+        there are errors with the parser's actions. Errors raised while
+        parsing the command-line are caught by ArgumentParser and emitted
+        as command-line messages.
+
+    - FileType -- A factory for defining types of files to be created. As the
+        example above shows, instances of FileType are typically passed as
+        the type= argument of add_argument() calls.
+
+    - Action -- The base class for parser actions. Typically actions are
+        selected by passing strings like 'store_true' or 'append_const' to
+        the action= argument of add_argument(). However, for greater
+        customization of ArgumentParser actions, subclasses of Action may
+        be defined and passed as the action= argument.
+
+    - HelpFormatter, RawDescriptionHelpFormatter, RawTextHelpFormatter,
+        ArgumentDefaultsHelpFormatter -- Formatter classes which
+        may be passed as the formatter_class= argument to the
+        ArgumentParser constructor. HelpFormatter is the default,
+        RawDescriptionHelpFormatter and RawTextHelpFormatter tell the parser
+        not to change the formatting for help text, and
+        ArgumentDefaultsHelpFormatter adds information about argument defaults
+        to the help.
+
+All other classes in this module are considered implementation details.
+(Also note that HelpFormatter and RawDescriptionHelpFormatter are only
+considered public as object names -- the API of the formatter objects is
+still considered an implementation detail.)
+"""
+
+__version__ = '1.1'
+__all__ = [
+    'ArgumentParser',
+    'ArgumentError',
+    'Namespace',
+    'Action',
+    'FileType',
+    'HelpFormatter',
+    'RawDescriptionHelpFormatter',
+    'RawTextHelpFormatter',
+    'ArgumentDefaultsHelpFormatter',
+]
+
+
+import copy as _copy
+import os as _os
+import re as _re
+import sys as _sys
+import textwrap as _textwrap
+
+from gettext import gettext as _
+
+try:
+    _set = set
+except NameError:
+    from sets import Set as _set
+
+try:
+    _basestring = basestring
+except NameError:
+    _basestring = str
+
+try:
+    _sorted = sorted
+except NameError:
+
+    def _sorted(iterable, reverse=False):
+        result = list(iterable)
+        result.sort()
+        if reverse:
+            result.reverse()
+        return result
+
+
+def _callable(obj):
+    return hasattr(obj, '__call__') or hasattr(obj, '__bases__')
+
+# silence Python 2.6 buggy warnings about Exception.message
+if _sys.version_info[:2] == (2, 6):
+    import warnings
+    warnings.filterwarnings(
+        action='ignore',
+        message='BaseException.message has been deprecated as of Python 2.6',
+        category=DeprecationWarning,
+        module='argparse')
+
+
+SUPPRESS = '==SUPPRESS=='
+
+OPTIONAL = '?'
+ZERO_OR_MORE = '*'
+ONE_OR_MORE = '+'
+PARSER = 'A...'
+REMAINDER = '...'
+
+# =============================
+# Utility functions and classes
+# =============================
+
+class _AttributeHolder(object):
+    """Abstract base class that provides __repr__.
+
+    The __repr__ method returns a string in the format::
+        ClassName(attr=name, attr=name, ...)
+    The attributes are determined either by a class-level attribute,
+    '_kwarg_names', or by inspecting the instance __dict__.
+    """
+
+    def __repr__(self):
+        type_name = type(self).__name__
+        arg_strings = []
+        for arg in self._get_args():
+            arg_strings.append(repr(arg))
+        for name, value in self._get_kwargs():
+            arg_strings.append('%s=%r' % (name, value))
+        return '%s(%s)' % (type_name, ', '.join(arg_strings))
+
+    def _get_kwargs(self):
+        return _sorted(self.__dict__.items())
+
+    def _get_args(self):
+        return []
+
+
+def _ensure_value(namespace, name, value):
+    if getattr(namespace, name, None) is None:
+        setattr(namespace, name, value)
+    return getattr(namespace, name)
+
+
+# ===============
+# Formatting Help
+# ===============
+
+class HelpFormatter(object):
+    """Formatter for generating usage messages and argument help strings.
+
+    Only the name of this class is considered a public API. All the methods
+    provided by the class are considered an implementation detail.
+    """
+
+    def __init__(self,
+                 prog,
+                 indent_increment=2,
+                 max_help_position=24,
+                 width=None):
+
+        # default setting for width
+        if width is None:
+            try:
+                width = int(_os.environ['COLUMNS'])
+            except (KeyError, ValueError):
+                width = 80
+            width -= 2
+
+        self._prog = prog
+        self._indent_increment = indent_increment
+        self._max_help_position = max_help_position
+        self._width = width
+
+        self._current_indent = 0
+        self._level = 0
+        self._action_max_length = 0
+
+        self._root_section = self._Section(self, None)
+        self._current_section = self._root_section
+
+        self._whitespace_matcher = _re.compile(r'\s+')
+        self._long_break_matcher = _re.compile(r'\n\n\n+')
+
+    # ===============================
+    # Section and indentation methods
+    # ===============================
+    def _indent(self):
+        self._current_indent += self._indent_increment
+        self._level += 1
+
+    def _dedent(self):
+        self._current_indent -= self._indent_increment
+        assert self._current_indent >= 0, 'Indent decreased below 0.'
+        self._level -= 1
+
+    class _Section(object):
+
+        def __init__(self, formatter, parent, heading=None):
+            self.formatter = formatter
+            self.parent = parent
+            self.heading = heading
+            self.items = []
+
+        def format_help(self):
+            # format the indented section
+            if self.parent is not None:
+                self.formatter._indent()
+            join = self.formatter._join_parts
+            for func, args in self.items:
+                func(*args)
+            item_help = join([func(*args) for func, args in self.items])
+            if self.parent is not None:
+                self.formatter._dedent()
+
+            # return nothing if the section was empty
+            if not item_help:
+                return ''
+
+            # add the heading if the section was non-empty
+            if self.heading is not SUPPRESS and self.heading is not None:
+                current_indent = self.formatter._current_indent
+                heading = '%*s%s:\n' % (current_indent, '', self.heading)
+            else:
+                heading = ''
+
+            # join the section-initial newline, the heading and the help
+            return join(['\n', heading, item_help, '\n'])
+
+    def _add_item(self, func, args):
+        self._current_section.items.append((func, args))
+
+    # ========================
+    # Message building methods
+    # ========================
+    def start_section(self, heading):
+        self._indent()
+        section = self._Section(self, self._current_section, heading)
+        self._add_item(section.format_help, [])
+        self._current_section = section
+
+    def end_section(self):
+        self._current_section = self._current_section.parent
+        self._dedent()
+
+    def add_text(self, text):
+        if text is not SUPPRESS and text is not None:
+            self._add_item(self._format_text, [text])
+
+    def add_usage(self, usage, actions, groups, prefix=None):
+        if usage is not SUPPRESS:
+            args = usage, actions, groups, prefix
+            self._add_item(self._format_usage, args)
+
+    def add_argument(self, action):
+        if action.help is not SUPPRESS:
+
+            # find all invocations
+            get_invocation = self._format_action_invocation
+            invocations = [get_invocation(action)]
+            for subaction in self._iter_indented_subactions(action):
+                invocations.append(get_invocation(subaction))
+
+            # update the maximum item length
+            invocation_length = max([len(s) for s in invocations])
+            action_length = invocation_length + self._current_indent
+            self._action_max_length = max(self._action_max_length,
+                                          action_length)
+
+            # add the item to the list
+            self._add_item(self._format_action, [action])
+
+    def add_arguments(self, actions):
+        for action in actions:
+            self.add_argument(action)
+
+    # =======================
+    # Help-formatting methods
+    # =======================
+    def format_help(self):
+        help = self._root_section.format_help()
+        if help:
+            help = self._long_break_matcher.sub('\n\n', help)
+            help = help.strip('\n') + '\n'
+        return help
+
+    def _join_parts(self, part_strings):
+        return ''.join([part
+                        for part in part_strings
+                        if part and part is not SUPPRESS])
+
+    def _format_usage(self, usage, actions, groups, prefix):
+        if prefix is None:
+            prefix = _('usage: ')
+
+        # if usage is specified, use that
+        if usage is not None:
+            usage = usage % dict(prog=self._prog)
+
+        # if no optionals or positionals are available, usage is just prog
+        elif usage is None and not actions:
+            usage = '%(prog)s' % dict(prog=self._prog)
+
+        # if optionals and positionals are available, calculate usage
+        elif usage is None:
+            prog = '%(prog)s' % dict(prog=self._prog)
+
+            # split optionals from positionals
+            optionals = []
+            positionals = []
+            for action in actions:
+                if action.option_strings:
+                    optionals.append(action)
+                else:
+                    positionals.append(action)
+
+            # build full usage string
+            format = self._format_actions_usage
+            action_usage = format(optionals + positionals, groups)
+            usage = ' '.join([s for s in [prog, action_usage] if s])
+
+            # wrap the usage parts if it's too long
+            text_width = self._width - self._current_indent
+            if len(prefix) + len(usage) > text_width:
+
+                # break usage into wrappable parts
+                part_regexp = r'\(.*?\)+|\[.*?\]+|\S+'
+                opt_usage = format(optionals, groups)
+                pos_usage = format(positionals, groups)
+                opt_parts = _re.findall(part_regexp, opt_usage)
+                pos_parts = _re.findall(part_regexp, pos_usage)
+                assert ' '.join(opt_parts) == opt_usage
+                assert ' '.join(pos_parts) == pos_usage
+
+                # helper for wrapping lines
+                def get_lines(parts, indent, prefix=None):
+                    lines = []
+                    line = []
+                    if prefix is not None:
+                        line_len = len(prefix) - 1
+                    else:
+                        line_len = len(indent) - 1
+                    for part in parts:
+                        if line_len + 1 + len(part) > text_width:
+                            lines.append(indent + ' '.join(line))
+                            line = []
+                            line_len = len(indent) - 1
+                        line.append(part)
+                        line_len += len(part) + 1
+                    if line:
+                        lines.append(indent + ' '.join(line))
+                    if prefix is not None:
+                        lines[0] = lines[0][len(indent):]
+                    return lines
+
+                # if prog is short, follow it with optionals or positionals
+                if len(prefix) + len(prog) <= 0.75 * text_width:
+                    indent = ' ' * (len(prefix) + len(prog) + 1)
+                    if opt_parts:
+                        lines = get_lines([prog] + opt_parts, indent, prefix)
+                        lines.extend(get_lines(pos_parts, indent))
+                    elif pos_parts:
+                        lines = get_lines([prog] + pos_parts, indent, prefix)
+                    else:
+                        lines = [prog]
+
+                # if prog is long, put it on its own line
+                else:
+                    indent = ' ' * len(prefix)
+                    parts = opt_parts + pos_parts
+                    lines = get_lines(parts, indent)
+                    if len(lines) > 1:
+                        lines = []
+                        lines.extend(get_lines(opt_parts, indent))
+                        lines.extend(get_lines(pos_parts, indent))
+                    lines = [prog] + lines
+
+                # join lines into usage
+                usage = '\n'.join(lines)
+
+        # prefix with 'usage:'
+        return '%s%s\n\n' % (prefix, usage)
+
+    def _format_actions_usage(self, actions, groups):
+        # find group indices and identify actions in groups
+        group_actions = _set()
+        inserts = {}
+        for group in groups:
+            try:
+                start = actions.index(group._group_actions[0])
+            except ValueError:
+                continue
+            else:
+                end = start + len(group._group_actions)
+                if actions[start:end] == group._group_actions:
+                    for action in group._group_actions:
+                        group_actions.add(action)
+                    if not group.required:
+                        inserts[start] = '['
+                        inserts[end] = ']'
+                    else:
+                        inserts[start] = '('
+                        inserts[end] = ')'
+                    for i in range(start + 1, end):
+                        inserts[i] = '|'
+
+        # collect all actions format strings
+        parts = []
+        for i, action in enumerate(actions):
+
+            # suppressed arguments are marked with None
+            # remove | separators for suppressed arguments
+            if action.help is SUPPRESS:
+                parts.append(None)
+                if inserts.get(i) == '|':
+                    inserts.pop(i)
+                elif inserts.get(i + 1) == '|':
+                    inserts.pop(i + 1)
+
+            # produce all arg strings
+            elif not action.option_strings:
+                part = self._format_args(action, action.dest)
+
+                # if it's in a group, strip the outer []
+                if action in group_actions:
+                    if part[0] == '[' and part[-1] == ']':
+                        part = part[1:-1]
+
+                # add the action string to the list
+                parts.append(part)
+
+            # produce the first way to invoke the option in brackets
+            else:
+                option_string = action.option_strings[0]
+
+                # if the Optional doesn't take a value, format is:
+                #    -s or --long
+                if action.nargs == 0:
+                    part = '%s' % option_string
+
+                # if the Optional takes a value, format is:
+                #    -s ARGS or --long ARGS
+                else:
+                    default = action.dest.upper()
+                    args_string = self._format_args(action, default)
+                    part = '%s %s' % (option_string, args_string)
+
+                # make it look optional if it's not required or in a group
+                if not action.required and action not in group_actions:
+                    part = '[%s]' % part
+
+                # add the action string to the list
+                parts.append(part)
+
+        # insert things at the necessary indices
+        for i in _sorted(inserts, reverse=True):
+            parts[i:i] = [inserts[i]]
+
+        # join all the action items with spaces
+        text = ' '.join([item for item in parts if item is not None])
+
+        # clean up separators for mutually exclusive groups
+        open = r'[\[(]'
+        close = r'[\])]'
+        text = _re.sub(r'(%s) ' % open, r'\1', text)
+        text = _re.sub(r' (%s)' % close, r'\1', text)
+        text = _re.sub(r'%s *%s' % (open, close), r'', text)
+        text = _re.sub(r'\(([^|]*)\)', r'\1', text)
+        text = text.strip()
+
+        # return the text
+        return text
+
+    def _format_text(self, text):
+        if '%(prog)' in text:
+            text = text % dict(prog=self._prog)
+        text_width = self._width - self._current_indent
+        indent = ' ' * self._current_indent
+        return self._fill_text(text, text_width, indent) + '\n\n'
+
+    def _format_action(self, action):
+        # determine the required width and the entry label
+        help_position = min(self._action_max_length + 2,
+                            self._max_help_position)
+        help_width = self._width - help_position
+        action_width = help_position - self._current_indent - 2
+        action_header = self._format_action_invocation(action)
+
+        # ho nelp; start on same line and add a final newline
+        if not action.help:
+            tup = self._current_indent, '', action_header
+            action_header = '%*s%s\n' % tup
+
+        # short action name; start on the same line and pad two spaces
+        elif len(action_header) <= action_width:
+            tup = self._current_indent, '', action_width, action_header
+            action_header = '%*s%-*s  ' % tup
+            indent_first = 0
+
+        # long action name; start on the next line
+        else:
+            tup = self._current_indent, '', action_header
+            action_header = '%*s%s\n' % tup
+            indent_first = help_position
+
+        # collect the pieces of the action help
+        parts = [action_header]
+
+        # if there was help for the action, add lines of help text
+        if action.help:
+            help_text = self._expand_help(action)
+            help_lines = self._split_lines(help_text, help_width)
+            parts.append('%*s%s\n' % (indent_first, '', help_lines[0]))
+            for line in help_lines[1:]:
+                parts.append('%*s%s\n' % (help_position, '', line))
+
+        # or add a newline if the description doesn't end with one
+        elif not action_header.endswith('\n'):
+            parts.append('\n')
+
+        # if there are any sub-actions, add their help as well
+        for subaction in self._iter_indented_subactions(action):
+            parts.append(self._format_action(subaction))
+
+        # return a single string
+        return self._join_parts(parts)
+
+    def _format_action_invocation(self, action):
+        if not action.option_strings:
+            metavar, = self._metavar_formatter(action, action.dest)(1)
+            return metavar
+
+        else:
+            parts = []
+
+            # if the Optional doesn't take a value, format is:
+            #    -s, --long
+            if action.nargs == 0:
+                parts.extend(action.option_strings)
+
+            # if the Optional takes a value, format is:
+            #    -s ARGS, --long ARGS
+            else:
+                default = action.dest.upper()
+                args_string = self._format_args(action, default)
+                for option_string in action.option_strings:
+                    parts.append('%s %s' % (option_string, args_string))
+
+            return ', '.join(parts)
+
+    def _metavar_formatter(self, action, default_metavar):
+        if action.metavar is not None:
+            result = action.metavar
+        elif action.choices is not None:
+            choice_strs = [str(choice) for choice in action.choices]
+            result = '{%s}' % ','.join(choice_strs)
+        else:
+            result = default_metavar
+
+        def format(tuple_size):
+            if isinstance(result, tuple):
+                return result
+            else:
+                return (result, ) * tuple_size
+        return format
+
+    def _format_args(self, action, default_metavar):
+        get_metavar = self._metavar_formatter(action, default_metavar)
+        if action.nargs is None:
+            result = '%s' % get_metavar(1)
+        elif action.nargs == OPTIONAL:
+            result = '[%s]' % get_metavar(1)
+        elif action.nargs == ZERO_OR_MORE:
+            result = '[%s [%s ...]]' % get_metavar(2)
+        elif action.nargs == ONE_OR_MORE:
+            result = '%s [%s ...]' % get_metavar(2)
+        elif action.nargs == REMAINDER:
+            result = '...'
+        elif action.nargs == PARSER:
+            result = '%s ...' % get_metavar(1)
+        else:
+            formats = ['%s' for _ in range(action.nargs)]
+            result = ' '.join(formats) % get_metavar(action.nargs)
+        return result
+
+    def _expand_help(self, action):
+        params = dict(vars(action), prog=self._prog)
+        for name in list(params):
+            if params[name] is SUPPRESS:
+                del params[name]
+        for name in list(params):
+            if hasattr(params[name], '__name__'):
+                params[name] = params[name].__name__
+        if params.get('choices') is not None:
+            choices_str = ', '.join([str(c) for c in params['choices']])
+            params['choices'] = choices_str
+        return self._get_help_string(action) % params
+
+    def _iter_indented_subactions(self, action):
+        try:
+            get_subactions = action._get_subactions
+        except AttributeError:
+            pass
+        else:
+            self._indent()
+            for subaction in get_subactions():
+                yield subaction
+            self._dedent()
+
+    def _split_lines(self, text, width):
+        text = self._whitespace_matcher.sub(' ', text).strip()
+        return _textwrap.wrap(text, width)
+
+    def _fill_text(self, text, width, indent):
+        text = self._whitespace_matcher.sub(' ', text).strip()
+        return _textwrap.fill(text, width, initial_indent=indent,
+                                           subsequent_indent=indent)
+
+    def _get_help_string(self, action):
+        return action.help
+
+
+class RawDescriptionHelpFormatter(HelpFormatter):
+    """Help message formatter which retains any formatting in descriptions.
+
+    Only the name of this class is considered a public API. All the methods
+    provided by the class are considered an implementation detail.
+    """
+
+    def _fill_text(self, text, width, indent):
+        return ''.join([indent + line for line in text.splitlines(True)])
+
+
+class RawTextHelpFormatter(RawDescriptionHelpFormatter):
+    """Help message formatter which retains formatting of all help text.
+
+    Only the name of this class is considered a public API. All the methods
+    provided by the class are considered an implementation detail.
+    """
+
+    def _split_lines(self, text, width):
+        return text.splitlines()
+
+
+class ArgumentDefaultsHelpFormatter(HelpFormatter):
+    """Help message formatter which adds default values to argument help.
+
+    Only the name of this class is considered a public API. All the methods
+    provided by the class are considered an implementation detail.
+    """
+
+    def _get_help_string(self, action):
+        help = action.help
+        if '%(default)' not in action.help:
+            if action.default is not SUPPRESS:
+                defaulting_nargs = [OPTIONAL, ZERO_OR_MORE]
+                if action.option_strings or action.nargs in defaulting_nargs:
+                    help += ' (default: %(default)s)'
+        return help
+
+
+# =====================
+# Options and Arguments
+# =====================
+
+def _get_action_name(argument):
+    if argument is None:
+        return None
+    elif argument.option_strings:
+        return  '/'.join(argument.option_strings)
+    elif argument.metavar not in (None, SUPPRESS):
+        return argument.metavar
+    elif argument.dest not in (None, SUPPRESS):
+        return argument.dest
+    else:
+        return None
+
+
+class ArgumentError(Exception):
+    """An error from creating or using an argument (optional or positional).
+
+    The string value of this exception is the message, augmented with
+    information about the argument that caused it.
+    """
+
+    def __init__(self, argument, message):
+        self.argument_name = _get_action_name(argument)
+        self.message = message
+
+    def __str__(self):
+        if self.argument_name is None:
+            format = '%(message)s'
+        else:
+            format = 'argument %(argument_name)s: %(message)s'
+        return format % dict(message=self.message,
+                             argument_name=self.argument_name)
+
+
+class ArgumentTypeError(Exception):
+    """An error from trying to convert a command line string to a type."""
+    pass
+
+
+# ==============
+# Action classes
+# ==============
+
+class Action(_AttributeHolder):
+    """Information about how to convert command line strings to Python objects.
+
+    Action objects are used by an ArgumentParser to represent the information
+    needed to parse a single argument from one or more strings from the
+    command line. The keyword arguments to the Action constructor are also
+    all attributes of Action instances.
+
+    Keyword Arguments:
+
+        - option_strings -- A list of command-line option strings which
+            should be associated with this action.
+
+        - dest -- The name of the attribute to hold the created object(s)
+
+        - nargs -- The number of command-line arguments that should be
+            consumed. By default, one argument will be consumed and a single
+            value will be produced.  Other values include:
+                - N (an integer) consumes N arguments (and produces a list)
+                - '?' consumes zero or one arguments
+                - '*' consumes zero or more arguments (and produces a list)
+                - '+' consumes one or more arguments (and produces a list)
+            Note that the difference between the default and nargs=1 is that
+            with the default, a single value will be produced, while with
+            nargs=1, a list containing a single value will be produced.
+
+        - const -- The value to be produced if the option is specified and the
+            option uses an action that takes no values.
+
+        - default -- The value to be produced if the option is not specified.
+
+        - type -- The type which the command-line arguments should be converted
+            to, should be one of 'string', 'int', 'float', 'complex' or a
+            callable object that accepts a single string argument. If None,
+            'string' is assumed.
+
+        - choices -- A container of values that should be allowed. If not None,
+            after a command-line argument has been converted to the appropriate
+            type, an exception will be raised if it is not a member of this
+            collection.
+
+        - required -- True if the action must always be specified at the
+            command line. This is only meaningful for optional command-line
+            arguments.
+
+        - help -- The help string describing the argument.
+
+        - metavar -- The name to be used for the option's argument with the
+            help string. If None, the 'dest' value will be used as the name.
+    """
+
+    def __init__(self,
+                 option_strings,
+                 dest,
+                 nargs=None,
+                 const=None,
+                 default=None,
+                 type=None,
+                 choices=None,
+                 required=False,
+                 help=None,
+                 metavar=None):
+        self.option_strings = option_strings
+        self.dest = dest
+        self.nargs = nargs
+        self.const = const
+        self.default = default
+        self.type = type
+        self.choices = choices
+        self.required = required
+        self.help = help
+        self.metavar = metavar
+
+    def _get_kwargs(self):
+        names = [
+            'option_strings',
+            'dest',
+            'nargs',
+            'const',
+            'default',
+            'type',
+            'choices',
+            'help',
+            'metavar',
+        ]
+        return [(name, getattr(self, name)) for name in names]
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        raise NotImplementedError(_('.__call__() not defined'))
+
+
+class _StoreAction(Action):
+
+    def __init__(self,
+                 option_strings,
+                 dest,
+                 nargs=None,
+                 const=None,
+                 default=None,
+                 type=None,
+                 choices=None,
+                 required=False,
+                 help=None,
+                 metavar=None):
+        if nargs == 0:
+            raise ValueError('nargs for store actions must be > 0; if you '
+                             'have nothing to store, actions such as store '
+                             'true or store const may be more appropriate')
+        if const is not None and nargs != OPTIONAL:
+            raise ValueError('nargs must be %r to supply const' % OPTIONAL)
+        super(_StoreAction, self).__init__(
+            option_strings=option_strings,
+            dest=dest,
+            nargs=nargs,
+            const=const,
+            default=default,
+            type=type,
+            choices=choices,
+            required=required,
+            help=help,
+            metavar=metavar)
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, values)
+
+
+class _StoreConstAction(Action):
+
+    def __init__(self,
+                 option_strings,
+                 dest,
+                 const,
+                 default=None,
+                 required=False,
+                 help=None,
+                 metavar=None):
+        super(_StoreConstAction, self).__init__(
+            option_strings=option_strings,
+            dest=dest,
+            nargs=0,
+            const=const,
+            default=default,
+            required=required,
+            help=help)
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, self.const)
+
+
+class _StoreTrueAction(_StoreConstAction):
+
+    def __init__(self,
+                 option_strings,
+                 dest,
+                 default=False,
+                 required=False,
+                 help=None):
+        super(_StoreTrueAction, self).__init__(
+            option_strings=option_strings,
+            dest=dest,
+            const=True,
+            default=default,
+            required=required,
+            help=help)
+
+
+class _StoreFalseAction(_StoreConstAction):
+
+    def __init__(self,
+                 option_strings,
+                 dest,
+                 default=True,
+                 required=False,
+                 help=None):
+        super(_StoreFalseAction, self).__init__(
+            option_strings=option_strings,
+            dest=dest,
+            const=False,
+            default=default,
+            required=required,
+            help=help)
+
+
+class _AppendAction(Action):
+
+    def __init__(self,
+                 option_strings,
+                 dest,
+                 nargs=None,
+                 const=None,
+                 default=None,
+                 type=None,
+                 choices=None,
+                 required=False,
+                 help=None,
+                 metavar=None):
+        if nargs == 0:
+            raise ValueError('nargs for append actions must be > 0; if arg '
+                             'strings are not supplying the value to append, '
+                             'the append const action may be more appropriate')
+        if const is not None and nargs != OPTIONAL:
+            raise ValueError('nargs must be %r to supply const' % OPTIONAL)
+        super(_AppendAction, self).__init__(
+            option_strings=option_strings,
+            dest=dest,
+            nargs=nargs,
+            const=const,
+            default=default,
+            type=type,
+            choices=choices,
+            required=required,
+            help=help,
+            metavar=metavar)
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        items = _copy.copy(_ensure_value(namespace, self.dest, []))
+        items.append(values)
+        setattr(namespace, self.dest, items)
+
+
+class _AppendConstAction(Action):
+
+    def __init__(self,
+                 option_strings,
+                 dest,
+                 const,
+                 default=None,
+                 required=False,
+                 help=None,
+                 metavar=None):
+        super(_AppendConstAction, self).__init__(
+            option_strings=option_strings,
+            dest=dest,
+            nargs=0,
+            const=const,
+            default=default,
+            required=required,
+            help=help,
+            metavar=metavar)
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        items = _copy.copy(_ensure_value(namespace, self.dest, []))
+        items.append(self.const)
+        setattr(namespace, self.dest, items)
+
+
+class _CountAction(Action):
+
+    def __init__(self,
+                 option_strings,
+                 dest,
+                 default=None,
+                 required=False,
+                 help=None):
+        super(_CountAction, self).__init__(
+            option_strings=option_strings,
+            dest=dest,
+            nargs=0,
+            default=default,
+            required=required,
+            help=help)
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        new_count = _ensure_value(namespace, self.dest, 0) + 1
+        setattr(namespace, self.dest, new_count)
+
+
+class _HelpAction(Action):
+
+    def __init__(self,
+                 option_strings,
+                 dest=SUPPRESS,
+                 default=SUPPRESS,
+                 help=None):
+        super(_HelpAction, self).__init__(
+            option_strings=option_strings,
+            dest=dest,
+            default=default,
+            nargs=0,
+            help=help)
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        parser.print_help()
+        parser.exit()
+
+
+class _VersionAction(Action):
+
+    def __init__(self,
+                 option_strings,
+                 version=None,
+                 dest=SUPPRESS,
+                 default=SUPPRESS,
+                 help=None):
+        super(_VersionAction, self).__init__(
+            option_strings=option_strings,
+            dest=dest,
+            default=default,
+            nargs=0,
+            help=help)
+        self.version = version
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        version = self.version
+        if version is None:
+            version = parser.version
+        formatter = parser._get_formatter()
+        formatter.add_text(version)
+        parser.exit(message=formatter.format_help())
+
+
+class _SubParsersAction(Action):
+
+    class _ChoicesPseudoAction(Action):
+
+        def __init__(self, name, help):
+            sup = super(_SubParsersAction._ChoicesPseudoAction, self)
+            sup.__init__(option_strings=[], dest=name, help=help)
+
+    def __init__(self,
+                 option_strings,
+                 prog,
+                 parser_class,
+                 dest=SUPPRESS,
+                 help=None,
+                 metavar=None):
+
+        self._prog_prefix = prog
+        self._parser_class = parser_class
+        self._name_parser_map = {}
+        self._choices_actions = []
+
+        super(_SubParsersAction, self).__init__(
+            option_strings=option_strings,
+            dest=dest,
+            nargs=PARSER,
+            choices=self._name_parser_map,
+            help=help,
+            metavar=metavar)
+
+    def add_parser(self, name, **kwargs):
+        # set prog from the existing prefix
+        if kwargs.get('prog') is None:
+            kwargs['prog'] = '%s %s' % (self._prog_prefix, name)
+
+        # create a pseudo-action to hold the choice help
+        if 'help' in kwargs:
+            help = kwargs.pop('help')
+            choice_action = self._ChoicesPseudoAction(name, help)
+            self._choices_actions.append(choice_action)
+
+        # create the parser and add it to the map
+        parser = self._parser_class(**kwargs)
+        self._name_parser_map[name] = parser
+        return parser
+
+    def _get_subactions(self):
+        return self._choices_actions
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        parser_name = values[0]
+        arg_strings = values[1:]
+
+        # set the parser name if requested
+        if self.dest is not SUPPRESS:
+            setattr(namespace, self.dest, parser_name)
+
+        # select the parser
+        try:
+            parser = self._name_parser_map[parser_name]
+        except KeyError:
+            tup = parser_name, ', '.join(self._name_parser_map)
+            msg = _('unknown parser %r (choices: %s)' % tup)
+            raise ArgumentError(self, msg)
+
+        # parse all the remaining options into the namespace
+        parser.parse_args(arg_strings, namespace)
+
+
+# ==============
+# Type classes
+# ==============
+
+class FileType(object):
+    """Factory for creating file object types
+
+    Instances of FileType are typically passed as type= arguments to the
+    ArgumentParser add_argument() method.
+
+    Keyword Arguments:
+        - mode -- A string indicating how the file is to be opened. Accepts the
+            same values as the builtin open() function.
+        - bufsize -- The file's desired buffer size. Accepts the same values as
+            the builtin open() function.
+    """
+
+    def __init__(self, mode='r', bufsize=None):
+        self._mode = mode
+        self._bufsize = bufsize
+
+    def __call__(self, string):
+        # the special argument "-" means sys.std{in,out}
+        if string == '-':
+            if 'r' in self._mode:
+                return _sys.stdin
+            elif 'w' in self._mode:
+                return _sys.stdout
+            else:
+                msg = _('argument "-" with mode %r' % self._mode)
+                raise ValueError(msg)
+
+        # all other arguments are used as file names
+        if self._bufsize:
+            return open(string, self._mode, self._bufsize)
+        else:
+            return open(string, self._mode)
+
+    def __repr__(self):
+        args = [self._mode, self._bufsize]
+        args_str = ', '.join([repr(arg) for arg in args if arg is not None])
+        return '%s(%s)' % (type(self).__name__, args_str)
+
+# ===========================
+# Optional and Positional Parsing
+# ===========================
+
+class Namespace(_AttributeHolder):
+    """Simple object for storing attributes.
+
+    Implements equality by attribute names and values, and provides a simple
+    string representation.
+    """
+
+    def __init__(self, **kwargs):
+        for name in kwargs:
+            setattr(self, name, kwargs[name])
+
+    def __eq__(self, other):
+        return vars(self) == vars(other)
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __contains__(self, key):
+        return key in self.__dict__
+
+
+class _ActionsContainer(object):
+
+    def __init__(self,
+                 description,
+                 prefix_chars,
+                 argument_default,
+                 conflict_handler):
+        super(_ActionsContainer, self).__init__()
+
+        self.description = description
+        self.argument_default = argument_default
+        self.prefix_chars = prefix_chars
+        self.conflict_handler = conflict_handler
+
+        # set up registries
+        self._registries = {}
+
+        # register actions
+        self.register('action', None, _StoreAction)
+        self.register('action', 'store', _StoreAction)
+        self.register('action', 'store_const', _StoreConstAction)
+        self.register('action', 'store_true', _StoreTrueAction)
+        self.register('action', 'store_false', _StoreFalseAction)
+        self.register('action', 'append', _AppendAction)
+        self.register('action', 'append_const', _AppendConstAction)
+        self.register('action', 'count', _CountAction)
+        self.register('action', 'help', _HelpAction)
+        self.register('action', 'version', _VersionAction)
+        self.register('action', 'parsers', _SubParsersAction)
+
+        # raise an exception if the conflict handler is invalid
+        self._get_handler()
+
+        # action storage
+        self._actions = []
+        self._option_string_actions = {}
+
+        # groups
+        self._action_groups = []
+        self._mutually_exclusive_groups = []
+
+        # defaults storage
+        self._defaults = {}
+
+        # determines whether an "option" looks like a negative number
+        self._negative_number_matcher = _re.compile(r'^-\d+$|^-\d*\.\d+$')
+
+        # whether or not there are any optionals that look like negative
+        # numbers -- uses a list so it can be shared and edited
+        self._has_negative_number_optionals = []
+
+    # ====================
+    # Registration methods
+    # ====================
+    def register(self, registry_name, value, object):
+        registry = self._registries.setdefault(registry_name, {})
+        registry[value] = object
+
+    def _registry_get(self, registry_name, value, default=None):
+        return self._registries[registry_name].get(value, default)
+
+    # ==================================
+    # Namespace default accessor methods
+    # ==================================
+    def set_defaults(self, **kwargs):
+        self._defaults.update(kwargs)
+
+        # if these defaults match any existing arguments, replace
+        # the previous default on the object with the new one
+        for action in self._actions:
+            if action.dest in kwargs:
+                action.default = kwargs[action.dest]
+
+    def get_default(self, dest):
+        for action in self._actions:
+            if action.dest == dest and action.default is not None:
+                return action.default
+        return self._defaults.get(dest, None)
+
+
+    # =======================
+    # Adding argument actions
+    # =======================
+    def add_argument(self, *args, **kwargs):
+        """
+        add_argument(dest, ..., name=value, ...)
+        add_argument(option_string, option_string, ..., name=value, ...)
+        """
+
+        # if no positional args are supplied or only one is supplied and
+        # it doesn't look like an option string, parse a positional
+        # argument
+        chars = self.prefix_chars
+        if not args or len(args) == 1 and args[0][0] not in chars:
+            if args and 'dest' in kwargs:
+                raise ValueError('dest supplied twice for positional argument')
+            kwargs = self._get_positional_kwargs(*args, **kwargs)
+
+        # otherwise, we're adding an optional argument
+        else:
+            kwargs = self._get_optional_kwargs(*args, **kwargs)
+
+        # if no default was supplied, use the parser-level default
+        if 'default' not in kwargs:
+            dest = kwargs['dest']
+            if dest in self._defaults:
+                kwargs['default'] = self._defaults[dest]
+            elif self.argument_default is not None:
+                kwargs['default'] = self.argument_default
+
+        # create the action object, and add it to the parser
+        action_class = self._pop_action_class(kwargs)
+        if not _callable(action_class):
+            raise ValueError('unknown action "%s"' % action_class)
+        action = action_class(**kwargs)
+
+        # raise an error if the action type is not callable
+        type_func = self._registry_get('type', action.type, action.type)
+        if not _callable(type_func):
+            raise ValueError('%r is not callable' % type_func)
+
+        return self._add_action(action)
+
+    def add_argument_group(self, *args, **kwargs):
+        group = _ArgumentGroup(self, *args, **kwargs)
+        self._action_groups.append(group)
+        return group
+
+    def add_mutually_exclusive_group(self, **kwargs):
+        group = _MutuallyExclusiveGroup(self, **kwargs)
+        self._mutually_exclusive_groups.append(group)
+        return group
+
+    def _add_action(self, action):
+        # resolve any conflicts
+        self._check_conflict(action)
+
+        # add to actions list
+        self._actions.append(action)
+        action.container = self
+
+        # index the action by any option strings it has
+        for option_string in action.option_strings:
+            self._option_string_actions[option_string] = action
+
+        # set the flag if any option strings look like negative numbers
+        for option_string in action.option_strings:
+            if self._negative_number_matcher.match(option_string):
+                if not self._has_negative_number_optionals:
+                    self._has_negative_number_optionals.append(True)
+
+        # return the created action
+        return action
+
+    def _remove_action(self, action):
+        self._actions.remove(action)
+
+    def _add_container_actions(self, container):
+        # collect groups by titles
+        title_group_map = {}
+        for group in self._action_groups:
+            if group.title in title_group_map:
+                msg = _('cannot merge actions - two groups are named %r')
+                raise ValueError(msg % (group.title))
+            title_group_map[group.title] = group
+
+        # map each action to its group
+        group_map = {}
+        for group in container._action_groups:
+
+            # if a group with the title exists, use that, otherwise
+            # create a new group matching the container's group
+            if group.title not in title_group_map:
+                title_group_map[group.title] = self.add_argument_group(
+                    title=group.title,
+                    description=group.description,
+                    conflict_handler=group.conflict_handler)
+
+            # map the actions to their new group
+            for action in group._group_actions:
+                group_map[action] = title_group_map[group.title]
+
+        # add container's mutually exclusive groups
+        # NOTE: if add_mutually_exclusive_group ever gains title= and
+        # description= then this code will need to be expanded as above
+        for group in container._mutually_exclusive_groups:
+            mutex_group = self.add_mutually_exclusive_group(
+                required=group.required)
+
+            # map the actions to their new mutex group
+            for action in group._group_actions:
+                group_map[action] = mutex_group
+
+        # add all actions to this container or their group
+        for action in container._actions:
+            group_map.get(action, self)._add_action(action)
+
+    def _get_positional_kwargs(self, dest, **kwargs):
+        # make sure required is not specified
+        if 'required' in kwargs:
+            msg = _("'required' is an invalid argument for positionals")
+            raise TypeError(msg)
+
+        # mark positional arguments as required if at least one is
+        # always required
+        if kwargs.get('nargs') not in [OPTIONAL, ZERO_OR_MORE]:
+            kwargs['required'] = True
+        if kwargs.get('nargs') == ZERO_OR_MORE and 'default' not in kwargs:
+            kwargs['required'] = True
+
+        # return the keyword arguments with no option strings
+        return dict(kwargs, dest=dest, option_strings=[])
+
+    def _get_optional_kwargs(self, *args, **kwargs):
+        # determine short and long option strings
+        option_strings = []
+        long_option_strings = []
+        for option_string in args:
+            # error on strings that don't start with an appropriate prefix
+            if not option_string[0] in self.prefix_chars:
+                msg = _('invalid option string %r: '
+                        'must start with a character %r')
+                tup = option_string, self.prefix_chars
+                raise ValueError(msg % tup)
+
+            # strings starting with two prefix characters are long options
+            option_strings.append(option_string)
+            if option_string[0] in self.prefix_chars:
+                if len(option_string) > 1:
+                    if option_string[1] in self.prefix_chars:
+                        long_option_strings.append(option_string)
+
+        # infer destination, '--foo-bar' -> 'foo_bar' and '-x' -> 'x'
+        dest = kwargs.pop('dest', None)
+        if dest is None:
+            if long_option_strings:
+                dest_option_string = long_option_strings[0]
+            else:
+                dest_option_string = option_strings[0]
+            dest = dest_option_string.lstrip(self.prefix_chars)
+            if not dest:
+                msg = _('dest= is required for options like %r')
+                raise ValueError(msg % option_string)
+            dest = dest.replace('-', '_')
+
+        # return the updated keyword arguments
+        return dict(kwargs, dest=dest, option_strings=option_strings)
+
+    def _pop_action_class(self, kwargs, default=None):
+        action = kwargs.pop('action', default)
+        return self._registry_get('action', action, action)
+
+    def _get_handler(self):
+        # determine function from conflict handler string
+        handler_func_name = '_handle_conflict_%s' % self.conflict_handler
+        try:
+            return getattr(self, handler_func_name)
+        except AttributeError:
+            msg = _('invalid conflict_resolution value: %r')
+            raise ValueError(msg % self.conflict_handler)
+
+    def _check_conflict(self, action):
+
+        # find all options that conflict with this option
+        confl_optionals = []
+        for option_string in action.option_strings:
+            if option_string in self._option_string_actions:
+                confl_optional = self._option_string_actions[option_string]
+                confl_optionals.append((option_string, confl_optional))
+
+        # resolve any conflicts
+        if confl_optionals:
+            conflict_handler = self._get_handler()
+            conflict_handler(action, confl_optionals)
+
+    def _handle_conflict_error(self, action, conflicting_actions):
+        message = _('conflicting option string(s): %s')
+        conflict_string = ', '.join([option_string
+                                     for option_string, action
+                                     in conflicting_actions])
+        raise ArgumentError(action, message % conflict_string)
+
+    def _handle_conflict_resolve(self, action, conflicting_actions):
+
+        # remove all conflicting options
+        for option_string, action in conflicting_actions:
+
+            # remove the conflicting option
+            action.option_strings.remove(option_string)
+            self._option_string_actions.pop(option_string, None)
+
+            # if the option now has no option string, remove it from the
+            # container holding it
+            if not action.option_strings:
+                action.container._remove_action(action)
+
+
+class _ArgumentGroup(_ActionsContainer):
+
+    def __init__(self, container, title=None, description=None, **kwargs):
+        # add any missing keyword arguments by checking the container
+        update = kwargs.setdefault
+        update('conflict_handler', container.conflict_handler)
+        update('prefix_chars', container.prefix_chars)
+        update('argument_default', container.argument_default)
+        super_init = super(_ArgumentGroup, self).__init__
+        super_init(description=description, **kwargs)
+
+        # group attributes
+        self.title = title
+        self._group_actions = []
+
+        # share most attributes with the container
+        self._registries = container._registries
+        self._actions = container._actions
+        self._option_string_actions = container._option_string_actions
+        self._defaults = container._defaults
+        self._has_negative_number_optionals = \
+            container._has_negative_number_optionals
+
+    def _add_action(self, action):
+        action = super(_ArgumentGroup, self)._add_action(action)
+        self._group_actions.append(action)
+        return action
+
+    def _remove_action(self, action):
+        super(_ArgumentGroup, self)._remove_action(action)
+        self._group_actions.remove(action)
+
+
+class _MutuallyExclusiveGroup(_ArgumentGroup):
+
+    def __init__(self, container, required=False):
+        super(_MutuallyExclusiveGroup, self).__init__(container)
+        self.required = required
+        self._container = container
+
+    def _add_action(self, action):
+        if action.required:
+            msg = _('mutually exclusive arguments must be optional')
+            raise ValueError(msg)
+        action = self._container._add_action(action)
+        self._group_actions.append(action)
+        return action
+
+    def _remove_action(self, action):
+        self._container._remove_action(action)
+        self._group_actions.remove(action)
+
+
+class ArgumentParser(_AttributeHolder, _ActionsContainer):
+    """Object for parsing command line strings into Python objects.
+
+    Keyword Arguments:
+        - prog -- The name of the program (default: sys.argv[0])
+        - usage -- A usage message (default: auto-generated from arguments)
+        - description -- A description of what the program does
+        - epilog -- Text following the argument descriptions
+        - parents -- Parsers whose arguments should be copied into this one
+        - formatter_class -- HelpFormatter class for printing help messages
+        - prefix_chars -- Characters that prefix optional arguments
+        - fromfile_prefix_chars -- Characters that prefix files containing
+            additional arguments
+        - argument_default -- The default value for all arguments
+        - conflict_handler -- String indicating how to handle conflicts
+        - add_help -- Add a -h/-help option
+    """
+
+    def __init__(self,
+                 prog=None,
+                 usage=None,
+                 description=None,
+                 epilog=None,
+                 version=None,
+                 parents=[],
+                 formatter_class=HelpFormatter,
+                 prefix_chars='-',
+                 fromfile_prefix_chars=None,
+                 argument_default=None,
+                 conflict_handler='error',
+                 add_help=True):
+
+        if version is not None:
+            import warnings
+            warnings.warn(
+                """The "version" argument to ArgumentParser is deprecated. """
+                """Please use """
+                """"add_argument(..., action='version', version="N", ...)" """
+                """instead""", DeprecationWarning)
+
+        superinit = super(ArgumentParser, self).__init__
+        superinit(description=description,
+                  prefix_chars=prefix_chars,
+                  argument_default=argument_default,
+                  conflict_handler=conflict_handler)
+
+        # default setting for prog
+        if prog is None:
+            prog = _os.path.basename(_sys.argv[0])
+
+        self.prog = prog
+        self.usage = usage
+        self.epilog = epilog
+        self.version = version
+        self.formatter_class = formatter_class
+        self.fromfile_prefix_chars = fromfile_prefix_chars
+        self.add_help = add_help
+
+        add_group = self.add_argument_group
+        self._positionals = add_group(_('positional arguments'))
+        self._optionals = add_group(_('optional arguments'))
+        self._subparsers = None
+
+        # register types
+        def identity(string):
+            return string
+        self.register('type', None, identity)
+
+        # add help and version arguments if necessary
+        # (using explicit default to override global argument_default)
+        if self.add_help:
+            self.add_argument(
+                '-h', '--help', action='help', default=SUPPRESS,
+                help=_('show this help message and exit'))
+        if self.version:
+            self.add_argument(
+                '-v', '--version', action='version', default=SUPPRESS,
+                version=self.version,
+                help=_("show program's version number and exit"))
+
+        # add parent arguments and defaults
+        for parent in parents:
+            self._add_container_actions(parent)
+            try:
+                defaults = parent._defaults
+            except AttributeError:
+                pass
+            else:
+                self._defaults.update(defaults)
+
+    # =======================
+    # Pretty __repr__ methods
+    # =======================
+    def _get_kwargs(self):
+        names = [
+            'prog',
+            'usage',
+            'description',
+            'version',
+            'formatter_class',
+            'conflict_handler',
+            'add_help',
+        ]
+        return [(name, getattr(self, name)) for name in names]
+
+    # ==================================
+    # Optional/Positional adding methods
+    # ==================================
+    def add_subparsers(self, **kwargs):
+        if self._subparsers is not None:
+            self.error(_('cannot have multiple subparser arguments'))
+
+        # add the parser class to the arguments if it's not present
+        kwargs.setdefault('parser_class', type(self))
+
+        if 'title' in kwargs or 'description' in kwargs:
+            title = _(kwargs.pop('title', 'subcommands'))
+            description = _(kwargs.pop('description', None))
+            self._subparsers = self.add_argument_group(title, description)
+        else:
+            self._subparsers = self._positionals
+
+        # prog defaults to the usage message of this parser, skipping
+        # optional arguments and with no "usage:" prefix
+        if kwargs.get('prog') is None:
+            formatter = self._get_formatter()
+            positionals = self._get_positional_actions()
+            groups = self._mutually_exclusive_groups
+            formatter.add_usage(self.usage, positionals, groups, '')
+            kwargs['prog'] = formatter.format_help().strip()
+
+        # create the parsers action and add it to the positionals list
+        parsers_class = self._pop_action_class(kwargs, 'parsers')
+        action = parsers_class(option_strings=[], **kwargs)
+        self._subparsers._add_action(action)
+
+        # return the created parsers action
+        return action
+
+    def _add_action(self, action):
+        if action.option_strings:
+            self._optionals._add_action(action)
+        else:
+            self._positionals._add_action(action)
+        return action
+
+    def _get_optional_actions(self):
+        return [action
+                for action in self._actions
+                if action.option_strings]
+
+    def _get_positional_actions(self):
+        return [action
+                for action in self._actions
+                if not action.option_strings]
+
+    # =====================================
+    # Command line argument parsing methods
+    # =====================================
+    def parse_args(self, args=None, namespace=None):
+        args, argv = self.parse_known_args(args, namespace)
+        if argv:
+            msg = _('unrecognized arguments: %s')
+            self.error(msg % ' '.join(argv))
+        return args
+
+    def parse_known_args(self, args=None, namespace=None):
+        # args default to the system args
+        if args is None:
+            args = _sys.argv[1:]
+
+        # default Namespace built from parser defaults
+        if namespace is None:
+            namespace = Namespace()
+
+        # add any action defaults that aren't present
+        for action in self._actions:
+            if action.dest is not SUPPRESS:
+                if not hasattr(namespace, action.dest):
+                    if action.default is not SUPPRESS:
+                        default = action.default
+                        if isinstance(action.default, _basestring):
+                            default = self._get_value(action, default)
+                        setattr(namespace, action.dest, default)
+
+        # add any parser defaults that aren't present
+        for dest in self._defaults:
+            if not hasattr(namespace, dest):
+                setattr(namespace, dest, self._defaults[dest])
+
+        # parse the arguments and exit if there are any errors
+        try:
+            return self._parse_known_args(args, namespace)
+        except ArgumentError:
+            err = _sys.exc_info()[1]
+            self.error(str(err))
+
+    def _parse_known_args(self, arg_strings, namespace):
+        # replace arg strings that are file references
+        if self.fromfile_prefix_chars is not None:
+            arg_strings = self._read_args_from_files(arg_strings)
+
+        # map all mutually exclusive arguments to the other arguments
+        # they can't occur with
+        action_conflicts = {}
+        for mutex_group in self._mutually_exclusive_groups:
+            group_actions = mutex_group._group_actions
+            for i, mutex_action in enumerate(mutex_group._group_actions):
+                conflicts = action_conflicts.setdefault(mutex_action, [])
+                conflicts.extend(group_actions[:i])
+                conflicts.extend(group_actions[i + 1:])
+
+        # find all option indices, and determine the arg_string_pattern
+        # which has an 'O' if there is an option at an index,
+        # an 'A' if there is an argument, or a '-' if there is a '--'
+        option_string_indices = {}
+        arg_string_pattern_parts = []
+        arg_strings_iter = iter(arg_strings)
+        for i, arg_string in enumerate(arg_strings_iter):
+
+            # all args after -- are non-options
+            if arg_string == '--':
+                arg_string_pattern_parts.append('-')
+                for arg_string in arg_strings_iter:
+                    arg_string_pattern_parts.append('A')
+
+            # otherwise, add the arg to the arg strings
+            # and note the index if it was an option
+            else:
+                option_tuple = self._parse_optional(arg_string)
+                if option_tuple is None:
+                    pattern = 'A'
+                else:
+                    option_string_indices[i] = option_tuple
+                    pattern = 'O'
+                arg_string_pattern_parts.append(pattern)
+
+        # join the pieces together to form the pattern
+        arg_strings_pattern = ''.join(arg_string_pattern_parts)
+
+        # converts arg strings to the appropriate and then takes the action
+        seen_actions = _set()
+        seen_non_default_actions = _set()
+
+        def take_action(action, argument_strings, option_string=None):
+            seen_actions.add(action)
+            argument_values = self._get_values(action, argument_strings)
+
+            # error if this argument is not allowed with other previously
+            # seen arguments, assuming that actions that use the default
+            # value don't really count as "present"
+            if argument_values is not action.default:
+                seen_non_default_actions.add(action)
+                for conflict_action in action_conflicts.get(action, []):
+                    if conflict_action in seen_non_default_actions:
+                        msg = _('not allowed with argument %s')
+                        action_name = _get_action_name(conflict_action)
+                        raise ArgumentError(action, msg % action_name)
+
+            # take the action if we didn't receive a SUPPRESS value
+            # (e.g. from a default)
+            if argument_values is not SUPPRESS:
+                action(self, namespace, argument_values, option_string)
+
+        # function to convert arg_strings into an optional action
+        def consume_optional(start_index):
+
+            # get the optional identified at this index
+            option_tuple = option_string_indices[start_index]
+            action, option_string, explicit_arg = option_tuple
+
+            # identify additional optionals in the same arg string
+            # (e.g. -xyz is the same as -x -y -z if no args are required)
+            match_argument = self._match_argument
+            action_tuples = []
+            while True:
+
+                # if we found no optional action, skip it
+                if action is None:
+                    extras.append(arg_strings[start_index])
+                    return start_index + 1
+
+                # if there is an explicit argument, try to match the
+                # optional's string arguments to only this
+                if explicit_arg is not None:
+                    arg_count = match_argument(action, 'A')
+
+                    # if the action is a single-dash option and takes no
+                    # arguments, try to parse more single-dash options out
+                    # of the tail of the option string
+                    chars = self.prefix_chars
+                    if arg_count == 0 and option_string[1] not in chars:
+                        action_tuples.append((action, [], option_string))
+                        for char in self.prefix_chars:
+                            option_string = char + explicit_arg[0]
+                            explicit_arg = explicit_arg[1:] or None
+                            optionals_map = self._option_string_actions
+                            if option_string in optionals_map:
+                                action = optionals_map[option_string]
+                                break
+                        else:
+                            msg = _('ignored explicit argument %r')
+                            raise ArgumentError(action, msg % explicit_arg)
+
+                    # if the action expect exactly one argument, we've
+                    # successfully matched the option; exit the loop
+                    elif arg_count == 1:
+                        stop = start_index + 1
+                        args = [explicit_arg]
+                        action_tuples.append((action, args, option_string))
+                        break
+
+                    # error if a double-dash option did not use the
+                    # explicit argument
+                    else:
+                        msg = _('ignored explicit argument %r')
+                        raise ArgumentError(action, msg % explicit_arg)
+
+                # if there is no explicit argument, try to match the
+                # optional's string arguments with the following strings
+                # if successful, exit the loop
+                else:
+                    start = start_index + 1
+                    selected_patterns = arg_strings_pattern[start:]
+                    arg_count = match_argument(action, selected_patterns)
+                    stop = start + arg_count
+                    args = arg_strings[start:stop]
+                    action_tuples.append((action, args, option_string))
+                    break
+
+            # add the Optional to the list and return the index at which
+            # the Optional's string args stopped
+            assert action_tuples
+            for action, args, option_string in action_tuples:
+                take_action(action, args, option_string)
+            return stop
+
+        # the list of Positionals left to be parsed; this is modified
+        # by consume_positionals()
+        positionals = self._get_positional_actions()
+
+        # function to convert arg_strings into positional actions
+        def consume_positionals(start_index):
+            # match as many Positionals as possible
+            match_partial = self._match_arguments_partial
+            selected_pattern = arg_strings_pattern[start_index:]
+            arg_counts = match_partial(positionals, selected_pattern)
+
+            # slice off the appropriate arg strings for each Positional
+            # and add the Positional and its args to the list
+            for action, arg_count in zip(positionals, arg_counts):
+                args = arg_strings[start_index: start_index + arg_count]
+                start_index += arg_count
+                take_action(action, args)
+
+            # slice off the Positionals that we just parsed and return the
+            # index at which the Positionals' string args stopped
+            positionals[:] = positionals[len(arg_counts):]
+            return start_index
+
+        # consume Positionals and Optionals alternately, until we have
+        # passed the last option string
+        extras = []
+        start_index = 0
+        if option_string_indices:
+            max_option_string_index = max(option_string_indices)
+        else:
+            max_option_string_index = -1
+        while start_index <= max_option_string_index:
+
+            # consume any Positionals preceding the next option
+            next_option_string_index = min([
+                index
+                for index in option_string_indices
+                if index >= start_index])
+            if start_index != next_option_string_index:
+                positionals_end_index = consume_positionals(start_index)
+
+                # only try to parse the next optional if we didn't consume
+                # the option string during the positionals parsing
+                if positionals_end_index > start_index:
+                    start_index = positionals_end_index
+                    continue
+                else:
+                    start_index = positionals_end_index
+
+            # if we consumed all the positionals we could and we're not
+            # at the index of an option string, there were extra arguments
+            if start_index not in option_string_indices:
+                strings = arg_strings[start_index:next_option_string_index]
+                extras.extend(strings)
+                start_index = next_option_string_index
+
+            # consume the next optional and any arguments for it
+            start_index = consume_optional(start_index)
+
+        # consume any positionals following the last Optional
+        stop_index = consume_positionals(start_index)
+
+        # if we didn't consume all the argument strings, there were extras
+        extras.extend(arg_strings[stop_index:])
+
+        # if we didn't use all the Positional objects, there were too few
+        # arg strings supplied.
+        if positionals:
+            self.error(_('too few arguments'))
+
+        # make sure all required actions were present
+        for action in self._actions:
+            if action.required:
+                if action not in seen_actions:
+                    name = _get_action_name(action)
+                    self.error(_('argument %s is required') % name)
+
+        # make sure all required groups had one option present
+        for group in self._mutually_exclusive_groups:
+            if group.required:
+                for action in group._group_actions:
+                    if action in seen_non_default_actions:
+                        break
+
+                # if no actions were used, report the error
+                else:
+                    names = [_get_action_name(action)
+                             for action in group._group_actions
+                             if action.help is not SUPPRESS]
+                    msg = _('one of the arguments %s is required')
+                    self.error(msg % ' '.join(names))
+
+        # return the updated namespace and the extra arguments
+        return namespace, extras
+
+    def _read_args_from_files(self, arg_strings):
+        # expand arguments referencing files
+        new_arg_strings = []
+        for arg_string in arg_strings:
+
+            # for regular arguments, just add them back into the list
+            if arg_string[0] not in self.fromfile_prefix_chars:
+                new_arg_strings.append(arg_string)
+
+            # replace arguments referencing files with the file content
+            else:
+                try:
+                    args_file = open(arg_string[1:])
+                    try:
+                        arg_strings = []
+                        for arg_line in args_file.read().splitlines():
+                            for arg in self.convert_arg_line_to_args(arg_line):
+                                arg_strings.append(arg)
+                        arg_strings = self._read_args_from_files(arg_strings)
+                        new_arg_strings.extend(arg_strings)
+                    finally:
+                        args_file.close()
+                except IOError:
+                    err = _sys.exc_info()[1]
+                    self.error(str(err))
+
+        # return the modified argument list
+        return new_arg_strings
+
+    def convert_arg_line_to_args(self, arg_line):
+        return [arg_line]
+
+    def _match_argument(self, action, arg_strings_pattern):
+        # match the pattern for this action to the arg strings
+        nargs_pattern = self._get_nargs_pattern(action)
+        match = _re.match(nargs_pattern, arg_strings_pattern)
+
+        # raise an exception if we weren't able to find a match
+        if match is None:
+            nargs_errors = {
+                None: _('expected one argument'),
+                OPTIONAL: _('expected at most one argument'),
+                ONE_OR_MORE: _('expected at least one argument'),
+            }
+            default = _('expected %s argument(s)') % action.nargs
+            msg = nargs_errors.get(action.nargs, default)
+            raise ArgumentError(action, msg)
+
+        # return the number of arguments matched
+        return len(match.group(1))
+
+    def _match_arguments_partial(self, actions, arg_strings_pattern):
+        # progressively shorten the actions list by slicing off the
+        # final actions until we find a match
+        result = []
+        for i in range(len(actions), 0, -1):
+            actions_slice = actions[:i]
+            pattern = ''.join([self._get_nargs_pattern(action)
+                               for action in actions_slice])
+            match = _re.match(pattern, arg_strings_pattern)
+            if match is not None:
+                result.extend([len(string) for string in match.groups()])
+                break
+
+        # return the list of arg string counts
+        return result
+
+    def _parse_optional(self, arg_string):
+        # if it's an empty string, it was meant to be a positional
+        if not arg_string:
+            return None
+
+        # if it doesn't start with a prefix, it was meant to be positional
+        if not arg_string[0] in self.prefix_chars:
+            return None
+
+        # if the option string is present in the parser, return the action
+        if arg_string in self._option_string_actions:
+            action = self._option_string_actions[arg_string]
+            return action, arg_string, None
+
+        # if it's just a single character, it was meant to be positional
+        if len(arg_string) == 1:
+            return None
+
+        # if the option string before the "=" is present, return the action
+        if '=' in arg_string:
+            option_string, explicit_arg = arg_string.split('=', 1)
+            if option_string in self._option_string_actions:
+                action = self._option_string_actions[option_string]
+                return action, option_string, explicit_arg
+
+        # search through all possible prefixes of the option string
+        # and all actions in the parser for possible interpretations
+        option_tuples = self._get_option_tuples(arg_string)
+
+        # if multiple actions match, the option string was ambiguous
+        if len(option_tuples) > 1:
+            options = ', '.join([option_string
+                for action, option_string, explicit_arg in option_tuples])
+            tup = arg_string, options
+            self.error(_('ambiguous option: %s could match %s') % tup)
+
+        # if exactly one action matched, this segmentation is good,
+        # so return the parsed action
+        elif len(option_tuples) == 1:
+            option_tuple, = option_tuples
+            return option_tuple
+
+        # if it was not found as an option, but it looks like a negative
+        # number, it was meant to be positional
+        # unless there are negative-number-like options
+        if self._negative_number_matcher.match(arg_string):
+            if not self._has_negative_number_optionals:
+                return None
+
+        # if it contains a space, it was meant to be a positional
+        if ' ' in arg_string:
+            return None
+
+        # it was meant to be an optional but there is no such option
+        # in this parser (though it might be a valid option in a subparser)
+        return None, arg_string, None
+
+    def _get_option_tuples(self, option_string):
+        result = []
+
+        # option strings starting with two prefix characters are only
+        # split at the '='
+        chars = self.prefix_chars
+        if option_string[0] in chars and option_string[1] in chars:
+            if '=' in option_string:
+                option_prefix, explicit_arg = option_string.split('=', 1)
+            else:
+                option_prefix = option_string
+                explicit_arg = None
+            for option_string in self._option_string_actions:
+                if option_string.startswith(option_prefix):
+                    action = self._option_string_actions[option_string]
+                    tup = action, option_string, explicit_arg
+                    result.append(tup)
+
+        # single character options can be concatenated with their arguments
+        # but multiple character options always have to have their argument
+        # separate
+        elif option_string[0] in chars and option_string[1] not in chars:
+            option_prefix = option_string
+            explicit_arg = None
+            short_option_prefix = option_string[:2]
+            short_explicit_arg = option_string[2:]
+
+            for option_string in self._option_string_actions:
+                if option_string == short_option_prefix:
+                    action = self._option_string_actions[option_string]
+                    tup = action, option_string, short_explicit_arg
+                    result.append(tup)
+                elif option_string.startswith(option_prefix):
+                    action = self._option_string_actions[option_string]
+                    tup = action, option_string, explicit_arg
+                    result.append(tup)
+
+        # shouldn't ever get here
+        else:
+            self.error(_('unexpected option string: %s') % option_string)
+
+        # return the collected option tuples
+        return result
+
+    def _get_nargs_pattern(self, action):
+        # in all examples below, we have to allow for '--' args
+        # which are represented as '-' in the pattern
+        nargs = action.nargs
+
+        # the default (None) is assumed to be a single argument
+        if nargs is None:
+            nargs_pattern = '(-*A-*)'
+
+        # allow zero or one arguments
+        elif nargs == OPTIONAL:
+            nargs_pattern = '(-*A?-*)'
+
+        # allow zero or more arguments
+        elif nargs == ZERO_OR_MORE:
+            nargs_pattern = '(-*[A-]*)'
+
+        # allow one or more arguments
+        elif nargs == ONE_OR_MORE:
+            nargs_pattern = '(-*A[A-]*)'
+
+        # allow any number of options or arguments
+        elif nargs == REMAINDER:
+            nargs_pattern = '([-AO]*)'
+
+        # allow one argument followed by any number of options or arguments
+        elif nargs == PARSER:
+            nargs_pattern = '(-*A[-AO]*)'
+
+        # all others should be integers
+        else:
+            nargs_pattern = '(-*%s-*)' % '-*'.join('A' * nargs)
+
+        # if this is an optional action, -- is not allowed
+        if action.option_strings:
+            nargs_pattern = nargs_pattern.replace('-*', '')
+            nargs_pattern = nargs_pattern.replace('-', '')
+
+        # return the pattern
+        return nargs_pattern
+
+    # ========================
+    # Value conversion methods
+    # ========================
+    def _get_values(self, action, arg_strings):
+        # for everything but PARSER args, strip out '--'
+        if action.nargs not in [PARSER, REMAINDER]:
+            arg_strings = [s for s in arg_strings if s != '--']
+
+        # optional argument produces a default when not present
+        if not arg_strings and action.nargs == OPTIONAL:
+            if action.option_strings:
+                value = action.const
+            else:
+                value = action.default
+            if isinstance(value, _basestring):
+                value = self._get_value(action, value)
+                self._check_value(action, value)
+
+        # when nargs='*' on a positional, if there were no command-line
+        # args, use the default if it is anything other than None
+        elif (not arg_strings and action.nargs == ZERO_OR_MORE and
+              not action.option_strings):
+            if action.default is not None:
+                value = action.default
+            else:
+                value = arg_strings
+            self._check_value(action, value)
+
+        # single argument or optional argument produces a single value
+        elif len(arg_strings) == 1 and action.nargs in [None, OPTIONAL]:
+            arg_string, = arg_strings
+            value = self._get_value(action, arg_string)
+            self._check_value(action, value)
+
+        # REMAINDER arguments convert all values, checking none
+        elif action.nargs == REMAINDER:
+            value = [self._get_value(action, v) for v in arg_strings]
+
+        # PARSER arguments convert all values, but check only the first
+        elif action.nargs == PARSER:
+            value = [self._get_value(action, v) for v in arg_strings]
+            self._check_value(action, value[0])
+
+        # all other types of nargs produce a list
+        else:
+            value = [self._get_value(action, v) for v in arg_strings]
+            for v in value:
+                self._check_value(action, v)
+
+        # return the converted value
+        return value
+
+    def _get_value(self, action, arg_string):
+        type_func = self._registry_get('type', action.type, action.type)
+        if not _callable(type_func):
+            msg = _('%r is not callable')
+            raise ArgumentError(action, msg % type_func)
+
+        # convert the value to the appropriate type
+        try:
+            result = type_func(arg_string)
+
+        # ArgumentTypeErrors indicate errors
+        except ArgumentTypeError:
+            name = getattr(action.type, '__name__', repr(action.type))
+            msg = str(_sys.exc_info()[1])
+            raise ArgumentError(action, msg)
+
+        # TypeErrors or ValueErrors also indicate errors
+        except (TypeError, ValueError):
+            name = getattr(action.type, '__name__', repr(action.type))
+            msg = _('invalid %s value: %r')
+            raise ArgumentError(action, msg % (name, arg_string))
+
+        # return the converted value
+        return result
+
+    def _check_value(self, action, value):
+        # converted value must be one of the choices (if specified)
+        if action.choices is not None and value not in action.choices:
+            tup = value, ', '.join(map(repr, action.choices))
+            msg = _('invalid choice: %r (choose from %s)') % tup
+            raise ArgumentError(action, msg)
+
+    # =======================
+    # Help-formatting methods
+    # =======================
+    def format_usage(self):
+        formatter = self._get_formatter()
+        formatter.add_usage(self.usage, self._actions,
+                            self._mutually_exclusive_groups)
+        return formatter.format_help()
+
+    def format_help(self):
+        formatter = self._get_formatter()
+
+        # usage
+        formatter.add_usage(self.usage, self._actions,
+                            self._mutually_exclusive_groups)
+
+        # description
+        formatter.add_text(self.description)
+
+        # positionals, optionals and user-defined groups
+        for action_group in self._action_groups:
+            formatter.start_section(action_group.title)
+            formatter.add_text(action_group.description)
+            formatter.add_arguments(action_group._group_actions)
+            formatter.end_section()
+
+        # epilog
+        formatter.add_text(self.epilog)
+
+        # determine help from format above
+        return formatter.format_help()
+
+    def format_version(self):
+        import warnings
+        warnings.warn(
+            'The format_version method is deprecated -- the "version" '
+            'argument to ArgumentParser is no longer supported.',
+            DeprecationWarning)
+        formatter = self._get_formatter()
+        formatter.add_text(self.version)
+        return formatter.format_help()
+
+    def _get_formatter(self):
+        return self.formatter_class(prog=self.prog)
+
+    # =====================
+    # Help-printing methods
+    # =====================
+    def print_usage(self, file=None):
+        if file is None:
+            file = _sys.stdout
+        self._print_message(self.format_usage(), file)
+
+    def print_help(self, file=None):
+        if file is None:
+            file = _sys.stdout
+        self._print_message(self.format_help(), file)
+
+    def print_version(self, file=None):
+        import warnings
+        warnings.warn(
+            'The print_version method is deprecated -- the "version" '
+            'argument to ArgumentParser is no longer supported.',
+            DeprecationWarning)
+        self._print_message(self.format_version(), file)
+
+    def _print_message(self, message, file=None):
+        if message:
+            if file is None:
+                file = _sys.stderr
+            file.write(message)
+
+    # ===============
+    # Exiting methods
+    # ===============
+    def exit(self, status=0, message=None):
+        if message:
+            self._print_message(message, _sys.stderr)
+        _sys.exit(status)
+
+    def error(self, message):
+        """error(message: string)
+
+        Prints a usage message incorporating the message to stderr and
+        exits.
+
+        If you override this in a subclass, it should not return -- it
+        should either exit or raise an exception.
+        """
+        self.print_usage(_sys.stderr)
+        self.exit(2, _('%s: error: %s\n') % (self.prog, message))
diff --git a/theano/misc/hooks/check_whitespace.py b/theano/misc/hooks/check_whitespace.py
index 01f0ce7a656..084a26f9510 100755
--- a/theano/misc/hooks/check_whitespace.py
+++ b/theano/misc/hooks/check_whitespace.py
@@ -10,16 +10,9 @@
 import tabnanny
 import tokenize
 
-try:
-    import argparse
-except ImportError:
-    raise ImportError(
-        "check_whitespace.py need Python module argparse introduced in"
-        " Python 2.7. It is available in pypi for compatibility."
-        " You can install it with this command 'pip install argparse'")
-
+import argparse
 import reindent
-from theano.compat.six import StringIO
+from theano.compat.six.StringIO import StringIO
 
 SKIP_WHITESPACE_CHECK_FILENAME = ".hg/skip_whitespace_check"
 
diff --git a/theano/misc/may_share_memory.py b/theano/misc/may_share_memory.py
index 62f357c81af..b68beeb6f28 100644
--- a/theano/misc/may_share_memory.py
+++ b/theano/misc/may_share_memory.py
@@ -15,14 +15,12 @@
     def _is_sparse(a):
         return scipy.sparse.issparse(a)
 except ImportError:
-    # scipy not imported, their can be only ndarray and cudandarray
+    #scipy not imported, their can be only ndarray and cudandarray
     def _is_sparse(a):
         return False
 
 from theano.sandbox import cuda
 if cuda.cuda_available:
-    from theano.sandbox.cuda.type import CudaNdarrayType
-
     def _is_cuda(a):
         return isinstance(a, cuda.CudaNdarray)
 else:
@@ -30,38 +28,26 @@ def _is_cuda(a):
         return False
 
 
-from theano.sandbox import gpuarray
-if gpuarray.pygpu:
-    def _is_gpua(a):
-        return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray)
-else:
-    def _is_gpua(a):
-        return False
-
-
 def may_share_memory(a, b, raise_other_type=True):
     a_ndarray = isinstance(a, numpy.ndarray)
     b_ndarray = isinstance(b, numpy.ndarray)
-    if a_ndarray and b_ndarray:
-        return TensorType.may_share_memory(a, b)
+    a_sparse = _is_sparse(a)
+    b_sparse = _is_sparse(b)
     a_cuda = _is_cuda(a)
     b_cuda = _is_cuda(b)
-    if a_cuda and b_cuda:
-        return CudaNdarrayType.may_share_memory(a, b)
-    a_gpua = _is_gpua(a)
-    b_gpua = _is_gpua(b)
-    if a_gpua and b_gpua:
-        return gpuarray.pygpu.gpuarray.may_share_memory(a, b)
 
-    a_sparse = _is_sparse(a)
-    b_sparse = _is_sparse(b)
-    if (not(a_ndarray or a_sparse or a_cuda or a_gpua) or
-        not(b_ndarray or b_sparse or b_cuda or b_gpua)):
+    if (not(a_ndarray or a_sparse or a_cuda) or
+        not(b_ndarray or b_sparse or b_cuda)):
         if raise_other_type:
             raise TypeError("may_share_memory support only ndarray"
-                            " and scipy.sparse, CudaNdarray or GpuArray type")
+                            " and scipy.sparse and CudaNdarray type")
         return False
 
-    if a_cuda or b_cuda or a_gpua or b_gpua:
+    if a_ndarray and b_ndarray:
+        return TensorType.may_share_memory(a, b)
+    if a_cuda and b_cuda:
+        from theano.sandbox.cuda.type import CudaNdarrayType
+        return CudaNdarrayType.may_share_memory(a, b)
+    if a_cuda or b_cuda:
         return False
     return SparseType.may_share_memory(a, b)
diff --git a/theano/misc/nose_pr.py b/theano/misc/nose_pr.py
new file mode 100644
index 00000000000..d2622d1da7d
--- /dev/null
+++ b/theano/misc/nose_pr.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python
+"""
+This is a script for testing pull requests for Theano. It merges the pull
+request with current master, installs and tests on all available versions of
+Python, and posts the results to Gist if any tests fail.
+
+It is copied from ipython
+
+Usage:
+    python test_pr.py 1657
+"""
+from __future__ import print_function
+
+import errno
+from glob import glob
+import io
+import json
+import os
+import pickle
+import re
+import requests
+import shutil
+import time
+from subprocess import (call, check_call, check_output,
+                        PIPE, STDOUT, CalledProcessError)
+import sys
+
+import gh_api
+
+basedir = os.path.join(os.path.expanduser("~"), ".theano_pr_tests")
+repodir = os.path.join(basedir, "Theano")
+ipy_repository = 'git://github.com/Theano/Theano.git'
+ipy_http_repository = 'http://github.com/Theano/Theano.git'
+gh_project = "Theano/Theano"
+
+supported_pythons = ['python2.6', 'python2.7', 'python3.1', 'python3.2']
+supported_pythons = ['python2.7']
+unavailable_pythons = []
+
+
+def available_python_versions():
+    """Get the executable names of available versions of Python on the system.
+    """
+    del unavailable_pythons[:]
+    for py in supported_pythons:
+        try:
+            check_call([py, '-c', 'import nose'], stdout=PIPE)
+            yield py
+        except (OSError, CalledProcessError):
+            unavailable_pythons.append(py)
+
+venvs = []
+
+
+def setup():
+    """Prepare the repository and virtualenvs."""
+    global venvs
+
+    try:
+        os.mkdir(basedir)
+    except OSError, e:
+        if e.errno != errno.EEXIST:
+            raise
+    os.chdir(basedir)
+
+    # Delete virtualenvs and recreate
+    for venv in glob('venv-*'):
+        shutil.rmtree(venv)
+    for py in available_python_versions():
+        check_call(['virtualenv', '-p', py,
+             '--system-site-packages', 'venv-%s' % py])
+        venvs.append((py, 'venv-%s' % py))
+
+    # Check out and update the repository
+    if not os.path.exists('Theano'):
+        try:
+            check_call(['git', 'clone', ipy_repository])
+        except CalledProcessError:
+            check_call(['git', 'clone', ipy_http_repository])
+    os.chdir(repodir)
+    check_call(['git', 'checkout', 'master'])
+    try:
+        check_call(['git', 'pull', ipy_repository, 'master'])
+    except CalledProcessError:
+        check_call(['git', 'pull', ipy_http_repository, 'master'])
+    os.chdir(basedir)
+
+missing_libs_re = re.compile(r"Tools and libraries NOT available at test"
+                             r" time:\n\s*(.*?)\n")
+
+
+def get_missing_libraries(log):
+    m = missing_libs_re.search(log)
+    if m:
+        return m.group(1)
+
+
+def get_branch(repo, branch, owner, mergeable):
+    os.chdir(repodir)
+    if mergeable:
+        merged_branch = "%s-%s" % (owner, branch)
+        # Delete the branch first
+        call(['git', 'branch', '-D', merged_branch])
+        check_call(['git', 'checkout', '-b', merged_branch])
+        check_call(['git', 'pull', '--no-ff', '--no-commit', repo, branch])
+        check_call(['git', 'commit', '-m', "merge %s/%s" % (repo, branch)])
+    else:
+        # Fetch the branch without merging it.
+        check_call(['git', 'fetch', repo, branch])
+        check_call(['git', 'checkout', 'FETCH_HEAD'])
+    os.chdir(basedir)
+
+
+def run_tests(venv):
+    py = os.path.join(basedir, venv, 'bin', 'python')
+    print(py)
+    os.chdir(repodir)
+    # cleanup build-dir
+    if os.path.exists('build'):
+        shutil.rmtree('build')
+    check_call([py, 'setup.py', 'install'])
+    os.chdir(basedir)
+
+    # Environment variables:
+    orig_path = os.environ["PATH"]
+    os.environ["PATH"] = os.path.join(basedir, venv,
+                                      'bin') + ':' + os.environ["PATH"]
+    os.environ.pop("PYTHONPATH", None)
+    iptest = os.path.join(basedir, venv, 'bin', 'theano-test')
+#    if not os.path.exists(iptest):
+#        iptest = os.path.join(basedir, venv, 'bin', 'iptest3')
+
+    print("\nRunning tests, this typically takes a few minutes...")
+    try:
+
+        return True, check_output([iptest], stderr=STDOUT).decode('utf-8')
+    except CalledProcessError, e:
+        return False, e.output.decode('utf-8')
+    finally:
+        # Restore $PATH
+        os.environ["PATH"] = orig_path
+
+
+def markdown_format(pr, results_urls, unavailable_pythons):
+    def format_result(py, passed, gist_url, missing_libraries):
+        s = "* %s: " % py
+        if passed:
+            s += "OK"
+        else:
+            s += "Failed, log at %s" % gist_url
+        if missing_libraries:
+            s += " (libraries not available: " + missing_libraries + ")"
+        return s
+
+    if pr['mergeable']:
+        com = pr['head']['sha'][:7] + " merged into master"
+    else:
+        com = pr['head']['sha'][:7] + " (can't merge cleanly)"
+    lines = ["**Test results for commit %s**" % com,
+             "Platform: " + sys.platform,
+             ""] + \
+            [format_result(*r) for r in results_urls] + \
+            ["",
+             "Not available for testing: " + ", ".join(unavailable_pythons)]
+    return "\n".join(lines)
+
+
+def post_results_comment(pr, results, num,
+                         unavailable_pythons=unavailable_pythons):
+    body = markdown_format(pr, results, unavailable_pythons)
+    gh_api.post_issue_comment(gh_project, num, body)
+
+
+def print_results(pr, results_urls, unavailable_pythons=unavailable_pythons):
+    print("\n")
+    if pr['mergeable']:
+        print("**Test results for commit %s merged into master**" %
+             pr['head']['sha'][:7])
+    else:
+        print(
+            "**Test results for commit %s (can't merge cleanly)**" %
+            pr['head']['sha'][:7])
+    print("Platform:", sys.platform)
+    for py, passed, gist_url, missing_libraries in results_urls:
+        if passed:
+            print(py, ":", "OK")
+        else:
+            print(py, ":", "Failed")
+            print("    Test log:", gist_url)
+        if missing_libraries:
+            print("    Libraries not available:", missing_libraries)
+    print("Not available for testing:", ", ".join(unavailable_pythons))
+
+
+def dump_results(num, results, pr):
+    f = open(os.path.join(basedir, 'lastresults.pkl'), 'wb')
+    try:
+        pickle.dump((num, results, pr, unavailable_pythons), f)
+    finally:
+        f.close()
+
+
+def load_results():
+    f = open(os.path.join(basedir, 'lastresults.pkl'), 'rb')
+    try:
+        ret = pickle.load(f)
+    finally:
+        f.close()
+    return ret
+
+
+def save_logs(results, pr):
+    results_paths = []
+    for py, passed, log, missing_libraries in results:
+        if passed:
+            results_paths.append((py, passed, None, missing_libraries))
+        else:
+
+            result_locn = os.path.abspath(os.path.join('venv-%s' % py,
+                                        pr['head']['sha'][:7] + ".log"))
+            f = io.open(result_locn, 'w', encoding='utf-8')
+            try:
+                f.write(log)
+            finally:
+                f.close()
+
+            results_paths.append((py, False, result_locn, missing_libraries))
+
+    return results_paths
+
+
+def post_logs(results):
+    results_urls = []
+    for py, passed, log, missing_libraries in results:
+        if passed:
+            results_urls.append((py, passed, None, missing_libraries))
+        else:
+            result_locn = gh_api.post_gist(log, description='Theano test log',
+                                            filename="results.log", auth=True)
+            results_urls.append((py, False, result_locn, missing_libraries))
+
+    return results_urls
+
+
+def test_pr(num, post_results=True):
+    # Get Github authorisation first, so that the user is prompted
+    # straight away if their login is needed.
+    if post_results:
+        gh_api.get_auth_token()
+
+    setup()
+    pr = gh_api.get_pull_request(gh_project, num)
+    get_branch(repo=pr['head']['repo']['clone_url'],
+                 branch=pr['head']['ref'],
+                 owner=pr['head']['repo']['owner']['login'],
+                 mergeable=pr['mergeable'],
+              )
+
+    results = []
+    for py, venv in venvs:
+        tic = time.time()
+        passed, log = run_tests(venv)
+        elapsed = int(time.time() - tic)
+        print("Ran tests with %s in %is" % (py, elapsed))
+        missing_libraries = get_missing_libraries(log)
+        if passed:
+            results.append((py, True, None, missing_libraries))
+        else:
+            results.append((py, False, log, missing_libraries))
+
+    dump_results(num, results, pr)
+
+    results_paths = save_logs(results, pr)
+    print_results(pr, results_paths)
+
+    if post_results:
+        results_urls = post_logs(results)
+        post_results_comment(pr, results_urls, num)
+        print("(Posted to Github)")
+    else:
+        post_script = os.path.join(os.path.dirname(sys.argv[0]),
+             "post_pr_test.py")
+        print("To post the results to Github, run", post_script)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description="Test an Theano pull request")
+    parser.add_argument('-p', '--publish', action='store_true',
+                        help="Publish the results to Github")
+    parser.add_argument('number', type=int, help="The pull request number")
+
+    args = parser.parse_args()
+    test_pr(args.number, post_results=args.publish)
diff --git a/theano/misc/ordered_set.py b/theano/misc/ordered_set.py
index 4c30846c334..77825b4b8d2 100644
--- a/theano/misc/ordered_set.py
+++ b/theano/misc/ordered_set.py
@@ -7,7 +7,6 @@
 from theano.gof.python25 import OrderedDict
 import types
 
-
 def check_deterministic(iterable):
     # Most places where OrderedSet is used, theano interprets any exception
     # whatsoever as a problem that an optimization introduced into the graph.
@@ -41,30 +40,11 @@ def check_deterministic(iterable):
     # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     ## {{{ http://code.activestate.com/recipes/576696/ (r5)
     import collections
-    import weakref
+    from weakref import proxy
 
     class Link(object):
-        # This make that we need to use a different pickle protocol
-        # then the default.  Othewise, there is pickling errors
         __slots__ = 'prev', 'next', 'key', '__weakref__'
 
-        def __getstate__(self):
-            # weakref.proxy don't pickle well, so we use weakref.ref
-            # manually and don't pickle the weakref.
-            # We restore the weakref when we unpickle.
-            ret = [self.prev(), self.next()]
-            try:
-                ret.append(self.key)
-            except AttributeError:
-                pass
-            return ret
-
-        def __setstate__(self, state):
-            self.prev = weakref.ref(state[0])
-            self.next = weakref.ref(state[1])
-            if len(state) == 3:
-                self.key = state[2]
-
     class OrderedSet(collections.MutableSet):
         'Set the remembers the order elements were added'
         # Big-O running times for all methods are the same as for regular sets.
@@ -85,7 +65,7 @@ def __init__(self, iterable=None):
             # Checks added by IG
             check_deterministic(iterable)
             self.__root = root = Link()         # sentinel node for doubly linked list
-            root.prev = root.next = weakref.ref(root)
+            root.prev = root.next = root
             self.__map = {}                     # key --> link
             if iterable is not None:
                 self |= iterable
@@ -102,61 +82,32 @@ def add(self, key):
                 self.__map[key] = link = Link()
                 root = self.__root
                 last = root.prev
-                link.prev, link.next, link.key = last, weakref.ref(root), key
-                last().next = root.prev = weakref.ref(link)
-
-        def union(self, s):
-            check_deterministic(s)
-            n = self.copy()
-            for elem in s:
-                if elem not in n:
-                    n.add(elem)
-            return n
-
-        def intersection_update(self, s):
-            l = []
-            for elem in self:
-                if elem not in s:
-                    l.append(elem)
-            for elem in l:
-                self.remove(elem)
-            return self
-
-        def difference_update(self, s):
-            check_deterministic(s)
-            for elem in s:
-                if elem in self:
-                    self.remove(elem)
-            return self
-
-        def copy(self):
-            n = OrderedSet()
-            n.update(self)
-            return n
+                link.prev, link.next, link.key = last, root, key
+                last.next = root.prev = proxy(link)
 
         def discard(self, key):
             # Remove an existing item using self.__map to find the link which is
             # then removed by updating the links in the predecessor and successors.
             if key in self.__map:
                 link = self.__map.pop(key)
-                link.prev().next = link.next
-                link.next().prev = link.prev
+                link.prev.next = link.next
+                link.next.prev = link.prev
 
         def __iter__(self):
             # Traverse the linked list in order.
             root = self.__root
-            curr = root.next()
+            curr = root.next
             while curr is not root:
                 yield curr.key
-                curr = curr.next()
+                curr = curr.next
 
         def __reversed__(self):
             # Traverse the linked list in reverse order.
             root = self.__root
-            curr = root.prev()
+            curr = root.prev
             while curr is not root:
                 yield curr.key
-                curr = curr.prev()
+                curr = curr.prev
 
         def pop(self, last=True):
             if not self:
diff --git a/theano/misc/pkl_utils.py b/theano/misc/pkl_utils.py
deleted file mode 100644
index ac386c0f32c..00000000000
--- a/theano/misc/pkl_utils.py
+++ /dev/null
@@ -1,97 +0,0 @@
-"""
-Utility classes and methods to pickle parts of symbolic graph.
-
-These pickled graphs can be used, for instance, as cases for
-unit tests or regression tests.
-"""
-__docformat__ = "restructuredtext en"
-__authors__ = "Pascal Lamblin"
-__copyright__ = "Copyright 2013, Universite de Montreal"
-__license__ = "3-clause BSD"
-
-
-import pickle
-import sys
-import theano
-from theano.compat import PY3
-from theano.compat.six import string_types
-
-
-sys.setrecursionlimit(3000)
-Pickler = pickle.Pickler
-
-
-class StripPickler(Pickler):
-    """
-    Subclass of Pickler that strips unnecessary attributes from Theano objects.
-
-    Example of use::
-
-        fn_args = dict(inputs=inputs,
-                       outputs=outputs,
-                       updates=updates)
-        dest_pkl = 'my_test.pkl'
-        f = open(dest_pkl, 'wb')
-        strip_pickler = StripPickler(f, protocol=-1)
-        strip_pickler.dump(fn_args)
-        f.close()
-    """
-    def save(self, obj):
-        # Remove the tag.trace attribute from Variable and Apply nodes
-        if isinstance(obj, theano.gof.utils.scratchpad):
-            if hasattr(obj, 'trace'):
-                del obj.trace
-
-        # Remove manually-added docstring of Elemwise ops
-        elif (isinstance(obj, theano.tensor.Elemwise)):
-            if '__doc__' in obj.__dict__:
-                del obj.__dict__['__doc__']
-
-        return Pickler.save(self, obj)
-
-
-# Make an unpickler that tries encoding byte streams before raising TypeError.
-# This is useful with python 3, in order to unpickle files created with
-# python 2.
-# This code is taken from Pandas, https://github.com/pydata/pandas,
-# under the same 3-clause BSD license.
-def load_reduce(self):
-    stack = self.stack
-    args = stack.pop()
-    func = stack[-1]
-    try:
-        value = func(*args)
-    except Exception:
-        # try to reencode the arguments
-        if self.encoding is not None:
-            new_args = []
-            for arg in args:
-                if isinstance(arg, string_types):
-                    new_args.append(arg.encode(self.encoding))
-                else:
-                    new_args.append(arg)
-            args = tuple(new_args)
-            try:
-                stack[-1] = func(*args)
-                return
-            except Exception:
-                pass
-
-        if self.is_verbose:
-            print(sys.exc_info())
-            print(func, args)
-
-        raise
-
-    stack[-1] = value
-
-
-if PY3:
-    class CompatUnpickler(pickle._Unpickler):
-        pass
-
-    # Register `load_reduce` defined above in CompatUnpickler
-    CompatUnpickler.dispatch[pickle.REDUCE[0]] = load_reduce
-else:
-    class CompatUnpickler(pickle.Unpickler):
-        pass
diff --git a/theano/misc/pycuda_example.py b/theano/misc/pycuda_example.py
index 7f6d2bdaaae..11e1cb496fb 100644
--- a/theano/misc/pycuda_example.py
+++ b/theano/misc/pycuda_example.py
@@ -369,7 +369,7 @@ def thunk():
 gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")
 
 
-@local_optimizer([GpuElemwise])
+@local_optimizer([])
 def local_pycuda_gpu_elemwise(node):
     """
        GpuElemwise -> PycudaElemwiseSourceModuleOp
@@ -386,7 +386,7 @@ def local_pycuda_gpu_elemwise(node):
                           local_pycuda_gpu_elemwise)
 
 
-@local_optimizer([GpuElemwise])
+@local_optimizer([])
 def local_pycuda_gpu_elemwise_kernel(node):
     """
        GpuElemwise -> PycudaElemwiseKernelOp
diff --git a/theano/misc/pycuda_init.py b/theano/misc/pycuda_init.py
index 3a04da2da53..120f0c9346b 100644
--- a/theano/misc/pycuda_init.py
+++ b/theano/misc/pycuda_init.py
@@ -49,8 +49,6 @@ def set_gpu_from_theano():
     if pycuda_available:
         if hasattr(pycuda.driver.Context, "attach"):
             pycuda.driver.Context.attach()
-            import atexit
-            atexit.register(pycuda.driver.Context.pop)
         else:
             # Now we always import this file when we call
             # theano.sandbox.cuda.use. So this should not happen
diff --git a/theano/misc/pycuda_utils.py b/theano/misc/pycuda_utils.py
index 76269b4674a..7a089b012ad 100644
--- a/theano/misc/pycuda_utils.py
+++ b/theano/misc/pycuda_utils.py
@@ -1,7 +1,7 @@
 import numpy
 import pycuda.gpuarray
 
-from theano.sandbox import cuda
+import theano.sandbox.cuda as cuda
 if cuda.cuda_available == False:
     raise ImportError('Optional theano package cuda disabled')
 
diff --git a/theano/misc/tests/test_gnumpy_utils.py b/theano/misc/tests/test_gnumpy_utils.py
index 8710a7559ba..460102f471b 100644
--- a/theano/misc/tests/test_gnumpy_utils.py
+++ b/theano/misc/tests/test_gnumpy_utils.py
@@ -63,16 +63,3 @@ def test2(shape=(3, 4, 5)):
     assert A_cnd.gpudata == B.gpudata
     v = numpy.asarray(B)
     assert (v == A).all()
-
-
-def test_broadcast_dims():
-    """
-    Test with some dimensions being 1.
-    CudaNdarray use 0 for strides for those dimensions.
-    """
-    test((1, 2, 3))
-    test((2, 1, 3))
-    test((2, 3, 1))
-    test2((1, 2, 3))
-    test2((2, 1, 3))
-    test2((2, 3, 1))
diff --git a/theano/misc/tests/test_pycuda_example.py b/theano/misc/tests/test_pycuda_example.py
index faad370e3ed..fe9817b9a5c 100644
--- a/theano/misc/tests/test_pycuda_example.py
+++ b/theano/misc/tests/test_pycuda_example.py
@@ -9,7 +9,7 @@
                    " with pycuda code.")
 
 import theano.sandbox.cuda as cuda_ndarray
-if not cuda_ndarray.cuda_available:
+if cuda_ndarray.cuda_available == False:
     from nose.plugins.skip import SkipTest
     raise SkipTest('Optional package cuda disabled')
 
diff --git a/theano/misc/windows.py b/theano/misc/windows.py
index 647d4da061e..4664d6aba75 100644
--- a/theano/misc/windows.py
+++ b/theano/misc/windows.py
@@ -2,11 +2,9 @@
 import subprocess
 
 
-def subprocess_Popen(command, **params):
+def call_subprocess_Popen(command, **params):
     """
-    Utility function to work around windows behavior that open windows.
-
-    :see: call_subprocess_Popen and output_subprocess_Popen
+    Utility function to work around windows behavior that open windows
     """
     startupinfo = None
     if os.name == 'nt':
@@ -38,40 +36,3 @@ def subprocess_Popen(command, **params):
         if stdin is not None:
             del stdin
     return proc
-
-def call_subprocess_Popen(command, **params):
-    """
-    Calls subprocess_Popen and discards the output, returning only the
-    exit code.
-    """
-    if 'stdout' in params or 'stderr' in params:
-        raise TypeError("don't use stderr or stdout with call_subprocess_Popen")
-    with open(os.devnull, 'wb') as null:
-        # stdin to devnull is a workaround for a crash in a weird Windows
-        # environment where sys.stdin was None
-        params.setdefault('stdin', null)
-        params['stdout'] = null
-        params['stderr'] = null
-        p = subprocess_Popen(command, **params)
-        returncode = p.wait()
-    return returncode
-
-def output_subprocess_Popen(command, **params):
-    """
-    Calls subprocess_Popen, returning the output, error and exit code
-    in a tuple.
-    """
-    if 'stdout' in params or 'stderr' in params:
-        raise TypeError("don't use stderr or stdout with output_subprocess_Popen")
-    # stdin to devnull is a workaround for a crash in a weird Windows
-    # environement where sys.stdin was None
-    if not hasattr(params, 'stdin'):
-        null = open(os.devnull, 'wb')
-        params['stdin'] = null
-    params['stdout'] = subprocess.PIPE
-    params['stderr'] = subprocess.PIPE
-    p = subprocess_Popen(command, **params)
-    # we need to use communicate to make sure we don't deadlock around
-    # the stdour/stderr pipe.
-    out = p.communicate()
-    return out + (p.returncode,)
diff --git a/theano/printing.py b/theano/printing.py
index d63223ff7cd..e130c6a4501 100644
--- a/theano/printing.py
+++ b/theano/printing.py
@@ -7,7 +7,6 @@
 import logging
 import os
 import sys
-import warnings
 # Not available on all platforms
 hashlib = None
 
@@ -28,6 +27,7 @@
 from theano import config
 from theano.compat.six import StringIO
 from theano.gof import Op, Apply
+from theano.gof.python25 import any
 from theano.compile import Function, debugmode
 from theano.compile.profilemode import ProfileMode
 
@@ -82,68 +82,24 @@ def debugprint(obj, depth=-1, print_type=False,
     done = dict()
     results_to_print = []
     order = []
-    if isinstance(obj, (list, tuple)):
-        lobj = obj
+    if isinstance(obj, gof.Variable):
+        results_to_print.append(obj)
+    elif isinstance(obj, gof.Apply):
+        results_to_print.extend(obj.outputs)
+    elif isinstance(obj, Function):
+        results_to_print.extend(obj.maker.fgraph.outputs)
+        order = obj.maker.fgraph.toposort()
+    elif isinstance(obj, (list, tuple)):
+        results_to_print.extend(obj)
+    elif isinstance(obj, gof.FunctionGraph):
+        results_to_print.extend(obj.outputs)
+        order = obj.toposort()
     else:
-        lobj = [obj]
-    for obj in lobj:
-        if isinstance(obj, gof.Variable):
-            results_to_print.append(obj)
-        elif isinstance(obj, gof.Apply):
-            results_to_print.extend(obj.outputs)
-        elif isinstance(obj, Function):
-            results_to_print.extend(obj.maker.fgraph.outputs)
-            order = obj.maker.fgraph.toposort()
-        elif isinstance(obj, gof.FunctionGraph):
-            results_to_print.extend(obj.outputs)
-            order = obj.toposort()
-        elif isinstance(obj, (int, long, float, numpy.ndarray)):
-            print obj
-        elif isinstance(obj, (theano.In, theano.Out)):
-            results_to_print.append(obj.variable)
-        else:
-            raise TypeError("debugprint cannot print an object of this type",
-                            obj)
-
-    scan_ops = []
+        raise TypeError("debugprint cannot print an object of this type", obj)
     for r in results_to_print:
-        # Add the parent scan op to the list as well
-        if (hasattr(r.owner, 'op') and
-            isinstance(r.owner.op, theano.scan_module.scan_op.Scan)):
-            scan_ops.append(r)
-
         debugmode.debugprint(r, depth=depth, done=done, print_type=print_type,
                              file=_file, order=order, ids=ids,
-                             scan_ops=scan_ops, stop_on_name=stop_on_name)
-    if len(scan_ops) > 0:
-        print >> file, ""
-        new_prefix = ' >'
-        new_prefix_child = ' >'
-        print >> file, "Inner graphs of the scan ops:"
-
-        for s in scan_ops:
-            print >> file, ""
-            debugmode.debugprint(s, depth=depth, done=done,
-                                 print_type=print_type,
-                                 file=_file, ids=ids,
-                                 scan_ops=scan_ops, stop_on_name=stop_on_name)
-            if hasattr(s.owner.op, 'fn'):
-                # If the op was compiled, print the optimized version.
-                outputs = s.owner.op.fn.maker.fgraph.outputs
-            else:
-                outputs = s.owner.op.output
-            for idx, i in enumerate(outputs):
-                if hasattr(i, 'owner') and hasattr(i.owner, 'op'):
-                    if isinstance(i.owner.op, theano.scan_module.scan_op.Scan):
-                        scan_ops.append(i)
-
-                debugmode.debugprint(r=i, prefix=new_prefix,
-                                     depth=depth, done=done,
-                                     print_type=print_type, file=file,
-                                     ids=ids, stop_on_name=stop_on_name,
-                                     prefix_child=new_prefix_child,
-                                     scan_ops=scan_ops)
-
+                             stop_on_name=stop_on_name)
     if file is _file:
         return file
     elif file == 'str':
@@ -273,10 +229,10 @@ def process(self, output, pstate):
             if (self.assoc == 'left' and i != 0 or self.assoc == 'right'
                 and i != max_i):
                 s = pprinter.process(input, pstate.clone(
-                    precedence=self.precedence + 1e-6))
+                        precedence=self.precedence + 1e-6))
             else:
                 s = pprinter.process(input, pstate.clone(
-                    precedence=self.precedence))
+                        precedence=self.precedence))
             input_strings.append(s)
         if len(input_strings) == 1:
             s = self.operator + input_strings[0]
@@ -331,8 +287,8 @@ def process(self, output, pstate):
         idx = node.outputs.index(output)
         name = self.names[idx]
         return "%s(%s)" % (name, ", ".join(
-            [pprinter.process(input, pstate.clone(precedence=-1000))
-             for input in node.inputs]))
+                [pprinter.process(input, pstate.clone(precedence=-1000))
+                 for input in node.inputs]))
 
 
 class MemberPrinter:
@@ -378,8 +334,8 @@ def process(self, r, pstate):
         if node is None:
             return LeafPrinter().process(r, pstate)
         return "%s(%s)" % (str(node.op), ", ".join(
-            [pprinter.process(input, pstate.clone(precedence=-1000))
-             for input in node.inputs]))
+                [pprinter.process(input, pstate.clone(precedence=-1000))
+                 for input in node.inputs]))
 
 
 class LeafPrinter:
@@ -446,7 +402,7 @@ def process_graph(self, inputs, outputs, updates=None,
                 if output in inv_updates:
                     name = str(inv_updates[output])
                     strings.append((i + 1000, "%s <- %s" % (
-                        name, pprinter.process(output))))
+                                name, pprinter.process(output))))
                     i += 1
                 if output.name is not None or output in outputs:
                     if output.name is None:
@@ -518,13 +474,13 @@ def __call__(self, *args):
 # colors not used: orange, amber#FFBF00, purple, pink,
 # used by default: green, blue, grey, red
 default_colorCodes = {'GpuFromHost': 'red',
-                      'HostFromGpu': 'red',
-                      'Scan': 'yellow',
-                      'Shape': 'cyan',
-                      'IfElse': 'magenta',
-                      'Elemwise': '#FFAABB',  # dark pink
-                      'Subtensor': '#FFAAFF',  # purple
-                      'Alloc': '#FFAA22'}  # orange
+              'HostFromGpu': 'red',
+              'Scan': 'yellow',
+              'Shape': 'cyan',
+              'IfElse': 'magenta',
+              'Elemwise': '#FFAABB',  # dark pink
+              'Subtensor': '#FFAAFF',  # purple
+              'Alloc': '#FFAA22'}  # orange
 
 
 def pydotprint(fct, outfile=None,
@@ -533,13 +489,12 @@ def pydotprint(fct, outfile=None,
                max_label_size=70, scan_graphs=False,
                var_with_name_simple=False,
                print_output_file=True,
-               assert_nb_all_strings=-1,
-               return_image=False,
+               assert_nb_all_strings=-1
                ):
-    """Print to a file (png format) the graph of a compiled theano function's ops.
+    """
+    Print to a file (png format) the graph of a compiled theano function's ops.
 
-    :param fct: a compiled Theano function, a Variable, an Apply or
-                a list of Variable.
+    :param fct: the theano fct returned by theano.function.
     :param outfile: the output file where to put the graph.
     :param compact: if True, will remove intermediate var that don't have name.
     :param format: the file format of the output.
@@ -568,16 +523,6 @@ def pydotprint(fct, outfile=None,
                 the number of unique string nodes in the dot graph is equal to
                 this number. This is used in tests to verify that dot won't
                 merge Theano nodes.
-    :param return_image: If True, it will create the image and return it.
-        Useful to display the image in ipython notebook.
-
-        .. code-block:: python
-
-            import theano
-            v = theano.tensor.vector()
-            from IPython.display import SVG
-            SVG(theano.printing.pydotprint(v*2, return_image=True,
-                                           format='svg'))
 
     In the graph, ellipses are Apply Nodes (the execution of an op)
     and boxes are variables.  If variables have names they are used as
@@ -596,11 +541,6 @@ def pydotprint(fct, outfile=None,
     red ellipses are transfers from/to the gpu (ops with names GpuFromHost,
     HostFromGpu).
 
-    .. note::
-
-        Since October 20th, 2014, this print the inner function of all
-        scan separately after the top level debugprint output.
-
     """
     if colorCodes is None:
         colorCodes = default_colorCodes
@@ -615,39 +555,27 @@ def pydotprint(fct, outfile=None,
         if (not isinstance(mode, ProfileMode)
             or not fct in mode.profile_stats):
             mode = None
-        outputs = fct.maker.fgraph.outputs
-        topo = fct.maker.fgraph.toposort()
+        fct_fgraph = fct.maker.fgraph
     elif isinstance(fct, gof.FunctionGraph):
         mode = None
         profile = None
-        outputs = fct.outputs
-        topo = fct.toposort()
+        fct_fgraph = fct
     else:
-        if isinstance(fct, gof.Variable):
-            fct = [fct]
-        elif isinstance(fct, gof.Apply):
-            fct = fct.outputs
-        assert isinstance(fct, (list, tuple))
-        assert all(isinstance(v, gof.Variable) for v in fct)
-        fct = gof.FunctionGraph(inputs=gof.graph.inputs(fct),
-                                outputs=fct)
-        mode = None
-        profile = None
-        outputs = fct.outputs
-        topo = fct.toposort()
+        raise ValueError(('pydotprint expects as input a theano.function or '
+                         'the FunctionGraph of a function!'), fct)
+
     if not pydot_imported:
         raise RuntimeError("Failed to import pydot. You must install pydot"
-                           " for `pydotprint` to work.")
+                            " for `pydotprint` to work.")
         return
 
     g = pd.Dot()
-
     if cond_highlight is not None:
         c1 = pd.Cluster('Left')
         c2 = pd.Cluster('Right')
         c3 = pd.Cluster('Middle')
         cond = None
-        for node in topo:
+        for node in fct_fgraph.toposort():
             if (node.op.__class__.__name__ == 'IfElse'
                 and node.op.name == cond_highlight):
                 cond = node
@@ -700,8 +628,8 @@ def var_name(var):
                 varstr = (input_update[var].variable.name + " UPDATE "
                           + str(var.type))
         else:
-            # a var id is needed as otherwise var with the same type will be
-            # merged in the graph.
+            #a var id is needed as otherwise var with the same type will be
+            #merged in the graph.
             varstr = str(var.type)
         if (varstr in all_strings) or with_ids:
             idx = ' id=' + str(len(var_str))
@@ -722,6 +650,7 @@ def var_name(var):
         all_strings.add(varstr)
 
         return varstr
+    topo = fct_fgraph.toposort()
     apply_name_cache = {}
 
     def apply_name(node):
@@ -730,7 +659,7 @@ def apply_name(node):
         prof_str = ''
         if mode:
             time = mode.profile_stats[fct].apply_time.get(node, 0)
-            # second, % total time in profiler, %fct time in profiler
+            #second, % total time in profiler, %fct time in profiler
             if mode.local_time == 0:
                 pt = 0
             else:
@@ -742,7 +671,7 @@ def apply_name(node):
             prof_str = '   (%.3fs,%.3f%%,%.3f%%)' % (time, pt, pf)
         elif profile:
             time = profile.apply_time.get(node, 0)
-            # second, %fct time in profiler
+            #second, %fct time in profiler
             if profile.fct_callcount == 0:
                 pf = 0
             else:
@@ -773,6 +702,7 @@ def apply_name(node):
 
     # Update the inputs that have an update function
     input_update = {}
+    outputs = list(fct_fgraph.outputs)
     if isinstance(fct, Function):
         for i in reversed(fct.maker.expanded_inputs):
             if i.update is not None:
@@ -792,7 +722,7 @@ def apply_name(node):
             nw_node = pd.Node(astr, shape=apply_shape)
         elif high_contrast:
             nw_node = pd.Node(astr, style='filled', fillcolor=use_color,
-                              shape=apply_shape)
+                               shape=apply_shape)
         else:
             nw_node = pd.Node(astr, color=use_color, shape=apply_shape)
         g.add_node(nw_node)
@@ -823,12 +753,12 @@ def apply_name(node):
             elif var.name or not compact:
                 g.add_edge(pd.Edge(varstr, astr, label=label))
             else:
-                # no name, so we don't make a var ellipse
+                #no name, so we don't make a var ellipse
                 g.add_edge(pd.Edge(apply_name(var.owner), astr, label=label))
 
         for id, var in enumerate(node.outputs):
             varstr = var_name(var)
-            out = var in outputs
+            out = any([x[0] == 'output' for x in var.clients])
             label = str(var.type)
             if len(node.outputs) > 1:
                 label = str(id) + ' ' + label
@@ -851,7 +781,7 @@ def apply_name(node):
             elif var.name or not compact:
                 g.add_edge(pd.Edge(astr, varstr, label=label))
 #            else:
-            # don't add egde here as it is already added from the inputs.
+            #don't add egde here as it is already added from the inputs.
 
     if cond_highlight:
         g.add_subgraph(c1)
@@ -861,11 +791,15 @@ def apply_name(node):
     if not outfile.endswith('.' + format):
         outfile += '.' + format
 
+    g.write(outfile, prog='dot', format=format)
+    if print_output_file:
+        print 'The output file is available at', outfile
+
     if assert_nb_all_strings != -1:
-        assert len(all_strings) == assert_nb_all_strings, len(all_strings)
+        assert len(all_strings) == assert_nb_all_strings
 
     if scan_graphs:
-        scan_ops = [(idx, x) for idx, x in enumerate(topo)
+        scan_ops = [(idx, x) for idx, x in enumerate(fct_fgraph.toposort())
                     if isinstance(x.op, theano.scan_module.scan_op.Scan)]
         path, fn = os.path.split(outfile)
         basename = '.'.join(fn.split('.')[:-1])
@@ -883,13 +817,6 @@ def apply_name(node):
                        high_contrast, cond_highlight, colorCodes,
                        max_label_size, scan_graphs)
 
-    if return_image:
-        return g.create(prog='dot', format=format)
-    else:
-        g.write(outfile, prog='dot', format=format)
-        if print_output_file:
-            print 'The output file is available at', outfile
-
 
 def pydotprint_variables(vars,
                          outfile=None,
@@ -898,15 +825,8 @@ def pydotprint_variables(vars,
                          high_contrast=True, colorCodes=None,
                          max_label_size=50,
                          var_with_name_simple=False):
-    '''DEPRECATED: use pydotprint() instead.
-
-    Identical to pydotprint just that it starts from a variable
-    instead of a compiled function. Could be useful ?
-
-    '''
-
-    warnings.warn("pydotprint_variables() is deprecated."
-                  " Use pydotprint() instead.")
+    ''' Identical to pydotprint just that it starts from a variable instead
+    of a compiled function. Could be useful ? '''
 
     if colorCodes is None:
         colorCodes = default_colorCodes
@@ -941,8 +861,8 @@ def var_name(var):
                 dstr = dstr[:dstr.index('\n')]
             varstr = '%s %s' % (dstr, str(var.type))
         else:
-            # a var id is needed as otherwise var with the same type will be
-            # merged in the graph.
+            #a var id is needed as otherwise var with the same type will be
+            #merged in the graph.
             varstr = str(var.type)
 
         varstr += ' ' + str(len(var_str))
@@ -990,12 +910,12 @@ def plot_apply(app, d):
                     g.add_node(pd.Node(varastr))
                 elif high_contrast:
                     g.add_node(pd.Node(varastr, style='filled',
-                                       fillcolor='green'))
+                                        fillcolor='green'))
                 else:
                     g.add_node(pd.Node(varastr, color='green'))
             else:
                 varastr = my_list[nd]
-            label = None
+            label = ''
             if len(app.inputs) > 1:
                 label = str(i)
             g.add_edge(pd.Edge(varastr, astr, label=label))
@@ -1015,12 +935,12 @@ def plot_apply(app, d):
                     g.add_node(pd.Node(varastr))
                 elif high_contrast:
                     g.add_node(pd.Node(varastr, style='filled',
-                                       fillcolor=color))
+                                        fillcolor=color))
                 else:
                     g.add_node(pd.Node(varastr, color=color))
             else:
                 varastr = my_list[nd]
-            label = None
+            label = ''
             if len(app.outputs) > 1:
                 label = str(i)
             g.add_edge(pd.Edge(astr, varastr, label=label))
@@ -1037,22 +957,8 @@ def plot_apply(app, d):
     for nd in vars:
         if nd.owner:
             plot_apply(nd.owner, depth)
-    try:
-        g.write(outfile, prog='dot', format=format)
-    except pd.InvocationException, e:
-        # Some version of pydot are bugged/don't work correctly with
-        # empty label. Provide a better user error message.
-        if pd.__version__ == "1.0.28" and "label=]" in e.message:
-            raise Exception("pydot 1.0.28 is know to be bugged. Use another "
-                            "working version of pydot")
-        elif "label=]" in e.message:
-            raise Exception("Your version of pydot " + pd.__version__ +
-                            " returned an error. Version 1.0.28 is known"
-                            " to be bugged and 1.0.25 to be working with"
-                            " Theano. Using another version of pydot could"
-                            " fix this problem. The pydot error is: " +
-                            e.message)
-        raise
+
+    g.write_png(outfile, prog='dot')
 
     print 'The output file is available at', outfile
 
@@ -1169,6 +1075,8 @@ def min_informative_str(obj, indent_level=0,
     return rval
 
 
+
+
 def var_descriptor(obj, _prev_obs=None, _tag_generator=None):
     """
     Returns a string, with no endlines, fully specifying
@@ -1231,7 +1139,6 @@ def var_descriptor(obj, _prev_obs=None, _tag_generator=None):
 
     return rval
 
-
 def position_independent_str(obj):
     if isinstance(obj, theano.gof.graph.Variable):
         rval = 'theano_var'
diff --git a/theano/raise_op.py b/theano/raise_op.py
index d140b105e46..8f3e46ab0b6 100644
--- a/theano/raise_op.py
+++ b/theano/raise_op.py
@@ -1,14 +1,13 @@
 """Symbolic Op for raising an exception."""
 
-__authors__ = "James Bergstra"
+__authors__   = "James Bergstra"
 __copyright__ = "(c) 2011, Universite de Montreal"
-__license__ = "3-clause BSD License"
-__contact__ = "theano-dev <theano-dev@googlegroups.com>"
+__license__   = "3-clause BSD License"
+__contact__   = "theano-dev <theano-dev@googlegroups.com>"
 
 __docformat__ = "restructuredtext en"
 from theano import gof
 
-
 class Raise(gof.Op):
     """Op whose perform() raises an exception.
     """
@@ -19,22 +18,19 @@ def __init__(self, msg="", exc=NotImplementedError):
         """
         self.msg = msg
         self.exc = exc
-
     def __eq__(self, other):
         # Note: the msg does not technically have to be in the hash and eq
         # because it doesn't affect the return value.
         return (type(self) == type(other)
                 and self.msg == other.msg
                 and self.exc == other.exc)
-
     def __hash__(self):
         return hash((type(self), self.msg, self.exc))
-
     def __str__(self):
-        return "Raise{%s(%s)}" % (self.exc, self.msg)
-
+        return "Raise{%s(%s)}"%(self.exc, self.msg)
     def make_node(self, x):
         return gof.Apply(self, [x], [x.type()])
-
     def perform(self, node, inputs, out_storage):
         raise self.exc(self.msg)
+
+
diff --git a/theano/sandbox/cuda/GpuConv3D.py b/theano/sandbox/cuda/GpuConv3D.py
index c851f91461a..22aa9c1f903 100644
--- a/theano/sandbox/cuda/GpuConv3D.py
+++ b/theano/sandbox/cuda/GpuConv3D.py
@@ -3,14 +3,12 @@
 import theano
 import theano.tensor as T
 from theano.gof import local_optimizer
-from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
-                                           host_from_gpu, HostFromGpu)
+from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, host_from_gpu, HostFromGpu
 from theano.misc import strutil
 from theano.tensor.nnet.Conv3D import Conv3D
-from theano.sandbox.cuda.opt import gpu_optimizer
+from theano.sandbox.cuda.opt import register_opt
 from theano.sandbox.cuda import CudaNdarrayType, GpuOp
 
-
 class GpuConv3D(GpuOp):
     """ GPU implementation of Conv3D """
 
@@ -34,53 +32,51 @@ def make_node(self, V, W, b, d):
         W_ = as_cuda_ndarray_variable(W)
         b_ = as_cuda_ndarray_variable(b)
         d_ = T.as_tensor_variable(d)
-        broad = (V_.broadcastable[0], W_.broadcastable[0], False, False, False)
+
         return theano.Apply(self, inputs=[V_, W_, b_, d_],
-                            outputs=[CudaNdarrayType(dtype=V_.dtype,
-                                                     broadcastable=broad)()])
+                            outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] )
 
     def c_code_cache_version(self):
         return ()
-
     def c_code(self, node, nodename, inputs, outputs, sub):
         V, W, b, d = inputs
         fail = sub['fail']
 
         H = outputs[0]
 
-        codeSource = """
+        codeSource =  """
                         ///////////// < code generated by GpuConv3D >
 
                         //printf("\t\t\t\tConv3DGPU c code\\n");
 
                         //Check dimensionality of inputs
-                        if (CudaNdarray_NDIM(%(W)s) != 5)
+                        if (%(W)s->nd != 5)
                         {
                 PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray");
                             %(fail)s
                         }
 
-                        if (CudaNdarray_NDIM(%(V)s) != 5)
+                        if (%(V)s->nd != 5)
                         {
                 PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray");
                             %(fail)s
                         }
 
-                        if (CudaNdarray_NDIM(%(b)s) != 1)
+                        if (%(b)s->nd != 1)
                         {
                 PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray");
                             %(fail)s
                         }
 
-                        if (CudaNdarray_NDIM(%(d)s) != 1)
+                        if (%(d)s->nd != 1)
                         {
 PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
                             %(fail)s
 
                         }
-                        if (PyArray_DIMS(%(d)s)[0] != 3)
+                        if (%(d)s->dimensions[0] != 3)
                         {
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", PyArray_DIMS(%(d)s)[0]);
+                PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]);
                             %(fail)s
 
                         }
@@ -224,13 +220,13 @@ def c_code(self, node, nodename, inputs, outputs, sub):
 }}}}}}} //extra scope so error handler jumps don't cross declarations
                         ///////////// < /code generated by GpuConv3D >
         """
-        return strutil.render_string(codeSource, locals())
+        return strutil.render_string(codeSource,locals())
 
     def c_support_code_apply(self, node, nodename):
         # This code is not sensitive to the ignore_border flag.
         # It runs for every position in the output z, and then computes the gradient for the
         # input pixels that were downsampled to that z-position.
-        codeSource = """
+        codeSource =  """
 __global__ void
 //thread block size = out_dur
 //grid block size =(out_len*out_wid, nb kern *nb batch)
@@ -287,17 +283,11 @@ def c_support_code_apply(self, node, nodename):
 
 gpu_convd = GpuConv3D()
 
-
-@local_optimizer([Conv3D])
+@register_opt()
+@local_optimizer([])
 def local_gpu_conv3d(node):
     if isinstance(node.op, Conv3D):
-        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
-                      for i in node.inputs]):
+        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
             if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                 V, W, b, d = node.inputs
-                return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),
-                                                as_cuda_ndarray_variable(W),
-                                                as_cuda_ndarray_variable(b),
-                                                d))]
-# Not enabled by default as we don't want people to use it.
-gpu_optimizer.register("local_gpu_conv3d", local_gpu_conv3d)
+                return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),as_cuda_ndarray_variable(W), as_cuda_ndarray_variable(b), d))]
diff --git a/theano/sandbox/cuda/GpuConvGrad3D.py b/theano/sandbox/cuda/GpuConvGrad3D.py
index 95f486003e5..0efa441b761 100644
--- a/theano/sandbox/cuda/GpuConvGrad3D.py
+++ b/theano/sandbox/cuda/GpuConvGrad3D.py
@@ -7,11 +7,12 @@
 from theano.misc import strutil
 
 from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
-from theano.sandbox.cuda.opt import gpu_optimizer
+from theano.sandbox.cuda.opt import register_opt
 from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
                                  host_from_gpu, GpuOp)
 
 
+
 class GpuConvGrad3D(GpuOp):
     """ GPU version of gradient of ConvGrad3D with respect to W """
 
@@ -26,10 +27,9 @@ def make_node(self, V, d, WShape, dCdH):
         d_ = T.as_tensor_variable(d)
         WShape_ = T.as_tensor_variable(WShape)
         dCdH_ = as_cuda_ndarray_variable(dCdH)
-        broad = (False,)*5
+
         return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
-                            outputs=[CudaNdarrayType(dtype=V_.dtype,
-                                                     broadcastable=broad)()])
+                            outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(False,)*5)()])
 
     def perform_(self, node, inputs, output_storage):
         V, d, WShape, dCdH = inputs
@@ -51,18 +51,18 @@ def perform_(self, node, inputs, output_storage):
 
         dCdW = numpy.zeros(WShape, dtype=V.dtype)
 
-        # block
-        for j in xrange(0, WShape[0]):
-            for z in xrange(0, WShape[1]):
-                for k in xrange(0, WShape[2]):
-                    for l in xrange(0, WShape[3]):
-                        # threads
-                        for m in xrange(0, WShape[4]):
-                            # thread
-                            for i in xrange(0, batchSize):
-                                for p in xrange(0, outputHeight):
-                                    for q in xrange(0, outputWidth):
-                                        for r in xrange(0, outputDur):
+        #block
+        for j in xrange(0,WShape[0]):
+            for z in xrange(0,WShape[1]):
+                for k in xrange(0,WShape[2]):
+                    for l in xrange(0,WShape[3]):
+                        #threads
+                        for m in xrange(0,WShape[4]):
+                            #thread
+                            for i in xrange(0,batchSize):
+                                for p in xrange(0,outputHeight):
+                                    for q in xrange(0,outputWidth):
+                                        for r in xrange(0,outputDur):
                                             dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
 
         output_storage[0][0] = dCdW
@@ -79,33 +79,33 @@ def c_code(self, node, nodename, inputs, outputs, sub):
             //printf("\t\t\t\tGpuConvGrad3DW c code\\n");
 
             //Check dimensionality of inputs
-            if (CudaNdarray_NDIM(%(dCdH)s) != 5)
+            if (%(dCdH)s->nd != 5)
             {
                 PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: dCdH must be a 5-d CudaNdArray");
                 %(fail)s
             }
 
-            if (CudaNdarray_NDIM(%(V)s) != 5)
+            if (%(V)s->nd != 5)
             {
                 PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: V must be a 5-d CudaNdArray");
                 %(fail)s
             }
 
-            if (CudaNdarray_NDIM(%(WShape)s) != 1)
+            if (%(WShape)s->nd != 1)
             {
                 PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must be a 1-d CudaNdArray");
                 %(fail)s
             }
 
-            if (PyArray_NDIM(%(d)s) != 1)
+            if (%(d)s->nd != 1)
             {
                 PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: d must be a 1-d CudaNdArray");
                 %(fail)s
             }
 
-            if (PyArray_DIMS(%(d)s)[0] != 3)
+            if (%(d)s->dimensions[0] != 3)
             {
-                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: 3 stride lengths arguments expected(for row, col, and time) but %%li were given", PyArray_DIMS(%(d)s)[0]);
+                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: 3 stride lengths arguments expected(for row, col, and time) but %%li were given", %(d)s->dimensions[0]);
                 %(fail)s
             }
 
@@ -113,7 +113,7 @@ def c_code(self, node, nodename, inputs, outputs, sub):
 
             //Read and check sizes of inputs
             const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
-            if (PyArray_DIMS(%(WShape)s)[0] != 5)
+            if (%(WShape)s->dimensions[0] != 5)
             {
                 PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must specify a 5-d shape");
                 %(fail)s
@@ -125,7 +125,7 @@ def c_code(self, node, nodename, inputs, outputs, sub):
 
             }
 { //for fail
-            dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) PyArray_DATA(%(WShape)s);
+            dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) %(WShape)s->data;
             const int outputChannels =  WShape[0];
             const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
             if (WShape[4] != inputChannels)
@@ -340,18 +340,11 @@ def c_support_code_apply(self, node, nodename):
 
 gpu_conv_grad3d = GpuConvGrad3D()
 
-
-@local_optimizer([ConvGrad3D])
-def local_gpu_conv_grad3d(node):
+@register_opt()
+@local_optimizer([])
+def local_gpu_conv_gradd(node):
     if isinstance(node.op, ConvGrad3D):
-        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
-                      for i in node.inputs]):
+        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
             if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                 V, d, WShape, dCdH = node.inputs
-                return [host_from_gpu(gpu_conv_grad3d(
-                    as_cuda_ndarray_variable(V),
-                    d,
-                    WShape,
-                    as_cuda_ndarray_variable(dCdH)))]
-# Not enabled by default as we don't want people to use it.
-gpu_optimizer.register("local_gpu_conv_grad3d", local_gpu_conv_grad3d)
+                return [host_from_gpu(gpu_conv_grad3d(as_cuda_ndarray_variable(V),d, WShape, as_cuda_ndarray_variable(dCdH)))]
diff --git a/theano/sandbox/cuda/GpuConvTransp3D.py b/theano/sandbox/cuda/GpuConvTransp3D.py
index 2ae23c0838d..9b6ab021d67 100644
--- a/theano/sandbox/cuda/GpuConvTransp3D.py
+++ b/theano/sandbox/cuda/GpuConvTransp3D.py
@@ -8,20 +8,20 @@
 from theano.gof import local_optimizer
 
 from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
-from theano.sandbox.cuda.opt import gpu_optimizer
+from theano.sandbox.cuda.opt import register_opt
 from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
                                  host_from_gpu, GpuOp)
 
 
 class GpuConvTransp3D(GpuOp):
     """ The gpu version of ConvTransp3D """
-    def __eq__(self, other):
+    def __eq__(self,other):
         return type(self) == type(other)
 
     def __hash__(self):
         return hash(type(self))
 
-    def make_node(self, W, b, d, H, RShape=None):
+    def make_node(self, W, b, d, H, RShape = None):
         W_ = as_cuda_ndarray_variable(W)
         b_ = as_cuda_ndarray_variable(b)
         d_ = T.as_tensor_variable(d)
@@ -29,21 +29,22 @@ def make_node(self, W, b, d, H, RShape=None):
         if RShape:
             RShape_ = T.as_tensor_variable(RShape)
         else:
-            RShape_ = T.as_tensor_variable([-1, -1, -1])
+            RShape_ = T.as_tensor_variable([-1,-1,-1])
 
-        return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_],
-                            outputs=[CudaNdarrayType(dtype=H_.dtype,
-                                                     broadcastable=(False,)*5)()])
+        return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_],
+                            outputs = [CudaNdarrayType(dtype=H_.dtype,
+                                                       broadcastable=(False,)*5)()])
 
     def infer_shape(self, node, input_shapes):
-        W, b, d, H, RShape = node.inputs
+        W,b,d,H,RShape = node.inputs
         W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes
         return [(H_shape[0], W_shape[1], RShape[0], RShape[1], RShape[2])]
 
+
     def perform_(self, node, inputs, output_storage):
         W, b, d, H, RShape = inputs
         print "\t\t\t\tGpuConvTransp3D python code still uses old format"
-        output_storage[0][0] = computeR(W, b, d, H, RShape)
+        output_storage[0][0] = computeR(W,b,d,H,RShape)
 
     def c_code_cache_version(self):
         return ()
@@ -54,40 +55,40 @@ def c_code(self, node, nodename, inputs, outputs, sub):
 
         R = outputs[0]
 
-        codeSource = """
+        codeSource =  """
             ///////////// < code generated by GpuConvTransp3D >
 
             //printf("\t\t\t\tGpuConvTransp c code\\n");
 
             //Check dimensionality of inputs
-            if (CudaNdarray_NDIM(%(H)s) != 5)
+            if (%(H)s->nd != 5)
             {
-                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D", CudaNdarray_NDIM(%(H)s));
+                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd);
                 %(fail)s
             }
 
-            if (CudaNdarray_NDIM(%(W)s) != 5)
+            if (%(W)s->nd != 5)
             {
                 PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor");
                 %(fail)s
             }
 
-            if (CudaNdarray_NDIM(%(b)s) != 1)
+            if (%(b)s->nd != 1)
             {
                 PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector");
                 %(fail)s
             }
 
-            if (PyArray_NDIM(%(d)s) != 1)
+            if (%(d)s->nd != 1)
             {
                 PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector");
                 %(fail)s
             }
 
             //Read and check stride arguments
-            if (PyArray_DIMS(%(d)s)[0] != 3)
+            if (%(d)s->dimensions[0] != 3)
             {
-                PyErr_Format(PyExc_ValueError,"GpuConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", PyArray_DIMS(%(d)s)[0]);
+                PyErr_Format(PyExc_ValueError,"GpuConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", %(d)s->dimensions[0]);
                 %(fail)s
             }
 { // for fail
@@ -137,13 +138,13 @@ def c_code(self, node, nodename, inputs, outputs, sub):
 
             if (%(RShape)s)
             {
-                if (PyArray_NDIM(%(RShape)s) != 1)
+                if (%(RShape)s->nd != 1)
                 {
                     PyErr_Format(PyExc_ValueError, "RShape must be a vector");
                     %(fail)s
                 }
 
-                if (PyArray_DIMS(%(RShape)s)[0] != 3)
+                if (%(RShape)s->dimensions[0] != 3)
                 {
                     PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
                     %(fail)s
@@ -188,7 +189,7 @@ def c_code(self, node, nodename, inputs, outputs, sub):
                     %(fail)s;
                 }
                         }
-            cudaMemset(CudaNdarray_DEV_DATA(%(R)s), 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur);
+            cudaMemset(%(R)s->devdata, 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur);
 
 { // for fail
 
@@ -262,13 +263,13 @@ def c_code(self, node, nodename, inputs, outputs, sub):
 }}}}}} // for fail
             ///////////// < /code generated by GpuConvTransp3D >
         """
-        return strutil.render_string(codeSource, locals())
+        return strutil.render_string(codeSource,locals())
 
     def c_support_code_apply(self, node, nodename):
         # This code is not sensitive to the ignore_border flag.
         # It runs for every position in the output z, and then computes the gradient for the
         # input pixels that were downsampled to that z-position.
-        codeSource = """
+        codeSource =  """
 __global__ void
 //thread block size = videoDur
 //grid block size =(batchSize * inputChannels, videoHeight * videoWidth)
@@ -346,21 +347,18 @@ def c_support_code_apply(self, node, nodename):
 
 gpu_conv_transpd = GpuConvTransp3D()
 
-
-@local_optimizer([ConvTransp3D])
-def local_gpu_conv_transp3d(node):
+@register_opt()
+@local_optimizer([])
+def local_gpu_conv_transpd(node):
     if isinstance(node.op, ConvTransp3D):
-        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
-                      for i in node.inputs]):
+        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
             if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                 W, b, d, H, RShape = node.inputs
                 return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))]
-# Not enabled by default as we don't want people to use it.
-gpu_optimizer.register("local_gpu_conv_transp3d", local_gpu_conv_transp3d)
 
 
 #If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction
-def computeR(W, b, d, H, Rshape=None):
+def computeR(W,b,d,H,Rshape = None):
         assert len(W.shape) == 5
         assert len(H.shape) == 5
         assert len(b.shape) == 1
@@ -372,7 +370,7 @@ def computeR(W, b, d, H, Rshape=None):
         assert outputChannelsAgain == outputChannels
         assert b.shape[0] == inputChannels
 
-        dr, dc, dt = d
+        dr,dc,dt = d
         assert dr > 0
         assert dc > 0
         assert dt > 0
@@ -400,14 +398,14 @@ def computeR(W, b, d, H, Rshape=None):
             videoWidth, videoDur ) , dtype=H.dtype)
 
         #R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
-        for i in xrange(0, batchSize):
+        for i in xrange(0,batchSize):
             #print '\texample '+str(i+1)+'/'+str(batchSize)
-            for j in xrange(0, inputChannels):
+            for j in xrange(0,inputChannels):
                 #print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
-                for r in xrange(0, videoHeight):
+                for r in xrange(0,videoHeight):
                     #print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
-                    for c in xrange(0, videoWidth):
-                        for t in xrange(0, videoDur):
+                    for c in xrange(0,videoWidth):
+                        for t in xrange(0,videoDur):
                             R[i,j,r,c,t] = b[j]
 
                             ftc = max([0, int(numpy.ceil(float(t-filterDur +1  )/float(dt))) ])
@@ -434,16 +432,16 @@ def computeR(W, b, d, H, Rshape=None):
                                         R[i,j,r,c,t] += numpy.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
 
                                         tc += 1
-                                    ""  # close loop over tc
+                                    "" #close loop over tc
                                     cc += 1
-                                ""  # close loop over cc
+                                "" #close loop over cc
 
                                 rc += 1
-                            ""  # close loop over rc
-                        ""  # close loop over t
-                    ""  # close loop over c
-                ""  # close loop over r
-            ""  # close loop over j
-        ""  # close loop over i
+                            "" #close loop over rc
+                        "" #close loop over t
+                    "" #close loop over c
+                "" #close loop over r
+            "" #close loop over j
+        "" #close loop over i
 
         return R
diff --git a/theano/sandbox/cuda/__init__.py b/theano/sandbox/cuda/__init__.py
index 5421edfa1ee..b39ce87b5fd 100644
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -9,31 +9,26 @@
 import theano
 from theano.compat import get_unbound_function
 from theano.compile import optdb
-from theano.gof import EquilibriumDB, SequenceDB
 from theano.gof.cmodule import get_lib_extension
 from theano.gof.compilelock import get_lock, release_lock
 from theano.configparser import config, AddConfigVar, StrParam, BoolParam
 import nvcc_compiler
 
-# ignore_newtrees is to speed the optimization as this is the pattern
-# we use for optimization. Otherwise, we can iterate 100s of time on
-# the graph and apply only a few optimizations each time.
-gpu_optimizer = EquilibriumDB(ignore_newtrees=False)
-gpu_seqopt = SequenceDB()
-
-
-def register_opt(*tags, **kwargs):
-    def f(local_opt):
-        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
-        gpu_optimizer.register(name, local_opt, 'fast_run', 'fast_compile',
-                               'gpu', *tags)
-        return local_opt
-    return f
-
-
-
 _logger_name = 'theano.sandbox.cuda'
 _logger = logging.getLogger(_logger_name)
+_logger.setLevel(logging.WARNING)
+
+AddConfigVar('cuda.root',
+        """directory with bin/, lib/, include/ for cuda utilities.
+        This directory is included via -L and -rpath when linking
+        dynamically compiled modules.  If AUTO and nvcc is in the
+        path, it will use one of nvcc parent directory.  Otherwise
+        /usr/local/cuda will be used.  Leave empty to prevent extra
+        linker directives.  Default: environment variable "CUDA_ROOT"
+        or else "AUTO".
+        """,
+        StrParam(os.getenv('CUDA_ROOT', "AUTO")),
+        in_c_key=False)
 
 AddConfigVar('pycuda.init',
         """If True, always initialize PyCUDA when Theano want to
@@ -46,9 +41,9 @@ def f(local_opt):
         BoolParam(False),
         in_c_key=False)
 
-AddConfigVar('cublas.lib',
-        """Name of the cuda blas library for the linker.""",
-        StrParam('cublas'))
+if config.cuda.root == "AUTO":
+    # set nvcc_path correctly and get the version
+    nvcc_compiler.set_cuda_root()
 
 #is_nvcc_available called here to initialize global vars in
 #nvcc_compiler module
@@ -172,8 +167,7 @@ def try_import():
                             'cuda_ndarray',
                             code,
                             location=cuda_ndarray_loc,
-                            include_dirs=[cuda_path],
-                            libs=[config.cublas.lib],
+                            include_dirs=[cuda_path], libs=['cublas'],
                             preargs=['-O3'] + compiler.compile_args())
                     from cuda_ndarray.cuda_ndarray import *
             except Exception, e:
@@ -185,7 +179,6 @@ def try_import():
 del compile_cuda_ndarray
 
 if cuda_available:
-    global cuda_initialization_error_message
     # The module should be compiled.
     from cuda_ndarray.cuda_ndarray import *
 
@@ -229,8 +222,6 @@ def ok():
     except EnvironmentError, e:
         cuda_available = False
         cuda_initialization_error_message = " ".join(e.args)
-else:
-    cuda_initialization_error_message = 'cuda unavilable'
 
 
 class GpuOp(theano.gof.Op):
@@ -256,7 +247,7 @@ def make_thunk(self, node, storage_map, compute_map, no_recycling):
                                              compute_map, no_recycling)
 
 theano.compile.debugmode.default_make_thunk.append(
-    get_unbound_function(GpuOp.make_thunk))
+                                        get_unbound_function(GpuOp.make_thunk))
 
 # We must do those import to be able to create the full doc when
 # nvcc is not available
@@ -282,8 +273,8 @@ def make_thunk(self, node, storage_map, compute_map, no_recycling):
 
     shared_constructor = float32_shared_constructor
 
-    from . import basic_ops
-    from .basic_ops import (
+    import basic_ops
+    from basic_ops import (
             GpuFromHost, HostFromGpu, GpuElemwise,
             GpuDimShuffle, GpuCAReduce, GpuReshape, GpuContiguous,
             GpuSubtensor, GpuIncSubtensor,
@@ -293,11 +284,11 @@ def make_thunk(self, node, storage_map, compute_map, no_recycling):
             ftensor3, ftensor4,
             scalar, vector, matrix, row, col,
             tensor3, tensor4)
-    from .basic_ops import (host_from_gpu, gpu_from_host,
+    from basic_ops import (host_from_gpu, gpu_from_host,
             as_cuda_array, as_cuda_ndarray_variable)
+    import opt
     import cuda_ndarray
-    from . import opt, dnn
-    from .rng_curand import CURAND_RandomStreams
+    from rng_curand import CURAND_RandomStreams
 
 
 def use(device,
@@ -385,8 +376,6 @@ def use(device,
                 # event if another device is selected later.
                 cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((2, 3))
                 use.device_number = active_device_number()
-                # This is needed to initialize the cublas handle.
-                gpu_init(use.device_number)
 
             if test_driver:
                 import theano.sandbox.cuda.tests.test_driver
@@ -434,7 +423,6 @@ def use(device,
 
     if default_to_move_computation_to_gpu:
         optdb.add_tags('gpu_opt',
-                       'fast_compile',
                        'fast_run',
                        'inplace')
         optdb.add_tags('gpu_after_fusion',
diff --git a/theano/sandbox/cuda/basic_ops.py b/theano/sandbox/cuda/basic_ops.py
index aec1d6b627f..0c25d8814b0 100644
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -5,13 +5,10 @@
 import numpy
 
 import theano
-
-from theano import gof, Type, Apply
+from theano import Op, Type, Apply, Variable, Constant
 from theano import tensor, scalar, config
 from theano.compat.six import StringIO
-from theano.gradient import grad_undefined
 from theano.scalar import Scalar
-
 scal = scalar # somewhere scalar gets reassigned to be a function
 
 from theano.gof.python25 import all, any
@@ -32,6 +29,8 @@
 
 _logger_name = 'theano.sandbox.cuda.basic_ops'
 _logger = logging.getLogger(_logger_name)
+_logger.setLevel(logging.INFO)
+_logger.addHandler(logging.StreamHandler())  # TO REMOVE
 
 
 def as_cuda_ndarray_variable(x):
@@ -54,8 +53,6 @@ class HostFromGpu(GpuOp):
     """
     Implement the transfer from gpu to the cpu.
     """
-    check_input = False
-    
     def __eq__(self, other):
         return type(self) == type(other)
 
@@ -105,7 +102,7 @@ def c_code(self, node, name, inputs, outputs, sub):
         """ % locals()
 
     def c_code_cache_version(self):
-        return (3,)
+        return (2,)
 host_from_gpu = HostFromGpu()
 
 
@@ -113,8 +110,6 @@ class GpuFromHost(GpuOp):
     """
     Implement the transfer from cpu to the gpu.
     """
-    check_input = False
-    
     def __eq__(self, other):
         return type(self) == type(other)
 
@@ -171,7 +166,7 @@ def c_code(self, node, name, inputs, outputs, sub):
         """ % locals()
 
     def c_code_cache_version(self):
-        return (2,)
+        return (1,)
 
 gpu_from_host = GpuFromHost()
 
@@ -298,20 +293,41 @@ class GpuDimShuffle(GpuOp):
     """
     Implement DimShuffle on the gpu.
     """
-    check_broadcast = False
-
     def __init__(self, input_broadcastable, new_order):
         input_broadcastable = tuple(input_broadcastable)
         self.input_broadcastable = input_broadcastable
+        new_order = tuple(new_order)
         self.new_order = new_order
 
+        # list of dimensions of the input to drop
+        self.drop = []
+        # this maps i before dropping dimensions to j after dropping
+        # dimensions so self.shuffle can be set properly later on
+        i2j = {}
+        j = 0
         for i, b in enumerate(input_broadcastable):
             if i not in new_order:
-                if not b:
+                # we want to drop this dimension because it's not a
+                # value in new_order
+                if b == 1:  # 1 aka True
+                    self.drop.append(i)
+                else:
                     # we cannot drop non-broadcastable dimensions
                     raise ValueError("You cannot drop a non-broadcastable"
                                      " dimension.",
                                      (input_broadcastable, new_order))
+            else:
+                i2j[i] = j
+                j += 1
+
+        # transposition of non-broadcastable dimensions This is how
+        # the dimensions will be permuted, without accounting for the
+        # extra 'x' broadcastable dimensions to insert.
+        self.shuffle = [i2j[x] for x in new_order if x != 'x']
+
+        # list of dimensions of the output that are broadcastable and
+        # were not in the original input
+        self.augment = [i for i, x in enumerate(new_order) if x == 'x']
 
         self.view_map = {0: [0]}
 
@@ -329,19 +345,10 @@ def __setstate__(self, d):
     def make_node(self, input):
         ib = tuple(input.type.broadcastable)
         if not ib == self.input_broadcastable:
-            if len(ib) != len(self.input_broadcastable):
-                raise TypeError((
-                    "The number of dimensions of the "
-                    "input is incorrect for this op. Expected %s, got %s."
-                    % (self.input_broadcastable, ib)))
-            for expected, b in zip(self.input_broadcastable, ib):
-                if expected is True and b is False:
-                    raise TypeError((
-                        "The broadcastable pattern of the "
-                        "input is incorrect for this op. Expected %s, got %s."
-                        % (self.input_broadcastable, ib)))
-                #else, expected == b or expected is False and b is True
-                # Both case are good.
+            raise TypeError(
+                "The number of dimensions and/or broadcastable pattern of the"
+                " input is incorrect for this op. Expected %s, got %s." %
+                (self.input_broadcastable, ib))
         ob = []
         if not isinstance(input.type, CudaNdarrayType):
             raise TypeError("The input of a GpuDimshuffle must"
@@ -474,6 +481,8 @@ def c_code(self, node, name, inp, out, sub):
             print self
             print "IN BROAD", self.input_broadcastable
             print "NEW ORDER", self.new_order
+            print "SHUFFLE", self.shuffle
+            print "AUGMENT", self.augment
             print '------------'
             print ''
             print sio.getvalue()
@@ -519,51 +528,30 @@ class GpuCAReduce(GpuOp):
     GPUs are not especially well-suited to reduction operations so it is
     quite possible that the GPU might be slower for some cases.
 
-    pre_scalar_op: if present, must be a scalar op with only 1
-    input. We will execute it on the input value before reduction.
-
     """
 
-    def __init__(self, reduce_mask, scalar_op, pre_scalar_op=None):
+    def __init__(self, reduce_mask, scalar_op):
         self.reduce_mask = tuple(reduce_mask)
         self.scalar_op = scalar_op
         # used to make sure that calls to scalar op
         # have unique name arguments
         self._n_scalar_op_calls = 0
-        self.pre_scalar_op = pre_scalar_op
-        if pre_scalar_op:
-            assert pre_scalar_op.nin == 1
 
     def __eq__(self, other):
         return (type(self) == type(other) and
                 self.reduce_mask == other.reduce_mask and
-                self.scalar_op == other.scalar_op and
-                self.pre_scalar_op == other.pre_scalar_op)
+                self.scalar_op == other.scalar_op)
 
     def __hash__(self):
-        return (hash(type(self)) ^
-                hash(self.reduce_mask) ^
-                hash(type(self.scalar_op)) ^
-                hash(type(self.pre_scalar_op)))
+        return hash(type(self)) ^ hash(self.reduce_mask) ^ hash(type(self.scalar_op))
 
     def __str__(self):
-        pre = ""
-        if self.pre_scalar_op:
-            pre = "pre=%s,red=" % str(self.pre_scalar_op)
-        return "GpuCAReduce{%s%s}{%s}" % (
-                pre,
+        return "GpuCAReduce{%s}{%s}" % (
                 str(self.scalar_op),
                 ','.join(str(i) for i in self.reduce_mask)
                 )
 
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        # For unpickling of old ops.
-        if not hasattr(self, "pre_scalar_op"):
-            self.pre_scalar_op = None
-
     def make_node(self, x):
-        x = as_cuda_ndarray_variable(x)
         if (x.type.ndim != len(self.reduce_mask)):
             raise TypeError("x must have rank %i" % len(self.reduce_mask))
         o_broadcast = [x.type.broadcastable[i] for i
@@ -577,6 +565,7 @@ def make_node(self, x):
     def perform(self, node, inp, out):
         x, = inp
         z, = out
+        self._op_guard()
         # reduce_max is declared but does nothing but
         # raise NotImplementedError.
         # We can't call it here anyway because it hasn't
@@ -610,7 +599,7 @@ def supports_c_code(self, inputs):
         inp = ['fake_input_name_%d' % i for i in xrange(len(inputs))]
         out = ['fake_output_name_%d' % i for i in xrange(len(node.outputs))]
 
-        sub = {'fail': 'fake failure code'}
+        sub = { 'fail' : 'fake failure code' }
 
         try:
             self.c_code(node, name, inp, out, sub)
@@ -645,9 +634,8 @@ def c_code(self, node, name, inp, out, sub):
         # but tensor.elemwise.CAReduce has this exact same check so I guess
         # this is OK to do
         if self.scalar_op in [scal.minimum, scal.maximum]:
-            conds = ["(CudaNdarray_HOST_DIMS(%s)[%d] == 0)" % (x, i)
-                     for i in xrange(nd_in)
-                     if self.reduce_mask[i]]
+            conds = ["(CudaNdarray_HOST_DIMS(%s)[%d] == 0)" % (x, i) for i in xrange(nd_in) \
+                    if self.reduce_mask[i]]
             assert len(conds) > 0
             cond = "(" + " || ".join(conds) + ")"
             print >> sio, """
@@ -703,18 +691,9 @@ def c_code(self, node, name, inp, out, sub):
 
         # \begin bracket the reduction in a check that there is
         # actually work to do
-        if getattr(self.scalar_op, 'identity', None) == 0:
-            zero_shp = "cudaMemset(%(z)s->devdata, 0, CudaNdarray_SIZE(%(z)s) * sizeof(float))" % locals()
-        #TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
-        else:
-            zero_shp = """
-            PyErr_Format(PyExc_NotImplementedError,
-                         "GpuCAReduce not implemented when input shape is 0 for this scalar_op");
-            %(fail)s;
-            """ % locals()
         print >> sio, """
         if (CudaNdarray_SIZE(%(z)s) && ! CudaNdarray_SIZE(%(x)s)){
-            %(zero_shp)s;
+            cudaMemset(%(z)s->devdata, 0, CudaNdarray_SIZE(%(z)s) * sizeof(float));
         }
         else if (CudaNdarray_SIZE(%(z)s))
         {
@@ -731,12 +710,10 @@ def c_code(self, node, name, inp, out, sub):
             print >> sio, 'if(CudaNdarray_is_c_contiguous(%(x)s)){'%locals()
             self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
             print >> sio, "}else{"
-            getattr(self, 'c_code_reduce_%s'%(''.join(
-                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
+            getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
             print >> sio, "}"
         else:
-            getattr(self, 'c_code_reduce_%s'%(''.join(
-                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
+            getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
 
         # \end bracket the reduction ...
         print >> sio, """
@@ -777,10 +754,6 @@ def _makecall(self, node, name, x, z, fail, pattern=None):
             pattern = ''.join(str(c) for c in self.reduce_mask)
         ndim = len(self.reduce_mask)
         nd_out = ndim - sum(self.reduce_mask)
-        shapes_format = "shape=(%s)" % ",".join(["%d"] * node.inputs[0].ndim)
-        shapes_data = ",".join(["CudaNdarray_HOST_DIMS(%s)[%d]" % (x, i)
-                                for i in range(node.inputs[0].ndim)])
-
         print >> sio, """
             if (verbose)
                 printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
@@ -788,11 +761,11 @@ def _makecall(self, node, name, x, z, fail, pattern=None):
             if (verbose>1)
                 printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
                        " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
-                       " nb_block=%%d, n_shared=%%d, %(shapes_format)s\\n",
+                       " nb_block=%%d, n_shared=%%d\\n",
                                   n_threads.x,n_threads.y,n_threads.z,
                                   n_threads.x*n_threads.y*n_threads.z,
                                   n_blocks.x,n_blocks.y,
-                                  n_blocks.x*n_blocks.y, n_shared, %(shapes_data)s);
+                                  n_blocks.x*n_blocks.y, n_shared);
             kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
             """ % locals()
         for i in xrange(ndim):
@@ -814,6 +787,9 @@ def _makecall(self, node, name, x, z, fail, pattern=None):
                     ,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s]
             """ % locals()
 
+        shapes_format = "shape=(%s)" % ",".join(["%d"] * node.inputs[0].ndim)
+        shapes_data = ",".join(["CudaNdarray_HOST_DIMS(%s)[%d]" % (x, i)
+                                for i in range(node.inputs[0].ndim)])
         print >> sio, """
                     );
             CNDA_THREAD_SYNC;
@@ -910,73 +886,30 @@ def _k_init(self, *args):
 
         """
 
-    def _assign_init(self, first_item):
-        """
-        This return the initial value for myresult.
-        If the scalar op have an identity value, return it.
-
-        Otherwise, check that the scalar op is maximum or minimum
-        and return first_item. It should be the first element of the reduction.
-        As the maximum and minimum of the same value don't change, this work.
-        """
-        if hasattr(self.scalar_op, 'identity'):
-            return str(self.scalar_op.identity)
-        else:
-            assert isinstance(self.scalar_op, (scal.Maximum,
-                                               scal.Minimum))
-            if self.pre_scalar_op:
-                #dtype = node.inputs[0].dtype
-                dtype = 'float32'
-
-                dummy_var = scal.Scalar(dtype=dtype)()
-
-                dummy_node = self.pre_scalar_op.make_node(dummy_var)
-
-                dummy_name = 'assign_init_pre_scalar_op' + str(self._n_scalar_op_calls)
-                self._n_scalar_op_calls += 1
-                t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
-                                              (first_item,), ("",), {})
-                assert t.startswith(' = ')
-                first_item = t[3:]
-                if first_item[-1] == ';':
-                    first_item = first_item[:-1]
-
-            return first_item
-
-    def _assign_reduce(self, node, name, left, right, sub, pre):
+    def _assign_reduce(self, node, name, left, right, sub):
         """
             node: the node argument to this op's c_code
             name: the name argument to this op's c_code
             left: a C code string identifying an lvalue
             right: a C code string identifying an expression
             sub: the sub argument to this op's c_code
-            pre: If True, we will add the pre_scalar_op.c_code
 
             returns C code to reduce left and right, assigning the
             result to left."""
 
-        x, = node.inputs
+        x ,= node.inputs
 
         dtype = x.dtype
 
-        dummy_left = scal.Scalar(dtype=dtype)()
-        dummy_right = scal.Scalar(dtype=dtype)()
+
+        dummy_left = scal.Scalar(dtype = dtype)()
+        dummy_right = scal.Scalar(dtype = dtype)()
 
         dummy_node = self.scalar_op.make_node(dummy_left, dummy_right)
 
-        dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
+        dummy_name = name + '_scalar_op'+ str(self._n_scalar_op_calls)
         self._n_scalar_op_calls += 1
-        if pre and self.pre_scalar_op:
-            assert left == "myresult"
-            dummy_node = self.pre_scalar_op.make_node(dummy_left)
-            dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
-            self._n_scalar_op_calls += 1
-            t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
-                                          (right,), ("",), sub)
-            assert t.startswith(' = ')
-            right = t[3:]
-            if right[-1] == ';':
-                right = right[:-1]
+
         return self.scalar_op.c_code(dummy_node, dummy_name, (left, right),
                                      (left,), sub)
 
@@ -1002,8 +935,7 @@ def _k_reduce_buf(self, z_pos, node, name, sub):
         {
             int idx = threadNum - (threadCount >> 1) * 2;"""
 
-        new_version += self._assign_reduce(node, name, 'buf[idx]',
-                                           'buf[threadNum]', sub, False)
+        new_version += self._assign_reduce(node, name, 'buf[idx]','buf[threadNum]', sub)
 
         new_version += """
         }
@@ -1022,8 +954,7 @@ def _k_reduce_buf(self, z_pos, node, name, sub):
               float temp = buf[threadNum + halfPoint];
               """
 
-        new_version += self._assign_reduce(node, name, 'buf[threadNum]',
-                                           'temp', sub, False)
+        new_version += self._assign_reduce(node, name, 'buf[threadNum]', 'temp', sub)
 
         new_version += """
             }
@@ -1053,8 +984,7 @@ def _k_reduce_buf(self, z_pos, node, name, sub):
             for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
             {
                 """
-        current_version += self._assign_reduce(node, name, 'myresult',
-                                               'buf[i]', sub, False) + """
+        current_version += self._assign_reduce(node, name, 'myresult', 'buf[i]', sub) + """
             }
             buf[threadNum] = myresult;
         /*Comment this optimization as it don't work on Fermi GPU.
@@ -1062,11 +992,9 @@ def _k_reduce_buf(self, z_pos, node, name, sub):
             // no sync because only one warp is running
             if(threadCount >32)
             {"""
-        for num in [16, 8, 4, 2, 1]:
-            current_version += self._assign_reduce(node, name,
-                                                   'buf[threadNum]',
-                                                   'buf[threadNum+%d]' % num,
-                                                   sub, False)
+        for num in [16,8,4,2,1]:
+            current_version += self._assign_reduce(node, name, 'buf[threadNum]',
+                    'buf[threadNum+%d]' % num, sub)
         current_version += """
                 if (threadNum == 0)
                 {
@@ -1079,11 +1007,9 @@ def _k_reduce_buf(self, z_pos, node, name, sub):
             {
                 //reduce so that threadNum 0 has the reduction of everything
                 """
-        for num in [16, 8, 4, 2, 1]:
+        for num in [16,8,4,2,1]:
             this_if = "if (threadNum + %d < threadCount) " % num + \
-                self._assign_reduce(node, name,
-                                    'buf[threadNum]','buf[threadNum+%d]' % num,
-                                    sub, False)
+                self._assign_reduce(node, name, 'buf[threadNum]','buf[threadNum+%d]' % num, sub)
             current_version += this_if
         current_version += """
                 if (threadNum == 0)
@@ -1100,9 +1026,8 @@ def _k_reduce_buf(self, z_pos, node, name, sub):
 
     #Threads must be organized as: threadNum%nb_reduce correspond to the same sum
     #nb_reduce<=warpSize
-    def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce):
-        reduce_fct = self._assign_reduce(node, name, 'myresult',
-                                         'buf[i]', {}, True)
+    def _k_reduce_buf_multiple(self, z_pos, nb_reduce):
+        self._op_guard()
         return """
         __syncthreads(); // some kernel do multiple reduction.
         buf[threadNum] = myresult;
@@ -1114,7 +1039,7 @@ def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce):
             //round up all the partial sums into the first `nb_reduce` elements
             for (int i = threadNum + %(nb_reduce)s; i < threadCount; i += %(nb_reduce)s)
             {
-                %(reduce_fct)s;
+                myresult += buf[i];
             }
             %(z_pos)s = myresult;
         }
@@ -1127,20 +1052,11 @@ def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
         is for the case where we are reducing on all axes and x is
         C contiguous.
         """
-        if getattr(self.scalar_op, 'identity', None) == 0:
-            zero_shp = "cudaMemset(%(z)s->devdata, 0, CudaNdarray_SIZE(%(z)s) * sizeof(float))" % locals()
-        #TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
-        else:
-            zero_shp = """
-            PyErr_Format(PyExc_NotImplementedError,
-                         "GpuCAReduce not implemented when input shape is 0 for this scalar_op");
-            %(fail)s;
-            """ % locals()
-
+        self._op_guard()
         print >> sio, """
         {
           if(CudaNdarray_SIZE(%(x)s)==0){
-            %(zero_shp)s;
+            cudaMemset(CudaNdarray_DEV_DATA(%(z)s),0,sizeof(float));
           }else{
             int verbose = 0;
             dim3 n_threads(
@@ -1176,6 +1092,7 @@ def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
         """ % locals()
 
     def c_code_reduce_1(self, sio, node, name, x, z, fail):
+        self._op_guard()
         makecall = self._makecall(node, name, x, z, fail)
         print >> sio, """
         {
@@ -1189,6 +1106,7 @@ def c_code_reduce_1(self, sio, node, name, x, z, fail):
         """ % locals()
 
     def c_code_reduce_11(self, sio, node, name, x, z, fail):
+        self._op_guard()
         makecall = self._makecall(node, name, x, z, fail)
         print >> sio, """
         {
@@ -1238,11 +1156,7 @@ def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
                     n_threads.z += 1;
                 else
                     break;
-            }
-            //Maximum for Fermi GPU on that dimensions.
-            n_threads.z = std::min(n_threads.z, (unsigned)64);
-
-        """ % locals()
+            }""" % locals()
 
         if len(self.reduce_mask) == 2:
             threads_y = ''
@@ -1275,65 +1189,13 @@ def c_code_reduce_0111(self, sio, node, name, x, z, fail):
         self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
 
     def c_code_reduce_10(self, sio, node, name, x, z, fail):
+        if not isinstance(self.scalar_op, (scal.Add,
+                                           scal.Maximum,
+                                           scal.Minimum)):
+            raise NotImplementedError()
         print >> sio, """
-    {
-        int verbose = 0;
-        if(CudaNdarray_HOST_STRIDES(%(x)s)[0] >
-           CudaNdarray_HOST_STRIDES(%(x)s)[1]){
-                // If there are a lot of summations to do, then we can use simple parallelization -
-                // use each thread to do one sum.
-
-                // we might as well launch blocks of 32 threads because that's the warp size.
-                // we could schedule more threads if we were maxing out the gridsize below, but
-                // the gridsize is way more than the physical hardware and I think 32 threads
-                // on a huge grid is enough to fully use the hardware.
-                dim3 n_threads(32,1,1);
-
-                // We kindof reshape the input implicitly to something 4D:
-                //  the shape A,B,C    ->   A, B, D, E
-                //  where C <= D*E < C+32
-                //  where E==32
-
-                int A = 1;
-                int B = CudaNdarray_HOST_DIMS(%(x)s)[0];
-                int C = CudaNdarray_HOST_DIMS(%(x)s)[1];
-                int D = C/32;
-                if (32*D < C) D+= 1;
-                assert ((C <= 32*D) && (32*D < C+32));
-
-                // The gridsize would ideally be (A, D).  But we do the following logic to make
-                // sure we don't ask for a grid that is too big.
-                dim3 n_blocks(A,D);
-                if (n_blocks.x > NUM_VECTOR_OP_BLOCKS) n_blocks.x = NUM_VECTOR_OP_BLOCKS;
-                if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS) n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x;
-                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads>>>(
-                A,B,C,D,
-                        CudaNdarray_DEV_DATA(%(x)s),
-                        1,
-                        CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(z)s),
-                        1,
-                        CudaNdarray_HOST_STRIDES(%(z)s)[0]
-                        );
-
-            CNDA_THREAD_SYNC;
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                    "Cuda error: %%s: %%s."
-                    " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_10_AD%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
-        }else{
+        {
+            int verbose = 0;
             dim3 n_threads(
                     std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
                             NUM_VECTOR_OP_THREADS_PER_BLOCK));
@@ -1346,7 +1208,7 @@ def c_code_reduce_10(self, sio, node, name, x, z, fail):
                 n_blocks.x,
                 n_blocks.y);
             }
-            assert(CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]);
+            assert( CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]);
             int n_shared = sizeof(float) * n_threads.x;
             kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                     1,
@@ -1377,10 +1239,10 @@ def c_code_reduce_10(self, sio, node, name, x, z, fail):
                 %(fail)s;
             }
         }
-    }
         """ % locals()
 
     def c_code_reduce_010(self, sio, node, name, x, z, fail):
+        self._op_guard()
         makecall = self._makecall(node, name, x, z, fail)
         makecall_inner = self._makecall(node, name, x, z, fail,
                                         pattern="010_inner")
@@ -1503,6 +1365,7 @@ def c_code_reduce_010(self, sio, node, name, x, z, fail):
         """ % locals()
 
     def c_code_reduce_0101(self, sio, node, name, x, z, fail):
+        self._op_guard()
         makecall = self._makecall(node, name, x, z, fail)
         print >> sio, """
         {
@@ -1522,6 +1385,7 @@ def c_code_reduce_0101(self, sio, node, name, x, z, fail):
         """ % locals()
 
     def c_code_reduce_100(self, sio, node, name, x, z, fail):
+        self._op_guard()
         makecall = self._makecall(node, name, x, z, fail)
         # use threadIdx.x for i0
         # use blockIdx.x for i1
@@ -1542,6 +1406,7 @@ def c_code_reduce_100(self, sio, node, name, x, z, fail):
         """ % locals()
 
     def c_code_reduce_110(self, sio, node, name, x, z, fail):
+        self._op_guard()
         makecall = self._makecall(node, name, x, z, fail)
         print >> sio, """
         {
@@ -1563,6 +1428,7 @@ def c_code_reduce_110(self, sio, node, name, x, z, fail):
         """ % locals()
 
     def c_code_reduce_001(self, sio, node, name, x, z, fail):
+        self._op_guard()
         makecall = self._makecall(node, name, x, z, fail)
         print >> sio, """
         {
@@ -1585,6 +1451,7 @@ def c_code_reduce_001(self, sio, node, name, x, z, fail):
         """ % locals()
 
     def c_code_reduce_111(self, sio, node, name, x, z, fail):
+        self._op_guard()
         makecall = self._makecall(node, name, x, z, fail)
         print >> sio, """
         {
@@ -1610,8 +1477,6 @@ def c_code_reduce_111(self, sio, node, name, x, z, fail):
                 n_threads.z += 1;
             }
             n_threads.z -= 1;
-            //Maximum for Fermi GPU on that dimensions.
-            n_threads.z = std::min(n_threads.z, (unsigned)64);
 
             dim3 n_blocks(1,1,1);
             %(makecall)s
@@ -1619,6 +1484,7 @@ def c_code_reduce_111(self, sio, node, name, x, z, fail):
         """ % locals()
 
     def c_code_reduce_0011(self, sio, node, name, x, z, fail):
+        self._op_guard()
         makecall = self._makecall(node, name, x, z, fail)
         print >> sio, """
         {
@@ -1649,6 +1515,7 @@ def c_code_reduce_0011(self, sio, node, name, x, z, fail):
         """ % locals()
 
     def c_code_reduce_1111(self, sio, node, name, x, z, fail):
+        self._op_guard()
         makecall = self._makecall(node, name, x, z, fail)
         print >> sio, """
         {
@@ -1675,15 +1542,13 @@ def c_code_reduce_1111(self, sio, node, name, x, z, fail):
             }
             n_threads.z -= 1;
 
-            //Maximum for Fermi GPU on that dimensions.
-            n_threads.z = std::min(n_threads.z, (unsigned)64);
-
             dim3 n_blocks(1,1,1);
             %(makecall)s
         }
         """ % locals()
 
     def c_code_reduce_1011(self, sio, node, name, x, z, fail):
+        self._op_guard()
         makecall = self._makecall(node, name, x, z, fail)
         print >> sio, """
         {
@@ -1708,7 +1573,7 @@ def c_code_reduce_1011(self, sio, node, name, x, z, fail):
         """ % locals()
 
     def c_code_cache_version_apply(self, node):
-        version = [11]  # the version corresponding to the c code in this Op
+        version = [7]  # the version corresponding to the c code in this Op
 
         # now we insert versions for the ops on which we depend...
         scalar_node = Apply(self.scalar_op,
@@ -1722,17 +1587,19 @@ def c_code_cache_version_apply(self, node):
         else:
             return ()
 
+    def _op_guard(self):
+        """ Raises NotImplementedError if op is not Add """
+        if not isinstance(self.scalar_op, theano.scalar.basic.Add):
+            raise NotImplementedError()
+
     def c_support_code_apply(self, node, nodename):
         sio = StringIO()
         nd_in = len(self.reduce_mask)
         if all(i == 1 for i in self.reduce_mask):
+            self._op_guard()
             #this kernel is ok for up to a few thousand elements, but
             # it only runs on ONE multiprocessor
-            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[0]")
+            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub = {})
             print >> sio, """
             static __global__ void kernel_reduce_ccontig_%(nodename)s(
                     const unsigned int d0,
@@ -1742,7 +1609,7 @@ def c_support_code_apply(self, node, nodename):
                 const int threadCount = blockDim.x;
                 const int threadNum = threadIdx.x;
                 extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                float myresult = 0.0f;
 
                 if (warpSize != 32)
                 {
@@ -1751,19 +1618,16 @@ def c_support_code_apply(self, node, nodename):
 
                 for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
                 {
-                    %(reduce_fct)s
+                    myresult += A[i0];
                 }
                 %(reducebuf)s
             }
             """ % locals()
         if self.reduce_mask == (1,):
+            self._op_guard()
             #this kernel is ok for up to a few thousand elements, but
             # it only runs on ONE multiprocessor
-            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[0]")
+            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub = {})
             print >> sio, """
             static __global__ void kernel_reduce_1_%(nodename)s(
                     const unsigned int d0,
@@ -1773,7 +1637,7 @@ def c_support_code_apply(self, node, nodename):
                 const int threadCount = blockDim.x;
                 const int threadNum = threadIdx.x;
                 extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                float myresult = 0.0f;
 
                 if (warpSize != 32)
                 {
@@ -1782,20 +1646,17 @@ def c_support_code_apply(self, node, nodename):
 
                 for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
                 {
-                    %(reduce_fct)s
+                    float Ai = A[i0 * sA0];
+                    myresult += Ai;
                 }
                 %(reducebuf)s
             }
             """ % locals()
         if self.reduce_mask == (1, 1):
+            self._op_guard()
             #this kernel is ok for up to a few thousand elements, but
             # it only runs on ONE multiprocessor
-            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[0]")
-
+            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub = {})
             print >> sio, """
             static __global__ void kernel_reduce_11_%(nodename)s(
                     const int d0,
@@ -1806,7 +1667,7 @@ def c_support_code_apply(self, node, nodename):
                 const int threadCount = blockDim.x * blockDim.y;
                 const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
                 extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                float myresult = 0.0f;
 
                 if (warpSize != 32)
                 {
@@ -1817,7 +1678,8 @@ def c_support_code_apply(self, node, nodename):
                 {
                     for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
                     {
-                        %(reduce_fct)s;
+                        float Ai = A[i0 * sA0 + i1 * sA1];
+                        myresult += Ai;
                     }
                 }
                 %(reducebuf)s
@@ -1865,21 +1727,33 @@ def c_support_code_apply(self, node, nodename):
                 first_i3 = 'threadIdx.x'
                 sA3 = 'sA3'
 
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node,
-                                           nodename, sub={})
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node, nodename, sub = {})
             param_dim = ",".join(["const int d%d" % i
                                   for i in xrange(nd_in)])
             param_strides = ",".join(["const int sA%d" % i
                                       for i in xrange(nd_in)])
             decl = self._k_decl(node, nodename)
             init = self._k_init(node, nodename)
-            reduce_init = self._assign_init("A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0]" % locals())
-            reduce_fct = self._assign_reduce(
-                node, nodename, "myresult",
-                "A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
-                {}, True)
-
-            print >> sio, """
+            # TODO: ideally this would all be some clean function of scalar_op,
+            # but since sum is a special case where it's OK to reduce with an
+            # extra 0, I would need to change the behavior of the sum reduction
+            # code to do that. I don't want to benchmark and test changes to the
+            # sum code so I will leave that for later.
+            # max/min reduction is also a special case that is simple to implement.
+            # this is the special case where reduction is idempotent so it doesn't
+            # matter if we reduce with the first element multiple times.
+            if isinstance(self.scalar_op, (scal.Add, scal.Maximum, scal.Minimum)):
+                # special cased max/min code (special case because visits first
+                # member of each row twice)
+                if isinstance(self.scalar_op, scal.Add):
+                    reduce_init = "0.f;"
+                else:
+                    reduce_init = "A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0];" % locals()
+                reduce_fct = self._assign_reduce(
+                    node, nodename, "myresult",
+                    "A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
+                    {})
+                print >> sio, """
                 %(decl)s{
                     %(init)s
                     for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
@@ -1895,7 +1769,22 @@ def c_support_code_apply(self, node, nodename):
                     }
                 }
                 """ % locals()
+            else:
+                # TODO: implement general case and get rid of the two special
+                # cases above
+                # it should initialize myresult to element 0,
+                # and the for loop should begin traversing from element 1
+                # raise an error if asked to reduce an empty dimension
+                # (maybe special-case sum to return 0 instead of returning an
+                # error)
+                # in both cases, benchmark the general case against the existing
+                # code to make sure it does not cause a slowdown
+                raise NotImplementedError()
         if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
+            if not isinstance(self.scalar_op, (scal.Add,
+                                               scal.Maximum,
+                                               scal.Minimum)):
+                raise NotImplementedError()
             # this kernel uses one block for each column,
             # threads per block for each element per column.
 
@@ -1906,8 +1795,11 @@ def c_support_code_apply(self, node, nodename):
                                            node, nodename, sub={})
             reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                              "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2]")
+                                             {})
+            if isinstance(self.scalar_op, scal.Add):
+                reduce_init = "0.f;"
+            else:
+                reduce_init = "A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2];"
             print >> sio, """
             static __global__ void kernel_reduce_010_%(nodename)s(
                     const int d0,
@@ -1942,11 +1834,8 @@ def c_support_code_apply(self, node, nodename):
 
             }
             """ % locals()
-        if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "X[a * sX0 + b * sX1 + c * sX2]",
-                                             {}, True)
-            reduce_init = self._assign_init("X[a * sX0 + 0 * sX1 + c * sX2]")
+        if self.reduce_mask == (0, 1, 0):
+            self._op_guard()
             print >> sio, """
             static __global__ void kernel_reduce_010_AD_%(nodename)s(
                     const int A,
@@ -1974,10 +1863,10 @@ def c_support_code_apply(self, node, nodename):
                         int c = i2_D * 32 + threadIdx.x;
                         if (c < C)
                         {
-                            myresult = %(reduce_init)s;
+                            myresult = 0;
                             for (int b = 0; b < B; ++b)
                             {
-                                %(reduce_fct)s;
+                                myresult += X[a * sX0 + b * sX1 + c * sX2];
                             }
                             Z[a * sZ0 + c * sZ1] = myresult;
                         }
@@ -1987,6 +1876,7 @@ def c_support_code_apply(self, node, nodename):
             }
             """ % locals()
         if self.reduce_mask == (0, 1, 0):
+            self._op_guard()
             #
             # This kernel is optimized when the inner most dimensions
             # have the smallest stride.
@@ -2001,12 +1891,9 @@ def c_support_code_apply(self, node, nodename):
             init = self._k_init(node, nodename)
             decl = self._k_decl(node, nodename, pattern="010_inner")
             reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
-                                                    node, nodename,
                                                     'blockDim.x')
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[i0 * sA0 + 0 * sA1 + i2 * sA2]")
+            reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
+                                                    'blockDim.x')
             print >> sio, """
             %(decl)s
             {
@@ -2021,10 +1908,9 @@ def c_support_code_apply(self, node, nodename):
               {
                 for (int i2 = blockIdx.y*blockDim.x+threadIdx.x; i2 < d2; i2 += gridDim.y*blockDim.x)
                  {
-                  myresult = %(reduce_init)s;
                   for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
                   {
-                      %(reduce_fct)s;
+                      myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
                   }
                   %(reducebuf)s
                  }
@@ -2032,6 +1918,7 @@ def c_support_code_apply(self, node, nodename):
             }
             """ % locals()
         if self.reduce_mask == (1, 1, 0):
+            self._op_guard()
             # this kernel uses one block for each column,
             # threads per block for each element per column.
 
@@ -2039,10 +1926,6 @@ def c_support_code_apply(self, node, nodename):
             #      c_contiguous (typical case) then each warp is accessing non-contigous
             #      memory (a segment of a column).
             reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[blockIdx.x * sA2]")
             print >> sio, """
             static __global__ void kernel_reduce_110_%(nodename)s(
                     const int d0,
@@ -2055,7 +1938,7 @@ def c_support_code_apply(self, node, nodename):
                 const int threadCount = blockDim.x * blockDim.y;
                 const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
                 extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                float myresult = 0.0f;
 
                 if (warpSize != 32)
                 {
@@ -2068,7 +1951,8 @@ def c_support_code_apply(self, node, nodename):
                 {
                     for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
                     {
-                        %(reduce_fct)s;
+                        float Ai = A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2];
+                        myresult += Ai;
                     }
                 }
 
@@ -2076,14 +1960,11 @@ def c_support_code_apply(self, node, nodename):
             }
             """ % locals()
         if self.reduce_mask == (1, 0, 0):
+            self._op_guard()
             reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]',
-                                           node, nodename, sub={})
+                    node, nodename, sub={})
             decl = self._k_decl(node, nodename)
             init = self._k_init(node, nodename)
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[i1 * sA1 + i2 * sA2]")
             print >> sio, """
             %(decl)s
             {
@@ -2092,10 +1973,10 @@ def c_support_code_apply(self, node, nodename):
                 {
                     for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x)
                     {
-                        myresult = %(reduce_init)s;
+                        myresult = 0;
                         for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
                         {
-                            %(reduce_fct)s
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
                         }
                         %(reducebuf)s
                     }
@@ -2103,26 +1984,23 @@ def c_support_code_apply(self, node, nodename):
             }
             """ % locals()
         if self.reduce_mask == (1, 1, 1):
+            self._op_guard()
             reducebuf = self._k_reduce_buf('Z[0]', node,
-                                           nodename, sub={})
+                    nodename, sub={})
             decl = self._k_decl(node, nodename)
             init = self._k_init(node, nodename)
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[0]")
             print >> sio, """
             %(decl)s
             {
                 %(init)s
-                myresult = %(reduce_init)s;
+                myresult = 0;
                 for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
                 {
                     for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
                     {
                         for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
                         {
-                            %(reduce_fct)s;
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
                         }
                     }
                 }
@@ -2130,14 +2008,11 @@ def c_support_code_apply(self, node, nodename):
             }
             """ % locals()
         if self.reduce_mask == (0, 0, 1):
+            self._op_guard()
             # this kernel uses one block for each row,
             # threads per block for each element per row.
             reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
-                                           node, nodename, sub={})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
+                    node, nodename, sub = {})
             print >> sio, """
             static __global__ void kernel_reduce_001_%(nodename)s(
                     const int d0,
@@ -2160,10 +2035,10 @@ def c_support_code_apply(self, node, nodename):
                 {
                     for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
                     {
-                        float myresult = %(reduce_init)s;
+                        float myresult = 0.0f;
                         for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
                         {
-                            %(reduce_fct)s;
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
                         }
                         %(reducebuf)s
                     }
@@ -2171,16 +2046,13 @@ def c_support_code_apply(self, node, nodename):
             }
             """ % locals()
         if self.reduce_mask == (0, 0, 1, 1):
-             # this kernel uses one block for each row,
+            self._op_guard()
+            # this kernel uses one block for each row,
             # threads per block for each element per row.
             reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
-                                           node, nodename, sub={})
+                    node, nodename, sub = {})
             decl = self._k_decl(node, nodename)
             init = self._k_init(node, nodename)
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
             print >> sio, """
             %(decl)s
             {
@@ -2190,12 +2062,12 @@ def c_support_code_apply(self, node, nodename):
                 {
                     for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
                     {
-                        float myresult = %(reduce_init)s;
+                        float myresult = 0.0f;
                     for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
                     {
                         for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
                         {
-                            %(reduce_fct)s;
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
                         }
                     }
                         %(reducebuf)s
@@ -2204,16 +2076,13 @@ def c_support_code_apply(self, node, nodename):
             }
             """ % locals()
         if self.reduce_mask == (0, 1, 0, 1):
+            self._op_guard()
             # this kernel uses one block for each row,
             # threads per block for each element per row.
             reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]',
-                                           node, nodename, sub={})
+                    node, nodename, sub = {})
             decl = self._k_decl(node, nodename)
             init = self._k_init(node, nodename)
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[i0 * sA0 + i2 * sA2]")
             print >> sio, """
             %(decl)s
             {
@@ -2223,12 +2092,12 @@ def c_support_code_apply(self, node, nodename):
                 {
                     for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                     {
-                        float myresult = %(reduce_init)s;
+                        float myresult = 0.0f;
                     for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
                     {
                         for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
                         {
-                            %(reduce_fct)s;
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
                         }
                     }
                         %(reducebuf)s
@@ -2237,19 +2106,16 @@ def c_support_code_apply(self, node, nodename):
             }
             """ % locals()
         if self.reduce_mask == (1, 1, 1, 1):
+            self._op_guard()
             reducebuf = self._k_reduce_buf('Z[0]', node, nodename,
-                                           sub={})
+                    sub = {})
             decl = self._k_decl(node, nodename)
             init = self._k_init(node, nodename)
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[0]")
             print >> sio, """
             %(decl)s
             {
                 %(init)s
-                myresult = %(reduce_init)s;
+                myresult = 0;
               for (int i0 = 0; i0 < d0; i0++)
                 for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)
                 {
@@ -2257,7 +2123,7 @@ def c_support_code_apply(self, node, nodename):
                     {
                         for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
                         {
-                            %(reduce_fct)s;
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
                         }
                     }
                 }
@@ -2265,12 +2131,9 @@ def c_support_code_apply(self, node, nodename):
             }
             """ % locals()
         if self.reduce_mask == (1, 0, 1, 1):
+            self._op_guard()
             reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
-                                           node, nodename, sub={})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[blockIdx.x * sA1]")
+                    node, nodename, sub = {})
             print >> sio, """
             static __global__ void kernel_reduce_1011_%(nodename)s(
                     const unsigned int d0,
@@ -2284,7 +2147,7 @@ def c_support_code_apply(self, node, nodename):
                 const int threadCount = blockDim.x * blockDim.y * blockDim.z;
                 const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
                 extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                float myresult = 0.0f;
 
                 if (warpSize != 32)
                 {
@@ -2297,7 +2160,8 @@ def c_support_code_apply(self, node, nodename):
                     {
                         for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
                         {
-                            %(reduce_fct)s;
+                            float Ai = A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3];
+                            myresult += Ai;
                         }
                     }
                 }
@@ -2324,41 +2188,19 @@ def perform(self, node, inp, out_):
             raise ValueError('shape argument to Reshape.perform'
                              ' has incorrect length %i'
                              ', should be %i' % (len(shp), self.ndim), shp)
-
-        if shp.prod() != x.size:
-            # We need to do check here to raise the same error as NumPy.
-            # We should make pygpu do the same.
-            ss = 1
-            nb_m1 = 0
-            m1_idx = -1
-            for idx, i in enumerate(shp):
-                if i == -1:
-                    nb_m1 += 1
-                    m1_idx = idx
-                else:
-                    ss *= i
-            if nb_m1 > 1:
-                raise ValueError("Only one -1 is accepted in the new shape")
-            elif nb_m1 == 1:
-                if (x.size % ss) != 0:
-                    raise ValueError("When using -1 in new shape, the computed new shape must be an multiple of the original shape.")
-                shp_new = numpy.copy(shp)
-                shp_new[m1_idx] = x.size/ss
-                shp = shp_new
-
-            else:
-                raise ValueError("total size of new array must be unchanged",
-                                 x.shape, shp)
-
         out[0] = x.reshape(tuple(shp))
 
 
+# C Code shared by GpuSubtensor and GpuIncSubtensor
+_define_set_data = """
+    #define CudaNdarray_set_device_data2(obj, ptr, base) \
+            CudaNdarray_set_device_data(obj, (float *)ptr, base)
+"""
+
 class GpuSubtensor(GpuOp, tensor.Subtensor):
     """
     Implement subtensor on the gpu.
     """
-    check_broadcast = False
-
     # __hash__, __eq__, __str__ come from tensor.Subtensor
     def make_node(self, x, *inputs):
         assert isinstance(x.type, CudaNdarrayType)
@@ -2402,27 +2244,16 @@ def c_code(self, node, name, inputs, outputs, sub):
         view_ndim = node.outputs[0].ndim
         fail = sub['fail']
 
-        decl = "CudaNdarray* xview = NULL;"
-
-        get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
-                                       self.idx_list,
-                                       view_ndim=view_ndim,
-                                       c_prefix='CudaNdarray',
-                                       strides_mul=4,
-                                       )
         build_view = """
         //TODO: give this Op a second output so that this view can be cached
         //TODO: alternatively, fix the memory leak on failure
-        xview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
+        CudaNdarray* xview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
         if (!xview)
         {
             %(fail)s;
         }
-
-        if (CudaNdarray_set_device_data(
-                xview,
-                CudaNdarray_DEV_DATA(%(x)s) + xview_offset/4,
-                (PyObject*) %(x)s))
+        if (CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(%(x)s),
+                                       (PyObject*) NULL))
         {
             PyErr_Format(PyExc_RuntimeError,
                          "GpuSubtensor is not able to set the"
@@ -2431,24 +2262,43 @@ def c_code(self, node, name, inputs, outputs, sub):
             %(fail)s;
         }
         cnda_mark_dev_structure_dirty(xview);
-        for(int idx=0;idx <%(view_ndim)s; idx++){
-        //For broadcasted dimensions, set the strides to 0
-        //We can't do that only for broadcasted dimensions as this can happen
-        //for dimensions of size 0. That are rebroadcated later.
-            if(xview_dims[idx]==1)
-                CudaNdarray_set_stride(xview, idx, 0);
-            else
-                CudaNdarray_set_stride(xview, idx, xview_strides[idx]);
-            CudaNdarray_set_dim(xview, idx, xview_dims[idx]);
-        }
         """ % locals()
 
-        finish_view = """
+        get_xview = _define_set_data + \
+                    self.helper_c_code(node, name, inputs, outputs, sub,
+                                       self.idx_list,
+                                       c_prefix='CudaNdarray',
+                                       set_data='CudaNdarray_set_device_data2',
+                                       set_dim='CudaNdarray_set_dim',
+                                       set_stride='CudaNdarray_set_stride',
+                                       update_flags="", strides_mul=4)
+        finish_view = ""
+        #For broadcasted dimensions, set the strides to 0
+        #We can't do that only for broadcasted dimensions as this can happen for dimensions of size 0,
+        #That are rebroadcated later.
+        for idx in range(node.outputs[0].ndim):
+            finish_view += """
+            if(CudaNdarray_HOST_DIMS(xview)[%(idx)s]==1)
+            CudaNdarray_set_stride(xview, %(idx)s, 0);
+            """ % locals()
+
+        finish_view += """
+        //Set the base only now
+
+        if(CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(xview),
+                                    %(x)s)){
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuSubtensor is not able to set"
+                         " the base of the view array");
+            Py_XDECREF(xview);
+            %(fail)s;
+        }
+
         Py_XDECREF(%(z)s);
         %(z)s = xview;
         """ % locals()
 
-        return decl + get_xview + build_view + finish_view
+        return build_view + "{" + get_xview + "}" + finish_view
 
     def c_code_cache_version(self):
         hv = self.helper_c_code_cache_version()
@@ -2478,10 +2328,7 @@ def make_node(self, x, ilist):
         if x_.type.ndim == 0:
             raise TypeError('cannot index into a scalar')
 
-        bcast = (ilist_.broadcastable[0],) + x_.broadcastable[1:]
-        return Apply(self, [x_, ilist_],
-                     [CudaNdarrayType(dtype=x.dtype,
-                                      broadcastable=bcast)()])
+        return Apply(self, [x_, ilist_], [x_.type()])
 
     def perform(self, node, inp, out_):
         # This don't work as CudaNdarray_Subscript() don't support it.
@@ -2551,10 +2398,13 @@ def make_node(self, x, y, ilist):
 
         if ilist_.type.dtype[:3] not in ('int', 'uin'):
             raise TypeError('index must be integers')
-        if ilist_.type.ndim != 1:
+        if ilist_.type.broadcastable != (False,):
             raise TypeError('index must be vector')
         if x_.type.ndim == 0:
             raise TypeError('cannot index into a scalar')
+        if x_.type.broadcastable[0]:
+            # the caller should have made a copy of x len(ilist) times
+            raise TypeError('cannot index into a broadcastable dimension')
 
         return Apply(self, [x_, y_, ilist_], [x_.type()])
 
@@ -2594,7 +2444,7 @@ def perform(self, node, inp, out_):
         out[0] = x
 
     def c_code_cache_version(self):
-        return (3,)
+        return (1,)
 
     def c_code(self, node, name, inputs, outputs, sub):
         if (self.set_instead_of_inc) or \
@@ -2617,11 +2467,10 @@ def c_code(self, node, name, inputs, outputs, sub):
 
         num_indices = PyArray_SIZE(%(ind)s);
         if ((num_indices - 1) > LONG_MAX) {
-            PyErr_Format(PyExc_AssertionError,
-                         "num_indices %%d exceeds LONG_MAX + 1", num_indices);
+            PyErr_Format(PyExc_AssertionError, "num_indices %%d exceeds LONG_MAX + 1", num_indices);
             %(fail)s;
         }
-
+        
         Py_XDECREF(%(out)s);
         if (!%(inplace)s) {
             %(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
@@ -2640,8 +2489,7 @@ def c_code(self, node, name, inputs, outputs, sub):
              x_rowind_obj = PyInt_FromLong(*p_index);
 
              if (PyInt_AsLong(x_rowind_obj) != (*p_index)) {
-                 PyErr_Format(PyExc_AssertionError,
-                              "Error in converting row index to integer from long");
+                 PyErr_Format(PyExc_AssertionError, "Error in converting row index to integer from long");
                  // Dec Ref what ever we have increfed or allocated so far
                  // We deallocate objects exactly in the reverse order they were allocated.
                  Py_XDECREF(x_rowind_obj);
@@ -2664,7 +2512,7 @@ def c_code(self, node, name, inputs, outputs, sub):
                   Py_XDECREF(x_obj);
                   %(fail)s;
              }
-
+             
              ret = CudaNdarray_inplace_elemwise(row_x, row_y, IADD);
              if (ret != 0) {
                  Py_XDECREF(row_y);
@@ -2684,157 +2532,12 @@ def c_code(self, node, name, inputs, outputs, sub):
 
         Py_XDECREF(y_obj);
         Py_XDECREF(x_obj);
-
+   
         if (!%(out)s) {
             %(fail)s
         }
-        """ % locals()
-
-
-class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
-    """Implement AdvancedIncSubtensor1 on the gpu, but use function
-    only avail on compute capability 2.0 and more recent.
-    """
-
-    def make_node(self, x, y, ilist):
-        """It defer from GpuAdvancedIncSubtensor1 in that it make sure
-        the index are of type long.
-        """
-        x_ = as_cuda_ndarray_variable(x)
-        y_ = as_cuda_ndarray_variable(y)
-        ilist_ = tensor.as_tensor_variable(ilist)
-
-        convert_map = {8: tensor.basic._convert_to_int8,
-                       16: tensor.basic._convert_to_int16,
-                       32: tensor.basic._convert_to_int32,
-                       64: tensor.basic._convert_to_int64
-        }
-        intwidth = theano.gof.compiledir.python_int_bitwidth()
-        ilist_ = convert_map[intwidth](ilist_)
-
-        assert x_.type.dtype == y_.type.dtype
-        assert x_.type.ndim >= y_.type.ndim
-
-        if ilist_.type.dtype[:3] not in ('int', 'uin'):
-            raise TypeError('index must be integers')
-        if ilist_.type.ndim != 1:
-            raise TypeError('index must be vector')
-        if x_.type.ndim == 0:
-            raise TypeError('cannot index into a scalar')
-
-        return Apply(self, [x_, y_, ilist_], [x_.type()])
-
-    def c_code_cache_version(self):
-        return (2,)
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        active_device_no = theano.sandbox.cuda.active_device_number()
-        compute_capability = device_properties(active_device_no)['major']
-        if ((self.set_instead_of_inc) or
-            (node.inputs[0].ndim != node.inputs[1].ndim) or
-            (node.inputs[0].ndim != 2) or
-            (compute_capability < 2)):
-            raise NotImplementedError("This case does not have C code yet.")
-
-        x = inputs[0]
-        y = inputs[1]
-        ind = inputs[2]
-        out = outputs[0]
-        fail = sub['fail']
-        inplace = int(self.inplace)
-        return """
-        Py_XDECREF(%(out)s);
-        if (!%(inplace)s) {
-            %(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
-        } else {
-            %(out)s = %(x)s;
-            Py_XINCREF(%(out)s);
-        }
-
-        CudaNdarray_vector_add_fast(%(out)s, %(y)s, %(ind)s);
-
-        if (!%(out)s) {
-            %(fail)s
-        }
-        """ % locals()
-
-    def c_support_code_apply(self, node, nodename):
-        return """
-
-        __global__ void k_vector_add_fast(int numRowsX,
-                                          int numColsX,
-                                          int stridesX0,
-                                          int stridesX1,
-                                          float *X,
-                                          int numRowsY,
-                                          int numColsY,
-                                          int stridesY0,
-                                          int stridesY1,
-                                          float *Y ,
-                                          long *d_indices_arr,
-                                          int num)
-        {
-             for (int i = (blockIdx.x); i < num; i += gridDim.x)
-             {
-                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
-                  {
-                      int x_row = d_indices_arr[i];
-                      int y_row = i;
-                      atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
-                  }
-             }
-             return;
-        }
-
-	void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray* py_other, PyArrayObject *indices_arr)
-	{
-     		const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
-     		const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
-     		const int *strX   = CudaNdarray_HOST_STRIDES(py_self);
-     		const int *strY   = CudaNdarray_HOST_STRIDES(py_other);
-
-     		unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
-     		unsigned int numcolsX = shapeX[1];
-     		unsigned int num_threads_per_block = std::min(numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-     		unsigned int num_blocks = std::min(size ,(unsigned int)NUM_VECTOR_OP_BLOCKS);
-
-     		dim3 n_blocks(num_blocks);
-     		dim3 n_threads(num_threads_per_block);
-     		long *d_indices_arr = NULL;
-
-     		PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(indices_arr);
-
-     		d_indices_arr = (long*)device_malloc(PyArray_NBYTES(cpu_indices_arr));
-     		assert(d_indices_arr);
-
-     		cudaError_t err = cudaMemcpy(d_indices_arr,
-                                             PyArray_DATA(cpu_indices_arr),
-                                             PyArray_NBYTES(cpu_indices_arr),
-                                             cudaMemcpyHostToDevice);
-
-     		assert(err == cudaSuccess);
-
-     		k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0],
-                                                           shapeX[1],
-                                                           strX[0],
-                                                           strX[1],
-                                                           CudaNdarray_DEV_DATA(py_self),
-                                                           shapeY[0],
-                                                           shapeY[1],
-                                                           strY[0],
-                                                           strY[1],
-                                                           CudaNdarray_DEV_DATA(py_other),
-                                                           d_indices_arr,
-                                                           PyArray_SIZE(indices_arr)
-                                                          );
-     		device_free(d_indices_arr);
-     		Py_XDECREF(cpu_indices_arr);
-     		return;
-	}
-
         """ %locals()
 
-
 class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
     """
     Implement IncSubtensor on the gpu.
@@ -2872,9 +2575,6 @@ def copy_of_x(self, x):
         """
         return """(CudaNdarray*) CudaNdarray_Copy(%(x)s)""" % locals()
 
-    def decl_view(self):
-        return "CudaNdarray* zview = NULL;"
-
     def make_view_array(self, x, view_ndim):
         """
             :param x: a string identifying an array to be viewed
@@ -2884,32 +2584,17 @@ def make_view_array(self, x, view_ndim):
             This doesn't need to actually set up the view with the
             right indexing; we'll do that manually later.
         """
-        ret = """zview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
-        if (CudaNdarray_set_device_data(
-                zview,
-                CudaNdarray_DEV_DATA(%(x)s) + xview_offset/4,
-                (PyObject*) %(x)s))
-        {
-            zview = NULL;
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuSubtensor is not able to set the"
-                         " devdata field of the view");
-        }else{
-            cnda_mark_dev_structure_dirty(zview);
-            for(int idx=0;idx <%(view_ndim)s; idx++){
-                if(xview_dims[idx]==1)
-                    CudaNdarray_set_stride(zview, idx, 0);
-                else
-                    CudaNdarray_set_stride(zview, idx, xview_strides[idx]);
-                CudaNdarray_set_dim(zview, idx, xview_dims[idx]);
-            }
-        }
-        """ % locals()
-        return ret
+        return """CudaNdarray* zview = (CudaNdarray*)
+                CudaNdarray_New(%(view_ndim)s)""" % locals()
 
     def get_helper_c_code_args(self):
         """ Return a dictionary of arguments to use with helper_c_code"""
-        return {'c_prefix': 'CudaNdarray',
+        return { 'update_flags' : "",
+                'c_prefix' : 'CudaNdarray',
+                'set_data' :'CudaNdarray_set_device_data2',
+                'set_dim' : 'CudaNdarray_set_dim',
+                'set_stride' : 'CudaNdarray_set_stride',
+                'update_flags' : "",
                 'strides_mul': 4
                 }
 
@@ -2921,15 +2606,45 @@ def copy_into(self, view, source):
             returns a C code expression to copy source into view, and
             return 0 on success
         """
-        # On the CPU it unbroadcast based on the run time shapes. We
-        # need the same behavior on the GPU.
-        return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s, 1)""" % locals()
+        return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
+
+    def define_set_data(self):
+        return _define_set_data
+
+    def link_view_array(self, x, fail):
 
-    def add_to_zview(self, name, x, fail):
+        return """
+        if (CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(%(x)s),
+                                       (PyObject*) NULL))
+        {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuSubtensor is not able to set the"
+                         " devdata field of the view");
+            Py_XDECREF(zview);
+            %(fail)s;
+        }
+        cnda_mark_dev_structure_dirty(zview);
+        """ % locals()
 
+    def set_view_base(self, x, fail):
         return """
-        PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,
-                                                        (PyObject *) py_%(x)s);
+        //Set the base only now
+
+        if(CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(zview),
+                                    %(x)s)){
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuSubtensor is not able to set"
+                         " the base of the view array");
+            Py_XDECREF(zview);
+            %(fail)s;
+        }""" % locals()
+
+    def add_to_zview(self, x, fail):
+
+        return """
+
+        PyObject * add_result =  CudaNdarray_inplace_add((PyObject *) zview,
+                                                         (PyObject *) py_%(x)s);
 
         if (! add_result )
         {
@@ -2943,9 +2658,10 @@ def add_to_zview(self, name, x, fail):
         """ % locals()
 
     def c_code_cache_version(self):
+
         parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
         if parent_version:
-            return parent_version + (1,)
+            return parent_version + (0,)
         return ()
 
 
@@ -3048,153 +2764,6 @@ def construct_slices(curlen):
 
         out[0] = rval
 
-    def c_code(self, node, name, inputs, out_, sub):
-        nd = node.inputs[1].ndim
-        if not all(i.ndim == nd for i in node.inputs[2:]):
-            # all inputs ndarray need to have the same number of dimensions
-            raise NotImplementedError()
-        axis = inputs[0]
-        n_cndas = len(inputs[1:])
-        input_1 = inputs[1]
-        fail = sub['fail']
-        out = out_[0]
-
-        # getting the shapes of all the involved tensors (input[0]+out)
-        str = """
-        const int axis = PyInt_AsLong((PyObject*)%(axis)s);
-        const int nd = %(nd)s;
-        int shape_out[nd];
-        int width_sum = 0;
-        int errorcode;
-        int sum = 0;
-        PyObject *slice_tuple = NULL;
-        PyObject *section_slice = NULL;
-        PyObject *full_slice = NULL;
-        PyObject *out_sub = NULL;
-        PyObject *start, *stop;
-        start = NULL;
-        stop = NULL;
-
-        """ % locals()
-
-        # getting the shapes of all the involved tensors (input[1:])
-        # + check: all input tensors have same shape as final out
-        # execept for "axis" dimension
-        # shape_%(cdna)s[nd] is initialized before, to prevent following
-        # error: jump to label __label_9 crosses initialization of
-        # shape_%(cdna)s[nd]
-        for i, cdna in enumerate(gof.utils.uniq(inputs[1:])):
-            str += """
-            int shape_%(cdna)s[nd];
-            """ % locals()
-        str += """
-        if(-1 == axis && PyErr_Occurred()){
-            %(fail)s;
-        }
-        full_slice = PySlice_New(NULL, NULL, NULL);
-        if(full_slice == NULL){
-            %(fail)s;
-        }
-
-        for(int i = 0; i<nd; i+=1)
-        {
-            shape_%(input_1)s[i] = CudaNdarray_HOST_DIMS(%(input_1)s)[i];
-            shape_out[i] = shape_%(input_1)s[i];
-        }
-        """ % locals()
-        for i, cdna in enumerate(gof.utils.uniq(inputs[2:])):
-            str += """
-            for(int i = 0; i<nd; i+=1)
-            {
-                shape_%(cdna)s[i] = CudaNdarray_HOST_DIMS(%(cdna)s)[i];
-                if((i!=axis) && (shape_%(cdna)s[i]!=shape_out[i]))
-                {
-                    PyErr_Format(
-                        PyExc_ValueError,
-                        "GpuJoin: Wrong inputs for input %%d related"
-                        " to inputs 0.!",
-                        i);
-                    %(fail)s;
-                }
-            }
-            """ % locals()
-
-        # computing the new shape for the out tensors
-        for i, cdna in enumerate(inputs[1:]):
-            str += "\t\twidth_sum += CudaNdarray_HOST_DIMS(%(cdna)s)[axis];\n" % locals()
-        str += "\t\tshape_out[axis] = width_sum;\n"
-
-        # preparing the output array + init of the necessary variables
-        # for the data transfer
-        str += """
-        if (CudaNdarray_prep_output(&%(out)s, nd, shape_out))
-        {
-            %(fail)s;
-        }
-        """ % locals()
-        # start copying the data into the new out tensors
-        for i, cdna in enumerate(inputs[1:]):
-            str += """
-            sum += shape_%(cdna)s[axis];
-            stop = PyInt_FromLong(sum);
-            slice_tuple = PyTuple_New(nd);
-            if(slice_tuple == NULL){
-                %(fail)s;
-            }
-            section_slice = PySlice_New(start, stop, NULL);
-            if(section_slice == NULL){
-                %(fail)s;
-            }
-            for(int i=0; i<nd; i++)
-            {
-                if(i!=axis)
-                {
-                    Py_INCREF(full_slice);
-                    PyTuple_SetItem(slice_tuple, i, full_slice);
-                }
-                else
-                {
-                    Py_INCREF(section_slice);
-                    PyTuple_SetItem(slice_tuple, i, section_slice);
-                }
-            }
-            out_sub = CudaNdarray_Subscript((PyObject*)%(out)s, slice_tuple);
-            if(out_sub == NULL){
-                Py_XDECREF(start);
-                Py_XDECREF(stop);
-                Py_XDECREF(slice_tuple);
-                Py_XDECREF(out_sub);
-                Py_XDECREF(%(out)s);
-                %(fail)s;
-            }
-            Py_CLEAR(slice_tuple);
-            Py_CLEAR(section_slice);
-
-            errorcode = CudaNdarray_CopyFromCudaNdarray(
-                (CudaNdarray*)out_sub, %(cdna)s);
-            if(errorcode != 0)
-            {
-                Py_XDECREF(start);
-                Py_XDECREF(stop);
-                Py_XDECREF(out_sub);
-                Py_XDECREF(%(out)s);
-                %(fail)s;
-            }
-            Py_XDECREF(out_sub);
-            Py_XDECREF(start);
-            start = stop;
-            stop = NULL;
-            """ % locals()
-
-        str += """
-            Py_XDECREF(start);
-            Py_XDECREF(stop);
-        """
-        return str
-
-    def c_code_cache_version(self):
-        return (5,)
-
 gpu_join = GpuJoin()
 
 
@@ -3323,27 +2892,13 @@ def do_constant_folding(self, node):
                 # If the output is a constant, it will have to be deepcopied
                 # each time the function is called.  So we do not fold.
                 return False
-            elif (#The following ops work inplace of their input id 0.
-                  client[1] == 0 and
-                  isinstance(client[0].op, (
-                    #Ops that will work inplace on the Alloc. So if they
-                    #get constant_folded, they would copy the
-                    #constant and this is less efficients.
-
-                    #Not doing the constant folding could also lower
-                    #the peak memory usage, as we the "constant" won't
-                    #always exists.
-                      #theano.tensor.subtensor.AdvancedIncSubtensor,
-                      GpuIncSubtensor,
-                      GpuAdvancedIncSubtensor1,
-                      theano.sandbox.cuda.blas.GpuGemm,
-                      theano.sandbox.cuda.blas.GpuGemv,
-                      theano.sandbox.cuda.blas.GpuGer,
-                  ))):
-                return False
-            #If the clients is a transfer, we don't want to fold. We
-            #let the moving opt finish before deciding what to do.
-            elif isinstance(client[0].op, HostFromGpu):
+            elif (not isinstance(client[0], basestring)
+                    and isinstance(client[0].op, (
+                        tensor.IncSubtensor,
+                        tensor.AdvancedIncSubtensor1,
+                        GpuIncSubtensor,
+                        GpuAdvancedIncSubtensor1
+                        ))):
                 return False
         return True
 
@@ -3356,7 +2911,6 @@ class GpuContiguous(GpuOp):
     not already c contiguous.
     """
     view_map = {0: [0]}
-    check_input = False
 
     def __eq__(self, other):
         return type(self) == type(other)
@@ -3379,13 +2933,6 @@ def make_node(self, input):
         input = as_cuda_ndarray_variable(input)
         return Apply(self, [input], [input.type()])
 
-    def perform(self, node, inp, out):
-        i = inp[0]
-        if not i.is_c_contiguous():
-            i = i.copy()
-        assert i.is_c_contiguous()
-        out[0][0] = i
-
     def c_code(self, node, name, inp, out, sub):
         input, = inp
         z, = out
@@ -3398,7 +2945,7 @@ def c_code(self, node, name, inp, out, sub):
                 Py_INCREF(%(z)s);
 
             } else if ((NULL == %(z)s)""" % locals()
-        for i in xrange(node.inputs[0].type.ndim):
+        for i in xrange(len(node.inputs[0].type.broadcastable)):
             str += "\n|| (CudaNdarray_HOST_DIMS(%(input)s)[%(i)s] != CudaNdarray_HOST_DIMS(%(z)s)[%(i)s])" % locals()
         str += """
                 || !CudaNdarray_is_c_contiguous(%(z)s))
@@ -3643,7 +3190,7 @@ def c_code(self, node, name, inp, out, sub):
                     cudaGetErrorString(sts),
                     dims[0], dims[1]);
             %(fail)s;
-        }
+         }
         """ % locals()
 
         return s
diff --git a/theano/sandbox/cuda/blas.py b/theano/sandbox/cuda/blas.py
index 6e5adc1212d..ea692d1bce8 100644
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -1,7 +1,5 @@
 import copy
 import os
-import logging
-_logger = logging.getLogger(__name__)
 
 import theano
 from theano import Apply
@@ -9,9 +7,6 @@
 from theano.compat.six import StringIO
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import GpuOp
-from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
-                                           gpu_contiguous)
-from theano.tensor import as_tensor_variable
 
 
 class GpuDot22(GpuOp):
@@ -129,8 +124,8 @@ def c_code(self, node, name, inputs, outputs, sub):
         return """
         #define REAL float
         float %(name)s_a = (PyArray_TYPE(%(a)s) == NPY_FLOAT)
-        ? (REAL)(((float*)PyArray_DATA(%(a)s))[0])
-        : (REAL)(((double*)PyArray_DATA(%(a)s))[0]);
+        ? (REAL)(((float*)%(a)s->data)[0])
+        : (REAL)(((double*)%(a)s->data)[0]);
         #undef REAL
         if (%(x)s->nd != 2)
         {
@@ -237,12 +232,12 @@ def c_code(self, node, name, inputs, outputs, sub):
 
         #define REAL float
         float %(name)s_a = (PyArray_TYPE(%(a)s) == NPY_FLOAT)
-        ? (REAL)(((float*)PyArray_DATA(%(a)s))[0])
-        : (REAL)(((double*)PyArray_DATA(%(a)s))[0]);
+        ? (REAL)(((float*)%(a)s->data)[0])
+        : (REAL)(((double*)%(a)s->data)[0]);
 
         float %(name)s_b = (PyArray_TYPE(%(b)s) == NPY_FLOAT) ?
-        (REAL)(((float*)PyArray_DATA(%(b)s))[0])
-        : (REAL)(((double*)PyArray_DATA(%(b)s))[0]);
+        (REAL)(((float*)%(b)s->data)[0])
+        : (REAL)(((double*)%(b)s->data)[0]);
         #undef REAL
 
         if (%(inplace)s
@@ -266,8 +261,6 @@ def c_code(self, node, name, inputs, outputs, sub):
                     == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
                 && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
                 && (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
-        // The following condition is needed as this is a condition by cublas
-        // on the memory layout of the output it accepts.
                 && ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
                     || (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
                     || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
@@ -351,8 +344,8 @@ def c_code(self, node, name, inputs, outputs, sub):
         sio = StringIO()
 
         print >> sio, """
-        float %(name)s_alpha = ((dtype_%(a)s*)(PyArray_DATA(%(a)s)))[0];
-        float %(name)s_beta = ((dtype_%(b)s*)(PyArray_DATA(%(b)s)))[0];
+        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
+        float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];
 
         if (%(inplace)s
             && ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] > 0)
@@ -448,7 +441,7 @@ def c_code(self, node, name, inputs, outputs, sub):
         sio = StringIO()
 
         print >> sio, """
-        float %(name)s_alpha = ((dtype_%(a)s*)(PyArray_DATA(%(a)s)))[0];
+        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
 
         if (%(inplace)s
             && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
@@ -504,1003 +497,13 @@ def c_code(self, node, name, inputs, outputs, sub):
 gpu_ger_inplace = GpuGer(inplace=True)
 
 
-class BaseGpuCorrMM(GpuOp):
-    """Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
-    `GpuCorrMM_gradInputs`. Cannot be used directly.
-
-    :param border_mode: one of 'valid', 'full', 'half'; additionally, the
-        padding size could be directly specified by an integer or a pair of
-        integers
-    :param subsample: perform subsampling of the output (default: (1, 1))
-    :param pad: *deprecated*, now you should always use border_mode
-
-    """
-    check_broadcast = False
-
-    def __init__(self, border_mode="valid", subsample=(1, 1), pad=(0, 0)):
-        if pad != (0, 0):
-            _logger.warning(
-                'do not use pad for BaseGpuCorrMM; please set padding in'
-                'border_mode, see the docstring for more details')
-            if border_mode != "valid":
-                raise ValueError("border_mode must be 'valid'")
-            border_mode = pad
-        if isinstance(border_mode, int):
-            border_mode = (border_mode, border_mode)
-        if isinstance(border_mode, tuple):
-            pad_h, pad_w = map(int, border_mode)
-            border_mode = (pad_h, pad_w)
-        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
-                border_mode in ('valid', 'full', 'half')):
-            raise ValueError(
-                'invalid border_mode {}, which must be either '
-                '"valid", "full", "half", an integer or a pair of'
-                ' integers'.format(border_mode))
-        self.border_mode = border_mode
-        if len(subsample) != 2:
-            raise ValueError("subsample must have two elements")
-        self.subsample = subsample
-
-    @property
-    def pad(self):
-        if self.border_mode != 'valid':
-            return self.border_mode
-        return (0, 0)
-
-    def __eq__(self, other):
-        return type(self) == type(other) \
-            and self.border_mode == other.border_mode \
-            and self.subsample == other.subsample
-
-    def __hash__(self):
-        return hash(type(self)) \
-            ^ hash(self.border_mode) \
-            ^ hash(self.subsample)
-
-    def __str__(self):
-        return '%s{%s, %s}' % (
-            self.__class__.__name__,
-            self.border_mode,
-            str(self.subsample))
-
-    def flops(self, inp, outp):
-        """ Useful with the hack in profilemode to print the MFlops"""
-        # if the output shape is correct, then this gives the correct
-        # flops for any direction, sampling, padding, and border mode
-        inputs, filters = inp
-        outputs, = outp
-        assert inputs[1] == filters[1]
-        # nb mul and add by output pixel
-        flops = filters[2] * filters[3] * 2
-        # nb flops by output image
-        flops *= outputs[2] * outputs[3]
-        # nb patch multiplied
-        flops *= inputs[1] * filters[0] * inputs[0]
-        return flops
-
-    def c_headers(self):
-        return ['cuda_ndarray.cuh', '<stdio.h>']
-
-    def c_code_cache_version(self):
-        # raise this whenever modifying any of the support_code_files
-        return (0, 24)
-
-    def c_support_code_apply(self, node, nodename):
-        # REMEMBER TO RAISE c_code_cache_version when changing any of
-        # these files
-        files = ['corr_gemm.cu']
-        codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
-                for f in files]
-        return reduce(str.__add__, codes)
-
-    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
-        """
-        This generates the C code for GpuCorrMM (direction="forward"),
-        GpuCorrMM_gradWeights (direction="backprop weights"), and
-        GpuCorrMM_gradInputs (direction="backprop inputs").
-        Depending on the direction, one of bottom, weights, top will
-        receive the output, while the other two serve as inputs.
-
-        :param bottom: Variable name of the input images in the forward pass,
-            or the gradient of the input images in backprop wrt. inputs
-        :param weights: Variable name of the filters in the forward pass,
-            or the gradient of the filters in backprop wrt. weights
-        :param top: Variable name of the output images / feature maps in the
-            forward pass, or the gradient of the outputs in the backprop passes
-        :param direction: "forward" to correlate bottom with weights and store
-            results in top,
-            "backprop weights" to do a valid convolution of bottom with top
-            (swapping the first two dimensions) and store results in weights,
-            and "backprop inputs" to do a full convolution of top with weights
-            (swapping the first two dimensions) and store results in bottom.
-        :param sub: Dictionary of substitutions useable to help generating the
-            C code.
-        :param height: If self.subsample[0] != 1, a variable giving the height
-            of the filters for direction="backprop weights" or the height of
-            the input images for direction="backprop inputs".
-
-            If self.border_mode == 'half', a variable giving the height of the
-            filters for direction="backprop weights".  Ignored otherwise.
-        :param width: If self.subsample[1] != 1, a variable giving the width
-            of the filters for direction="backprop weights" or the width of the
-            input images for direction="backprop inputs".
-
-            If self.border_mode == 'half', a variable giving the width of the
-            filters for direction="backprop weights".  Ignored otherwise.
-        """
-        dH, dW = self.subsample
-        if self.border_mode == "half":
-            padH = padW = -1
-        elif self.border_mode == "full":
-            padH = padW = -2
-        elif isinstance(self.border_mode, tuple):
-            padH, padW = self.border_mode
-        else:
-            assert self.border_mode == "valid"
-            padH = padW = 0
-        if direction == "forward":
-            direction = 0
-            out = top
-        elif direction == "backprop weights":
-            direction = 1
-            out = weights
-        elif direction == "backprop inputs":
-            direction = 2
-            out = bottom
-        else:
-            raise ValueError("direction must be one of 'forward', "
-                    "'backprop weights', 'backprop inputs'")
-        # When subsampling, we cannot unambiguously infer the height and width
-        # of bottom and weights from top, so we require them to be given.
-        # Similarly, when pad="half", we cannot infer the weight size.
-        if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
-            if not height:
-                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
-            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
-        else:
-            height = 'NULL'
-        if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
-            if not width:
-                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
-            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
-        else:
-            width = 'NULL'
-        sub = sub.copy()
-        sub.update(locals())
-
-        return """
-    // Mandatory args
-    int direction = %(direction)s;  // forward, bprop weights, bprop inputs
-
-    // Optional args
-    int dH = %(dH)s;
-    int dW = %(dW)s;
-    int padH = %(padH)s;
-    int padW = %(padW)s;
-    
-    CudaNdarray * bottom = %(bottom)s;
-    CudaNdarray * weights = %(weights)s;
-    CudaNdarray * top = %(top)s;
-    CudaNdarray * out2 = NULL;
-
-    // Obtain or infer kernel width and height
-    // (we need to know it early to be able to handle auto-padding)
-    int kH, kW;
-    if (direction != 1) {
-        // weight is an input variable, we can just read its shape
-        kH = CudaNdarray_HOST_DIMS(weights)[2];
-        kW = CudaNdarray_HOST_DIMS(weights)[3];
-    }
-    else {
-        if ((dH != 1) || (padH == -1)) {
-            // vertical subsampling or half padding, kernel height is specified
-            kH = %(height)s;
-        }
-        else if (padH == -2) {
-            // vertical full padding, we can infer the kernel height
-            kH = 2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
-        }
-        else {
-            // explicit padding, we can infer the kernel height
-            kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
-        }
-        if ((dW != 1) || (padW == -1)) {
-            kW = %(width)s;
-        }
-        else if (padW == -2) {
-            kW = 2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
-        }
-        else {
-            kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
-        }
-    }
-
-    // Auto-padding if requested
-    if (padH == -1) {  // vertical half padding
-        padH = kH / 2;
-    }
-    else if (padH == -2) {  // vertical full padding
-        padH = kH - 1;
-    }
-    else if (padH < 0) {
-        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2");
-        %(fail)s
-    }
-    if (padW == -1) {  // horizontal half padding
-        padW = kW / 2;
-    }
-    else if (padW == -2) {  // horizontal full padding
-        padW = kW - 1;
-    }
-    else if (padW < 0) {
-        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2");
-        %(fail)s
-    }
-
-    // Infer output shape
-    int out_dim[4];
-    switch(direction) {
-    case 0:  // forward pass
-        // output is top: (batchsize, num_filters, height, width)
-        // height and width: top = (bottom + 2*pad - weight) / sample + 1
-        out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
-        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
-        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
-        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
-        break;
-    case 1:  // backprop wrt. weights
-        // output is weights: (num_filters, num_channels, height, width)
-        // height and width: weights = bottom + 2*pad - (top - 1) * sample
-        out_dim[0] = CudaNdarray_HOST_DIMS(top)[1];
-        out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
-        out_dim[2] = kH;  // already inferred further above
-        out_dim[3] = kW;  // how convenient
-        break;
-    case 2:  // backprop wrt. inputs
-        // output is bottom: (batchsize, num_channels, height, width)
-        // height and width: bottom = (top - 1) * sample + weights - 2*pad
-        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
-        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
-        break;
-    default:
-        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
-        %(fail)s
-    }
-
-    // Prepare output array
-    if ( !(%(out)s
-           && %(out)s->nd==4
-           && CudaNdarray_is_c_contiguous(%(out)s)
-           && CudaNdarray_HOST_DIMS(%(out)s)[0]==out_dim[0]
-           && CudaNdarray_HOST_DIMS(%(out)s)[1]==out_dim[1]
-           && CudaNdarray_HOST_DIMS(%(out)s)[2]==out_dim[2]
-           && CudaNdarray_HOST_DIMS(%(out)s)[3]==out_dim[3]))
-    {
-        Py_XDECREF(%(out)s);
-        %(out)s = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
-        if (NULL == %(out)s)
-        {
-            PyErr_Format(PyExc_RuntimeError,
-                    "BaseGpuCorrMM: Failed to allocate output of %%d x %%d x %%d x %%d",
-                    out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
-            %(fail)s
-        }
-    }
-
-    // Call CUDA code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, padH, padW);
-    if (out2==NULL){
-       %(fail)s
-    }
-    assert (out2 == %(out)s);
-
-""" % sub
-
-
-class GpuCorrMM(BaseGpuCorrMM):
-    """GPU correlation implementation using Matrix Multiplication.
-
-    :param border_mode: currently supports "valid" only; "full" can be
-        simulated by setting `pad="full"` (at the cost of performance), or
-        by using `GpuCorrMM_gradInputs`
-    :param subsample: the subsample operation applied to each output image.
-        Should be a tuple with 2 elements.
-        `(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
-        but faster.
-        Set to `(1, 1)` to disable subsampling.
-    :param pad: the width of a border of implicit zeros to pad the input
-        image with. Should be a tuple with 2 elements giving the numbers of
-        rows and columns to pad on each side, or "half" to set the padding
-        to `(kernel_rows // 2, kernel_columns // 2)`, or "full" to set the
-        padding to `(kernel_rows - 1, kernel_columns - 1)` at runtime.
-        Set to `(0, 0)` to disable padding.
-
-    :note: Currently, the Op requires the inputs, filters and outputs to be
-        C-contiguous. Use :func:`gpu_contiguous
-        <theano.sandbox.cuda.basic_ops.gpu_contiguous>` on these arguments
-        if needed.
-
-    :note: You can either enable the Theano flag `optimizer_including=conv_gemm`
-        to automatically replace all convolution operations with `GpuCorrMM`
-        or one of its gradients, or you can use it as a replacement for
-        :func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as
-        `GpuCorrMM(subsample=...)(image, filters)`. The latter is currently
-        faster, but note that it computes a correlation -- if you need to
-        compute a convolution, flip the filters as `filters[:,:,::-1,::-1]`.
-
-    :warning: For 700 series Nvidia GPUs of compute capability 3.5 and CUDA 5.0
-        to 6.0, there is a bug in CUBLAS' matrix multiplication function that
-        can make GpuCorrMM or its gradients crash for some input and filter
-        shapes. So if you have a Tesla K20, Tesla K40, Quadro K6000, GeForce GT
-        640 (DDR5), GeForce GTX 780 (or Ti), GeForce GTX TITAN (or Black or Z)
-        and experience a crash, switching to CUDA 6.5 or CUDA 4.2 should fix it.
-        If this is not possible, changing the input or filter shapes (e.g., the
-        batchsize or number of filters) may also work around the CUBLAS bug.
-    """
-    def __init__(self, border_mode="valid",
-                 subsample=(1, 1),
-                 pad=(0, 0)):
-        super(GpuCorrMM, self).__init__(border_mode, subsample, pad)
-
-    def make_node(self, img, kern):
-        img = as_cuda_ndarray_variable(img)
-        kern = as_cuda_ndarray_variable(kern)
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
-
-        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
-                         False, False]
-        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        bottom, weights = inp
-        top, = out_
-        direction = "forward"
-        return super(GpuCorrMM, self).c_code_helper(bottom, weights, top, direction, sub)
-
-    def grad(self, inp, grads):
-        bottom, weights = inp
-        top, = grads
-        top = gpu_contiguous(top)
-        d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample)(
-            weights, top, bottom.shape[-2:])
-        d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample)(
-            bottom, top, weights.shape[-2:])
-        return d_bottom, d_weights
-
-
-class GpuCorrMM_gradWeights(BaseGpuCorrMM):
-    """Gradient wrt. filters for `GpuCorrMM`.
-
-    :note: You will not want to use this directly, but rely on
-           Theano's automatic differentiation or graph optimization to
-           use it as needed.
-
-    """
-
-    def __init__(self, border_mode="valid",
-            subsample=(1, 1),
-            pad=(0, 0)):
-        super(GpuCorrMM_gradWeights, self).__init__(border_mode, subsample, pad)
-
-    def make_node(self, img, topgrad, shape=None):
-        img = as_cuda_ndarray_variable(img)
-        topgrad = as_cuda_ndarray_variable(topgrad)
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if topgrad.type.ndim != 4:
-            raise TypeError('topgrad must be 4D tensor')
-        if self.subsample != (1, 1) or self.pad == "half":
-            if shape is None:
-                raise ValueError('shape must be given if subsample != (1, 1) or pad == "half"')
-            height_width = [shape[0], shape[1]]
-        else:
-            height_width = []
-
-        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
-                         False, False]
-        return Apply(self, [img, topgrad] + height_width, [CudaNdarrayType(broadcastable)()])
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        bottom, top = inp[:2]
-        height, width = inp[2:] or (None, None)
-        weights, = out_
-        direction = "backprop weights"
-        return super(GpuCorrMM_gradWeights, self).c_code_helper(bottom, weights, top, direction, sub, height, width)
-
-    def grad(self, inp, grads):
-        bottom, top = inp[:2]
-        weights, = grads
-        weights = gpu_contiguous(weights)
-        d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample, self.pad)(
-                weights, top, bottom.shape[-2:])
-        d_top = GpuCorrMM(self.border_mode, self.subsample, self.pad)(
-                bottom, weights)
-        d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
-        return (d_bottom, d_top) + d_height_width
-
-    def connection_pattern(self, node):
-        if node.nin == 2:
-            return [[1], [1]]
-        else:
-            return [[1], [1], [0], [0]]  # no connection to height, width
-
-
-class GpuCorrMM_gradInputs(BaseGpuCorrMM):
-    """Gradient wrt. inputs for `GpuCorrMM`.
-
-    :note: You will not want to use this directly, but rely on
-           Theano's automatic differentiation or graph optimization to
-           use it as needed.
-
-    """
-
-    def __init__(self, border_mode="valid",
-            subsample=(1, 1),
-            pad=(0, 0)):
-        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample, pad)
-
-    def make_node(self, kern, topgrad, shape=None):
-        kern = as_cuda_ndarray_variable(kern)
-        topgrad = as_cuda_ndarray_variable(topgrad)
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
-        if topgrad.type.ndim != 4:
-            raise TypeError('topgrad must be 4D tensor')
-        if self.subsample != (1, 1) and shape is None:
-            raise ValueError('shape must be given if subsample != (1, 1)')
-        height_width = [shape[0], shape[1]] if self.subsample != (1, 1) else []
-
-        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
-                         False, False]
-        return Apply(self, [kern, topgrad] + height_width, [CudaNdarrayType(broadcastable)()])
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        weights, top = inp[:2]
-        height, width = inp[2:] or (None, None)
-        bottom, = out_
-        direction = "backprop inputs"
-        return super(GpuCorrMM_gradInputs, self).c_code_helper(bottom, weights, top, direction, sub, height, width)
-
-    def grad(self, inp, grads):
-        weights, top = inp[:2]
-        bottom, = grads
-        bottom = gpu_contiguous(bottom)
-        d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample, self.pad)(
-                bottom, top, weights.shape[-2:])
-        d_top = GpuCorrMM(self.border_mode, self.subsample, self.pad)(
-                bottom, weights)
-        d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
-        return (d_weights, d_top) + d_height_width
-
-    def connection_pattern(self, node):
-        if node.nin == 2:
-            return [[1], [1]]
-        else:
-            return [[1], [1], [0], [0]]  # no connection to height, width
-
-
-class BaseGpuCorr3dMM(GpuOp):
-    """Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
-    `GpuCorr3dMM_gradInputs`. Cannot be used directly."""
-
-    def __init__(self, border_mode="valid",
-                 subsample=(1, 1, 1),
-                 pad=(0, 0, 0)):
-        if border_mode != "valid":
-            raise ValueError("border_mode must be 'valid'")
-        self.border_mode = border_mode
-        if len(subsample) != 3:
-            raise ValueError("subsample must have three elements")
-        self.subsample = subsample
-        if (pad not in ("half", "full")) and (len(pad) != 3):
-            raise ValueError("pad must be 'half', 'full', or have three elements")
-        self.pad = pad
-
-    def __eq__(self, other):
-        return type(self) == type(other) \
-            and self.border_mode == other.border_mode \
-            and self.subsample == other.subsample \
-            and self.pad == other.pad
-
-    def __hash__(self):
-        return hash(type(self)) \
-            ^ hash(self.border_mode) \
-            ^ hash(self.subsample) \
-            ^ hash(self.pad)
-
-    def __str__(self):
-        return '%s{%s, %s, pad=%r}' % (
-            self.__class__.__name__,
-            self.border_mode,
-            str(self.subsample),
-            self.pad)
-
-    def flops(self, inp, outp):
-        """ Useful with the hack in profilemode to print the MFlops"""
-        # if the output shape is correct, then this gives the correct
-        # flops for any direction, sampling, padding, and border mode
-        inputs, filters = inp
-        outputs, = outp
-        assert inputs[1] == filters[1]
-        # nb mul and add by output pixel
-        flops = filters[2] * filters[3] * filters[4] * 2
-        # nb flops by output image
-        flops *= outputs[2] * outputs[3] * outputs[4]
-        # nb patch multiplied
-        flops *= inputs[1] * filters[0] * inputs[0]
-        return flops
-
-    def c_headers(self):
-        return ['cuda_ndarray.cuh', '<stdio.h>']
-
-    def c_code_cache_version(self):
-        # raise this whenever modifying any of the support_code_files
-        return (0, 23)
-
-    def c_support_code_apply(self, node, nodename):
-        # REMEMBER TO RAISE c_code_cache_version when changing any of
-        # these files
-        files = ['corr3d_gemm.cu']
-        codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
-                for f in files]
-        return reduce(str.__add__, codes)
-
-    def c_code_helper(self, bottom, weights,
-                      top, direction,
-                      sub,
-                      height=None, width=None, depth=None):
-        """
-        This generates the C code for GpuCorrMM (direction="forward"),
-        GpuCorrMM_gradWeights (direction="backprop weights"), and
-        GpuCorrMM_gradInputs (direction="backprop inputs").
-        Depending on the direction, one of bottom, weights, top will
-        receive the output, while the other two serve as inputs.
-
-        :param bottom: Variable name of the input images in the forward pass,
-            or the gradient of the input images in backprop wrt. inputs
-        :param weights: Variable name of the filters in the forward pass,
-            or the gradient of the filters in backprop wrt. weights
-        :param top: Variable name of the output images / feature maps in the
-            forward pass, or the gradient of the outputs in the backprop passes
-        :param direction: "forward" to correlate bottom with weights and store
-            results in top,
-            "backprop weights" to do a valid convolution of bottom with top
-            (swapping the first two dimensions) and store results in weights,
-            and "backprop inputs" to do a full convolution of top with weights
-            (swapping the first two dimensions) and store results in bottom.
-        :param sub: Dictionary of substitutions useable to help generating the
-            C code.
-        :param height: If self.subsample[0] != 1, a variable giving the height
-            of the filters for direction="backprop weights" or the height of the
-            input images for direction="backprop inputs".
-            If self.pad == 'half', a variable giving the height of the filters
-            for direction="backprop weights".
-            Ignored otherwise.
-        :param width: If self.subsample[1] != 1, a variable giving the width
-            of the filters for direction="backprop weights" or the width of the
-            input images for direction="backprop inputs".
-            If self.pad == 'half', a variable giving the width of the filters
-            for direction="backprop weights".
-            Ignored otherwise.
-        :param depth: If self.subsample[2] != 1, a variable giving the depth
-            of the filters for direction="backprop weights" or the depth of the
-            input images for direction="backprop inputs".
-            If self.pad == 'half', a variable giving the depth of the filters
-            for direction="backprop weights".
-            Ignored otherwise.
-        """
-        if self.border_mode != "valid":
-            raise ValueError("mode must be 'valid'")
-        dH, dW, dD = self.subsample
-        if self.pad == "half":
-            padH = padW = padD = -1
-        elif self.pad == "full":
-            padH = padW = padD =-2
-        else:
-            padH, padW, padD = self.pad
-        if direction == "forward":
-            direction = 0
-            out = top
-        elif direction == "backprop weights":
-            direction = 1
-            out = weights
-        elif direction == "backprop inputs":
-            direction = 2
-            out = bottom
-        else:
-            raise ValueError("direction must be one of 'forward', "
-                    "'backprop weights', 'backprop inputs'")
-        # When subsampling, we cannot unambiguously infer the height and width
-        # of bottom and weights from top, so we require them to be given.
-        # Similarly, when pad="half", we cannot infer the weight size.
-        if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
-            if not height:
-                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
-            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
-        else:
-            height = 'NULL'
-        if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
-            if not width:
-                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
-            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
-        else:
-            width = 'NULL'
-        if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
-            if not depth:
-                raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'")
-            depth = '(*(npy_int*)(PyArray_DATA(%s)))' % depth
-        else:
-            depth = 'NULL'
-        sub = sub.copy()
-        sub.update(locals())
-
-        return """
-    // Mandatory args
-    int direction = %(direction)s;  // forward, bprop weights, bprop inputs
-
-    // Optional args
-    int dH = %(dH)s;
-    int dW = %(dW)s;
-    int dD = %(dD)s;
-    int padH = %(padH)s;
-    int padW = %(padW)s;
-    int padD = %(padD)s;
-
-    CudaNdarray * bottom = %(bottom)s;
-    CudaNdarray * weights = %(weights)s;
-    CudaNdarray * top = %(top)s;
-    CudaNdarray * out2 = NULL;
-
-    // Obtain or infer kernel width and height
-    // (we need to know it early to be able to handle auto-padding)
-    int kH, kW, kD;
-    if (direction != 1)
-    {
-      // weight is an input variable, we can just read its shape
-      kH = CudaNdarray_HOST_DIMS(weights)[2];
-      kW = CudaNdarray_HOST_DIMS(weights)[3];
-      kD = CudaNdarray_HOST_DIMS(weights)[4];
-    }
-    else
-    {
-      if ((dH != 1) || (padH == -1))
-      {
-         // vertical subsampling or half padding, kernel height is specified
-         kH = %(height)s;
-      }
-      else if (padH == -2)
-      {
-        // vertical full padding, we can infer the kernel height
-        kH = 2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
-      }
-      else
-      {
-        // explicit padding, we can infer the kernel height
-        kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
-      }
-      if ((dW != 1) || (padW == -1))
-      {
-        kW = %(width)s;
-      }
-      else if (padW == -2)
-      {
-         kW = 2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
-      }
-      else
-      {
-        kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
-      }
-      if ((dD != 1) || (padD == -1))
-      {
-        kD = %(depth)s;
-      }
-      else if (padD == -2)
-      {
-         kD = 2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD;
-      }
-      else
-      {
-        kD = CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD;
-      }
-    }
-
-    // Auto-padding if requested
-    if (padH == -1)
-    { // vertical half padding
-      padH = kH / 2;
-    }
-    else if (padH == -2)
-    { // vertical full padding
-      padH = kH - 1;
-    }
-    else if (padH < 0)
-    {
-      PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padH must be >= -2");
-      %(fail)s
-    }
-    if (padW == -1) {  // horizontal half padding
-      padW = kW / 2;
-    }
-    else if (padW == -2) {  // horizontal full padding
-      padW = kW - 1;
-    }
-    else if (padW < 0)
-    {
-      PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padW must be >= -2");
-      %(fail)s
-    }
-    if (padD == -1)
-    { // horizontal half padding
-      padD = kD / 2;
-    }
-    else if (padD == -2)
-    { // horizontal full padding
-      padD = kD - 1;
-    }
-    else if (padD < 0)
-    {
-      PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padD must be >= -2");
-      %(fail)s
-    }
-
-    // Infer output shape
-    int out_dim[5];
-    switch(direction) {
-    case 0:  // forward pass
-        // output is top: (batchsize, num_filters, height, width, depth)
-        // height and width: top = (bottom + 2*pad - weight) / sample + 1
-        out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
-        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
-        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
-        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
-        out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - CudaNdarray_HOST_DIMS(weights)[4]) / dD + 1;
-        break;
-    case 1:  // backprop wrt. weights
-        // output is weights: (num_filters, num_channels, height, width, depth)
-        // height, width and depth: weights = bottom + 2*pad - (top-1) * sample
-        out_dim[0] = CudaNdarray_HOST_DIMS(top)[1];
-        out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
-        out_dim[2] = kH;  // already inferred further above
-        out_dim[3] = kW;  // how convenient
-        out_dim[4] = kD;
-        break;
-    case 2:  // backprop wrt. inputs
-        // output is bottom: (batchsize, num_channels, height, width, depth)
-        // height, width and depth: bottom = (top-1) * sample + weights - 2*pad
-        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
-        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
-        out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + CudaNdarray_HOST_DIMS(weights)[4] - 2*padD;
-        break;
-    default:
-        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
-        %(fail)s
-    }
-
-
-
-    // Prepare output array
-    if (!(%(out)s
-          && %(out)s->nd == 5
-          && CudaNdarray_is_c_contiguous(%(out)s)
-          && CudaNdarray_HOST_DIMS(%(out)s)[0] == out_dim[0]
-          && CudaNdarray_HOST_DIMS(%(out)s)[1] == out_dim[1]
-          && CudaNdarray_HOST_DIMS(%(out)s)[2] == out_dim[2]
-          && CudaNdarray_HOST_DIMS(%(out)s)[3] == out_dim[3]
-          && CudaNdarray_HOST_DIMS(%(out)s)[4] == out_dim[4]))
-    {
-        Py_XDECREF(%(out)s);
-        %(out)s = (CudaNdarray*)CudaNdarray_NewDims(5, out_dim);
-        if (NULL == %(out)s)
-        {
-          PyErr_Format(PyExc_RuntimeError,
-                       "BaseGpuCorr3dM: Failed to allocate output of %%d x %%d x %%d x %%d x %%d",
-                    out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
-            %(fail)s
-        }
-    }
-
-    // Call CUDA code
-    out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dD, padH, padW, padD);
-    if (out2==NULL){
-       %(fail)s
-    }
-    assert (out2 == %(out)s);
-
-""" % sub
-
-
-class GpuCorr3dMM(BaseGpuCorr3dMM):
-    """GPU correlation implementation using Matrix Multiplication.
-
-    :warning: For 700 series Nvidia GPUs of compute capability 3.5 and CUDA 5.0
-        to 6.0, there is a bug in CUBLAS' matrix multiplication function that
-        can make GpuCorrMM or its gradients crash for some input and filter
-        shapes. So if you have a Tesla K20, Tesla K40, Quadro K6000, GeForce GT
-        640 (DDR5), GeForce GTX 780 (or Ti), GeForce GTX TITAN (or Black or Z)
-        and experience a crash, switching to CUDA 6.5 or CUDA 4.2 should fix it.
-        If this is not possible, changing the input or filter shapes (e.g., the
-        batchsize or number of filters) may also work around the CUBLAS bug.
-    """
-    def __init__(self, border_mode="valid",
-                 subsample=(1, 1, 1),
-                 pad=(0, 0, 0)):
-        """
-        :param border_mode: currently supports "valid" only; "full" can be
-            simulated by setting `pad="full"` (at the cost of performance), or
-            by using `GpuCorrMM_gradInputs`
-        :param subsample: the subsample operation applied to each output image.
-            Should be a tuple with 3 elements.
-            `(sv, sh, sl)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`,
-            but faster.
-            Set to `(1, 1, 1)` to disable subsampling.
-        :param pad: the width of a border of implicit zeros to pad the input
-            image with. Should be a tuple with 3 elements giving the numbers of
-            rows and columns to pad on each side, or "half" to set the padding
-            to `(kernel_rows // 2, kernel_columns // 2, kernel_depth // 2)`, or "full" to set the
-            padding to `(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)` at runtime.
-            Set to `(0, 0, 0)` to disable padding.
-
-        :note: Currently, the Op requires the inputs, filters and outputs to be
-            C-contiguous. Use :func:`gpu_contiguous
-            <theano.sandbox.cuda.basic_ops.gpu_contiguous>` on these arguments
-            if needed.
-        """
-        super(GpuCorr3dMM, self).__init__(border_mode, subsample, pad)
-
-    def make_node(self, img, kern):
-        img = as_cuda_ndarray_variable(img)
-        kern = as_cuda_ndarray_variable(kern)
-        if img.type.ndim != 5:
-            raise TypeError('img must be 5D tensor')
-        if kern.type.ndim != 5:
-            raise TypeError('kern must be 5D tensor')
-
-        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
-                         False, False, False]
-        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        bottom, weights = inp
-        top, = out_
-        direction = "forward"
-        return super(GpuCorr3dMM, self).c_code_helper(bottom, weights, top, direction, sub)
-
-    def grad(self, inp, grads):
-        bottom, weights = inp
-        top, = grads
-        top = gpu_contiguous(top)
-        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(
-                weights, top, bottom.shape[-3:])
-        d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)(
-                bottom, top, weights.shape[-3:])
-        return d_bottom, d_weights
-
-
-class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
-    """Gradient wrt. filters for `GpuCorr3dMM`.
-
-    :note: You will not want to use this directly, but rely on Theano's
-        automatic differentiation or graph optimization to use it as needed.
-    """
-
-    def __init__(self, border_mode="valid",
-                 subsample=(1, 1, 1),
-                 pad=(0, 0, 0)):
-        super(GpuCorr3dMM_gradWeights, self).__init__(border_mode, subsample, pad)
-
-    def make_node(self, img, topgrad, shape=None):
-        img = as_cuda_ndarray_variable(img)
-        topgrad = as_cuda_ndarray_variable(topgrad)
-        if shape is not None:
-            shape = as_tensor_variable(shape)
-
-        if img.type.ndim != 5:
-            raise TypeError('img must be 5D tensor')
-        if topgrad.type.ndim != 5:
-            raise TypeError('topgrad must be 5D tensor')
-        if self.subsample != (1, 1, 1) or self.pad == "half":
-            if shape is None:
-                raise ValueError('shape must be given if subsample != (1, 1, 1), or pad == "half"')
-            height_width_depth = [shape[0], shape[1], shape[2]]
-        else:
-            height_width_depth = []
-
-        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
-                         False, False, False]
-        return Apply(self, [img, topgrad] + height_width_depth, [CudaNdarrayType(broadcastable)()])
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        bottom, top = inp[:2]
-        height, width, depth = inp[2:] or (None, None, None)
-        weights, = out_
-        direction = "backprop weights"
-        return super(GpuCorr3dMM_gradWeights, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth)
-
-    def grad(self, inp, grads):
-        bottom, top = inp[:2]
-        weights, = grads
-        weights = gpu_contiguous(weights)
-        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(weights, top, bottom.shape[-3:])
-        d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)(
-            bottom, weights)
-        d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
-        return (d_bottom, d_top) + d_height_width_depth
-
-    def connection_pattern(self, node):
-        if node.nin == 2:
-            return [[1], [1]]
-        else:
-            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
-
-class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
-    """Gradient wrt. inputs for `GpuCorr3dMM`.
-
-    :note: You will not want to use this directly, but rely on Theano's
-        automatic differentiation or graph optimization to use it as needed.
-    """
-
-    def __init__(self, border_mode="valid",
-                 subsample=(1, 1, 1),
-                 pad=(0, 0, 0)):
-        super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample, pad)
-
-    def make_node(self, kern, topgrad, shape=None):
-        kern = as_cuda_ndarray_variable(kern)
-        topgrad = as_cuda_ndarray_variable(topgrad)
-        if kern.type.ndim != 5:
-            raise TypeError('kern must be 5D tensor')
-        if topgrad.type.ndim != 5:
-            raise TypeError('topgrad must be 5D tensor')
-        if self.subsample != (1, 1, 1) and shape is None:
-            raise ValueError('shape must be given if subsample != (1, 1, 1)')
-        height_width_depth = [shape[0], shape[1], shape[2]] if self.subsample != (1, 1, 1) else []
-
-        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
-                         False, False, False]
-        return Apply(self, [kern, topgrad] + height_width_depth, [CudaNdarrayType(broadcastable)()])
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        weights, top = inp[:2]
-        height, width, depth = inp[2:] or (None, None, None)
-        bottom, = out_
-        direction = "backprop inputs"
-        return super(GpuCorr3dMM_gradInputs, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth)
-
-    def grad(self, inp, grads):
-        weights, top = inp[:2]
-        bottom, = grads
-        bottom = gpu_contiguous(bottom)
-        d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)(
-            bottom, top, weights.shape[-3:])
-        d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)(
-                bottom, weights)
-        d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
-        return (d_weights, d_top) + d_height_width_depth
-
-    def connection_pattern(self, node):
-        if node.nin == 2:
-            return [[1], [1]]
-        else:
-            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
-
-
 ##
 # Not really a BLAS operation, but whatever.
 #
-
 class GpuConv(GpuOp):
     """
     Implement the batched and stacked 2d convolution on the gpu.
     """
-    check_broadcast = False
-
     @staticmethod
     def logical_output_shape_2d(imshp, kshp, mode):
         if mode == 'valid':
@@ -1515,49 +518,29 @@ def __init__(self, border_mode,
             logical_kern_hw=None,
             logical_kern_align_top=True,
             version=-1,
-            direction_hint=None,
             verbose=0,
             kshp=None,
             imshp=None,
-            max_threads_dim0=None,
-            nkern=None,
-            bsize=None,
-            fft_opt=True):
+            max_threads_dim0=None):
         """
-        :param version: each version of c_code implements many kernel for the
+        :param version: each version of c_code implement many kernel for the
                         convolution. By default we try to guess the best one.
                         You can force one version with this parameter. This
                         parameter is used by the tests.
-        :param direction_hint: 'forward', 'bprop weights' or 'bprop inputs'.
-                        Serves as a hint for graph optimizers replacing
-                        GpuConv by other implementations. If the GpuConv is
-                        inserted automatically, we take its value from ConvOp.
         :param verbose: for value of 1,2 and 3. Print more information during
                         the execution of the convolution. Mostly used for
                         optimization or debugging.
-        :param kshp:    The size of the kernel. If provided, can generate
+        :param kshp:    The size of the kernel. If provided, can genera
                         faster code. If the GpuConv op is automatically
                         inserted,
                         we take its value automatically from the Conv op.
         :param imshp:   The size of the image. Not used for code generation but
-                        allows to select an experimental new version in another
+                        allow to select an experimental new version in another
                         repo.
-        :param max_threads_dim0: The maximum number of threads for the
+        :param max_threads_dim0: The maximum number of thread for the
                         block size dimensions 0 (blockDim.x) used by the
                         GPU function.
-        :param nkern:   The number of kernels. Not used for this op, but can be
-                        used by graph optimizers to select a more optimal
-                        convolution implementation. If the GpuConv op is inserted
-                        automatically, we take its value from the Conv op.
-        :param bsize:   The batch size. Not used for this op, but can be
-                        used by graph optimizers to select a more optimal
-                        convolution implementation. If the GpuConv op is inserted
-                        automatically, we take its value from the Conv op.
-        :param fft_opt: deactivate fft_opt optimization at the op level when
-                        set to False. Note that by default fft optimization
-                        aren't enabled. See
-                        :ref:`convolution documentation <libdoc_tensor_nnet_conv>`
-                        to enable them.
+
         """
         self.border_mode = border_mode
         self.subsample = subsample
@@ -1577,14 +560,10 @@ def __init__(self, border_mode,
         self.logical_kern_hw = logical_kern_hw
         self.logical_kern_align_top = logical_kern_align_top
         self.version = version
-        self.direction_hint = direction_hint
         self.verbose = verbose
         self.kshp = kshp
         self.imshp = imshp
         self.max_threads_dim0 = max_threads_dim0
-        self.nkern = nkern
-        self.bsize = bsize
-        self.fft_opt = fft_opt
 
     def __eq__(self, other):
         return type(self) == type(other) \
@@ -1605,8 +584,6 @@ def __setstate__(self, d):
             self.imshp = None
         if not hasattr(self, "max_threads_dim0"):
             self.max_threads_dim0 = None
-        if not hasattr(self, "direction_hint"):
-            self.direction_hint = None
 
     def __hash__(self):
         # don't use hash(self.version) as hash(-1)==-2 and
@@ -1644,25 +621,6 @@ def make_node(self, img, kern):
                          False, False]
         return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
 
-    def flops(self, inputs, outputs):
-        """ Useful with the hack in profilemode to print the MFlops"""
-        images, kerns = inputs
-        out, = outputs
-        assert images[1] == kerns[1]
-        flops = 0
-        if self.border_mode == "valid":
-            # nb mul and add by output pixel
-            flops = kerns[2] * kerns[3] * 2
-            # nb flops by output image
-            flops *= out[2] * out[3]
-            # nb patch multiplied
-            flops *= images[1] * kerns[0] * images[0]
-        else:
-            flops = (images[0] * kerns[0] * images[1] *
-                     kerns[2] * kerns[3] *
-                     images[2] * images[3] * 2)
-        return flops
-
     def make_thunk(self, node, storage_map, compute_map, no_recycling):
         node_ = copy.copy(node)
         assert node.op is node_.op
@@ -1685,7 +643,7 @@ def make_thunk(self, node, storage_map, compute_map, no_recycling):
 
     def c_compile_args(self):
         nb = 0
-        if (self.kshp is not None) and (self.kshp[1] is not None):
+        if self.kshp is not None:
             nb = self.kshp[1]
         return ['-DTHEANO_KERN_WID=' + str(nb)]  # ,'-g','-G']
 
@@ -1694,7 +652,7 @@ def c_headers(self):
 
     def c_code_cache_version(self):
         # raise this whenever modifying any of the support_code_files
-        return (0, 22)
+        return (0, 19)
 
     def c_support_code_apply(self, node, nodename):
         # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -1709,15 +667,11 @@ def c_code(self, node, nodename, inp, out_, sub):
         out, = out_
         dx = self.subsample[0]
         dy = self.subsample[1]
+        border_mode = self.border_mode
         version = self.version
         verbose = self.verbose
         sub = sub.copy()
         max_threads_dim0 = self.max_threads_dim0
-        if self.border_mode == "valid":
-            bmode = 1
-        else:
-            assert self.border_mode == "full"
-            bmode = 0
         if max_threads_dim0 is None:
             raise NotImplementedError("GpuConv.c_code should not be called "
                                       "directly. It should be called by "
@@ -1726,7 +680,7 @@ def c_code(self, node, nodename, inp, out_, sub):
         sub.update(locals())
         return """
     //Mandatory args
-    int mode = %(bmode)s;
+    const char *mode_str = "%(border_mode)s";
 
     //Optional args
     int version = %(version)s;
@@ -1734,8 +688,22 @@ def c_code(self, node, nodename, inp, out_, sub):
     int dx = %(dx)s;
     int dy = %(dy)s;
 
+    int mode;
+    if (strcmp(mode_str, "full") == 0)
+    {
+        mode = ConvMode_FULL;
+    }
+    else if (strcmp(mode_str, "valid") == 0)
+    {
+        mode = ConvMode_VALID;
+    }
+    else
+    {
+        PyErr_SetString(PyExc_ValueError,
+                        "mode must be one of 'full' or 'valid'");
+        return NULL;
+    }
 
-    // TODO, make out be decref before we alloc out2!
     CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
                                                          %(out)s, mode,
                                                          dx, dy,
@@ -1743,10 +711,6 @@ def c_code(self, node, nodename, inp, out_, sub):
                                                          %(max_threads_dim0)s);
     Py_XDECREF(%(out)s);
     %(out)s = out2;
-
-    if (%(out)s==NULL){
-        %(fail)s
-    }
 """ % sub
 
 
diff --git a/theano/sandbox/cuda/blocksparse.py b/theano/sandbox/cuda/blocksparse.py
deleted file mode 100644
index 83a541b2f53..00000000000
--- a/theano/sandbox/cuda/blocksparse.py
+++ /dev/null
@@ -1,760 +0,0 @@
-import numpy
-import theano
-from theano import Apply, tensor, scalar, Constant
-from theano.tensor import DimShuffle, discrete_dtypes
-
-from theano.gradient import grad_undefined
-
-from theano.sandbox.cuda import cuda_available, GpuOp, GpuElemwise
-
-if cuda_available:
-    from theano.sandbox.cuda import (basic_ops,
-                                     opt, GpuFromHost,
-                                     HostFromGpu, host_from_gpu,
-                                     GpuDimShuffle)
-
-
-class SparseBlockGemvSS(GpuOp):
-    """
-    This op computes the dot product of specified pieces of vectors
-    and matrices, returning pieces of vectors.
-
-    It computes something like this for each j:
-
-      o[j] = sum_over_i(dot(W[i, j], h[i])) + o[j]
-
-    The i and j are taken from the inputIdx and outputIdx lists
-    respectively.
-
-    This should not be directly called since the interface is subject
-    to change without notice.  Use the sparse_block_dot_SS() function
-    for a stable interface.
-    """
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.inplace == other.inplace
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.inplace)
-
-    def __str__(self):
-        return "SparseBlockGemvSS%s" % ("{inplace}" if self.inplace else "")
-
-    def make_node(self, o, W, h, inputIdx, outputIdx):
-        o = basic_ops.as_cuda_ndarray_variable(o)
-        W = basic_ops.as_cuda_ndarray_variable(W)
-        h = basic_ops.as_cuda_ndarray_variable(h)
-        assert o.ndim == 3
-        assert W.ndim == 4
-        assert h.ndim == 3
-        assert inputIdx.ndim == 2
-        assert outputIdx.ndim == 2
-
-        assert inputIdx.type.dtype in discrete_dtypes
-        assert outputIdx.type.dtype in discrete_dtypes
-
-        return Apply(self, [o, W, h, inputIdx, outputIdx],
-                     [o.type()])
-
-    def infer_shape(self, node, input_shapes):
-        return [input_shapes[0]]
-
-    def c_support_code(self):
-        return """
-        __global__ void
-        SparseBlockGemv_fill_lists(
-int maxi, int maxj,
-const float **inp_list,
-float **out_list,
-const float **W_list,
-const float *W, int W_str_0, int W_str_1,
-const float *h, int h_str_0, int h_str_1,
-float *out, int o_str_0, int o_str_1,
-const npy_intp *iIdx, int iI_str_0,
-const npy_intp *oIdx, int oI_str_0
-        ) {
-  int i = threadIdx.x + blockDim.x * blockIdx.x;
-  int j = threadIdx.y + blockDim.y * blockIdx.y;
-  int b = blockIdx.z;
-  if (i >= maxi || j >= maxj) return;
-  int p = i + j * maxi + b * maxi * maxj;
-  inp_list[p] = &h[b * h_str_0 + i * h_str_1];
-  out_list[p] = &out[b * o_str_0 + j * o_str_1];
-  W_list[p] = &W[iIdx[b*iI_str_0+i] * W_str_0 +
-                 oIdx[b*oI_str_0+j] * W_str_1];
-}
-
-__global__ void _sgemvBH_N_a1_b1_small(const float *A[], int lda,
-                                       const float *x[], int incx,
-                                       float *y[], int incy,
-                                       int b, int m, int n) {
-  for (int p = blockIdx.y * blockDim.y + threadIdx.y; p < b;
-       p += gridDim.y * blockDim.y) {
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m;
-         i += gridDim.x * blockDim.x) {
-      float yi = 0.0f;
-      const float *Ap = A[p] + i;
-      const float *xp = x[p];
-      #pragma unroll 32
-      for (int j = 0; j < n; j++) {
-        yi += Ap[0] * xp[0];
-        Ap += lda;
-        xp += incx;
-      }
-      atomicAdd(&y[p][i*incy], yi);
-    }
-  }
-}
-
-__global__ void _sgemvBH_T_a1_b1_small(const float *A[], int lda,
-                                       const float *x[], int incx,
-                                       float *y[], int incy,
-                                       int b, int m, int n) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int p = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i >= m || p >= b) return;
-  float yi = 0.0f;
-  const float *Ap = A[p] + i * lda;
-  const float *xp = x[p];
-  # pragma unroll 32
-  for (int j = 0; j < n; j++) {
-    yi += Ap[j] * xp[0];
-    xp += incx;
-  }
-  atomicAdd(&y[p][i*incy], yi);
-}
-
-static cublasStatus_t SgemvBatched(cublasHandle_t handle,
-                                   cublasOperation_t trans,
-                                   int m, int n,
-                                   const float *alpha,
-                                   const float *A[], int lda,
-                                   const float *x[], int incx,
-                                   const float *beta,
-                                   float *y[], int incy, int batchCount) {
-  dim3 block(m, batchCount, 1);
-  dim3 grid(1, 1, 1);
-  cublasPointerMode_t mode;
-  cudaError_t err;
-  if (m < 512) {
-    block.x = 32;
-    if (batchCount > 16)
-      block.y = 16;
-    else
-      block.y = batchCount;
-  } else {
-    block.x = 512;
-    block.y = 1;
-  }
-  grid.x = (m + block.x - 1) / block.x;
-  grid.y = (batchCount + block.y - 1) / block.y;
-  if (grid.x * grid.y > 65535) {
-    grid.y = (65535 / grid.x);
-  }
-  cublasGetPointerMode(handle, &mode);
-  if (mode != CUBLAS_POINTER_MODE_HOST)
-    return CUBLAS_STATUS_INVALID_VALUE;
-  if (*alpha != 1.0 || *beta != 1.0)
-    return CUBLAS_STATUS_INVALID_VALUE;
-  if (trans == CUBLAS_OP_N)
-    _sgemvBH_N_a1_b1_small<<<grid, block>>>(A, lda, x, incx,
-                                            y, incy,
-                                            batchCount, m, n);
-  else if (trans == CUBLAS_OP_T)
-    _sgemvBH_T_a1_b1_small<<<grid, block>>>(A, lda, x, incx,
-                                            y, incy,
-                                            batchCount, m, n);
-  else
-    return CUBLAS_STATUS_INVALID_VALUE;
-  err = cudaGetLastError();
-  if (err != cudaSuccess)
-    return CUBLAS_STATUS_EXECUTION_FAILED;
-  return CUBLAS_STATUS_SUCCESS;
-}
-
-static int SparseBlockGemv_copy(PyArrayObject *a, npy_intp *b) {
-  cudaError_t err;
-  PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP);
-  if (aa == NULL) { return -1; }
-  err = cudaMemcpyAsync(b, PyArray_DATA(aa), PyArray_NBYTES(aa),
-                        cudaMemcpyHostToDevice);
-  Py_DECREF(aa);
-  if (err != cudaSuccess) {
-    PyErr_Format(PyExc_RuntimeError, "Cannot copy index data to GPU (%s)",
-                 cudaGetErrorString(err));
-    return -1;
-  }
-  return 0;
-}
-"""
-
-    def c_support_code_apply(self, node, nodename):
-        return """
-        /* Statics are initialized with 0 */
-        static const float **%(n)s_inp_list;
-        static float **%(n)s_out_list;
-        static const float **%(n)s_W_list;
-        static size_t %(n)s_list_len;
-        static npy_intp *%(n)s_iIdx;
-        static size_t %(n)s_iIdx_len;
-        static npy_intp *%(n)s_oIdx;
-        static size_t %(n)s_oIdx_len;
-
-        static int %(n)s_prep(int b, int i, int j, int outsize) {
-          int s = b*i*j;
-          if (%(n)s_list_len < s) {
-            cudaFree(%(n)s_inp_list);
-            cudaFree(%(n)s_out_list);
-            cudaFree(%(n)s_W_list);
-            if (cudaMalloc(&%(n)s_inp_list, s*sizeof(float *)) != cudaSuccess) return -1;
-            if (cudaMalloc(&%(n)s_out_list, s*sizeof(float *)) != cudaSuccess) return -1;
-            if (cudaMalloc(&%(n)s_W_list, s*sizeof(float *)) != cudaSuccess) return -1;
-            %(n)s_list_len = s;
-          }
-          if (%(n)s_iIdx_len < b*i) {
-            cudaFree(%(n)s_iIdx);
-            if (cudaMalloc(&%(n)s_iIdx, b*i*sizeof(npy_intp)) != cudaSuccess) return -1;
-            %(n)s_iIdx_len = b*i;
-          }
-          if (%(n)s_oIdx_len < b*j) {
-            cudaFree(%(n)s_oIdx);
-            if (cudaMalloc(&%(n)s_oIdx, b*j*sizeof(npy_intp)) != cudaSuccess) return -1;
-            %(n)s_oIdx_len = b*j;
-          }
-          return 0;
-        }
-        """ % dict(n=nodename)
-
-    def c_code(self, node, nodename, inputs, outputs, sub):
-        o, W, h, inputIdx, outputIdx = inputs
-        out = outputs[0]
-
-        if self.inplace:
-            res = """
-Py_XDECREF(%(out)s);
-%(out)s = %(o)s;
-Py_INCREF(%(out)s);
-""" % dict(out=out, o=o)
-        else:
-            res = """
-if (CudaNdarray_prep_output(&%(out)s, 3, CudaNdarray_HOST_DIMS(%(o)s)))
-{
-  // Error already set
-  %(fail)s
-}
-if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
-  // Error already set
-  %(fail)s
-}
-""" % dict(out=out, o=o, fail=sub['fail'])
-
-        return res + """
-        if (%(name)s_prep(CudaNdarray_HOST_DIMS(%(o)s)[0],
-                          CudaNdarray_HOST_DIMS(%(h)s)[1],
-                          CudaNdarray_HOST_DIMS(%(o)s)[1],
-                          CudaNdarray_HOST_DIMS(%(o)s)[2]) == -1) {
-          PyErr_SetString(PyExc_RuntimeError,
-                          "Could not allocate working memory.");
-          %(fail)s
-        }
-        if (SparseBlockGemv_copy(%(inputIdx)s, %(name)s_iIdx) == -1)
-          { %(fail)s }
-        if (SparseBlockGemv_copy(%(outputIdx)s, %(name)s_oIdx) == -1)
-          { %(fail)s }
-        { /* Prepare lists for the batch */
-          dim3 block;
-          dim3 grid;
-          block.x = CudaNdarray_HOST_DIMS(%(h)s)[1];
-          block.y = CudaNdarray_HOST_DIMS(%(o)s)[1];
-          grid.z = CudaNdarray_HOST_DIMS(%(o)s)[0]; // batch size
-          if (block.x > 32) {
-            grid.x = (block.x + 31) / 32;
-            block.x = 32;
-          }
-          if (block.x * block.y > 512) {
-            grid.y = (block.y + 15) / 16;
-            block.y = 16;
-          }
-          SparseBlockGemv_fill_lists<<<grid, block>>>(
-CudaNdarray_HOST_DIMS(%(h)s)[1], CudaNdarray_HOST_DIMS(%(o)s)[1],
-%(name)s_inp_list,
-%(name)s_out_list,
-%(name)s_W_list,
-CudaNdarray_DEV_DATA(%(W)s),
-CudaNdarray_HOST_STRIDES(%(W)s)[0], CudaNdarray_HOST_STRIDES(%(W)s)[1],
-CudaNdarray_DEV_DATA(%(h)s),
-CudaNdarray_HOST_STRIDES(%(h)s)[0], CudaNdarray_HOST_STRIDES(%(h)s)[1],
-CudaNdarray_DEV_DATA(%(out)s),
-CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
-%(name)s_iIdx, PyArray_DIM(%(inputIdx)s, 1),
-%(name)s_oIdx, PyArray_DIM(%(outputIdx)s, 1));
-        }
-        { /* Run SgemvBatched */
-          float alpha = 1.0f;
-          float beta = 1.0f;
-          cublasStatus_t err;
-          cublasOperation_t transA = CUBLAS_OP_N;
-          int lda = CudaNdarray_HOST_STRIDES(%(W)s)[2];
-          if (lda == 1) {
-            transA = CUBLAS_OP_T;
-            lda = CudaNdarray_HOST_STRIDES(%(W)s)[3];
-          }
-          if (lda == 0) lda = 1;
-          err = SgemvBatched(handle, transA,
-                             CudaNdarray_HOST_DIMS(%(o)s)[2],
-                             CudaNdarray_HOST_DIMS(%(h)s)[2], &alpha,
-                             %(name)s_W_list, lda, %(name)s_inp_list,
-                             CudaNdarray_HOST_STRIDES(%(h)s)[2],
-                             &beta, %(name)s_out_list,
-                             CudaNdarray_HOST_STRIDES(%(o)s)[2],
-                             CudaNdarray_HOST_DIMS(%(o)s)[1] *
-                             CudaNdarray_HOST_DIMS(%(h)s)[1] *
-                             CudaNdarray_HOST_DIMS(%(o)s)[0]);
-          if (err != CUBLAS_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError, "SgemvBatched failed(%%s)",
-                         cublasGetErrorString(err));
-            %(fail)s
-          }
-        }
-        // And we're done!
-        """ % dict(out=out, h=h, o=o, inputIdx=inputIdx, outputIdx=outputIdx,
-                   W=W, fail=sub['fail'], name=nodename)
-
-    def c_code_cache_version(self):
-        return (11,)
-
-    def grad(self, inputs, grads):
-        o, W, h, inputIdx, outputIdx = inputs
-        go = grads[0]
-
-        Wgrad = sparse_block_outer_ss(W.zeros_like(),
-                                      h, go, inputIdx, outputIdx)
-        hgrad = sparse_block_gemv_ss(h.zeros_like(),
-                                     W.dimshuffle((1, 0, 3, 2)),
-                                     go,
-                                     outputIdx, inputIdx)
-        return [go, Wgrad, hgrad,
-                grad_undefined(self, 3, inputIdx,
-                               "grad of inputIdx makes no sense"),
-                grad_undefined(self, 4, outputIdx,
-                               "grad of outputIdx makes no sense")]
-
-
-sparse_block_gemv_ss = SparseBlockGemvSS(False)
-sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True)
-
-
-class SparseBlockOuterSS(GpuOp):
-    """
-    This computes the outer product of two sets of pieces of vectors
-    updating a full matrix with the results.
-
-    It computes something like this:
-
-      o[i, j] = (alpha * outer(x[i], y[j])) + o[i, j]
-
-    The i and j are taken from the xIdx and yIdx lists respectively.
-
-    This op should not be called directly since its interface is
-    subject to change without notice.  It is involved in the gradient
-    of SparseBlockGemvSS.
-    """
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.inplace == other.inplace
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.inplace)
-
-    def __str__(self):
-        return "SparseBlockOuterSS%s" % ("{inplace}" if self.inplace else "")
-
-    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
-        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
-        o = basic_ops.as_cuda_ndarray_variable(o)
-        x = basic_ops.as_cuda_ndarray_variable(x)
-        y = basic_ops.as_cuda_ndarray_variable(y)
-        if alpha is None:
-            alpha = one
-        return Apply(self, [o, x, y, xIdx, yIdx, alpha],
-                     [o.type()])
-
-    def infer_shape(self, node, input_shapes):
-        return [input_shapes[0]]
-
-    def c_support_code(self):
-        return """
-__global__ void
-SparseBlockOuter_fill_lists(
-int maxi, int maxj,
-const float **x_list,
-const float **y_list,
-float **out_list,
-const float *x, int x_str_0, int x_str_1,
-const float *y, int y_str_0, int y_str_1,
-float *out, int o_str_0, int o_str_1,
-const npy_intp *xIdx, int xI_str_0,
-const npy_intp *yIdx, int yI_str_0
-) {
-  int i = threadIdx.x + blockDim.x * blockIdx.x;
-  int j = threadIdx.y + blockDim.y * blockIdx.y;
-  int b = blockIdx.z;
-  if (i >= maxi || j >= maxj) return;
-  int p = i + j * maxi + b * maxi * maxj;
-  x_list[p] = &x[b * x_str_0 + i * x_str_1];
-  y_list[p] = &y[b * y_str_0 + j * y_str_1];
-  out_list[p] = &out[xIdx[b * xI_str_0 + i] * o_str_0 +
-                     yIdx[b * yI_str_0 + j] * o_str_1];
-}
-
-/* This is tuned for smaller sizes (< 512) since it's what we get normally */
-__global__ void _sgerBH_gen_small(const float *x[], int incx,
-                                  const float *y[], int incy,
-                                  float alpha,
-                                  float *A[], int lda,
-                                  int b, int m, int n) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i >= m || j >= n) return;
-  for (int p = blockIdx.z; p < b; p += gridDim.z) {
-    atomicAdd(&A[p][j * lda + i],
-              alpha * x[p][i * incx] * y[p][j * incy]);
-  }
-}
-
-static cublasStatus_t SgerBatched(cublasHandle_t handle, int m, int n,
-                                  const float *alpha,
-                                  const float *x[], int incx,
-                                  const float *y[], int incy,
-                                  float *A[], int lda,
-                                  int batchCount) {
-  dim3 block(m, n, 1);
-  dim3 grid(1, 1, batchCount);
-  cublasPointerMode_t mode;
-  cudaError_t err;
-  if (incx == 1) {
-    if (block.x > 32) {
-      grid.x = (block.x + 31)/32;
-      block.x = 32;
-    }
-    if (block.x * block.y > 512) {
-      grid.y = (block.y + 15) / 16;
-      block.y = 16;
-    }
-  } else {
-    if (block.y > 32) {
-      grid.y = (block.y + 31)/32;
-      block.y = 32;
-    }
-    if (block.x * block.y > 512) {
-      grid.x = (block.x + 15) / 16;
-      block.x = 16;
-    }
-  }
-  if (grid.x * grid.y * grid.z > 65535) {
-    if (grid.x * grid.y > 65535)
-      return CUBLAS_STATUS_INVALID_VALUE;
-    grid.z = (65535 / (grid.x * grid.y));
-  }
-  cublasGetPointerMode(handle, &mode);
-  if (mode == CUBLAS_POINTER_MODE_HOST) {
-    _sgerBH_gen_small<<<grid, block>>>(x, incx, y, incy, *alpha, A, lda,
-                                       batchCount, m, n);
-  } else {
-    return CUBLAS_STATUS_INVALID_VALUE;
-  }
-  err = cudaGetLastError();
-  if (err != cudaSuccess)
-    return CUBLAS_STATUS_EXECUTION_FAILED;
-  return CUBLAS_STATUS_SUCCESS;
-}
-
-static int SparseBlockOuter_copy(PyArrayObject *a, npy_intp *b) {
-  cudaError_t err;
-  PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP);
-  if (aa == NULL) { return -1; }
-  err = cudaMemcpyAsync(b, PyArray_DATA(aa), PyArray_NBYTES(aa),
-                        cudaMemcpyHostToDevice);
-  Py_DECREF(aa);
-  if (err != cudaSuccess) {
-    PyErr_Format(PyExc_RuntimeError, "Cannot copy index data to GPU(%s)",
-                 cudaGetErrorString(err));
-    return -1;
-  }
-  return 0;
-}
-"""
-
-    def c_support_code_apply(self, node, name):
-        return """
-/* statics are initialized with 0 */
-static float **%(n)s_out_list;
-static const float **%(n)s_x_list;
-static const float **%(n)s_y_list;
-static size_t %(n)s_list_len;
-static npy_intp *%(n)s_xIdx;
-static size_t %(n)s_xIdx_len;
-static npy_intp *%(n)s_yIdx;
-static size_t %(n)s_yIdx_len;
-
-static int %(n)s_prep(int b, int i, int j) {
-  int s = b*i*j;
-  if (%(n)s_list_len < s) {
-    cudaFree(%(n)s_x_list);
-    cudaFree(%(n)s_y_list);
-    cudaFree(%(n)s_out_list);
-    if (cudaMalloc(&%(n)s_x_list, s*sizeof(float *)) != cudaSuccess) return -1;
-    if (cudaMalloc(&%(n)s_y_list, s*sizeof(float *)) != cudaSuccess) return -1;
-    if (cudaMalloc(&%(n)s_out_list, s*sizeof(float *)) != cudaSuccess) return -1;
-    %(n)s_list_len = s;
-  }
-  if (%(n)s_xIdx_len < b*i) {
-    cudaFree(%(n)s_xIdx);
-    if (cudaMalloc(&%(n)s_xIdx, b*i*sizeof(npy_intp)) != cudaSuccess)
-      return -1;
-    %(n)s_xIdx_len = b*i;
-  }
-  if (%(n)s_yIdx_len < b*j) {
-    cudaFree(%(n)s_yIdx);
-    if (cudaMalloc(&%(n)s_yIdx, b*j*sizeof(npy_intp)) != cudaSuccess)
-      return -1;
-    %(n)s_yIdx_len = b*j;
-  }
-  return 0;
-}
-""" % dict(n=name)
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        o, x, y, xIdx, yIdx, alpha = inputs
-        out = outputs[0]
-        if self.inplace:
-            res = """
-Py_XDECREF(%(out)s);
-%(out)s = %(o)s;
-Py_INCREF(%(out)s);
-""" % dict(out=out, o=o)
-        else:
-            res = """
-if (CudaNdarray_prep_output(&%(out)s, 4, CudaNdarray_HOST_DIMS(%(o)s)))
-{
-  // Python error already set
-  %(fail)s
-}
-if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
-  //Error message already set
-  %(fail)s
-}
-""" % dict(out=out, o=o, fail=sub['fail'])
-
-        return res + """
-if (%(name)s_prep(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                  CudaNdarray_HOST_DIMS(%(x)s)[1],
-                  CudaNdarray_HOST_DIMS(%(y)s)[1]) == -1) {
-  PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory.");
-  %(fail)s
-}
-if (SparseBlockOuter_copy(%(xIdx)s, %(name)s_xIdx) == -1)
- { %(fail)s }
-if (SparseBlockOuter_copy(%(yIdx)s, %(name)s_yIdx) == -1)
- { %(fail)s }
-{
-  dim3 block;
-  dim3 grid;
-  block.x = CudaNdarray_HOST_DIMS(%(x)s)[1];
-  block.y = CudaNdarray_HOST_DIMS(%(y)s)[1];
-  grid.z = CudaNdarray_HOST_DIMS(%(x)s)[0];
-  if (block.x > 32) {
-    grid.x = (block.x + 31) / 32;
-    block.x = 32;
-  }
-  if (block.x * block.y > 512) {
-    grid.y = (block.y + 15) / 16;
-    block.y = 16;
-  }
-  SparseBlockOuter_fill_lists<<<grid, block>>>(
-CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(y)s)[1],
-%(name)s_x_list,
-%(name)s_y_list,
-%(name)s_out_list,
-CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
-CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0], CudaNdarray_HOST_STRIDES(%(y)s)[1],
-CudaNdarray_DEV_DATA(%(out)s),
-CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
-%(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1),
-%(name)s_yIdx, PyArray_DIM(%(yIdx)s, 1));
-}
-{
-  cublasStatus_t err;
-  int str_y = CudaNdarray_HOST_STRIDES(%(y)s)[2];
-  if (str_y == 0) str_y = 1;
-  int str_x = CudaNdarray_HOST_STRIDES(%(x)s)[2];
-  if (str_x == 0) str_x = 1;
-  int str_out = CudaNdarray_HOST_STRIDES(%(out)s)[2];
-  if (str_out == 0) str_out = 1;
-  err = SgerBatched(handle,
-    CudaNdarray_HOST_DIMS(%(y)s)[2], CudaNdarray_HOST_DIMS(%(x)s)[2],
-    (float *)PyArray_GETPTR1(%(alpha)s, 0), %(name)s_y_list, str_y,
-    %(name)s_x_list, str_x,
-    %(name)s_out_list, str_out,
-    CudaNdarray_HOST_DIMS(%(x)s)[0] *
-    CudaNdarray_HOST_DIMS(%(x)s)[1] *
-    CudaNdarray_HOST_DIMS(%(y)s)[1]);
-  if (err != CUBLAS_STATUS_SUCCESS) {
-    if (err == CUBLAS_STATUS_INVALID_VALUE) {
-       /* The current code would be much too slow for sizes any larger
-          than this. */
-       PyErr_SetString(PyExc_ValueError,
-                       "SgerBatched failed, probably because you have your "
-                       "block size too big. The current limit is 65535 for "
-                       "iSize * oSize.");
-    } else {
-      PyErr_Format(PyExc_RuntimeError, "SgerBatched failed(%%s)",
-                   cublasGetErrorString(err));
-    }
-    %(fail)s
-  }
-}""" % dict(x=x, y=y, out=out, xIdx=xIdx, yIdx=yIdx, name=name,
-            alpha=alpha, fail=sub['fail'])
-
-    def c_code_cache_version(self):
-        return (10,)
-
-
-sparse_block_outer_ss = SparseBlockOuterSS(False)
-sparse_block_outer_ss_inplace = SparseBlockOuterSS(True)
-
-
-if cuda_available:
-    @opt.register_opt()
-    @opt.local_optimizer([sparse_block_gemv_ss], inplace=True)
-    def local_inplace_blocksparse_gemv(node):
-        if node.op == sparse_block_gemv_ss:
-            return [sparse_block_gemv_ss_inplace(*node.inputs)]
-
-    @opt.register_opt()
-    @opt.local_optimizer([sparse_block_outer_ss], inplace=True)
-    def local_inplace_blocksparse_outer(node):
-        if node.op == sparse_block_outer_ss:
-            return [sparse_block_outer_ss_inplace(*node.inputs)]
-
-    def grab_ger(v):
-        # We need to do some digging because apparently the
-        # cut_transfers op does not run before us.
-        if v.owner is not None:
-            if isinstance(v.owner.op, SparseBlockOuterSS):
-                return v.owner
-            elif (isinstance(v.owner.op, GpuFromHost) and
-                  v.owner.inputs[0].owner is not None and
-                  isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
-                return grab_ger(v.owner.inputs[0].owner.inputs[0])
-            else:
-                return None
-
-    # Should be run before elemwise fusion
-    @opt.register_opt()
-    @opt.local_optimizer([GpuElemwise])
-    def local_merge_blocksparse_alpha(node):
-        """
-GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
-        """
-        def grab_lr(v):
-            if v.owner is not None:
-                n = v.owner
-                if (isinstance(n.op, GpuDimShuffle) and
-                      n.op.new_order == ('x', 'x', 'x', 'x')):
-                    return host_from_gpu(n.inputs[0])
-                elif (isinstance(n.op, DimShuffle) and
-                      n.op.new_order == ('x', 'x', 'x', 'x')):
-                    return n.inputs[0]
-                elif isinstance(n.op, GpuFromHost):
-                      return grab_lr(n.inputs[0])
-                else:
-                    return None
-            else:
-                if (isinstance(v, Constant) and
-                    v.broadcastable == (True, True, True, True)):
-                    return v.dimshuffle(())
-
-        if (isinstance(node.op, GpuElemwise) and
-            node.op.scalar_op == scalar.mul and
-            node.nin == 2):
-            ger = grab_ger(node.inputs[0])
-            if ger is None:
-                ger = grab_ger(node.inputs[1])
-                lr = grab_lr(node.inputs[0])
-            else:
-                lr = grab_lr(node.inputs[1])
-            if lr is None or ger is None:
-                return None
-            alpha = lr * ger.inputs[5]
-            return [sparse_block_outer_ss(*(ger.inputs[:5] + [alpha]))]
-
-    @opt.register_opt()
-    @opt.local_optimizer([GpuElemwise])
-    def local_merge_blocksparse_output(node):
-        if (isinstance(node.op, GpuElemwise) and
-            (node.op.scalar_op == scalar.sub or
-             node.op.scalar_op == scalar.add) and
-            node.nin == 2):
-            ger = grab_ger(node.inputs[0])
-            W = node.inputs[1]
-            if ger is None:
-                ger = grab_ger(node.inputs[1])
-                W = node.inputs[0]
-            if ger is None:
-                return None
-            if node.op.scalar_op == scalar.sub:
-                alpha = -ger.inputs[5]
-                W = W - ger.inputs[0]
-            else:
-                alpha = ger.inputs[5]
-                W = W + ger.inputs[0]
-            return [sparse_block_outer_ss(*([W] + ger.inputs[1:5] +
-                                            [alpha]))]
-
-
-def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
-    """
-    Compute the dot product (plus bias) of the specified pieces of vectors
-    and matrices.
-
-    Parameters
-    ----------
-    var: shape, comment
-    W: (iBlocks, oBlocks, iSize, oSize), weight matrix
-    h: (batch, iWin, iSize), input from lower layer (sparse)
-    inputIdx: (batch, iWin), indexes of the input blocks
-    b: (oBlocks, oSize), bias vector
-    outputIdx: (batch, oWin), indexes of the output blocks
-
-    returns (batch, oWin, oSize), dot(W[i, j], h[i]) + b[j]
-         but b[j] is only added once
-
-    Notation
-    --------
-    - `batch` is the number of examples in a minibatch (batch size).
-    - `iBlocks` is the total number of blocks in the input (from lower layer).
-    - `iSize` is the size of each of these input blocks.
-    - `iWin` is the number of blocks that will be used as inputs. Which blocks
-      will be used is specified in `inputIdx`.
-    - `oBlocks` is the number or possible output blocks.
-    - `oSize` is the size of each of these output blocks.
-    - `oWin` is the number of output blocks that will actually be computed.
-      Which blocks will be computed is specified in `outputIdx`.
-    """
-    assert inputIdx.ndim == h.ndim - 1
-    assert outputIdx.ndim == inputIdx.ndim
-    if h.ndim == 2:
-        h = h.dimshuffle('x', 0, 1)
-        inputIdx = inputIdx.dimshuffle('x', 0)
-        outputIdx = outputIdx.dimshuffle('x', 0)
-    return sparse_block_gemv_ss(b.take(outputIdx, axis=0), W, h,
-                                inputIdx, outputIdx)
diff --git a/theano/sandbox/cuda/conv.cu b/theano/sandbox/cuda/conv.cu
index 807c0d0c70e..d5a4ce2bba3 100644
--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -1,4 +1,4 @@
-// REMEMBER TO INCREASE c_code_cache_version when changing this file
+// REMEMBER TO RAISE c_code_cache_version when changing this file
 //
 enum { ConvMode_FULL, ConvMode_VALID };
 PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
@@ -1018,7 +1018,6 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
         (version==3||version==4||version==5||version==-1) &&
         out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
         (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
-        (kern_len > 1 || (img_size_padded_byte+kern_size_byte)<=shared_avail) &&
         !work_complete) //conv_full_patch_stack_padded
     {
       //version 3 without split
diff --git a/theano/sandbox/cuda/conv_kernel.cu b/theano/sandbox/cuda/conv_kernel.cu
index 9a8f6a55576..ea3930667c9 100644
--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
@@ -1,4 +1,4 @@
-// REMEMBER TO INCREASE c_code_cache_version when changing this file
+// REMEMBER TO RAISE c_code_cache_version when changing this file
 //
 //implement the valid convolution only
 
@@ -29,7 +29,6 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
 */
 #ifndef CONV_KERNEL_CU
 #define CONV_KERNEL_CU
-#include <stdint.h>
 
 /*
 #define CHECK_BANK_CONFLICTS 0
@@ -45,9 +44,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
 #define MIN(a, b) ((a) < (b) ? (a) : (b) )
 #define MAX(a, b) ((a) < (b) ? (b) : (a) )
 
-//Must be the same size as a ptr. We can't use unsigned long as on Windows 64
-//bit, it is 32 bit.
-const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
+const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
 
 __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
   if (nb_thread < 64)
@@ -76,7 +73,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
       if (thread_id < nb_thread)
         {
           const float * my_src_ptr = (const float *)(
-                  ((uintptr_t)src) & COALESCED_ALIGN);
+                  ((unsigned long int)src) & COALESCED_ALIGN);
           my_src_ptr += thread_id;
           while (my_src_ptr < src + N)
           {
diff --git a/theano/sandbox/cuda/corr3d_gemm.cu b/theano/sandbox/cuda/corr3d_gemm.cu
deleted file mode 100644
index da34156a07c..00000000000
--- a/theano/sandbox/cuda/corr3d_gemm.cu
+++ /dev/null
@@ -1,486 +0,0 @@
-// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
-// sources are clearly marked. Below we reproduce the original license of
-// the Caffe software.
-/*
-Copyright (c) 2014, The Regents of the University of California (Regents)
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-#undef _GLIBCXX_ATOMIC_BUILTINS
-
-
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/caffe_common.hpp)
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n)                        \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-       i < (n);                                       \
-       i += blockDim.x * gridDim.x)
-
-// CUDA: thread number configuration.
-// Use 1024 threads per block, which requires cuda sm_2x or above,
-// or fall back to attempt compatibility (best of luck to you).
-#if __CUDA_ARCH__ >= 200
-    const int CUDA_NUM_THREADS = 1024;
-#else
-    const int CUDA_NUM_THREADS = 512;
-#endif
-
-// CUDA: number of blocks for threads.
-inline int GET_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-
-// (Adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
-// Kernels for fast unfold + copy
-__global__ void im3d2col_kernel(const int n, const float* data_im,
-                                const int height, const int width, const int depth,
-                                const int kernel_h, const int kernel_w, const int kernel_d,
-                                const int pad_h, const int pad_w, const int pad_d,
-                                const int stride_h, const int stride_w, const int stride_d,
-                                const int height_col, const int width_col, const int depth_col,
-                                float* data_col)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int d_out = index % depth_col;
-    int w_index = index / depth_col;
-    int w_out = w_index % width_col;
-    int h_index = w_index / width_col;
-    int h_out = h_index % height_col;
-
-    int channel_in = h_index / height_col;
-    //channel_in = 1;
-
-    int channel_out = channel_in * kernel_h * kernel_w * kernel_d;
-
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
-    int d_in = d_out * stride_d - pad_d;
-
-    float* data_col_ptr = data_col;
-    data_col_ptr += channel_out * (height_col * width_col * depth_col) +
-      h_out * (width_col * depth_col) + w_out * depth_col + d_out;
-
-    const float* data_im_ptr = data_im;
-    data_im_ptr += channel_in * (height * width * depth) +
-      h_in * (width * depth) + w_in * depth + d_in;
-
-    for (int i = 0; i < kernel_h; ++i)
-    {
-      int h = h_in + i;
-      for (int j = 0; j < kernel_w; ++j)
-      {
-        int w = w_in + j;
-        for (int k = 0; k < kernel_d; ++k)
-        {
-          int d = d_in + k;
-          *data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
-                           h < height && w < width && d < depth) ?
-                           data_im_ptr[i * (width * depth) + j *depth + k] : 0;
-          data_col_ptr += height_col * width_col * depth_col;
-        }
-      }
-    }
-  }
-}
-
-void im3d2col(const float* data_im, const int channels,
-              const int height, const int width, const int depth,
-              const int kernel_h, const int kernel_w, const int kernel_d,
-              const int pad_h, const int pad_w, const int pad_d,
-              const int stride_h, const int stride_w, const int stride_d,
-              float* data_col)
-{
-  // We are going to launch channels * height_col * width_col * depth_col kernels, each
-  // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  int depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1;
-  int num_kernels = channels * height_col * width_col * depth_col;
-  im3d2col_kernel<<<GET_BLOCKS(num_kernels),
-                    CUDA_NUM_THREADS>>>(num_kernels, data_im,
-                                        height, width, depth,
-                                        kernel_h, kernel_w, kernel_d,
-                                        pad_h, pad_w, pad_d,
-                                        stride_h, stride_w, stride_d,
-                                        height_col, width_col, depth_col,
-                                        data_col);
-}
-
-
-__global__ void col2im3d_kernel(const int n, const float* data_col,
-                                const int height, const int width, const int depth,
-                                const int channels,
-                                const int patch_h, const int patch_w, const int patch_d,
-                                const int pad_h, const int pad_w, const int pad_d,
-                                const int stride_h, const int stride_w, const int stride_d,
-                                const int height_col, const int width_col, const int depth_col,
-                                float* data_im)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    float val = 0;
-    int d = index % depth + pad_d;
-    int w_index = index / depth;
-    int w = w_index % width + pad_w;
-    int h_index = w_index / width;
-    int h = h_index % height + pad_h;
-    int c = h_index / height;
-
-    // compute the start and end of the output
-    int d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1;
-    int d_col_end = min(d / stride_d + 1, depth_col);
-    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-    int w_col_end = min(w / stride_w + 1, width_col);
-    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-    int h_col_end = min(h / stride_h + 1, height_col);
-
-    int offset =
-      (c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col;
-
-    int coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
-    int coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
-    int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
-    for (int d_col = d_col_start; d_col < d_col_end; ++d_col)
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
-      }
-   }
-    data_im[index] = val;
-  }
-}
-
-void col2im3d(const float* data_col, const int channels,
-              const int height, const int width, const int depth,
-              const int patch_h, const int patch_w, const int patch_d,
-              const int pad_h, const int pad_w, const int pad_d,
-              const int stride_h, const int stride_w, const int stride_d,
-              float* data_im)
-{
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-  int depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1;
-  int num_kernels = channels * height * width * depth;
-
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  col2im3d_kernel<<<GET_BLOCKS(num_kernels),
-                    CUDA_NUM_THREADS>>>(num_kernels, data_col,
-                                        height, width, depth, channels,
-                                        patch_h, patch_w, patch_d,
-                                        pad_h, pad_w, pad_d,
-                                        stride_h, stride_w, stride_d,
-                                        height_col, width_col, depth_col,
-                                        data_im);
-}
-
-
-
-
-// Theano op code
-// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
-// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
-// Adaptation for 3d
-CudaNdarray* corr3dMM(CudaNdarray *const bottom,
-                      CudaNdarray *const weight,
-                      CudaNdarray *const top,
-                      const int direction,
-                      const int dH = 1,
-                      const int dW = 1,
-                      const int dD = 1,
-                      const int padH = 0,
-                      const int padW = 0,
-                      const int padD = 0)
-{
-    if (bottom->nd != 5)
-    {
-      PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
-      return NULL;
-    }
-    if (!CudaNdarray_is_c_contiguous(bottom))
-    {
-      PyErr_Format(PyExc_ValueError,
-                   "GpuCorr3dMM requires bottom to be C-contiguous, "
-                   "but strides are: %d %d %d %d %d\n",
-                   CudaNdarray_HOST_STRIDES(bottom)[0],
-                   CudaNdarray_HOST_STRIDES(bottom)[1],
-                   CudaNdarray_HOST_STRIDES(bottom)[2],
-                   CudaNdarray_HOST_STRIDES(bottom)[3],
-                   CudaNdarray_HOST_STRIDES(bottom)[4]);
-      return 0;
-    }
-    if (weight->nd != 5)
-    {
-      PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
-      return 0;
-    }
-    if (!CudaNdarray_is_c_contiguous(weight))
-    {
-      PyErr_Format(PyExc_ValueError,
-                   "GpuCorr3dMM requires weight to be C-contiguous, "
-                   "but strides are: %d %d %d %d %d\n",
-                   CudaNdarray_HOST_STRIDES(weight)[0],
-                   CudaNdarray_HOST_STRIDES(weight)[1],
-                   CudaNdarray_HOST_STRIDES(weight)[2],
-                   CudaNdarray_HOST_STRIDES(weight)[3],
-                   CudaNdarray_HOST_STRIDES(weight)[4]);
-      return 0;
-    }
-
-    if (top->nd != 5)
-    {
-      PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
-      return 0;
-    }
-    if (!CudaNdarray_is_c_contiguous(top))
-    {
-      PyErr_Format(PyExc_ValueError,
-                   "GpuCorr3dMM requires top to be C-contiguous, "
-                   "but strides are: %d %d %d %d %d\n",
-                   CudaNdarray_HOST_STRIDES(top)[0],
-                   CudaNdarray_HOST_STRIDES(top)[1],
-                   CudaNdarray_HOST_STRIDES(top)[2],
-                   CudaNdarray_HOST_STRIDES(top)[3],
-                   CudaNdarray_HOST_STRIDES(top)[4]);
-      return 0;
-    }
-
-
-    // Extract some shape information for later and check shape consistency
-    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
-    const int batchSize = CudaNdarray_HOST_DIMS(bottom)[0];
-    const int nChannels = CudaNdarray_HOST_DIMS(bottom)[1];
-    const int bottomHeight = CudaNdarray_HOST_DIMS(bottom)[2];
-    const int bottomWidth = CudaNdarray_HOST_DIMS(bottom)[3];
-    const int bottomDepth = CudaNdarray_HOST_DIMS(bottom)[4];
-    // weights: (nFilters, nChannels, rows, columns, depth)
-    const int nFilters = CudaNdarray_HOST_DIMS(weight)[0];
-    const int kH = CudaNdarray_HOST_DIMS(weight)[2];
-    const int kW = CudaNdarray_HOST_DIMS(weight)[3];
-    const int kD = CudaNdarray_HOST_DIMS(weight)[4];
-    if (nChannels != CudaNdarray_HOST_DIMS(weight)[1])
-    {
-      PyErr_SetString(PyExc_ValueError,
-                      "GpuCorr3dMM images and kernel must have the same stack size\n");
-      return 0;
-    }
-    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const int topHeight = int((bottomHeight + 2*padH - kH) / dH) + 1;
-    const int topWidth  = int((bottomWidth + 2*padW - kW) / dW) + 1;
-    const int topDepth  = int((bottomDepth + 2*padD - kD) / dD) + 1;
-    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
-        nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
-        topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
-        topWidth != CudaNdarray_HOST_DIMS(top)[3] ||
-        topDepth != CudaNdarray_HOST_DIMS(top)[4])
-   {
-     PyErr_Format(PyExc_ValueError,
-                  "GpuCorr3dMM shape inconsistency:\n"
-                  "  bottom shape: %d %d %d %d %d\n"
-                  "  weight shape: %d %d %d %d %d\n"
-                  "  top shape: %d %d %d %d %d (expected %d %d %d %d %d)\n",
-                  batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
-                  nFilters, nChannels, kH, kW, kD,
-                  CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
-                  CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3],
-                  CudaNdarray_HOST_DIMS(top)[4],
-                  batchSize, nFilters, topHeight, topWidth, topDepth);
-        return 0;
-    }
-
-    // Create temporary columns
-    int col_dim[2];
-    col_dim[0] = nChannels * kW * kH * kD;
-    col_dim[1] = topHeight * topWidth * topDepth;
-    CudaNdarray* col = (CudaNdarray*) CudaNdarray_NewDims(2, col_dim);
-    if (0 == col)
-    {
-      PyErr_Format(PyExc_RuntimeError,
-                   "GpuCorr3dMM failed to allocate working memory of %d x %d\n",
-                   col_dim[0], col_dim[1]);
-        return 0;
-    }
-
-    // Define some useful variables
-    const int bottom_stride = CudaNdarray_HOST_STRIDES(bottom)[0];
-    const int top_stride = CudaNdarray_HOST_STRIDES(top)[0];
-    const int K_ = col_dim[0];
-    const int N_ = col_dim[1];
-    const int M_ = nFilters;
-    const float one = 1.0f;
-    const float zero = 0.0f;
-
-    CudaNdarray *output;
-    if (direction == 0)
-    { // forward pass
-      output = top;
-      // valid correlation: im2col, then gemm
-      // Iterate over batch
-      for (int n = 0; n < batchSize; n++)
-      {
-        // First, im3d2col
-        im3d2col(bottom->devdata + n * bottom_stride,
-                 nChannels,
-                 bottomHeight, bottomWidth, bottomDepth,
-                 kH, kW, kD,
-                 padH, padW, padD,
-                 dH, dW, dD,
-                 col->devdata);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuCorr3dMM encountered a CUDA error in im2col: %s\n"
-                       "This could be a known bug in CUDA, please see the "
-                       "GpuCorr3dMM() documentation.\n",
-                       cudaGetErrorString(err));
-          Py_DECREF(col);
-          return 0;
-        }
-        // Second, gemm
-        cublasStatus_t status = cublasSgemm(handle,
-                                            CUBLAS_OP_N, CUBLAS_OP_N,
-                                            N_, M_, K_,
-                                            &one,
-                                            col->devdata, N_,
-                                            weight->devdata, K_,
-                                            &zero,
-                                            top->devdata + n * top_stride, N_);
-        if (status != CUBLAS_STATUS_SUCCESS)
-        {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuCorr3dMM encountered a CUBLAS error: %s\n"
-                       "This could be a known bug in CUDA, please see the "
-                       "GpuCorr3dMM() documentation.\n",
-                       cublasGetErrorString(status));
-          Py_DECREF(col);
-          return 0;
-        }
-      }
-    }
-    else if (direction == 1)
-    {
-      // backprop wrt. weights
-      output = weight;
-      // valid convolution: im2col, then gemm
-      // Iterate over batch
-      for (int n = 0; n < batchSize; n++)
-      {
-        // First, im2col
-        im3d2col(bottom->devdata + n * bottom_stride, nChannels,
-                 bottomHeight, bottomWidth, bottomDepth,
-                 kH, kW, kD,
-                 padH, padW, padD,
-                 dH, dW, dD,
-                 col->devdata);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-       {
-         PyErr_Format(PyExc_RuntimeError,
-                      "GpuCorr3dMM encountered a CUDA error in im2col: %s\n"
-                      "This could be a known bug in CUDA, please see the "
-                      "GpuCorr3dMM() documentation.\n",
-                      cudaGetErrorString(err));
-         Py_DECREF(col);
-         return 0;
-       }
-        // Second, gemm
-        // Note that we accumulate into weight. We do so by setting beta = 0
-        // for the first iteration and beta = 1 for subsequent ones. (This
-        // is faster than setting weight to all zeros before the loop.)
-        cublasStatus_t status = cublasSgemm(handle,
-                                            CUBLAS_OP_T, CUBLAS_OP_N,
-                                            K_, M_, N_,
-                                            &one,
-                                            col->devdata, N_,
-                                            top->devdata + n * top_stride, N_,
-                                            (n == 0) ? &zero : &one,
-                                            weight->devdata, K_);
-        if (status != CUBLAS_STATUS_SUCCESS)
-        {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuCorr3dMM encountered a CUBLAS error: %s\n"
-                       "This could be a known bug in CUDA, please see the "
-                       "GpuCorr3dMM() documentation.\n",
-                       cublasGetErrorString(status));
-          Py_DECREF(col);
-          return 0;
-        }
-      }
-    }
-    else if (direction == 2)
-    {
-      // backprop wrt. inputs
-      output = bottom;
-      // full convolution: gemm, then col2im3d
-      // Iterate over batch
-      for (int n = 0; n < batchSize; n++)
-      {
-        // gemm into columns
-        cublasStatus_t status = cublasSgemm(handle,
-                                            CUBLAS_OP_N, CUBLAS_OP_T,
-                                            N_, K_, M_,
-                                            &one,
-                                            top->devdata + n * top_stride, N_,
-                                            weight->devdata, K_,
-                                            &zero,
-                                            col->devdata, N_);
-        if (status != CUBLAS_STATUS_SUCCESS)
-        {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuCorr3dMM encountered a CUBLAS error: %s\n"
-                       "This could be a known bug in CUDA, please see the "
-                       "GpuCorr3dMM() documentation.\n",
-                       cublasGetErrorString(status));
-          Py_DECREF(col);
-          return 0;
-        }
-        // col2im3d back to the data
-        col2im3d(col->devdata, nChannels,
-                 bottomHeight, bottomWidth, bottomDepth,
-                 kH, kW, kD,
-                 padH, padW, padD,
-                 dH, dW, dD, bottom->devdata + n * bottom_stride);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuCorr3dMM encountered a CUDA error in col2im: %s\n"
-                       "This could be a known bug in CUDA, please see the "
-                       "GpuCorr3dMM() documentation.\n",
-                       cudaGetErrorString(err));
-          Py_DECREF(col);
-          return 0;
-        }
-      }
-    }
-    // Free temporary columns
-    Py_DECREF(col);
-
-    // Note that we don't change the refcount of the output matrix here. Output
-    // allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
-    // in here output is just aliased to one of bottom, weights, or top.
-    return output;
-}
diff --git a/theano/sandbox/cuda/corr_gemm.cu b/theano/sandbox/cuda/corr_gemm.cu
deleted file mode 100644
index a8d4cfa5d7d..00000000000
--- a/theano/sandbox/cuda/corr_gemm.cu
+++ /dev/null
@@ -1,488 +0,0 @@
-// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
-// sources are clearly marked. Below we reproduce the original license of
-// the Caffe software.
-/*
-Copyright (c) 2014, The Regents of the University of California (Regents)
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met: 
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer. 
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-#undef _GLIBCXX_ATOMIC_BUILTINS
-
-
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/caffe_common.hpp)
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n)                        \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-       i < (n);                                       \
-       i += blockDim.x * gridDim.x)
-
-// CUDA: thread number configuration.
-// Use 1024 threads per block, which requires cuda sm_2x or above,
-// or fall back to attempt compatibility (best of luck to you).
-#if __CUDA_ARCH__ >= 200
-    const int CUDA_NUM_THREADS = 1024;
-#else
-    const int CUDA_NUM_THREADS = 512;
-#endif
-
-// CUDA: number of blocks for threads.
-inline int GET_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
-// Kernels for fast unfold + copy
-__global__ void im2col_kernel(const int n, const float* data_im,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    float* data_col) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int w_out = index % width_col;
-    int h_index = index / width_col;
-    int h_out = h_index % height_col;
-    int channel_in = h_index / height_col;
-    int channel_out = channel_in * kernel_h * kernel_w;
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
-    float* data_col_ptr = data_col;
-    data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
-    const float* data_im_ptr = data_im;
-    data_im_ptr += (channel_in * height + h_in) * width + w_in;
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        int h = h_in + i;
-        int w = w_in + j;
-        *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
-            data_im_ptr[i * width + j] : 0;
-        data_col_ptr += height_col * width_col;
-      }
-    }
-  }
-}
-
-void im2col(const float* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    float* data_col) {
-  // We are going to launch channels * height_col * width_col kernels, each
-  // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  int num_kernels = channels * height_col * width_col;
-  im2col_kernel<<<GET_BLOCKS(num_kernels),
-                  CUDA_NUM_THREADS>>>(
-      num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
-      pad_w, stride_h, stride_w, height_col,
-      width_col, data_col);
-}
-
-__global__ void col2im_kernel(const int n, const float* data_col,
-    const int height, const int width, const int channels,
-    const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    float* data_im) {
-  CUDA_KERNEL_LOOP(index, n) {
-    float val = 0;
-    int w = index % width + pad_w;
-    int h = (index / width) % height + pad_h;
-    int c = index / (width * height);
-    // compute the start and end of the output
-    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-    int w_col_end = min(w / stride_w + 1, width_col);
-    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-    int h_col_end = min(h / stride_h + 1, height_col);
-    /*
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        // the col location: [c * width * height + h_out, w_out]
-        int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize
-            + (w - w_col * stride_w);
-        val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-      }
-    }
-    */
-    // equivalent implementation
-    int offset =
-        (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
-    int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
-    int coeff_w_col = (1 - stride_w * height_col * width_col);
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-      }
-    }
-    data_im[index] = val;
-  }
-}
-
-void col2im(const float* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, float* data_im) {
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-  int num_kernels = channels * height * width;
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  col2im_kernel<<<GET_BLOCKS(num_kernels),
-                  CUDA_NUM_THREADS>>>(
-      num_kernels, data_col, height, width, channels, patch_h, patch_w,
-      pad_h, pad_w, stride_h, stride_w,
-      height_col, width_col, data_im);
-}
-
-
-// Theano op code
-// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter
-// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
-CudaNdarray* corrMM(CudaNdarray *const bottom,
-                    CudaNdarray *const weight,
-                    CudaNdarray *const top,
-                    const int direction,
-                    const int dH = 1,
-                    const int dW = 1,
-                    const int padH = 0,
-                    const int padW = 0)
-{
-    if (bottom->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires bottom of 4D");
-        return NULL;
-    }
-    if (!CudaNdarray_is_c_contiguous(bottom))
-    {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM requires bottom to be C-contiguous, "
-                "but strides are: %d %d %d %d\n",
-                CudaNdarray_HOST_STRIDES(bottom)[0],
-                CudaNdarray_HOST_STRIDES(bottom)[1],
-                CudaNdarray_HOST_STRIDES(bottom)[2],
-                CudaNdarray_HOST_STRIDES(bottom)[3]);
-        return NULL;
-    }
-    
-    if (weight->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
-        return NULL;
-    }
-    if (!CudaNdarray_is_c_contiguous(weight))
-    {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM requires weight to be C-contiguous, "
-                "but strides are: %d %d %d %d\n",
-                CudaNdarray_HOST_STRIDES(weight)[0],
-                CudaNdarray_HOST_STRIDES(weight)[1],
-                CudaNdarray_HOST_STRIDES(weight)[2],
-                CudaNdarray_HOST_STRIDES(weight)[3]);
-        return NULL;
-    }
-
-    if (top->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires top of 4D");
-        return NULL;
-    }
-    if (!CudaNdarray_is_c_contiguous(top))
-    {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM requires top to be C-contiguous, "
-                "but strides are: %d %d %d %d\n",
-                CudaNdarray_HOST_STRIDES(top)[0],
-                CudaNdarray_HOST_STRIDES(top)[1],
-                CudaNdarray_HOST_STRIDES(top)[2],
-                CudaNdarray_HOST_STRIDES(top)[3]);
-        return NULL;
-    }
-
-    // Extract some shape information for later and check shape consistency
-    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
-    const int batchSize = CudaNdarray_HOST_DIMS(bottom)[0];
-    const int nChannels = CudaNdarray_HOST_DIMS(bottom)[1];
-    const int bottomHeight = CudaNdarray_HOST_DIMS(bottom)[2];
-    const int bottomWidth = CudaNdarray_HOST_DIMS(bottom)[3];
-    // weights: (nFilters, nChannels, rows, columns)
-    const int nFilters = CudaNdarray_HOST_DIMS(weight)[0];
-    const int kH = CudaNdarray_HOST_DIMS(weight)[2];
-    const int kW = CudaNdarray_HOST_DIMS(weight)[3];
-    if (nChannels != CudaNdarray_HOST_DIMS(weight)[1]) {
-        PyErr_SetString(PyExc_ValueError,
-                "GpuCorrMM images and kernel must have the same stack size\n");
-        return NULL;
-    }
-    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeight = (bottomHeight + 2*padH - kH) / dH + 1;
-    const int topWidth  = (bottomWidth + 2*padW - kW) / dW + 1;
-    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
-            nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
-            topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
-            topWidth != CudaNdarray_HOST_DIMS(top)[3]) {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM shape inconsistency:\n"
-                "  bottom shape: %d %d %d %d\n"
-                "  weight shape: %d %d %d %d\n"
-                "  top shape: %d %d %d %d (expected %d %d %d %d)\n",
-                batchSize, nChannels, bottomHeight, bottomWidth,
-                nFilters, nChannels, kH, kW,
-                CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
-                CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3],
-                batchSize, nFilters, topHeight, topWidth);
-        return NULL;
-    }
-
-    // Create temporary columns
-    int col_dim[2];
-    col_dim[0] = nChannels * kW * kH;
-    col_dim[1] = topHeight * topWidth;
-    CudaNdarray* col = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
-    if (NULL == col)
-    {
-        PyErr_Format(PyExc_RuntimeError,
-                "GpuCorrMM failed to allocate working memory of %d x %d\n",
-                col_dim[0], col_dim[1]);
-        return NULL;
-    }
-
-    // Define some useful variables
-    const int bottom_stride = CudaNdarray_HOST_STRIDES(bottom)[0];
-    const int top_stride = CudaNdarray_HOST_STRIDES(top)[0];
-    const int K_ = col_dim[0];
-    const int N_ = col_dim[1];
-    const int M_ = nFilters;
-    const float one = 1.0f;
-    const float zero = 0.0f;
-
-    CudaNdarray *output;
-    if (direction == 0) {  // forward pass
-        output = top;
-        // valid correlation: im2col, then gemm
-        // Iterate over batch
-        for (int n = 0; n < batchSize; n++) {
-            // First, im2col
-            im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
-                    bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
-            cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM encountered a CUDA error in im2col: %s\n"
-                             "This could be a known bug in CUDA, please see the "
-                             "GpuCorrMM() documentation.\n",
-                             cudaGetErrorString(err));
-                Py_DECREF(col);
-                return NULL;
-            }
-            // Second, gemm
-            cublasStatus_t status = cublasSgemm(handle,
-                    CUBLAS_OP_N, CUBLAS_OP_N,
-                    N_, M_, K_,
-                    &one,
-                    col->devdata, N_,
-                    weight->devdata, K_,
-                    &zero,
-                    top->devdata + n * top_stride, N_);
-            if (status != CUBLAS_STATUS_SUCCESS) {
-                PyErr_Format(PyExc_RuntimeError,
-                        "GpuCorrMM encountered a CUBLAS error: %s\n"
-                        "This could be a known bug in CUDA, please see the "
-                        "GpuCorrMM() documentation.\n",
-                        cublasGetErrorString(status));
-                Py_DECREF(col);
-                return NULL;
-            }
-        }
-        /*
-        // Original caffe code for comparison
-        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-        // Note that this is for grouped convolution; we can ignore groups here,
-        // but the group-related offsets help explain what M_, N_ and K_ are
-        int weight_offset = M_ * K_;
-        int col_offset = K_ * N_;
-        int top_offset = M_ * N_;
-        for (int n = 0; n < num_; ++n) {
-          // First, im2col
-          im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-              width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
-              col_data);
-          // Second, innerproduct with groups
-          for (int g = 0; g < group_; ++g) {
-            caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
-              (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
-              (Dtype)0., top_data + (*top)[i]->offset(n) + top_offset * g);
-            == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
-            cublasSgemm(CUBLAS_OP_N, CUBLAS_OP_N,
-              N_, M_, K_,
-              1.,
-              col_data + col_offset * g, N_,
-              weight + weight_offset * g, K_,
-              0.,
-              top_data + (*top)[i]->offset(n) + top_offset * g, N_);
-          }
-        }
-        */
-    }
-    else if (direction == 1) {  // backprop wrt. weights
-        output = weight;
-        // valid convolution: im2col, then gemm
-        // Iterate over batch
-        for (int n = 0; n < batchSize; n++) {
-            // First, im2col
-            im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
-                    bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
-            cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM encountered a CUDA error in im2col: %s\n"
-                             "This could be a known bug in CUDA, please see the "
-                             "GpuCorrMM() documentation.\n",
-                             cudaGetErrorString(err));
-                Py_DECREF(col);
-                return NULL;
-            }
-            // Second, gemm
-            // Note that we accumulate into weight. We do so by setting beta = 0
-            // for the first iteration and beta = 1 for subsequent ones. (This
-            // is faster than setting weight to all zeros before the loop.)
-            cublasStatus_t status = cublasSgemm(handle,
-                    CUBLAS_OP_T, CUBLAS_OP_N,
-                    K_, M_, N_,
-                    &one,
-                    col->devdata, N_,
-                    top->devdata + n * top_stride, N_,
-                    (n == 0) ? &zero : &one,
-                    weight->devdata, K_);
-            if (status != CUBLAS_STATUS_SUCCESS) {
-                PyErr_Format(PyExc_RuntimeError,
-                        "GpuCorrMM encountered a CUBLAS error: %s\n"
-                        "This could be a known bug in CUDA, please see the "
-                        "GpuCorrMM() documentation.\n",
-                        cublasGetErrorString(status));
-                Py_DECREF(col);
-                return NULL;
-            }
-        }
-        /*
-        // Original caffe code for comparison
-        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-        // Note that this is for grouped convolution; we can ignore groups
-        for (int n = 0; n < num_; ++n) {
-          // Since we saved memory in the forward pass by not storing all col
-          // data, we will need to recompute them.
-          im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
-                     width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-                     stride_h_, stride_w_, col_data);
-          // gradient w.r.t. weight. Note that we will accumulate diffs.
-          for (int g = 0; g < group_; ++g) {
-            caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
-                (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
-                col_data + col_offset * g, (Dtype)1.,
-                weight_diff + weight_offset * g);
-            == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
-            cublasSgemm(CUBLAS_OP_T, CUBLAS_OP_N, K_, M_, N_,
-                1.0,
-                col_data + col_offset * g, N_,
-                top_diff + top[i]->offset(n) + top_offset * g, N_,
-                1.0,
-                weight_diff + weight_offset * g, K_);
-          }
-        }
-        */
-    }
-    else if (direction == 2) {  // backprop wrt. inputs
-        output = bottom;
-        // full convolution: gemm, then col2im
-        // Iterate over batch
-        for (int n = 0; n < batchSize; n++) {
-            // gemm into columns
-            cublasStatus_t status = cublasSgemm(handle,
-                    CUBLAS_OP_N, CUBLAS_OP_T,
-                    N_, K_, M_,
-                    &one,
-                    top->devdata + n * top_stride, N_,
-                    weight->devdata, K_,
-                    &zero,
-                    col->devdata, N_);
-            if (status != CUBLAS_STATUS_SUCCESS) {
-                PyErr_Format(PyExc_RuntimeError,
-                        "GpuCorrMM encountered a CUBLAS error: %s\n"
-                        "This could be a known bug in CUDA, please see the "
-                        "GpuCorrMM() documentation.\n",
-                        cublasGetErrorString(status));
-                Py_DECREF(col);
-                return NULL;
-            }
-            // col2im back to the data
-            col2im(col->devdata, nChannels, bottomHeight, bottomWidth,
-                    kH, kW, padH, padW, dH, dW, bottom->devdata + n * bottom_stride);
-            cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM encountered a CUDA error in col2im: %s\n"
-                             "This could be a known bug in CUDA, please see the "
-                             "GpuCorrMM() documentation.\n",
-                             cudaGetErrorString(err));
-                Py_DECREF(col);
-                return NULL;
-            }
-        }
-        /*
-        // Original caffe code for comparison
-        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-        for (int n = 0; n < num_; ++n) {
-          // gradient w.r.t. bottom data, if necessary
-          if (propagate_down[i]) {
-            for (int g = 0; g < group_; ++g) {
-              caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
-                  (Dtype)1., weight + weight_offset * g,
-                  top_diff + top[i]->offset(n) + top_offset * g,
-                  (Dtype)0., col_diff + col_offset * g);
-              == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
-              cublasSgemm(CUBLAS_OP_N, CUBLAS_OP_T, N_, K_, M_,
-                  1.,
-                  top_diff + top[i]->offset(n) + top_offset * g, N_,
-                  weight + weight_offset * g, K_,
-                  0.,
-                  col_diff + col_offset * g, N_);
-            }
-            // col2im back to the data
-            col2im_gpu(col_diff, channels_, height_, width_,
-                kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
-                bottom_diff + (*bottom)[i]->offset(n));
-          }
-        }
-        */
-    }
-    // Free temporary columns
-    Py_DECREF(col);
-
-    // Note that we don't change the refcount of the output matrix here. Output
-    // (re)allocation and refcounting is done in BaseGpuCorrMM.c_code_helper();
-    // in here output is just aliased to one of bottom, weights, or top.
-    return output;
-}
-
diff --git a/theano/sandbox/cuda/cuda_ndarray.cu b/theano/sandbox/cuda/cuda_ndarray.cu
index 3fb4b196f36..aa64926d42c 100644
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -21,28 +21,10 @@
 //If true, we do error checking at the start of functions, to make sure there
 //is not a pre-existing error when the function is called.
 //You probably need to set the environment variable
-//CUDA_LAUNCH_BLOCKING=1, and/or modify the CNDA_THREAD_SYNC
-//preprocessor macro in cuda_ndarray.cuh
+//CUDA_LAUNCH_BLOCKING=1
 //if you want this to work.
 #define PRECHECK_ERROR 0
 
-//If true, we release the GIL around blocking GPU calls, to allow other Python
-//threads to run in the meantime. For a single-threaded program, the overhead
-//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
-//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
-#ifndef RELEASE_GIL
-#define RELEASE_GIL 1
-#endif
-#if RELEASE_GIL
-#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
-#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
-#else
-#define CNDA_BEGIN_ALLOW_THREADS
-#define CNDA_END_ALLOW_THREADS
-#endif
-
-cublasHandle_t handle = NULL;
-
 /////////////////////////
 // Alloc and Free
 /////////////////////////
@@ -64,13 +46,13 @@ static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure);
 int _outstanding_mallocs[] = {0,0};
 
 #if COMPUTE_GPU_MEM_USED
-size_t _allocated_size = 0;
-size_t _max_allocated_size = 0;
+int _allocated_size = 0;
+int _max_allocated_size = 0;
 
 const int TABLE_SIZE = 10000;
 struct table_struct{
     void* ptr;
-    size_t size;
+    int size;
 };
 table_struct _alloc_size_table[TABLE_SIZE];
 #endif
@@ -82,17 +64,6 @@ void * device_malloc(size_t size)
 
 void * device_malloc(size_t size, int verbose)
 {
-    #if PRECHECK_ERROR
-        cudaThreadSynchronize();
-        cudaError_t prevError = cudaGetLastError();
-        if (cudaSuccess != prevError)
-        {
-            fprintf(stderr,
-                    "Error existed before calling device_malloc. %s\n",
-                    cudaGetErrorString(prevError)
-                    );
-        }
-    #endif
     void * rval=NULL;
     cudaError_t err = cudaMalloc(&rval, size);
     if (cudaSuccess != err)
@@ -104,31 +75,14 @@ void * device_malloc(size_t size, int verbose)
         cudaGetLastError();
         if (verbose)
         {
-            size_t free = 0, total = 0;
-            cudaError_t err2 = cudaMemGetInfo(&free, &total);
-            if (err2 != cudaSuccess){
-                cudaGetLastError();
-                fprintf(stderr,
-                        "Error when trying to find the memory information"
-                        " on the GPU: %s\n", cudaGetErrorString(err2));
-            }
             #if COMPUTE_GPU_MEM_USED
-                fprintf(stderr,
-                        "Error allocating %zd bytes of device memory (%s)."
-                        " new total bytes allocated: %d."
-                        " Driver report %zd bytes free and %zd bytes total \n",
-                        size, cudaGetErrorString(err), _allocated_size,
-                        free, total);
+                fprintf(stderr, "Error allocating %li bytes of device memory (%s). new total bytes allocated: %d\n", (long)size, cudaGetErrorString(err),_allocated_size);
             #else
-                fprintf(stderr,
-                        "Error allocating %zd bytes of device memory (%s)."
-                        " Driver report %zd bytes free and %zd bytes total \n",
-                        size, cudaGetErrorString(err), free, total);
+                fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n", (long)size, cudaGetErrorString(err));
             #endif
         }
         PyErr_Format(PyExc_MemoryError,
-                     "Error allocating %zd bytes of device memory (%s).",
-                     size, cudaGetErrorString(err));
+                "Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err));
         return NULL;
     }
     if (rval != NULL){
@@ -139,19 +93,14 @@ void * device_malloc(size_t size, int verbose)
 #if COMPUTE_GPU_MEM_USED
         _allocated_size += size;
         _max_allocated_size = std::max(_max_allocated_size, _allocated_size);
-        int i = 0;
-        for(;i<TABLE_SIZE;i++){
+
+        for(int i=0;i<TABLE_SIZE;i++){
             if(NULL==_alloc_size_table[i].ptr){
                 _alloc_size_table[i].ptr=rval;
                 _alloc_size_table[i].size=size;
                 break;
             }
         }
-        if (i == TABLE_SIZE){
-            fprintf(stderr,
-                    "When tracking GPU malloc, our table size wasn't big enough."
-                    " So we loose some tracking. Raise the value of TABLE_SIZE in the file cuda_ndarra.cu");
-        }
 #endif
     }
     //fprintf(stderr,
@@ -164,51 +113,23 @@ void * device_malloc(size_t size, int verbose)
         //printf("MEMSET\n");
     }
     #if PRINT_FREE_MALLOC
-        fprintf(stderr, "device malloc %p of size %d\n", rval, size);
+        fprintf(stderr, "device malloc %p\n",rval);
     #endif
     return rval;
 }
 
 int device_free(void *ptr)
 {
+    #if PRINT_FREE_MALLOC
+        fprintf(stderr, "device_free %p\n",ptr);
+    #endif
     #if PRECHECK_ERROR
-        cudaThreadSynchronize();
         cudaError_t prevError = cudaGetLastError();
         if (cudaSuccess != prevError)
         {
-            fprintf(stderr,
-                    "Error existed before calling device_free. %s\n",
-                    cudaGetErrorString(prevError)
-                    );
+            fprintf(stderr, "Error existed before calling device_free.\n");
         }
     #endif
-    #if PRINT_FREE_MALLOC
-        size_t free = 0, total = 0;
-        cudaError_t err2 = cudaMemGetInfo(&free, &total);
-        if (err2 != cudaSuccess){
-            cudaGetLastError();
-            fprintf(stderr,
-                    "Error when tring to find the memory information"
-                    " on the GPU: %s\n", cudaGetErrorString(err2));
-        }
-        #if COMPUTE_GPU_MEM_USED
-        {
-            int i = 0;
-            for(;i<TABLE_SIZE;i++)
-                if(_alloc_size_table[i].ptr==ptr){
-                    break;
-                }
-            assert(i<TABLE_SIZE);
-            fprintf(stderr, "device_free %p of size %d."
-                    " Driver report %d bytes free and %d bytes total \n",
-                    ptr, _alloc_size_table[i].size, free, total);
-        }
-        #else
-            fprintf(stderr, "device_free %p."
-                    " Driver report %d bytes free and %d bytes total \n",
-                    ptr, free, total);
-        #endif
-    #endif
 
     // if there is no gpu context, the call to cudaFree will fail; skip it entirely
     if(!g_gpu_context_active) {
@@ -217,9 +138,7 @@ int device_free(void *ptr)
 
     // We need sync as the Theano's GC could remove intermediate variable that
     // are still needed as the gpu kernel are running or in the queue.
-    CNDA_BEGIN_ALLOW_THREADS
     cudaThreadSynchronize();
-    CNDA_END_ALLOW_THREADS
 
     cudaError_t err =  cudaFree(ptr);
     if (cudaSuccess != err)
@@ -229,43 +148,16 @@ int device_free(void *ptr)
         // it returns something else I still don't see why we should ignore
         // it.  All we want to do here is reset the flag.
         cudaGetLastError();
-        size_t free = 0, total = 0;
-        cudaError_t err2 = cudaMemGetInfo(&free, &total);
-        if (err2 != cudaSuccess){
-            cudaGetLastError();
-            fprintf(stderr,
-                    "Error when tring to find the memory information"
-                    " on the GPU: %s\n", cudaGetErrorString(err2));
-        }
         #if COMPUTE_GPU_MEM_USED
-        {
-            int i = 0;
-            for(;i<TABLE_SIZE;i++)
-                if(_alloc_size_table[i].ptr==ptr){
-                    break;
-                }
-            assert(i<TABLE_SIZE);
             fprintf(stderr,
-                    "Error freeing device pointer %p (%s) of size %d. %zd byte already allocated."
-                    " Driver report %zd bytes free and %zd bytes total \n",
-                    ptr, cudaGetErrorString(err),
-                    _alloc_size_table[i].size, _allocated_size, free, total);
-        }
+                    "Error freeing device pointer %p (%s).%d byte already allocated\n",
+                    ptr, cudaGetErrorString(err), _allocated_size);
         #else
             fprintf(stderr,
-                    "Error freeing device pointer %p (%s)."
-                    " Driver report %zd bytes free and %zd bytes total \n",
+                    "Error freeing device pointer %p (%s).\n",
                     ptr,
-                    cudaGetErrorString(err), free, total);
+                    cudaGetErrorString(err));
         #endif
-        if (NULL != PyErr_Occurred()){
-            fprintf(stderr,
-                    "device_free: cudaFree() returned an error, but there is already an"
-                    " Python error set. This happen during the clean up when there is a"
-                    " first error and the CUDA driver is in a so bad state that it don't"
-                    " work anymore. We keep the previous error set to help debugging it.");
-            return -1;
-        }
         PyErr_Format(PyExc_MemoryError,
                 "error freeing device pointer %p (%s)",
                 ptr,
@@ -456,38 +348,8 @@ static PyMemberDef CudaNdarray_members[] =
     {NULL}  /* Sentinel */
 };
 
-PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
+PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self)
 {
-    PyObject * dtype = NULL;
-    if (args && !PyArg_ParseTuple(args, "|O", &dtype))
-        return NULL;
-    if (dtype) {
-        PyArray_Descr* dtype2;
-        // PyArray_DescrConverter try to convert anything to a PyArray_Descr.
-        if(!PyArray_DescrConverter(dtype, &dtype2))
-        {
-            PyObject * str = PyObject_Repr(dtype);
-            PyErr_Format(PyExc_TypeError,
-                         "CudaNdarray dtype parameter not understood: %s",
-                         PyString_AsString(str)
-                         );
-            Py_CLEAR(str);
-            return NULL;
-        }
-        int typeNum = dtype2->type_num;
-        Py_DECREF(dtype2);
-        if (typeNum != NPY_FLOAT32)
-        {
-            PyObject * str = PyObject_Repr(dtype);
-            PyErr_Format(PyExc_TypeError,
-                         "CudaNdarray support only support float32 dtype, provided: %d",
-                         typeNum
-                         );
-            Py_CLEAR(str);
-            return NULL;
-        }
-    }
-
     int verbose = 0;
     if(self->nd>=0 && CudaNdarray_SIZE(self)==0){
         npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
@@ -498,7 +360,7 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
         if (!rval){
             return NULL;
         }
-        assert (PyArray_ITEMSIZE((PyArrayObject *)rval) == sizeof(real));
+        assert (PyArray_ITEMSIZE(rval) == sizeof(real));
         return rval;
     }
     if ((self->nd < 0) || (self->devdata == 0))
@@ -527,9 +389,7 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
     assert (npydims);
     for (int i = 0; i < self->nd; ++i)
         npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
-    PyArrayObject * rval = (PyArrayObject *) PyArray_SimpleNew(self->nd,
-                                                               npydims,
-                                                               REAL_TYPENUM);
+    PyObject * rval = PyArray_SimpleNew(self->nd, npydims, REAL_TYPENUM);
     free(npydims);
     if (!rval)
     {
@@ -539,17 +399,12 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
 
     assert (PyArray_ITEMSIZE(rval) == sizeof(real));
 
-    npy_intp rval_size = PyArray_SIZE(rval);
-    void *rval_data = PyArray_DATA(rval);
-    cublasStatus_t err;
-    CNDA_BEGIN_ALLOW_THREADS
-    err = cublasGetVector(rval_size, sizeof(real),
-                          contiguous_self->devdata, 1,
-                          rval_data, 1);
-    //CNDA_THREAD_SYNC;  // unneeded because cublasGetVector is blocking anyway
-    CNDA_END_ALLOW_THREADS
+    cublasGetVector(PyArray_SIZE(rval), sizeof(real),
+                    contiguous_self->devdata, 1,
+                    PyArray_DATA(rval), 1);
+    CNDA_THREAD_SYNC;
 
-    if (CUBLAS_STATUS_SUCCESS != err)
+    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
     {
         PyErr_SetString(PyExc_RuntimeError, "error copying data to host");
         Py_DECREF(rval);
@@ -557,7 +412,7 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
     }
 
     Py_DECREF(contiguous_self);
-    return (PyObject *)rval;
+    return rval;
 }
 
 // TODO-- we have two functions here, ZEROS and Zeros.
@@ -567,28 +422,12 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
 PyObject* CudaNdarray_ZEROS(int n, int * dims)
 {
 
-    size_t total_elements = 1;
-
-    for(size_t i=0;i<n;i++){
-        // Detect overflow on unsigned integer
-        if (dims[i] != 0 && total_elements > (SIZE_MAX / dims[i])) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "Can't store in size_t for the bytes requested %llu * %llu",
-                         (unsigned long long)total_elements,
-                         (unsigned long long)dims[i]);
-            return NULL;
-        }
+    int total_elements = 1;
+    for(int i=0;i<n;i++)
         total_elements*=dims[i];
-    }
 
     // total_elements now contains the size of the array, in reals
-    if (total_elements > (SIZE_MAX / sizeof(real))){
-        PyErr_Format(PyExc_RuntimeError,
-                     "Can't store in size_t for the bytes requested %llu * 4",
-                     (unsigned long long)total_elements);
-        return NULL;
-    }
-    size_t total_size = total_elements * sizeof(real);
+    int total_size = total_elements * sizeof(real);
 
     CudaNdarray* rval = (CudaNdarray*)CudaNdarray_New();
     if (!rval)
@@ -608,9 +447,7 @@ PyObject* CudaNdarray_ZEROS(int n, int * dims)
     //fprintf(stdout, "Sizeof: %d\n", total_size);
     if (cudaSuccess != cudaMemset(rval->devdata, 0, total_size))
     {
-        PyErr_Format(PyExc_MemoryError,
-                     "CudaNdarray_ZEROS: Error memsetting %llu bytes of device memory.",
-                     (unsigned long long)total_size);
+        PyErr_Format(PyExc_MemoryError, "CudaNdarray_ZEROS: Error memsetting %d bytes of device memory.", total_size);
         Py_DECREF(rval);
         return NULL;
     }
@@ -938,14 +775,12 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
         npy_int64 idx = indices[i0];
         if (idx<0)
             idx += dB0; // To allow negative indexing.
-        if ((idx < 0) || (idx >= dB0)){
+        if ((idx < 0) || (idx >= dB0))
             // Any value other the 0 probably work. But to be more safe, I want
             // to change all bits to prevent problem with concurrent write that
             // could cross cache line. But this should not happen with the
             // current code and driver.
             *err = 0xFFFF;
-            continue;
-        }
         for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x){
             for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y){
                 int a_idx = i0*sA0 + i1*sA1 + i2*sA2;
@@ -963,7 +798,7 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
 // This prevent us from setting it to 0 before each use
 static int* err_var = NULL;
 
-// We try to be similar to the PyArray_TakeFrom function
+// We try to be similat to the PyArray_TakeFrom function
 //http://docs.scipy.org/doc/numpy/reference/c-api.array.html
 //TODO: support other clip mode then raise(clip, wrap)
 //self is the input that we copy data from.
@@ -998,7 +833,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
         return NULL;
 
         if (verbose) printf("ndarray indices\n");
-        if (PyArray_TYPE((PyArrayObject *)indices_obj) != NPY_INT32) {
+        if (PyArray_TYPE(indices_obj) != NPY_INT32) {
             PyErr_SetString(PyExc_TypeError, "CudaNdarray_TakeFrom: need a ndarray for indices with dtype int32");
             return NULL;
         }
@@ -1020,7 +855,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
             return NULL;
 
         indices = (CudaNdarray*) CudaNdarray_New();
-        if (verbose) printf("\nndarray after new\n");
+        if (verbose) printf("ndarray after new\n");
         if (! indices){
             Py_DECREF(indices_float32);
             return NULL;
@@ -1158,13 +993,6 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
     }
 
     dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
-    if(CudaNdarray_HOST_DIMS(out)[0] == 0){
-        // We take 0 elements, so no need for the rest of the code.
-        // This speed up that case AND fix crash otherwise.
-        free(dims);
-        Py_DECREF(indices);
-        return (PyObject *)out;
-    }
 
     switch (self->nd) {
         case 1:
@@ -1174,7 +1002,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
                     printf("cudaGetLastError=%d, nd=%d"
                            " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
                            " n_threads.x=%i, n_threads.y=%i)\n",
-                           cudaGetLastError(), self->nd,
+                           self->nd, cudaGetLastError(),
                            n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
                 k3<<<n_blocks, n_threads>>>(
                         dims[0],
@@ -1212,7 +1040,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
                         CudaNdarray_DEV_DATA(out),
                         CudaNdarray_HOST_STRIDES(out)[0], //strides
                         CudaNdarray_HOST_STRIDES(out)[1],
-                        1,
+                        1, 
                         CudaNdarray_DEV_DATA(self),
                         CudaNdarray_HOST_DIMS(self)[0], //For indices check
                         CudaNdarray_HOST_STRIDES(self)[0], //strides
@@ -1230,7 +1058,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
                     printf("cudaGetLastError=%d, nd=%d"
                            " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
                            " n_threads.x=%i, n_threads.y=%i)\n",
-                           cudaGetLastError(), self->nd,
+                           self->nd, cudaGetLastError(),
                            n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
                 k3<<<n_blocks, n_threads>>>(
                         dims[0], //dimensions
@@ -1253,7 +1081,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
         PyErr_SetString(PyExc_NotImplementedError,
                         "CudaNdarray_TakeFrom: only input with 1, 2 or 3"
                         " dimensions are currently supported");
-
+        
     }
     free(dims);
     CNDA_THREAD_SYNC;
@@ -1270,12 +1098,14 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
     //-10 could be any value different then 0.
     int cpu_err_var=-10;
 
-    CNDA_BEGIN_ALLOW_THREADS
-    // As we execute cudaMemcpy on the default stream, it waits for all
-    // kernels (on all streams) to be finished before starting to copy
+    // We are not 100% sure that cudaMemcpy wait that the async gpu kernel are
+    // finished before doing the transfer. So we add this explicit sync as it
+    // is pretty fast. In a python loop, I ran 1 000 000 call in 1 second.
+    // It is better to be safe and not significatively slower than unsafe.
+    cudaThreadSynchronize();
+
     err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
                      cudaMemcpyDeviceToHost);
-    CNDA_END_ALLOW_THREADS
     if (cudaSuccess != err) {
         PyErr_Format(
             PyExc_RuntimeError,
@@ -1286,11 +1116,12 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
         Py_DECREF(out);
         return NULL;
     }
-
+    
     if (cpu_err_var != 0) {
         PyErr_Format(
             PyExc_IndexError,
-            "CudaNdarray_TakeFrom: One of the index value is out of bound.\n",
+            "Cuda error: %s: The error code on the gpu is %i.\n",
+            "CudaNdarray_TakeFrom",
             cpu_err_var);
         // Must reset it to 0 to don't reset it before each use.
         err = cudaMemset((void*)err_var, 0, sizeof(int));
@@ -1303,11 +1134,11 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
         Py_DECREF(indices);
         Py_DECREF(out);
         return NULL;
-
+  
     }
-
+    
     Py_DECREF(indices);
-
+        
     if (verbose) printf("TAKE SUCCEDED\n");
     return (PyObject *)out;
 }
@@ -1386,7 +1217,7 @@ CudaNdarray_exp(CudaNdarray* self)
 static PyMethodDef CudaNdarray_methods[] =
 {
     {"__array__",
-        (PyCFunction)CudaNdarray_CreateArrayObj, METH_VARARGS,
+        (PyCFunction)CudaNdarray_CreateArrayObj, METH_NOARGS,
         "Copy from the device to a numpy ndarray"},
     {"__copy__",
         (PyCFunction)CudaNdarray_View, METH_NOARGS,
@@ -1558,45 +1389,6 @@ __global__ void k_ielem_4(const int d0, const int d1, const int d2, const int d3
     }
 }
 
-template <int operator_num>
-__global__ void k_ielem_6(const int d0, const int d1,
-                          const int d2, const int d3,
-                          const int d4, const int d5,
-                          float* a, const int sA0, const int sA1,
-                          const int sA2, const int sA3,
-                          const int sA4, const int sA5,
-                          const float* b, const int sB0, const int sB1,
-                          const int sB2, const int sB3,
-                          const int sB4, const int sB5
-                          ){
-    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
-        for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
-            for (int i2 = blockIdx.z; i2 < d2; i2 += gridDim.z){
-                for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x){
-                    for (int i4 = threadIdx.y; i4 < d4; i4 += blockDim.y){
-                        for (int i5 = threadIdx.z; i5 < d5; i5 += blockDim.z){
-                            switch (operator_num) {
-                            case IADD:
-                                a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
-                                    += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
-                                break;
-                            case IDIV:
-                                a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
-                                    /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
-                                break;
-                            case CPY:
-                                a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
-                                    = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
-                                break;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
 /*
 CudaNdarray_inplace_elemwise
 Compute elemwise, working inplace on A.
@@ -1623,31 +1415,19 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                     const int, const int,
                     const float*, const int, const int,
                     const int, const int);
-    void (*k6)(const int, const int,
-               const int, const int,
-               const int, const int,
-               float*, const int, const int,
-               const int, const int,
-               const int, const int,
-               const float*, const int, const int,
-               const int, const int,
-               const int, const int);
     switch (fct_nb)
     {
         case IADD:
             k3 = k_ielem_3<IADD>;
             k4 = k_ielem_4<IADD>;
-            k6 = k_ielem_6<IADD>;
             break;
         case IDIV:
             k3 = k_ielem_3<IDIV>;
             k4 = k_ielem_4<IDIV>;
-            k6 = k_ielem_6<IDIV>;
             break;
         case CPY:
             k3 = k_ielem_3<CPY>;
             k4 = k_ielem_4<CPY>;
-            k6 = k_ielem_6<CPY>;
             break;
         default:
             assert (0);
@@ -1980,76 +1760,15 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                     {
                         PyErr_Format(
                             PyExc_RuntimeError,
-                            "Cuda error: %s: %s. n_block=(%ld,%ld) n_threads=%ld\n",
-                            "k5 with loop over k4",
-                            cudaGetErrorString(err),
-                            (long) n_blocks.x, (long) n_blocks.y, (long) n_threads.x);
+                            "Cuda error: %s: %s.\n",
+                            "k4",
+                            cudaGetErrorString(err));
                         Py_XDECREF(new_other);
                         return -1;
                     }
                 }
             }
             break;
-        case 6:
-            {
-                dim3 n_blocks(
-                        std::min(
-                            CudaNdarray_HOST_DIMS(self)[0],
-                            NUM_VECTOR_OP_BLOCKS),
-                        CudaNdarray_HOST_DIMS(self)[1],
-                        CudaNdarray_HOST_DIMS(self)[2]
-                        );
-                while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
-                    n_blocks.y /= 2;
-                // GTX285(compute capabilities 1.3) don't support n_blocks.z > 1
-                // (compute capabilities 2.0) support 65535 for n_blocks.z
-                //while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
-                //    n_blocks.z /= 2;
-                n_blocks.z = 1;
-                dim3 n_threads(
-                        std::min(
-                            CudaNdarray_HOST_DIMS(self)[3],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK)
-                    //TODO: DON'T YOU NEED TO PUT DIMS[4] in here???
-                    //TODO: DON'T YOU NEED TO PUT DIMS[5] in here???
-                            );
-                k6<<<n_blocks, n_threads>>>(
-                        CudaNdarray_HOST_DIMS(self)[0],
-                        CudaNdarray_HOST_DIMS(self)[1],
-                        CudaNdarray_HOST_DIMS(self)[2],
-                        CudaNdarray_HOST_DIMS(self)[3],
-                        CudaNdarray_HOST_DIMS(self)[4],
-                        CudaNdarray_HOST_DIMS(self)[5],
-                        CudaNdarray_DEV_DATA(self),
-                        CudaNdarray_HOST_STRIDES(self)[0],
-                        CudaNdarray_HOST_STRIDES(self)[1],
-                        CudaNdarray_HOST_STRIDES(self)[2],
-                        CudaNdarray_HOST_STRIDES(self)[3],
-                        CudaNdarray_HOST_STRIDES(self)[4],
-                        CudaNdarray_HOST_STRIDES(self)[5],
-                        CudaNdarray_DEV_DATA(other),
-                        other_strides[0],
-                        other_strides[1],
-                        other_strides[2],
-                        other_strides[3],
-                        other_strides[4],
-                        other_strides[5]);
-                CNDA_THREAD_SYNC;
-                cudaError_t err = cudaGetLastError();
-                if (cudaSuccess != err)
-                {
-                    PyErr_Format(
-                        PyExc_RuntimeError,
-                        "Cuda error: %s: %s. n_blocks=(%ld, %ld, %ld) n_threads=(%ld)\n",
-                        "k6",
-                        cudaGetErrorString(err),
-                        (long) n_blocks.x, (long) n_blocks.y, (long) n_blocks.z,
-                        (long) n_threads.x);
-                    Py_XDECREF(new_other);
-                    return -1;
-                }
-            }
-            break;
         default:
         {
             PyErr_Format(
@@ -2672,58 +2391,8 @@ CudaNdarray_get_strides(CudaNdarray *self, void *closure)
 static int
 CudaNdarray_set_strides(CudaNdarray *self, PyObject *value, void *closure)
 {
-    //npy_intp newstrides_bytes[PyTuple_Size(value)];
-    if (PyTuple_Check(value)){
-        if (PyTuple_Size(value) != CudaNdarray_NDIM(self)){
-            PyErr_SetString(PyExc_ValueError,
-                            "The new strides tuple must have the same length"
-                            " as the number of dimensions");
-            return -1;
-        }
-    }else if (PyList_Check(value)){
-        if (PyList_Size(value) != CudaNdarray_NDIM(self)){
-            PyErr_SetString(PyExc_ValueError,
-                            "The new strides list must have the same length"
-                            " as the number of dimensions");
-            return -1;
-        }
-    }else{
-        PyErr_SetString(PyExc_ValueError,
-                        "The new strides need to be encoded in a tuple or list");
-        return -1;
-    }
-    npy_intp* newstrides = (npy_intp*) alloca(CudaNdarray_NDIM(self) * sizeof(npy_intp));
-    if (PyTuple_Check(value)){
-        for(int i=0; i < CudaNdarray_NDIM(self); i++){
-            newstrides[i] = PyInt_AsLong(PyTuple_GetItem(value, Py_ssize_t(i)));
-            //newstrides_bytes[i] = newstrides[i] * 4;
-        }
-    }else if (PyList_Check(value)){
-        for(int i=0; i < CudaNdarray_NDIM(self); i++){
-            newstrides[i] = PyInt_AsLong(PyList_GetItem(value, Py_ssize_t(i)));
-            //newstrides_bytes[i] = newstrides[i] * 4;
-        }
-    }
-    /*
-    // Do not do this check, as ExtractDiag needs that, and NumPy does not seem
-    // to do it.
-    npy_intp dims[PyTuple_Size(value)];
-    for(int i=0; i < CudaNdarray_NDIM(self); i++){
-        dims[i] = CudaNdarray_HOST_DIMS(self)[i];
-    }
-    if (!PyArray_CheckStrides(4,
-                              CudaNdarray_NDIM(self),
-                              0, 0,
-                              dims,
-                              newstrides_bytes)){
-        PyErr_SetString(PyExc_ValueError, "bad new strides");
-        return -1;
-        }
-    */
-    for(int i=0; i < CudaNdarray_NDIM(self); i++){
-        CudaNdarray_set_stride(self, i, newstrides[i]);
-    }
-    return 0;
+    PyErr_SetString(PyExc_NotImplementedError, "");
+    return -1;
 }
 
 static PyObject *
@@ -2888,23 +2557,11 @@ GetDeviceMemInfo(PyObject* _unused, PyObject* dummy)
 PyObject *
 CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
 {
-    CNDA_BEGIN_ALLOW_THREADS
     cudaThreadSynchronize();
-    CNDA_END_ALLOW_THREADS
     Py_INCREF(Py_None);
     return Py_None;
 }
 
-/*
- * Exist and return true if we link with cublas v2.
- */
-PyObject *
-CudaNdarray_cublasv2(PyObject* _unused, PyObject* dummy)
-{
-    Py_INCREF(Py_True);
-    return Py_True;
-}
-
 #if COMPUTE_GPU_MEM_USED
 /*
  * Return the size in bytes that Theano currently have allocated on the gpu.
@@ -3042,34 +2699,29 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
 {
     int *gpu_data = (int*)device_malloc(sizeof(int)*2);
     if(gpu_data == NULL){
-        return NULL;
+        return PyErr_Format(PyExc_MemoryError,
+                            "CudaNdarray_ptr_int_size: Can't allocate memory on the gpu.");
     }
     get_gpu_ptr_size<<<1,1>>>(gpu_data);
-
-    cudaError_t cudaErr = cudaGetLastError();
-    if (cudaSuccess != cudaErr){
+    if (cudaSuccess != cublasGetError()){
 
         device_free(gpu_data);
         return PyErr_Format(PyExc_RuntimeError,
-                            "CudaNdarray_ptr_int_size: error when calling the gpu code. (%s)",
-                            cudaGetErrorString(cudaErr));
+                            "CudaNdarray_ptr_int_size: error when calling the gpu code.");
     }
 
     // Transfer the result to cpu
     int gpu_sizes[] = {-1,-1};
-    cublasStatus_t err;
-    err = cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
+    cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
     device_free(gpu_data);
 
-    if (CUBLAS_STATUS_SUCCESS != err){
+    if (CUBLAS_STATUS_SUCCESS != cublasGetError()){
         PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
         return NULL;
     }
     return Py_BuildValue("iiii", gpu_sizes[0], sizeof(float*), sizeof(int), gpu_sizes[1]);
 }
 
-static int cublas_init();
-static void cublas_shutdown();
 // Initialize the gpu.
 // Takes one optional parameter, the device number.
 // If provided, it sets that device to be the active device.
@@ -3134,8 +2786,6 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
                                 card_nb,
                                 cudaGetErrorString(cudaGetLastError()));
         }
-        if (cublas_init() == -1)
-            return NULL;
     }
 
     Py_INCREF(Py_None);
@@ -3165,8 +2815,6 @@ CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) {
 
 PyObject *
 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
-    // Don't handle errors here
-    cublas_shutdown();
     cudaThreadExit();
     g_gpu_context_active = 0; // context has now been closed down
     Py_INCREF(Py_None);
@@ -3394,7 +3042,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
         }
         for (int i = 0; i < PyArray_NDIM(data); ++i)
         {
-            if ((PyArray_DIMS(data)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
+            if ((data->dimensions[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
             {
                 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable dimension %i", i);
                 Py_DECREF(data);
@@ -3446,8 +3094,6 @@ static PyMethodDef module_methods[] = {
     {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
     {"from_gpu_pointer", CudaNdarray_from_gpu_pointer, METH_VARARGS, "Used to create a CudaNdarray from already allocated memory on the gpu.(example by pycuda)"},
     {"synchronize", CudaNdarray_synchronize, METH_NOARGS, "Used to synchronize the device"},
-    {"cublas_v2", CudaNdarray_cublasv2, METH_NOARGS,
-     "Used to know if this version of cuda_ndarray is linked with cublas v2."},
     {NULL, NULL, NULL, NULL}  /* Sentinel */
 };
 
@@ -3585,52 +3231,33 @@ CudaNdarray_New(int nd)
 //
 //////////////////////////////
 
-static int
+int
 cublas_init()
 {
-    cublasStatus_t err;
-    err = cublasCreate(&handle);
-    if (CUBLAS_STATUS_SUCCESS != err)
+    cublasInit();
+    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
     {
-        if(CUBLAS_STATUS_NOT_INITIALIZED == err)
-            PyErr_SetString(PyExc_RuntimeError,
-                            "cublasCreate() returned this error "
-                            "'the CUDA Runtime initialization failed'");
-        else if(CUBLAS_STATUS_ALLOC_FAILED == err)
-            PyErr_SetString(PyExc_RuntimeError,
-                            "cublasCreate() returned this error "
-                            "'the resources could not be allocated'");
-        else
-            PyErr_SetString(PyExc_RuntimeError,
-                            "unknow error during returned by cublasCreate()");
+        PyErr_SetString(PyExc_RuntimeError, "error initializing device");
         return -1;
     }
-    // Set the default stream as the one to execute on (default)
-    cublasSetStream(handle, NULL);
-    // Pointer to scalars are on the host (also default)
-    cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-#if CUDA_VERSION >= 5000
-    // atomics can be used in kernels to speed up operations (not default)
-    // This may lead to a slight variance from run to run in some operations
-    cublasSetAtomicsMode(handle, CUBLAS_ATOMICS_ALLOWED);
-#endif
     return 0;
 }
-
-static void
+int
 cublas_shutdown()
 {
-    if (handle != NULL)
-        cublasDestroy(handle);
-    // No point in handling any errors here
-    handle = NULL;
+    cublasShutdown();
+    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+    {
+        PyErr_SetString(PyExc_RuntimeError, "error shutting down device");
+        return -1;
+    }
+    return 0;
 }
 
 int
 CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
 {
-    int err = CudaNdarray_alloc_contiguous(self, PyArray_NDIM(obj),
-                                           PyArray_DIMS(obj));
+    int err = CudaNdarray_alloc_contiguous(self, PyArray_NDIM(obj), obj->dimensions);
     if (err) {
         return err;
     }
@@ -3642,22 +3269,16 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
         return -1;
     }
     assert( 4 ==  PyArray_ITEMSIZE(obj));
-    PyArrayObject * py_src = (PyArrayObject *)PyArray_ContiguousFromAny(
-        (PyObject*)obj, typenum, self->nd, self->nd);
+    PyObject * py_src = PyArray_ContiguousFromAny((PyObject*)obj, typenum, self->nd, self->nd);
     if (!py_src) {
         return -1;
     }
-    npy_intp py_src_size = PyArray_SIZE(py_src);
-    void *py_src_data = PyArray_DATA(py_src);
-    cublasStatus_t cerr;
-    CNDA_BEGIN_ALLOW_THREADS
-    cerr = cublasSetVector(py_src_size,
-                           sizeof(real),
-                           py_src_data, 1,
-                           self->devdata, 1);
-    //CNDA_THREAD_SYNC;  // unneeded because cublasSetVector is blocking anyway
-    CNDA_END_ALLOW_THREADS
-    if (CUBLAS_STATUS_SUCCESS != cerr)
+    cublasSetVector(PyArray_SIZE(py_src),
+            sizeof(real),
+            PyArray_DATA(py_src), 1,
+            self->devdata, 1);
+    CNDA_THREAD_SYNC;
+    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
     {
         PyErr_SetString(PyExc_RuntimeError, "error copying data to device memory");
         Py_DECREF(py_src);
@@ -3798,8 +3419,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
             && (1!=CudaNdarray_HOST_DIMS(other)[i] || !unbroadcast) )
         {
           PyErr_Format(PyExc_ValueError,
-                       "CudaNdarray_CopyFromCudaNdarray:"
-                       " need same dimensions for dim %d,"
+                       "need same dimensions for dim %d,"
                        " destination=%d, source=%d",
                        i, CudaNdarray_HOST_DIMS(self)[i],
                        CudaNdarray_HOST_DIMS(other)[i]);
@@ -3821,12 +3441,11 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
         if (verbose)
             fprintf(stderr, "Copying contiguous vector with cublasScopy\n");
 
-        cublasStatus_t err;
-        err = cublasScopy(handle, size, CudaNdarray_DEV_DATA(other), 1,
-                          CudaNdarray_DEV_DATA(self), 1);
+        cublasScopy(size, CudaNdarray_DEV_DATA(other), 1,
+                    CudaNdarray_DEV_DATA(self), 1);
         CNDA_THREAD_SYNC;
         Py_XDECREF(new_other);
-        if (CUBLAS_STATUS_SUCCESS != err)
+        if (CUBLAS_STATUS_SUCCESS != cublasGetError())
         {
             PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
             return -1;
@@ -4111,8 +3730,8 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
     float* a = CudaNdarray_DEV_DATA(A);
     float* b = CudaNdarray_DEV_DATA(B);
     float* c = CudaNdarray_DEV_DATA(C);
-    cublasOperation_t N = CUBLAS_OP_N;
-    cublasOperation_t T = CUBLAS_OP_T;
+    char N = 'N';
+    char T = 'T';
     //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
     // There should be no negative stride at that point
 #define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
@@ -4120,7 +3739,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
     if (sy == 0){sy = 1;}\
     if (sz == 0){sz = 1;}\
     if ((sx > 0) && (sy > 0) && (sz > 0)) { \
-        err = cublasSgemm(handle, T0, T1, D0, D1, D2, &a, x, sx, y, sy, &b, z, sz); \
+        cublasSgemm(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz); \
     } else { \
         PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
         Py_XDECREF(A_new);\
@@ -4128,7 +3747,6 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
         return -1; \
     }
 
-    cublasStatus_t err;
     switch(unit)
     {
         case 0x000: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_0, beta, c, sc_0); break;
@@ -4146,19 +3764,12 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
     Py_XDECREF(A_new);
     Py_XDECREF(B_new);
 
+    cublasStatus err = cublasGetError();
     if (CUBLAS_STATUS_SUCCESS != err)
     {
         PyErr_Format(PyExc_RuntimeError,
-                     "cublasSgemm failed (%i) %s\n"
-                     " unit=%x N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%p, b=%p, c=%p"
-                     " sa_0=%d, sa_1=%d, sb_0=%d, sb_1=%d, sc_0=%d, sc_1=%d",
-                     err,  cublasGetErrorString(err),
-                     unit, N,
-                     CudaNdarray_HOST_DIMS(C)[0],
-                     CudaNdarray_HOST_DIMS(C)[1],
-                     CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
-                     alpha, beta, a, b, c, sa_0, sa_1, sb_0, sb_1, sc_0, sc_1);
-
+                     "cublasSgemm failed (%i)",
+                     err);
         return -1;
     }
     return 0;
@@ -4245,31 +3856,29 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
     if (sa_1 == 0)
         sa_1 = 1;
 
-    // This is important because we can end up not calling Sgemv at all
-    cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
     if (CudaNdarray_SIZE(C)) {
         if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
             || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
                 && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
         {
-            err = cublasSgemv(handle, CUBLAS_OP_N,
+            cublasSgemv('N',
                     CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
-                    &alpha,
+                    alpha,
                     CudaNdarray_DEV_DATA(A), sa_1,
                     CudaNdarray_DEV_DATA(B), sb_0,
-                    &beta,
+                    beta,
                     CudaNdarray_DEV_DATA(C), sc_0);
         }
         else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
                 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
                     && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
         {
-            err = cublasSgemv(handle, CUBLAS_OP_T,
+            cublasSgemv('T',
                     CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
-                    &alpha,
+                    alpha,
                     CudaNdarray_DEV_DATA(A), sa_0,
                     CudaNdarray_DEV_DATA(B), sb_0,
-                    &beta,
+                    beta,
                     CudaNdarray_DEV_DATA(C), sc_0);
         }
         else
@@ -4295,6 +3904,7 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
     Py_XDECREF(A_new);
     Py_XDECREF(B_new);
 
+    cublasStatus err = cublasGetError();
     if (CUBLAS_STATUS_SUCCESS != err)
     {
         PyErr_Format(PyExc_RuntimeError,
@@ -4362,15 +3972,13 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y,
     int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1]
                                                  : CudaNdarray_HOST_DIMS(A)[0];
 
-    // This is important because we can end up not calling Sger at all
-    cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
     if(CudaNdarray_SIZE(A)){
         // If A is in col-major
         if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
             || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
                 && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
         {
-            err = cublasSger(handle, CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], &alpha,
+            cublasSger(CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], alpha,
                        CudaNdarray_DEV_DATA(x), x_strides,
                        CudaNdarray_DEV_DATA(y), y_strides,
                        CudaNdarray_DEV_DATA(A), sa_1);
@@ -4380,7 +3988,7 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y,
                 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
                     && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
         {
-            err = cublasSger(handle, CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], &alpha,
+            cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
                        CudaNdarray_DEV_DATA(y), y_strides,
                        CudaNdarray_DEV_DATA(x), x_strides,
                        CudaNdarray_DEV_DATA(A), sa_0);
@@ -4399,6 +4007,7 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y,
     Py_XDECREF(x_new);
     Py_XDECREF(y_new);
 
+    cublasStatus err = cublasGetError();
     if (CUBLAS_STATUS_SUCCESS != err)
     {
         PyErr_Format(PyExc_RuntimeError,
@@ -5033,12 +4642,14 @@ cnda_copy_structure_to_device(const CudaNdarray * self)
             }
         }
     }
-    if (cublasSetVector(cnda_structure_size(self->nd),
-                        sizeof(int),
-                        self->host_structure,
-                        1,
-                        self->dev_structure,
-                        1) != CUBLAS_STATUS_SUCCESS)
+    cublasSetVector(cnda_structure_size(self->nd),
+                    sizeof(int),
+                    self->host_structure,
+                    1,
+                    self->dev_structure,
+                    1);
+    CNDA_THREAD_SYNC;
+    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
     {
         PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
         return -1;
@@ -5178,7 +4789,7 @@ int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
 
 
 int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
-                            const int * dims, int fortran)
+        const int * dims)
 {
     bool allocated = false;
     if (*arr == NULL)
@@ -5190,7 +4801,7 @@ int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
         allocated = true;
     }
 
-    if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran))
+    if (CudaNdarray_alloc_contiguous(*arr, nd, dims))
     {
         if (allocated)
         {
diff --git a/theano/sandbox/cuda/cuda_ndarray.cuh b/theano/sandbox/cuda/cuda_ndarray.cuh
index 53a3efb7110..d93527b2e41 100644
--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -1,8 +1,6 @@
 #ifndef _CUDA_NDARRAY_H
 #define _CUDA_NDARRAY_H
 
-#include <algorithm>
-
 // Defines for Python 2/3 compatibility.
 #if PY_MAJOR_VERSION >= 3
 // Py3k treats all ints as longs. This one is not caught by npy_3kcompat.h.
@@ -36,13 +34,8 @@
 
 #include <numpy/arrayobject.h>
 #include <stdio.h>
-#include <stdint.h>
-#ifndef SIZE_MAX
-    #define SIZE_MAX ((size_t)-1)
-#endif
 
-
-#include <cublas_v2.h>
+#include <cublas.h>
 
 #ifdef _WIN32
 #ifdef _CUDA_NDARRAY_C
@@ -83,16 +76,11 @@ typedef float real;
 #define VERBOSE_DEVICE_MALLOC 1
 #define NO_VERBOSE_DEVICE_MALLOC 0
 
-/* Use this handle to make cublas calls */
-extern DllExport cublasHandle_t handle;
-
 /**
  * Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.
  *
  * device_malloc will set the Python error message before returning None.
  * device_free will return nonzero on failure (after setting the python error message)
- *
- * Set the Python error
  */
 DllExport void * device_malloc(size_t size);
 DllExport void * device_malloc(size_t size, int verbose);
@@ -150,8 +138,6 @@ enum operator_t
 /*
  * Return a CudaNdarray whose 'nd' dimensions are all 0.
  * if nd==-1, it is not initialized.
- *
- * Set the Python error
  */
 DllExport PyObject *
 CudaNdarray_New(int nd=-1);
@@ -174,12 +160,6 @@ CudaNdarray_CheckExact(const PyObject * ob);
 DllExport bool
 CudaNdarray_is_c_contiguous(const CudaNdarray * self);
 
-/**
- * Return true for a F-contiguous CudaNdarray, else false
- */
-DllExport bool
-CudaNdarray_is_f_contiguous(const CudaNdarray * self);
-
 /****
  * Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
  */
@@ -290,8 +270,6 @@ static PyObject *CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
  * Allocate a new CudaNdarray with room for given number of dimensions
  *
  * No Storage space is allocated (and all dimensions are 0)
- *
- * Set the Python error
  */
 DllExport PyObject * CudaNdarray_new_nd(const int nd);
 
@@ -300,8 +278,6 @@ DllExport PyObject * CudaNdarray_new_nd(const int nd);
  *
  * Note: This does not allocate storage for data, or free
  *       pre-existing storage.
- *
- * Set the Python error
  */
 DllExport inline int ALWAYS_INLINE
 CudaNdarray_set_nd(CudaNdarray * self, const int nd)
@@ -350,17 +326,14 @@ CudaNdarray_set_nd(CudaNdarray * self, const int nd)
  * Allocate storage space for a tensor of rank 'nd' and given dimensions.
  * (No-op if self already has a contiguous tensor of the right dimensions)
  *
- * If fortran is non-zeros, a fortran order is made, otherwise it is a c order.
- *
  * Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
  */
 template<typename inttype>
-static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
-                                        const inttype * dim, int fortran=0)
+static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype * dim)
 {
     // allocate an empty ndarray with c_contiguous access
     // return 0 on success
-    size_t size = 1; //set up the strides for contiguous tensor
+    int size = 1; //set up the strides for contiguous tensor
     assert (nd >= 0);
 
     // Here we modify the host structure to have the desired shape and
@@ -369,46 +342,11 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
     {
         return -1;
     }
-    if (fortran)
+    for (int i = nd-1; i >= 0; --i)
     {
-        for (int i = 0; i < nd; i++)
-        {
-            CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
-            CudaNdarray_set_dim(self, i, dim[i]);
-            //Detect overflow on unsigned integer
-            if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
-                PyErr_Format(PyExc_AssertionError,
-                             "Can't store in size_t for the bytes requested %llu * %llu",
-                             (unsigned long long)size, (unsigned long long)dim[i]);
-                return -1;
-            }
-            size = size * dim[i];
-        }
-    }
-    else
-    {
-        for (int i = nd-1; i >= 0; --i)
-        {
-            CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
-            CudaNdarray_set_dim(self, i, dim[i]);
-
-            //Detect overflow on unsigned integer
-            if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
-                PyErr_Format(PyExc_AssertionError,
-                             "Can't store in size_t for the bytes requested %llu * 4",
-                             (unsigned long long)size);
-                return -1;
-            }
-            size = size * dim[i];
-        }
-    }
-
-    // Detect overflow on unsigned integer
-    if (size > (SIZE_MAX / sizeof(real))) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "Can't store in size_t for the bytes requested %llu",
-                     (unsigned long long)size);
-        return -1;
+        CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
+        CudaNdarray_set_dim(self, i, dim[i]);
+        size = size * dim[i];
     }
 
     // If the allocated buffer is already of the right size, we don't need to
@@ -434,12 +372,22 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
         return -1;
     }
 
+    if (size < 0)
+    {
+        PyErr_Format(PyExc_AssertionError,
+                     "size (%i) < 0",
+                     size);
+        return -1;
+    }
+
     self->devdata = (float*)device_malloc(size*sizeof(real));
     if (size && !self->devdata)
     {
         CudaNdarray_set_nd(self, -1);
         self->data_allocated = 0;
         self->devdata = 0;
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Could not allocate memory on device");
         return -1;
     }
     if (0)
@@ -454,7 +402,6 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
 
 /*
  * Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated.
- * Set the python error.
  */
 template<typename inttype> 
 static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims)
@@ -467,9 +414,6 @@ static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims)
             Py_DECREF(rval);
             return NULL;
         }
-    }else{
-        PyErr_SetString(PyExc_MemoryError,
-                        "Failed to allocate the CudaNdarray structure.");
     }
     return (PyObject*)rval;
 }
@@ -521,8 +465,6 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
  *               e.g. suppose self and other are 2D matrices and other
  *               has only one row. Then we need to copy this row several
  *               times when copying to self.
- *
- * Set the Python error
  */
 DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
         const CudaNdarray * other, bool unbroadcast = false);
@@ -531,7 +473,7 @@ DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
  * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
  */
 DllExport PyObject *
-CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args = NULL);
+CudaNdarray_CreateArrayObj(CudaNdarray * self);
 
 DllExport PyObject *
 CudaNdarray_ZEROS(int n, int * dims);
@@ -557,27 +499,6 @@ CudaNdarray_is_c_contiguous(const CudaNdarray * self)
     return c_contiguous;
 }
 
-/**
- * True iff the strides look like [1, dim[0], dim[0]*dim[1], ...]
- */
-DllExport inline bool ALWAYS_INLINE
-CudaNdarray_is_f_contiguous(const CudaNdarray * self)
-{
-    bool f_contiguous = true;
-    int size = 1;
-    for (int i = 0; (i < self->nd) && f_contiguous; i++)
-    {
-        if (CudaNdarray_HOST_DIMS(self)[i] == 1)
-            continue;
-        if (CudaNdarray_HOST_STRIDES(self)[i] != size)
-        {
-            f_contiguous = false;
-        }
-        size = size * CudaNdarray_HOST_DIMS(self)[i];
-    }
-    return f_contiguous;
-}
-
 DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
 
 DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
@@ -593,7 +514,6 @@ DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const
 DllExport PyObject*
 CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args);
 
-// Set the Python error
 int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
 
 
@@ -602,45 +522,14 @@ DllExport PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_ot
 DllExport PyObject * CudaNdarray_Subscript(PyObject * py_self, PyObject * key);
 DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb);
 
+
 // Ensures that *arr is a pointer to a contiguous ndarray of the specified
 // dimensions.
 // *arr may initially be NULL, a pointer to an ndarray of the wrong size,
 // or a pointer to an ndarray of the right size. In the last case it will
 // not change.
-// If fortran is non-zero, a fortran order is expected/created
-//
-// Set the Python error
 DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
-                                      const int * dims, int fortran = 0);
-
-DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus_t err){
-    switch(err) {
-    case CUBLAS_STATUS_SUCCESS:
-        return "success";
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-        return "the library was not initialized";
-    case CUBLAS_STATUS_ALLOC_FAILED:
-        return "the resource allocation failed";
-    case CUBLAS_STATUS_INVALID_VALUE:
-        return "the parameters n<0 or incx,incy=0";
-#ifdef CUBLAS_STATUS_ARCH_MISMATCH
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-        return "required device feature not present";
-#endif
-    case CUBLAS_STATUS_MAPPING_ERROR:
-        return "an access to GPU memory space failed";
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-        return "the function failed to launch on the GPU";
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-        return "an internal operation failed";
-#ifdef CUBLAS_STATUS_NOT_SUPPORTED
-    case CUBLAS_STATUS_NOT_SUPPORTED:
-        return "unsupported function";
-#endif
-    default:
-        return "unknow code";
-    }
-}
+        const int * dims);
 
 #endif
 /*
diff --git a/theano/sandbox/cuda/cudnn_helper.h b/theano/sandbox/cuda/cudnn_helper.h
deleted file mode 100644
index 61c088e33d6..00000000000
--- a/theano/sandbox/cuda/cudnn_helper.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef CUDNN_HELPER_H
-#define CUDNN_HELPER_H
-
-#include <cudnn.h>
-
-static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
-  switch (err) {
-  case CUDNN_STATUS_SUCCESS:
-    return "The operation completed successfully.";
-  case CUDNN_STATUS_NOT_INITIALIZED:
-    return "The handle was not initialized(Is your driver recent enought?).";
-  case CUDNN_STATUS_ALLOC_FAILED:
-    return "Ressource allocation failed inside the library.";
-  case CUDNN_STATUS_BAD_PARAM:
-    return "An incorrect value was passed in.";
-  case CUDNN_STATUS_ARCH_MISMATCH:
-    return "The current GPU does not support the required features (only cc 3.0+ are supported).";
-  case CUDNN_STATUS_MAPPING_ERROR:
-    return "An access to GPU memory space failed (probably due to a failure to bind texture).";
-  case CUDNN_STATUS_EXECUTION_FAILED:
-    return "A kernel failed to execute.";
-  case CUDNN_STATUS_INTERNAL_ERROR:
-    return "An internal cuDNN operation failed.";
-  case CUDNN_STATUS_NOT_SUPPORTED:
-    return "The combination of parameters is not currently supported.";
-  default:
-    return "Unknown error code.";
-  }
-}
-
-#endif
diff --git a/theano/sandbox/cuda/dnn.py b/theano/sandbox/cuda/dnn.py
deleted file mode 100644
index 3798977aa9b..00000000000
--- a/theano/sandbox/cuda/dnn.py
+++ /dev/null
@@ -1,1357 +0,0 @@
-import os
-
-import theano
-from theano import Apply, gof, tensor
-from theano.gof import Optimizer, local_optimizer
-from theano.gof.type import CDataType
-from theano.compat import PY3
-from theano.tensor.nnet import SoftmaxGrad
-from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda import GpuOp
-from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
-                                           gpu_contiguous, HostFromGpu)
-from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
-                                      GpuDownsampleFactorMaxGrad)
-from theano.sandbox.cuda.nnet import GpuSoftmax
-from theano.sandbox.cuda import gpu_seqopt, register_opt
-
-from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
-
-
-def dnn_available():
-    if dnn_available.avail is None:
-        dev = theano.sandbox.cuda.active_device_number()
-        if theano.sandbox.cuda.device_properties(dev)['major'] < 3:
-            dnn_available.msg = "Device not supported by cuDNN"
-            dnn_available.avail = False
-        else:
-            preambule = """
-#include <stdio.h>
-#include <cuda.h>
-#include <cudnn.h>
-#include <cudnn_helper.h>
-            """
-
-            body = """
-cudnnHandle_t _handle = NULL;
-cudnnStatus_t err;
-if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
-  fprintf(stderr, "could not create cuDNN handle: %s",
-          cudnnGetErrorString(err));
-  return 1;
-}
-"""
-
-            comp, run, out, err = gof.cmodule.GCC_compiler.try_flags(
-                ["-l", "cudnn", "-I" + os.path.dirname(__file__),
-                 "-I" + os.path.join(theano.config.cuda.root, 'include'),
-                 "-L" + os.path.join(theano.config.cuda.root, 'lib64')],
-                preambule=preambule, body=body,
-                try_run=True, output=True)
-
-            dnn_available.avail = comp and run
-            if dnn_available.avail:
-                dnn_available.msg = "cuDNN should work"
-            else:
-                dnn_available.msg = (
-                    "Theano is not able to use cuDNN. We got this error: \n" +
-                    str(err))
-    return dnn_available.avail
-
-
-dnn_available.avail = None
-dnn_available.msg = None
-
-
-def c_set_tensor4d(var, desc, err, fail):
-    return """
-%(err)s = cudnnSetTensor4dDescriptorEx(
-    %(desc)s, CUDNN_DATA_FLOAT,
-    CudaNdarray_HOST_DIMS(%(var)s)[0],
-    CudaNdarray_HOST_DIMS(%(var)s)[1],
-    CudaNdarray_HOST_DIMS(%(var)s)[2],
-    CudaNdarray_HOST_DIMS(%(var)s)[3],
-    CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1],
-    CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3],
-    CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3],
-    CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1
-);
-if (%(err)s != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
-    cudnnGetErrorString(%(err)s));
-    %(fail)s
-}
-        """ % dict(var=var, err=err, desc=desc, fail=fail)
-
-
-class DnnBase(GpuOp):
-    """
-    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
-    """
-    # dnn does not know about broadcasting, so we do not need to assert
-    # the input broadcasting pattern.
-    check_broadcast = False
-
-    def c_headers(self):
-        return ['cudnn.h', 'cudnn_helper.h']
-
-    def c_header_dirs(self):
-        return [os.path.dirname(__file__)]
-
-    def c_libraries(self):
-        return ['cudnn']
-
-    def c_support_code(self):
-        return """
-cudnnHandle_t _handle = NULL;
-"""
-
-    def c_init_code(self):
-        if PY3:
-            error_out = "NULL"
-        else:
-            error_out = ""
-        return ["""{
-cudnnStatus_t err;
-if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %%s",
-               cudnnGetErrorString(err));
-  return %s;
-}
-}""" % (error_out,)]
-
-
-class GpuDnnConvDesc(GpuOp):
-    """This Op builds a convolution descriptor for use in the other
-    convolution operations.
-
-    see the doc of :func:`dnn_conv` for a description of the parameters
-
-    """
-    __props__ = ('border_mode', 'subsample', 'conv_mode')
-
-    def c_headers(self):
-        return ['cudnn.h', 'cudnn_helper.h']
-
-    def c_header_dirs(self):
-        return [os.path.dirname(__file__)]
-
-    def c_libraries(self):
-        return ['cudnn']
-
-    def c_compiler(self):
-        return NVCC_compiler
-
-    def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
-        if isinstance(border_mode, int):
-            border_mode = (border_mode, border_mode)
-        if isinstance(border_mode, tuple):
-            pad_h, pad_w = map(int, border_mode)
-            border_mode = (pad_h, pad_w)
-        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
-                border_mode in ('valid', 'full')):
-            raise ValueError(
-                'invalid border_mode {}, which must be either '
-                '"valid", "full", an integer or a pair of'
-                ' integers'.format(border_mode))
-        self.border_mode = border_mode
-        assert len(subsample) == 2
-        self.subsample = subsample
-        assert conv_mode in ('conv', 'cross')
-        self.conv_mode = conv_mode
-
-    def make_node(self, img_shape, kern_shape):
-        if img_shape.type.ndim != 1 or img_shape.type.dtype != 'int64':
-            raise TypeError('img must be 1D shape tensor')
-        if kern_shape.type.ndim != 1 or kern_shape.type.dtype != 'int64':
-            raise TypeError('kern must be 1D shape tensor')
-
-        return Apply(self, [img_shape, kern_shape],
-                     [CDataType("cudnnConvolutionDescriptor_t")()])
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        img_shape, kern_shape = inputs
-        desc, = outputs
-
-        if isinstance(self.border_mode, tuple):
-            pad_h_spec, pad_w_spec = map(int, self.border_mode)
-            assert pad_h_spec >= 0 and pad_w_spec >= 0
-            bmode = 2
-        else:
-            pad_h_spec = pad_w_spec = 0
-
-            if self.border_mode == "valid":
-                bmode = 1
-            else:
-                assert self.border_mode == "full"
-                bmode = 0
-
-        if self.conv_mode == 'conv':
-            conv_flag = 'CUDNN_CONVOLUTION'
-        else:
-            conv_flag = 'CUDNN_CROSS_CORRELATION'
-
-        return """
-{
-  cudnnStatus_t err;
-  int pad_h%(name)s;
-  int pad_w%(name)s;
-
-  if ((err = cudnnCreateConvolutionDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
-                 "descriptor: %%s", cudnnGetErrorString(err));
-    %(fail)s
-  }
-
-  if (%(bmode)d == 2) {
-    pad_h%(name)s = %(pad_h_spec)d;
-    pad_w%(name)s = %(pad_w_spec)d;
-  } else if (%(bmode)d == 1) {
-    pad_h%(name)s = 0;
-    pad_w%(name)s = 0;
-  } else if (%(bmode)d == 0) {
-    pad_h%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
-    pad_w%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
-  } else {
-    PyErr_SetString(PyExc_ValueError, "bad border mode");
-    %(fail)s
-  }
-  err = cudnnSetConvolutionDescriptorEx(
-  %(desc)s,
-  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 0),
-  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 1),
-  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 2),
-  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 3),
-  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 0),
-  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2),
-  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3),
-  pad_h%(name)s,
-  pad_w%(name)s,
-  %(subsx)d, %(subsy)d, 1, 1,
-  %(conv_flag)s
-  );
-
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
-                 cudnnGetErrorString(err));
-    %(fail)s
-  }
-}
-""" % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
-           bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
-           subsx=self.subsample[0], subsy=self.subsample[1],
-           pad_h_spec=pad_h_spec, pad_w_spec=pad_w_spec)
-
-    def c_code_cache_version(self):
-        return (2,)
-
-
-class GpuDnnConvBase(DnnBase):
-    __props__ = ()
-
-    def c_support_code_struct(self, node, struct_id):
-        return """
-cudnnTensor4dDescriptor_t input%(id)d;
-cudnnTensor4dDescriptor_t output%(id)d;
-cudnnFilterDescriptor_t kerns%(id)d;
-""" % dict(id=struct_id)
-
-    def c_init_code_struct(self, node, struct_id, sub):
-        return """
-cudnnStatus_t err%(id)d;
-input%(id)d = NULL;
-output%(id)d = NULL;
-kerns%(id)d = NULL;
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
-               "(inp): %%s", cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&output%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
-               "(out): %%s", cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
-if ((err%(id)d = cudnnCreateFilterDescriptor(&kerns%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %%s",
-               cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
-""" % dict(id=struct_id, fail=sub['fail'])
-
-    def c_cleanup_code_struct(self, node, struct_id):
-        return """
-if (input%(id)d != NULL) {cudnnDestroyTensor4dDescriptor(input%(id)d);}
-if (output%(id)d != NULL) {cudnnDestroyTensor4dDescriptor(output%(id)d);}
-if (kerns%(id)d != NULL) {cudnnDestroyFilterDescriptor(kerns%(id)d);}
-""" % dict(id=struct_id)
-
-    def c_set_filter(self, var, desc, err, fail):
-        return """
-%(err)s = cudnnSetFilterDescriptor(
-%(desc)s, CUDNN_DATA_FLOAT,
-CudaNdarray_HOST_DIMS(%(var)s)[0],
-CudaNdarray_HOST_DIMS(%(var)s)[1],
-CudaNdarray_HOST_DIMS(%(var)s)[2],
-CudaNdarray_HOST_DIMS(%(var)s)[3]
-);
-if (%(err)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not set filter descriptor: %%s",
-               cudnnGetErrorString(%(err)s));
-  %(fail)s
-}
-""" % dict(var=var, desc=desc, err=err, fail=fail)
-
-    def c_set_tensor4d(self, *arg):
-        return c_set_tensor4d(*arg)
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        desc = inputs[2]
-        out, = outputs
-
-        checks = []
-        for v in inputs[:2]:
-            checks.append("""
-if (!CudaNdarray_is_c_contiguous(%s)) {
-  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
-  %s
-}
-""" % (v, sub['fail']))
-
-        sets = []
-        for p, v, d in zip(inputs[:2], self.conv_inputs, self.conv_types[:2]):
-            sets.append(getattr(self, 'c_set_'+d)(p, v + str(sub['struct_id']),
-                                                  'err' + name, sub['fail']))
-
-        set_out = getattr(self, 'c_set_' + self.conv_types[2])(
-            out, self.conv_output + str(sub['struct_id']), 'err' + name,
-            sub['fail'])
-
-        return """
-cudnnStatus_t err%(name)s;
-
-%(checks)s
-
-%(sets)s
-
-{
-  int out_dims[4];
-  err%(name)s = cudnnGetOutputTensor4dDim(
-  %(desc)s, %(path)s,
-  &out_dims[0], &out_dims[1],
-  &out_dims[2], &out_dims[3]
-  );
-  if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not get output sizes: %%s",
-                 cudnnGetErrorString(err%(name)s));
-    %(fail)s
-  }
-  // workaround for cudnn R1 bug
-  if (%(path)s == CUDNN_CONVOLUTION_WEIGHT_GRAD &&
-      (out_dims[0] != CudaNdarray_HOST_DIMS(%(input2)s)[1] ||
-       out_dims[1] != CudaNdarray_HOST_DIMS(%(input1)s)[1])) {
-    out_dims[0] = CudaNdarray_HOST_DIMS(%(input2)s)[1];
-    out_dims[1] = CudaNdarray_HOST_DIMS(%(input1)s)[1];
-    // This is a horrible hack that is unfortulately necessary
-    int *dd = (int *)%(desc)s;
-    out_dims[2] = dd[5];
-    out_dims[3] = dd[6];
-  }
-  if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) {
-    %(fail)s
-  }
-}
-
-%(set_out)s
-
-err%(name)s = %(method)s(
-_handle,
-%(input1_desc)s, CudaNdarray_DEV_DATA(%(input1)s),
-%(input2_desc)s, CudaNdarray_DEV_DATA(%(input2)s),
-%(desc)s,
-%(output_desc)s, CudaNdarray_DEV_DATA(%(out)s),
-CUDNN_RESULT_NO_ACCUMULATE
-);
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "error doing operation: %%s",
-               cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-""" % dict(out=out, desc=desc, fail=sub['fail'], id=sub['struct_id'],
-           name=name, checks='\n'.join(checks), sets='\n'.join(sets),
-           set_out=set_out, input1=inputs[0], input2=inputs[1],
-           input1_desc=self.conv_inputs[0]+str(sub['struct_id']),
-           input2_desc=self.conv_inputs[1]+str(sub['struct_id']),
-           output_desc=self.conv_output+str(sub['struct_id']),
-           method=self.conv_op, path=self.path_flag)
-
-    def c_code_cache_version(self):
-        return (8,)
-
-
-class GpuDnnConv(GpuDnnConvBase):
-    """
-    The forward convolution.
-
-    :param image:
-    :param kernel:
-    :param descr: the convolution descriptor
-
-    """
-    conv_inputs = 'input', 'kerns'
-    conv_output = 'output'
-    conv_types = 'tensor4d', 'filter', 'tensor4d'
-    conv_op = 'cudnnConvolutionForward'
-    path_flag = 'CUDNN_CONVOLUTION_FWD'
-
-    def make_node(self, img, kern, desc):
-        img = as_cuda_ndarray_variable(img)
-        kern = as_cuda_ndarray_variable(kern)
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
-
-        if not isinstance(desc.type, CDataType) \
-                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
-            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
-
-        broadcastable = (img.type.broadcastable[0],
-                         kern.type.broadcastable[0],
-                         False, False)
-        return Apply(self, [img, kern, desc],
-                     [CudaNdarrayType(broadcastable)()])
-
-    def grad(self, inp, grads):
-        img, kerns, desc = inp
-        top, = grads
-
-        top = gpu_contiguous(top)
-
-        d_img = GpuDnnConvGradI()(kerns, top, desc)
-        d_kerns = GpuDnnConvGradW()(img, top, desc)
-
-        return d_img, d_kerns, theano.gradient.DisconnectedType()()
-
-    def connection_pattern(self, node):
-        # not connected to desc
-        return [[1], [1], [0]]
-
-
-class GpuDnnConvGradW(GpuDnnConvBase):
-    """
-    The convolution gradient with respect to the weights.
-
-    :param image:
-    :param kernel:
-    :param descr: the convolution descriptor
-
-    """
-
-    conv_inputs = 'input', 'output',
-    conv_output = 'kerns'
-    conv_types = 'tensor4d', 'tensor4d', 'filter'
-    path_flag = 'CUDNN_CONVOLUTION_WEIGHT_GRAD'
-    conv_op = 'cudnnConvolutionBackwardFilter'
-
-    def grad(self, inp, grads):
-        img, top, desc = inp
-        kerns, = grads
-
-        kerns = gpu_contiguous(kerns)
-
-        d_img = GpuDnnConvGradI()(kerns, top, desc)
-        d_top = GpuDnnConv()(img, kerns, desc)
-
-        return d_img, d_top, theano.gradient.DisconnectedType()()
-
-    def connection_pattern(self, node):
-        # not connected to desc
-        return [[1], [1], [0]]
-
-    def make_node(self, img, topgrad, desc):
-        img = as_cuda_ndarray_variable(img)
-        topgrad = as_cuda_ndarray_variable(topgrad)
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if topgrad.type.ndim != 4:
-            raise TypeError('topgrad must be 4D tensor')
-
-        if not isinstance(desc.type, CDataType) \
-                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
-            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
-
-        broadcastable = [topgrad.type.broadcastable[1],
-                         img.type.broadcastable[1],
-                         False, False]
-        return Apply(self, [img, topgrad, desc],
-                     [CudaNdarrayType(broadcastable)()])
-
-
-class GpuDnnConvGradI(GpuDnnConvBase):
-    """
-    The convolution gradient with respect to the inputs.
-
-    :param image:
-    :param kernel:
-    :param descr: the convolution descriptor
-
-    """
-
-    conv_inputs = 'kerns', 'output',
-    conv_output = 'input'
-    conv_types = 'filter', 'tensor4d', 'tensor4d'
-    path_flag = 'CUDNN_CONVOLUTION_DATA_GRAD'
-    conv_op = 'cudnnConvolutionBackwardData'
-
-    def grad(self, inp, grads):
-        kerns, top, desc = inp
-        img, = grads
-
-        img = gpu_contiguous(img)
-
-        d_kerns = GpuDnnConvGradW()(img, top, desc)
-        d_top = GpuDnnConv()(img, kerns, desc)
-
-        return d_kerns, d_top, theano.gradient.DisconnectedType()()
-
-    def connection_pattern(self, node):
-        # not connected to desc
-        return [[1], [1], [0]]
-
-    def make_node(self, kern, topgrad, desc):
-        kern = as_cuda_ndarray_variable(kern)
-        topgrad = as_cuda_ndarray_variable(topgrad)
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
-        if topgrad.type.ndim != 4:
-            raise TypeError('topgrad must be 4D tensor')
-
-        if not isinstance(desc.type, CDataType) \
-                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
-            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
-
-        broadcastable = [topgrad.type.broadcastable[0],
-                         kern.type.broadcastable[1],
-                         False, False]
-        return Apply(self, [kern, topgrad, desc],
-                     [CudaNdarrayType(broadcastable)()])
-
-
-def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
-             conv_mode='conv', direction_hint=None):
-    """
-    GPU convolution using cuDNN from NVIDIA.
-
-    The memory layout to use is 'bc01', that is 'batch', 'channel',
-    'first dim', 'second dim' in that order.
-
-    :param img: images to do the convolution over
-    :param kerns: convolution filters
-    :param border_mode: one of 'valid', 'full'; additionally, the padding size
-        could be directly specified by an integer or a pair of integers
-    :param subsample: perform subsampling of the output (default: (1, 1))
-    :param conv_mode: perform convolution (kernels flipped) or cross-correlation.
-        One of 'conv', 'cross'. (default: 'conv')
-    :param direction_hint: Used by graph optimizers to change algorithm choice.
-        By default, GpuDnnConv will be used to carry out the convolution.
-        If border_mode is 'valid', subsample is (1,1) and direction_hint is
-        'bprop weights', it will use GpuDnnConvGradW.
-        If border_mode is 'full', subsample is (1,1) and direction_hint is
-        *not* 'forward!', it will use GpuDnnConvGradI.
-        This parameter is used internally by graph optimizers and may be
-        removed at any time without a deprecation period. You have been warned.
-
-    :warning: The cuDNN library only works with GPU that have a compute
-      capability of 3.0 or higer.  This means that older GPU will not
-      work with this Op.
-    """
-    if (border_mode == 'valid' and subsample == (1,1) and
-        direction_hint == 'bprop weights'):
-        # Special case: We are asked to use GpuDnnConvGradW. We need to set
-        # up a suitable 'fake' convolution to compute the gradient for.
-        img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
-        if conv_mode == 'conv':
-            # We need to flip manually. These 'kerns' are not the kernels
-            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
-            kerns = kerns[:, :, ::-1, ::-1]
-        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
-        shape = theano.tensor.stack(kerns.shape[1], img.shape[1],
-                                    img.shape[2] - kerns.shape[2] + 1,
-                                    img.shape[3] - kerns.shape[3] + 1)
-        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
-                              conv_mode='cross')(img.shape, shape)
-        conv = GpuDnnConvGradW()(img, kerns, desc)
-        return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3))
-
-    elif (border_mode == 'full' and subsample == (1, 1) and
-          direction_hint != 'forward!'):
-        # Special case: We can be faster by using GpuDnnConvGradI to compute
-        # the full convolution as the backward pass of a valid convolution.
-        # We just need to set up a suitable 'fake' valid convolution.
-        img = gpu_contiguous(img)
-        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
-        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
-        shape = theano.tensor.stack(img.shape[0], kerns.shape[1],
-                                    img.shape[2] + kerns.shape[2] - 1,
-                                    img.shape[3] + kerns.shape[3] - 1)
-        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
-                              conv_mode=conv_mode)(shape, kerns.shape)
-        return GpuDnnConvGradI()(kerns, img, desc)
-
-    # Standard case: We use GpuDnnConv with suitable padding.
-    img = gpu_contiguous(img)
-    kerns = gpu_contiguous(kerns)
-    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
-                          conv_mode=conv_mode)(img.shape, kerns.shape)
-    return GpuDnnConv()(img, kerns, desc)
-
-
-class GpuDnnPoolDesc(GpuOp):
-    """
-    This Op builds a pooling descriptor for use in the other
-    pooling operations.
-
-    :param ws: windows size
-    :param stride: (dx, dy)
-    :param mode: 'max' or 'average'
-    """
-    __props__ = ('ws', 'stride', 'mode')
-
-    def c_headers(self):
-        return ['cudnn.h', 'cudnn_helper.h']
-
-    def c_header_dirs(self):
-        return [os.path.dirname(__file__)]
-
-    def c_libraries(self):
-        return ['cudnn']
-
-    def c_compiler(self):
-        return NVCC_compiler
-
-    def do_constant_folding(self, node):
-        return False
-
-    def __init__(self, ws=(1, 1), stride=(1, 1), mode='max'):
-        assert mode in ('max', 'average')
-        self.mode = mode
-        assert len(ws) == 2
-        self.ws = ws
-        assert len(stride) == 2
-        self.stride = stride
-
-    def make_node(self):
-        return Apply(self, [],
-                     [CDataType("cudnnPoolingDescriptor_t")()])
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        desc, = outputs
-
-        if self.mode == 'max':
-            mode_flag = 'CUDNN_POOLING_MAX'
-        elif self.mode == "average":
-            mode_flag = 'CUDNN_POOLING_AVERAGE'
-        else:
-            raise NotImplementedError("Unsupported pooling model.")
-
-        return """
-{
-  cudnnStatus_t err;
-
-  if ((err = cudnnCreatePoolingDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate pooling "
-                 "descriptor: %%s", cudnnGetErrorString(err));
-    %(fail)s
-  }
-
-  err = cudnnSetPoolingDescriptor(
-  %(desc)s,
-  %(mode_flag)s,    
-  %(wsX)d, %(wsY)d,
-  %(stridex)d, %(stridey)d
-  );
-
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
-                 cudnnGetErrorString(err));
-    %(fail)s
-  }
-}
-""" % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'],
-           wsX=self.ws[0], wsY=self.ws[1], stridex=self.stride[0],
-           stridey=self.stride[1])
-
-    def c_code_cache_version(self):
-        return (1,)
-
-
-class GpuDnnPool(DnnBase):
-    """
-    Pooling.
-
-    :param img: the image 4d tensor.
-    :param desc: the pooling descriptor.
-    """
-    __props__ = ()
-
-    def make_node(self, img, desc):
-        img = as_cuda_ndarray_variable(img)
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-
-        if not isinstance(desc.type, CDataType) \
-                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
-            raise TypeError('desc must be cudnnPoolingDescriptor_t')
-
-        return Apply(self, [img, desc],
-                     [img.type()])
-
-    def c_support_code_struct(self, node, struct_id):
-        return """
-cudnnTensor4dDescriptor_t input%(id)d;
-cudnnTensor4dDescriptor_t output%(id)d;
-""" % dict(id=struct_id)
-
-    def c_init_code_struct(self, node, struct_id, sub):
-        return """
-cudnnStatus_t err%(id)d;
-input%(id)d = NULL;
-output%(id)d = NULL;
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
-               "(inp): %%s", cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&output%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
-               "(out): %%s", cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
-""" % dict(id=struct_id, fail=sub['fail'])
-
-    def c_cleanup_code_struct(self, node, struct_id):
-        return """
-if (input%(id)d != NULL) { cudnnDestroyTensor4dDescriptor(input%(id)d); }
-if (output%(id)d != NULL) { cudnnDestroyTensor4dDescriptor(output%(id)d); }
-""" % dict(id=struct_id)
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        desc = inputs[1]
-        out, = outputs
-
-        set_in = c_set_tensor4d(inputs[0], "input" + str(sub['struct_id']),
-                                'err' + name, sub['fail'])
-
-        set_out = c_set_tensor4d(out, "output" + str(sub['struct_id']),
-                                 'err' + name, sub['fail'])
-
-        return """
-cudnnStatus_t err%(name)s;
-
-int %(out)s_dims[4];
-
-if (!CudaNdarray_is_c_contiguous(%(input)s)) {
-  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
-  %(fail)s
-}
-
-%(set_in)s
-
-cudnnPoolingMode_t mode;
-int wsX, wsY, strideX, strideY;
-
-err%(name)s = cudnnGetPoolingDescriptor(%(desc)s, &mode, &wsX, &wsY, &strideX, &strideY);
-
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError,
-               "GpuDnnPool: error doing cudnnGetPoolingDescriptor operation: %%s",
-               cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-
-%(out)s_dims[0] = CudaNdarray_HOST_DIMS(%(input)s)[0];
-%(out)s_dims[1] = CudaNdarray_HOST_DIMS(%(input)s)[1];
-%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] - wsX) / strideX + 1;
-%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] - wsY) / strideY + 1;
-
-if (CudaNdarray_prep_output(&%(out)s, 4, %(out)s_dims) != 0)
-{
-  %(fail)s
-}
-
-%(set_out)s
-
-err%(name)s = cudnnPoolingForward(
-_handle,
-%(desc)s,
-%(input_desc)s, CudaNdarray_DEV_DATA(%(input)s),
-%(output_desc)s, CudaNdarray_DEV_DATA(%(out)s)
-);
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError,
-               "GpuDnnPool: error doing cudnnPoolingForward operation: %%s",
-               cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-""" % dict(out=out, desc=desc, fail=sub['fail'], id=sub['struct_id'],
-           name=name, set_in=set_in,
-           set_out=set_out, input=inputs[0],
-           input_desc="input"+str(sub['struct_id']),
-           output_desc="output"+str(sub['struct_id']))
-
-    def grad(self, inp, grads):
-        img, desc = inp
-        grad, = grads
-
-        grad = gpu_contiguous(grad)
-
-        out = self(img, desc)
-
-        g_out = GpuDnnPoolGrad()(img, out, grad, desc)
-
-        return g_out, theano.gradient.DisconnectedType()()
-
-    def connection_pattern(self, node):
-        # not connected to desc
-        return [[1], [0]]
-
-    def c_code_cache_version(self):
-        return (4,)
-
-
-class GpuDnnPoolGrad(DnnBase):
-    """
-    The pooling gradient.
-
-    :param inp: the input of the pooling.
-    :param out: the output of the pooling in the forward.
-    :param inp_grad: same size as out, but is the corresponding gradient information.
-    :param desc: The pooling descriptor.
-    """
-    __props__ = ()
-
-    def make_node(self, inp, out, inp_grad, desc):
-        inp = as_cuda_ndarray_variable(inp)
-        if inp.type.ndim != 4:
-            raise TypeError('inp must be 4D tensor')
-
-        inp_grad = as_cuda_ndarray_variable(inp_grad)
-        if inp_grad.type.ndim != 4:
-            raise TypeError('inp_grad must be 4D tensor')
-
-        out = as_cuda_ndarray_variable(out)
-        if out.type.ndim != 4:
-            raise TypeError('out must be 4D tensor')
-
-        if not isinstance(desc.type, CDataType) \
-                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
-            raise TypeError('desc must be cudnnPoolingDescriptor_t')
-
-        return Apply(self, [inp, out, inp_grad, desc],
-                     [inp.type()])
-
-    def c_support_code_struct(self, node, struct_id):
-        return """
-cudnnTensor4dDescriptor_t input%(id)d;        
-cudnnTensor4dDescriptor_t input_grad%(id)d;
-cudnnTensor4dDescriptor_t output%(id)d;
-cudnnTensor4dDescriptor_t output_grad%(id)d;
-""" % dict(id=struct_id)
-
-    def c_init_code_struct(self, node, struct_id, sub):
-        return """
-cudnnStatus_t err%(id)d;
-input%(id)d = NULL;
-input_grad%(id)d = NULL;
-output%(id)d = NULL;
-output_grad%(id)d = NULL;
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError,
-               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
-               "(input): %%s", cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input_grad%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError,
-               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
-               "(input_grad): %%s", cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&output%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError,
-               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
-               "(output): %%s", cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&output_grad%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError,
-               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
-               "(output_grad): %%s", cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
-""" % dict(id=struct_id, fail=sub['fail'])
-
-    def c_cleanup_code_struct(self, node, struct_id):
-        return """
-if (input%(id)d != NULL) { cudnnDestroyTensor4dDescriptor(input%(id)d); }
-if (input_grad%(id)d != NULL) { cudnnDestroyTensor4dDescriptor(input_grad%(id)d); }
-if (output%(id)d != NULL) { cudnnDestroyTensor4dDescriptor(output%(id)d); }
-if (output_grad%(id)d != NULL) { cudnnDestroyTensor4dDescriptor(output_grad%(id)d); }
-""" % dict(id=struct_id)
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        # Here the name out and inp are based on the cudnn definition.
-        # Not the definition of this class.
-        # This make it complicated.
-        out, inp, inp_grad, desc = inputs
-        out_grad, = outputs
-
-        set_in = "\n".join([
-            c_set_tensor4d(inp, "input" + str(sub['struct_id']),
-                           'err' + name, sub['fail']),
-            c_set_tensor4d(inp_grad, "input_grad" + str(sub['struct_id']),
-                           'err' + name, sub['fail']),
-            c_set_tensor4d(out, "output" + str(sub['struct_id']),
-                           'err' + name, sub['fail'])
-        ])
-
-        set_out = c_set_tensor4d(out, "output_grad" + str(sub['struct_id']),
-                                 'err' + name, sub['fail'])
-
-        return """
-cudnnStatus_t err%(name)s;
-
-if (!CudaNdarray_is_c_contiguous(%(input)s)) {
-  PyErr_SetString(PyExc_ValueError,
-                  "GpuDnnPoolGrad: Only contiguous inputs are supported.");
-  %(fail)s
-}
-
-if (!CudaNdarray_is_c_contiguous(%(input_grad)s)) {
-  PyErr_SetString(PyExc_ValueError,
-                  "GpuDnnPoolGrad: Only contiguous input gradients are supported.");
-  %(fail)s
-}
-
-if (!CudaNdarray_is_c_contiguous(%(output)s)) {
-  PyErr_SetString(PyExc_ValueError,
-                  "GpuDnnPoolGrad: Only contiguous outputs are supported.");
-  %(fail)s
-}
-
-%(set_in)s
-
-if (CudaNdarray_prep_output(&%(output_grad)s, 4,
-                            CudaNdarray_HOST_DIMS(%(output)s)) != 0)
-{
-  %(fail)s
-}
-
-%(set_out)s
-
-err%(name)s = cudnnPoolingBackward(
-_handle,
-%(desc)s,
-%(input_desc)s, CudaNdarray_DEV_DATA(%(input)s),
-%(input_grad_desc)s, CudaNdarray_DEV_DATA(%(input_grad)s),
-%(output_desc)s, CudaNdarray_DEV_DATA(%(output)s),
-%(output_grad_desc)s, CudaNdarray_DEV_DATA(%(output_grad)s)
-);
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError,
-               "GpuDnnPoolGrad: error doing operation: %%s",
-               cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-""" % dict(output_grad=out_grad, desc=desc,
-           fail=sub['fail'], id=sub['struct_id'],
-           name=name, set_in=set_in,
-           set_out=set_out, input=inp, input_grad=inp_grad, output=out,
-           input_desc="input"+str(sub['struct_id']),
-           input_grad_desc="input_grad"+str(sub['struct_id']),
-           output_desc="output"+str(sub['struct_id']),
-           output_grad_desc="output_grad"+str(sub['struct_id']))
-
-    def c_code_cache_version(self):
-        return (4,)
-
-
-def dnn_pool(img, ws, stride=(1, 1), mode='max'):
-    """
-    GPU pooling using cuDNN from NVIDIA.
-
-    The memory layout to use is 'bc01', that is 'batch', 'channel',
-    'first dim', 'second dim' in that order.
-
-    :param img: images to do the pooling over
-    :param ws: subsampling window size
-    :param stride: subsampling stride (default: (1, 1))
-    :param mode: one of 'max', 'average' (default: 'max')
-
-    :warning: The cuDNN library only works with GPU that have a compute
-      capability of 3.0 or higer.  This means that older GPU will not
-      work with this Op.
-    :note: This Op implements the ignore_border=True of max_pool_2d.
-    """
-    img = gpu_contiguous(img)
-    desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode)()
-    return GpuDnnPool()(img, desc)
-
-
-class GpuDnnSoftmaxBase(DnnBase):
-    """
-    Op for the cuDNN Softmax.
-
-    :param tensor_format: Whether the data format is 'bc01' or 'b01c'
-    :param algo: 'fast' or 'accurate' indicating whether computations should be
-        optimized for speed or accuracy respectively.
-    :param mode: 'instance' or 'channel' indicating whether the softmax should
-        be computed per image across 'c01' or per spationali location '01' per
-        image across 'c'.
-    """
-
-    __props__ = ('tensor_format', 'mode', 'algo')
-
-    def __init__(self, tensor_format, algo, mode):
-        assert(tensor_format in ('bc01', 'b01c'))
-        self.tensor_format = tensor_format
-
-        assert(algo in ('fast', 'accurate'))
-        self.algo = algo
-
-        assert(mode in ('instance', 'channel'))
-        self.mode = mode
-
-        self.tensor_4d_descs = [softmax_input
-                                for softmax_input in self.softmax_inputs]
-        self.tensor_4d_descs.append('softmax_output')
-
-    def _define_tensor4d_desc(self, name, id):
-        return """
-cudnnTensor4dDescriptor_t %(name)s_%(id)d;
-""" % dict(name=name, id=id)
-
-    def _init_tensor4d_desc(self, name, id, fail):
-        return """
-%(name)s_%(id)d = NULL;
-if ((err%(id)d = cudnnCreateTensor4dDescriptor(&%(name)s_%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
-               ": %%s", cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
-""" % dict(name=name, id=id, fail=fail)
-
-    def _clean_tensor4d_desc(self, name, id):
-        return """
-if(%(name)s_%(id)d!= NULL)
-  cudnnDestroyTensor4dDescriptor(%(name)s_%(id)d);
-""" % dict(name=name, id=id)
-
-    def c_support_code_struct(self, node, struct_id):
-        result = ''
-        for name in self.tensor_4d_descs:
-            result += self._define_tensor4d_desc(name, struct_id)
-        return result
-
-    def c_init_code_struct(self, node, struct_id, sub):
-        result = """
-cudnnStatus_t err%(id)d;
-""" % dict(id=struct_id)
-
-        for name in self.tensor_4d_descs:
-            result += self._init_tensor4d_desc(name, struct_id, sub['fail'])
-        return result
-
-    def c_cleanup_code_struct(self, node, struct_id):
-        result = ''
-        for name in self.tensor_4d_descs:
-            result += self._clean_tensor4d_desc(name, struct_id)
-        return result
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        ins = inputs
-        outs, = outputs
-
-        if self.tensor_format == 'b01c':
-            tensor_format = 1
-        else:
-            tensor_format = 0
-
-        if self.mode == 'instance':
-            mode = 1
-        else:
-            mode = 0
-
-        if self.algo == 'fast':
-            algo = 1
-        else:
-            algo = 0
-
-        # Setup configuration variables.
-        result = """
-cudnnStatus_t err%(name)s;
-cudnnTensorFormat_t format%(id)d = CUDNN_TENSOR_NCHW;
-if (%(tensor_format)d == 1)
-  format%(id)d = CUDNN_TENSOR_NHWC;
-
-cudnnSoftmaxAlgorithm_t algo%(id)d = CUDNN_SOFTMAX_ACCURATE;
-if (%(algo)d == 1)
-  algo%(id)d = CUDNN_SOFTMAX_FAST;
-
-cudnnSoftmaxMode_t mode%(id)d = CUDNN_SOFTMAX_MODE_CHANNEL;
-if (%(mode)d == 1)
-  mode%(id)d = CUDNN_SOFTMAX_MODE_INSTANCE;
-""" % dict(id=sub['struct_id'], name=name,
-           tensor_format=tensor_format, mode=mode, algo=algo)
-
-        # Validate the input and build the input variables.
-        for input_idx, input_name in enumerate(self.softmax_inputs):
-            result += """
-if (!CudaNdarray_is_c_contiguous(%(ins)s)) {
-  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
-  %(fail)s
-}
-
-err%(name)s = cudnnSetTensor4dDescriptor(
-  %(input_name)s_%(id)d,
-  format%(id)d,
-  CUDNN_DATA_FLOAT,
-  CudaNdarray_HOST_DIMS(%(ins)s)[0],
-  CudaNdarray_HOST_DIMS(%(ins)s)[1],
-  CudaNdarray_HOST_DIMS(%(ins)s)[2],
-  CudaNdarray_HOST_DIMS(%(ins)s)[3]
-);
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%%%s",
-               cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-""" % dict(id=sub['struct_id'], name=name, input_name=input_name,
-           ins=ins[input_idx], fail=sub['fail'])
-
-        # Build and prepare the output variable.
-        result += """
-if (CudaNdarray_prep_output(&%(outs)s, 4, CudaNdarray_HOST_DIMS(%(ins)s)) != 0)
-{
-  %(fail)s
-}
-
-err%(name)s = cudnnSetTensor4dDescriptor(
-  softmax_output_%(id)d,
-  format%(id)d,
-  CUDNN_DATA_FLOAT,
-  CudaNdarray_HOST_DIMS(%(outs)s)[0],
-  CudaNdarray_HOST_DIMS(%(outs)s)[1],
-  CudaNdarray_HOST_DIMS(%(outs)s)[2],
-  CudaNdarray_HOST_DIMS(%(outs)s)[3]
-);
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%%%s",
-               cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-"""
-
-        # Add on a call to the method that does the actual work.
-        result += self.method()
-
-        subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'],
-                    id=sub['struct_id'], name=name)
-
-        for idx, softmax_input in enumerate(self.softmax_inputs):
-            subs['name%d' % idx] = softmax_input
-            subs['ins%d' % idx] = inputs[idx]
-
-        return result % subs
-
-    def c_code_cache_version(self):
-        return (0, 6)
-
-    def method(self):
-        raise NotImplementedError('GpuDnnSoftmaxBase::method')
-
-
-class GpuDnnSoftmax(GpuDnnSoftmaxBase):
-    softmax_inputs = ['softmax_input']
-
-    def make_node(self, x):
-        x = as_cuda_ndarray_variable(x)
-        assert x.ndim == 4
-        return Apply(self, [x], [x.type()])
-
-    def method(self):
-        return """
-err%(name)s = cudnnSoftmaxForward(
-  _handle,
-  algo%(id)d,
-  mode%(id)d,
-  softmax_input_%(id)d,
-  CudaNdarray_DEV_DATA(%(ins)s),
-  softmax_output_%(id)d,
-  CudaNdarray_DEV_DATA(%(outs)s)
-);
-"""
-
-    def grad(self, inp, grads):
-        x, = inp
-        g_sm, = grads
-        sm = self.make_node(x).outputs[0]
-        return [GpuDnnSoftmaxGrad(
-            self.tensor_format,
-            self.algo,
-            self.mode
-        )(g_sm, sm)]
-
-
-class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
-    softmax_inputs = ['softmax_gout', 'softmax_input']
-
-    def make_node(self, dy, sm):
-        dy = as_cuda_ndarray_variable(dy)
-        sm = as_cuda_ndarray_variable(sm)
-        assert dy.ndim == 4
-        assert sm.ndim == 4
-        return Apply(self, [dy, sm], [sm.type.make_variable()])
-
-    def method(self):
-        return """
-err%(name)s = cudnnSoftmaxBackward(
-  _handle,
-  algo%(id)d,
-  mode%(id)d,
-  %(name1)s_%(id)d,
-  CudaNdarray_DEV_DATA(%(ins1)s),
-  %(name0)s_%(id)d,
-  CudaNdarray_DEV_DATA(%(ins0)s),
-  softmax_output_%(id)d,
-  CudaNdarray_DEV_DATA(%(outs)s)
-);
-"""
-
-
-# Intentation for history
-if True:
-    #@register_opt('cudnn')  # this optimizer is registered in opt.py instead.
-    @local_optimizer([GpuConv])
-    def local_conv_dnn(node):
-        if not dnn_available():
-            return
-        if isinstance(node.op, GpuConv):
-            if node.op.border_mode not in ['full', 'valid']:
-                return
-            img, kern = node.inputs
-            border_mode = node.op.border_mode
-            subsample = node.op.subsample
-            direction_hint = node.op.direction_hint
-            return [dnn_conv(img, kern,
-                             border_mode=border_mode, subsample=subsample,
-                             direction_hint=direction_hint)]
-
-    # This optimizer is registered in opt.py as part of the meta-optimizer.
-    # It tries exactly the opposite code path of what local_conv_dnn() uses,
-    # because for some input/kernel shape configurations, this is faster.
-    @local_optimizer([GpuConv])
-    def local_conv_dnn_alternative(node):
-        if not dnn_available():
-            return
-        if isinstance(node.op, GpuConv):
-            border_mode = node.op.border_mode
-            subsample = node.op.subsample
-            if border_mode not in ['full', 'valid'] or subsample != (1, 1):
-                return
-            img, kern = node.inputs
-            direction_hint = node.op.direction_hint
-            if border_mode == 'full':
-                # for a full convolution, try using the forward pass instead
-                # of the backward pass wrt. inputs
-                direction_hint = 'forward!'
-            elif border_mode == 'valid':
-                # for a valid convolution, try using the backward pass wrt.
-                # weights instead of the forward pass and vice versa
-                if direction_hint == 'bprop weights':
-                    direction_hint = 'forward'
-                else:
-                    direction_hint = 'bprop weights'
-            return [dnn_conv(img, kern,
-                             border_mode=border_mode, subsample=subsample,
-                             direction_hint=direction_hint)]
-
-    @register_opt('cudnn')
-    @local_optimizer([GpuDownsampleFactorMax])
-    def local_pool_dnn(node):
-        if not dnn_available():
-            return
-        if isinstance(node.op, GpuDownsampleFactorMax):
-            if not node.op.ignore_border:
-                return
-            img, = node.inputs
-            ds = node.op.ds
-            return [dnn_pool(gpu_contiguous(img), ds, ds)]
-
-    @register_opt('cudnn')
-    @local_optimizer([GpuDownsampleFactorMaxGrad])
-    def local_pool_dnn_grad(node):
-        if not dnn_available():
-            return
-        if isinstance(node.op, GpuDownsampleFactorMaxGrad):
-            if not node.op.ignore_border:
-                return
-            inp, out, inp_grad = node.inputs
-            ds = node.op.ds
-
-            desc = GpuDnnPoolDesc(ws=ds, stride=ds, mode="max")()
-            return [GpuDnnPoolGrad()(gpu_contiguous(inp),
-                                     gpu_contiguous(out),
-                                     gpu_contiguous(inp_grad),
-                                     desc)]
-
-    @register_opt('cudnn')
-    @local_optimizer([GpuSoftmax])
-    def local_softmax_dnn(node):
-        if not dnn_available():
-            return
-        if isinstance(node.op, GpuSoftmax):
-            ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
-            ins = gpu_contiguous(ins)
-            out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(ins)
-            out = as_cuda_ndarray_variable(out.dimshuffle(0, 1))
-            return [out]
-
-    class NoCuDNNRaise(Optimizer):
-        def apply(self, fgraph):
-            """ Raise a RuntimeError if cudnn can't be used"""
-            if not dnn_available():
-                # Make an assert error as we want Theano to fail, not
-                # just skip this optimization.
-                raise AssertionError(
-                    "cuDNN optimization was enabled, but Theano was not able"
-                    " to use it. We got this error: \n" +
-                    dnn_available.msg)
-    gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
-
-    @register_opt('cudnn')
-    @local_optimizer([SoftmaxGrad])
-    def local_softmax_dnn_grad(node):
-        if (
-            isinstance(node.op, SoftmaxGrad)
-            and (isinstance(node.inputs[0].owner.op, HostFromGpu)
-                 or isinstance(node.inputs[1].owner.op, HostFromGpu))
-        ):
-            if not dnn_available():
-                return
-            ins = []
-            for n in node.inputs:
-                if isinstance(n.owner.op, HostFromGpu):
-                    n = n.owner.inputs[0]
-                ins.append(n.dimshuffle(0, 1, 'x', 'x'))
-
-            out = GpuDnnSoftmaxGrad(
-                'bc01',
-                'accurate',
-                'channel'
-            )(
-                gpu_contiguous(ins[0]),
-                gpu_contiguous(ins[1])
-            )
-            return [out.dimshuffle(0, 1)]
diff --git a/theano/sandbox/cuda/elemwise.py b/theano/sandbox/cuda/elemwise.py
index 3502d941caf..f91e72eb06e 100644
--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
@@ -7,19 +7,21 @@
 """
 
 
-import logging
+import copy, logging, sys
 
 import numpy
 
 from theano.scalar.basic import upgrade_to_float_no_complex, complex_types
 from theano.scalar.basic_scipy import Erfinv
 from theano.compat.six import StringIO
-from theano import Apply
-from theano import gof, scalar
+from theano import Apply, Constant, Op, Type, Variable
+from theano import gof, scalar, tensor
 
 
 _logger_name = 'theano.sandbox.cuda.elemwise'
 _logger = logging.getLogger(_logger_name)
+_logger.setLevel(logging.INFO)
+_logger.addHandler(logging.StreamHandler()) #TO REMOVE
 
 
 def _logical_scalar(x):
@@ -430,6 +432,7 @@ def task_code(d):
         return sio.getvalue()
 
     def c_src_kernel_Ccontiguous(self, node, nodename):
+        nd = node.outputs[0].type.ndim
         sio = StringIO()
         #print 'C_SRC_KERNEL', sio.getvalue()
 
diff --git a/theano/sandbox/cuda/extra_ops.py b/theano/sandbox/cuda/extra_ops.py
deleted file mode 100644
index e374458c554..00000000000
--- a/theano/sandbox/cuda/extra_ops.py
+++ /dev/null
@@ -1,433 +0,0 @@
-import theano
-import copy
-from theano import Op
-from theano.gof import local_optimizer
-from theano.sandbox.cuda import cuda_available, GpuOp
-from theano.sandbox.cuda.basic_ops import GpuFlatten
-from theano.tensor.extra_ops import CumsumOp
-
-if cuda_available:
-    from theano.sandbox.cuda import CudaNdarrayType
-    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host, HostFromGpu
-    from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
-
-
-class GpuCumsum(CumsumOp, GpuOp):
-    SUPPORTED_NDIMS = 3
-    __props__ = ('axis', 'max_threads_dim0', 'max_grid_size1', 'max_grid_size2')
-
-    def __init__(self, axis):
-        """
-        ``axis`` can not be None. If you want the array flatten, do it before.
-        """
-        self.axis = axis
-        self.max_threads_dim0 = None
-        self.max_grid_size1 = None
-        self.max_grid_size2 = None
-
-    def perform(self, node, inp, out):
-        return Op.perform(self, node, inp, out)
-
-    def make_node(self, x):
-        assert x.dtype == 'float32'
-        if not isinstance(x.type, CudaNdarrayType):
-            raise TypeError('x must be a CudaNdarrayType', x)
-
-        if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
-            raise NotImplementedError('Only cumsum on 1D, 2D and 3D array are supported right now!')
-
-        if self.axis >= x.ndim:
-            raise ValueError('axis(={1}) out of bounds'.format(self.axis))
-
-        return theano.Apply(self, [x], [x.type()])
-
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
-        node_ = copy.copy(node)
-        assert node.op is node_.op
-        if node_.op.max_threads_dim0 is None or node_.op.max_grid_size1 is None or node_.op.max_grid_size2 is None:
-            cuda = theano.sandbox.cuda
-            device_id = cuda.use.device_number
-            if device_id is None:
-                cuda.use("gpu",
-                         force=False,
-                         default_to_move_computation_to_gpu=False,
-                         move_shared_float32_to_gpu=False,
-                         enable_cuda=False,
-                         test_driver=True)
-                device_id = cuda.use.device_number
-            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
-            prop = cuda_ndarray.device_properties(device_id)
-            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
-            node_.op.max_grid_size1 = prop['maxGridSize1']
-            node_.op.max_grid_size2 = prop['maxGridSize2']
-
-        return super(GpuCumsum, node_.op).make_thunk(node_, storage_map,
-                                                     compute_map, no_recycling)
-
-    def __str__(self):
-        return "%s{%s}" % (self.__class__.__name__, self.axis)
-
-    def c_code_cache_version(self):
-        return (7,)
-
-    def c_support_code_apply(self, node, nodename):
-        return """
-        __device__
-        void k_reductionPhase_%(nodename)s(float* partialCumSum) {
-            // Traverse down from leaves to root building partial sums at internal nodes in the tree.
-            for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
-                __syncthreads();
-                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
-                if(index < blockDim.x*2) {
-                    partialCumSum[index] += partialCumSum[index - stride];
-                }
-            }
-        }
-
-        __device__
-        void k_reversePhase_%(nodename)s(float* partialCumSum) {
-            // Traverse back up the tree building the scan from the partial sums
-            for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
-                __syncthreads();
-                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
-                if(index + stride < blockDim.x*2) {
-                    partialCumSum[index + stride] += partialCumSum[index];
-                }
-            }
-        }
-
-        __device__
-        void k_fetchData_%(nodename)s(float* partialCumSum, float* input, int globalThreadID, dim3 dataStrides, int offsetY, int offsetZ) {
-            // blockIdx.y and blockIdx.z represents the current independent cumsum
-            int idY = blockIdx.y + offsetY;
-            int idZ = blockIdx.z + offsetZ;
-            int offset = idY * dataStrides.y + idZ * dataStrides.z;
-            int idx_even = (globalThreadID*2    ) * dataStrides.x + offset;
-            int idx_odd  = (globalThreadID*2 + 1) * dataStrides.x + offset;
-            partialCumSum[threadIdx.x*2]     = input[idx_even];
-            partialCumSum[threadIdx.x*2 + 1] = input[idx_odd];
-        }
-
-        __device__
-        void k_pushData_%(nodename)s(float* partialCumSum, float* output, int globalThreadID, dim3 dataStrides, int offsetY, int offsetZ) {
-            __syncthreads();
-            // blockIdx.y and blockIdx.z represents the current independent cumsum
-            int idY = blockIdx.y + offsetY;
-            int idZ = blockIdx.z + offsetZ;
-            int offset = idY * dataStrides.y + idZ * dataStrides.z;
-            int idx_even = (globalThreadID*2    ) * dataStrides.x + offset;
-            int idx_odd  = (globalThreadID*2 + 1) * dataStrides.x + offset;
-            output[idx_even] = partialCumSum[threadIdx.x*2];
-            output[idx_odd]  = partialCumSum[threadIdx.x*2 + 1];
-        }
-
-        __global__
-        void k_cumadd_%(nodename)s(float* input, float* output, dim3 inputStrides, dim3 outputStrides, int offsetY, int offsetZ, int beforeLastElementIdx, int lastElementIdx) {
-            int idY = blockIdx.y + offsetY;
-            int idZ = blockIdx.z + offsetZ;
-
-            int dataOffsetY_input = idY * inputStrides.y + idZ * inputStrides.z;
-            int dataOffsetY_output = idY * outputStrides.y + idZ * outputStrides.z;
-
-            int idx_last_input = lastElementIdx*inputStrides.x + dataOffsetY_input;
-            int idx_last_output = lastElementIdx*outputStrides.x + dataOffsetY_output;
-
-            int idx_beforelast = beforeLastElementIdx*outputStrides.x + dataOffsetY_output;
-            output[idx_last_output] = input[idx_last_input] + output[idx_beforelast];
-        }
-
-        __global__
-        void k_finalCumSum_%(nodename)s(float* output, float* blockSum, int nbElementsPerCumsum, dim3 dataStrides, int offsetY, int offsetZ) {
-            int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
-
-            // Check if current has data to process.
-            if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
-                return;
-            }
-
-            int idY = blockIdx.y + offsetY;
-            int idZ = blockIdx.z + offsetZ;
-
-            const float currentBlockSum = blockSum[blockIdx.x*(gridDim.y*gridDim.z) + idY*gridDim.z + idZ];
-
-            int offset = idY * dataStrides.y + idZ * dataStrides.z;
-            int idx_even = (globalThreadID*2    ) * dataStrides.x + offset;
-            int idx_odd  = (globalThreadID*2 + 1) * dataStrides.x + offset;
-            output[idx_even] += currentBlockSum;
-            output[idx_odd] += currentBlockSum;
-        }
-
-        __global__
-        void k_blockCumSum_%(nodename)s(float* input, float* output, int nbElementsPerCumsum, dim3 inputStrides, dim3 outputStrides, int offsetY, int offsetZ, float* blockSum) {
-            // Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis.
-            // The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case.
-
-            int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
-
-            // Check if current thread has data to process.
-            if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
-                return;
-            }
-
-            extern __shared__ float partialCumSum[];
-
-            // Load data in shared memory
-            k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, inputStrides, offsetY, offsetZ);
-
-            // Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree).
-            // The tree is sweeped from the leaves to the root and from the root to the leaves.
-            // Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
-            k_reductionPhase_%(nodename)s(partialCumSum);
-            k_reversePhase_%(nodename)s(partialCumSum);
-
-            // Write the final output to global memory
-            k_pushData_%(nodename)s(partialCumSum, output, globalThreadID, outputStrides, offsetY, offsetZ);
-
-            if (blockSum != NULL){
-                if (threadIdx.x == blockDim.x - 1) {
-                    blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumSum[threadIdx.x*2 + 1];
-                }
-            }
-        }
-
-        int cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int axis, int maxThreads, int maxGridY, int maxGridZ) {
-            int shape[3] = { 1, 1, 1 };
-            dim3 inputStrides(0, 0, 0);
-            dim3 outputStrides(0, 0, 0);
-
-            switch (CudaNdarray_NDIM(input))
-            {
-            case 1:
-                shape[0] = CudaNdarray_HOST_DIMS(input)[0];
-                inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
-                outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0];
-                break;
-            case 2:
-                shape[0] = CudaNdarray_HOST_DIMS(input)[0];
-                shape[1] = CudaNdarray_HOST_DIMS(input)[1];
-                inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
-                inputStrides.y = CudaNdarray_HOST_STRIDES(input)[1];
-                outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0];
-                outputStrides.y = CudaNdarray_HOST_STRIDES(output)[1];
-                break;
-            case 3:
-                shape[0] = CudaNdarray_HOST_DIMS(input)[0];
-                shape[1] = CudaNdarray_HOST_DIMS(input)[1];
-                shape[2] = CudaNdarray_HOST_DIMS(input)[2];
-                inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
-                inputStrides.y = CudaNdarray_HOST_STRIDES(input)[1];
-                inputStrides.z = CudaNdarray_HOST_STRIDES(input)[2];
-                outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0];
-                outputStrides.y = CudaNdarray_HOST_STRIDES(output)[1];
-                outputStrides.z = CudaNdarray_HOST_STRIDES(output)[2];
-                break;
-            default:
-                return -1;
-            }
-
-            if (shape[axis] <= 1) {
-                CudaNdarray_CopyFromCudaNdarray(output, input);
-                return 0;
-            }
-
-            // Perform cumsum on array of even size.
-            int nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
-
-            // Determine how many elements can be processed in one block.
-            int dimBlockX = ceil( min(nbElementsPerCumsum, 2*maxThreads) / 2.0);
-
-            // Determine how many blocks are needed in total.
-            int dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX));  // Nb. of blocks needed per cumsum.
-            int dimGridY;  // Nb. of independent cumsums (width).
-            int dimGridZ;  // Nb. of independent cumsums (height).
-
-            int tmp;
-            switch (axis)
-            {
-            case 0:
-                dimGridY = shape[1];
-                dimGridZ = shape[2];
-                break;
-            case 1:
-                dimGridY = shape[0];
-                dimGridZ = shape[2];
-
-                tmp = inputStrides.x;
-                inputStrides.x = inputStrides.y;
-                inputStrides.y = tmp;
-
-                tmp = outputStrides.x;
-                outputStrides.x = outputStrides.y;
-                outputStrides.y = tmp;
-                break;
-            case 2:
-                dimGridY = shape[1];
-                dimGridZ = shape[0];
-
-                tmp = inputStrides.x;
-                inputStrides.x = inputStrides.z;
-                inputStrides.z = tmp;
-
-                tmp = outputStrides.x;
-                outputStrides.x = outputStrides.z;
-                outputStrides.z = tmp;
-                break;
-            default:
-                return -1;
-            }
-
-            const int shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
-            CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum);
-
-            // Perform `maxGridY`*`maxGridZ` cumsums in parallel.
-            for (int offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
-                int localDimGridY = min(dimGridY - offsetY, maxGridY);
-
-                for (int offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
-                    int localDimGridZ = min(dimGridZ - offsetZ, maxGridZ);
-
-                    dim3 dimGrid(dimGridX, localDimGridY, localDimGridZ);
-                    dim3 dimBlock(dimBlockX, 1, 1);  // One cumsum per block.
-                    int sharedBytes = (2*dimBlockX) * sizeof(float);
-
-                    k_blockCumSum_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
-                    (
-                        CudaNdarray_DEV_DATA(input),
-                        CudaNdarray_DEV_DATA(output),
-                        nbElementsPerCumsum,
-                        inputStrides,
-                        outputStrides,
-                        offsetY,
-                        offsetZ,
-                        CudaNdarray_DEV_DATA(deviceBlockSum)
-                    );
-
-                    if (dimGridX > 1) {
-                        // Do a cumsum over the blockSum (recursive).
-                        if (cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
-                            return -1;
-                        }
-
-                        // Since there are more than one block (i.e. `dimGridX > 1`)
-                        //  report partial cumsums of previous blocks to subsequents ones.
-                        dim3 dimGrid(dimGridX, localDimGridY, localDimGridZ);
-                        dim3 dimBlock(dimBlockX, 1, 1);
-                        k_finalCumSum_%(nodename)s<<<dimGrid, dimBlock>>>
-                        (
-                            CudaNdarray_DEV_DATA(output),
-                            CudaNdarray_DEV_DATA(deviceBlockSum),
-                            nbElementsPerCumsum,
-                            outputStrides,
-                            offsetY,
-                            offsetZ
-                        );
-                    }
-
-                    // If shape[axis] is odd, the last element is compute manually
-                    if (shape[axis] != nbElementsPerCumsum){
-                        dim3 dimGrid(1, localDimGridY, localDimGridZ);
-                        dim3 dimBlock(1, 1, 1);
-                        k_cumadd_%(nodename)s<<<dimGrid, dimBlock>>>
-                        (
-                            CudaNdarray_DEV_DATA(input),
-                            CudaNdarray_DEV_DATA(output),
-                            inputStrides,
-                            outputStrides,
-                            offsetY,
-                            offsetZ,
-                            shape[axis]-2,
-                            shape[axis]-1
-                        );
-                    }
-                }
-            }
-
-            cudaFree(CudaNdarray_DEV_DATA(deviceBlockSum));
-            CNDA_THREAD_SYNC;
-            return 0;
-        }
-        """ % locals()
-
-    def c_code(self, node, nodename, inames, onames, sub):
-        x, = inames
-        z, = onames
-        axis = self.axis if self.axis is not None else 0
-        fail = sub['fail']
-
-        max_threads_dim0 = self.max_threads_dim0
-        max_grid_size1 = self.max_grid_size1
-        max_grid_size2 = self.max_grid_size2
-        if max_threads_dim0 is None or max_grid_size1 is None or max_grid_size2 is None:
-            raise NotImplementedError("GpuCumsum.c_code should not be called "
-                                      "directly. It should be called by "
-                                      "make_thunk() that add some information "
-                                      "related to the selected GPU.")
-
-        code = """
-            const int* shape = CudaNdarray_HOST_DIMS(%(x)s);
-            bool needAllocation = !%(z)s || CudaNdarray_NDIM(%(x)s) != CudaNdarray_NDIM(%(z)s);
-
-            // If output is already allocated, check if its shape matches the input's one.
-            if (!needAllocation) {
-                for (int i= 0; i < CudaNdarray_NDIM(%(x)s); ++i) {
-                    if (CudaNdarray_HOST_DIMS(%(x)s)[i] != CudaNdarray_HOST_DIMS(%(z)s)[i]) {
-                        needAllocation = true;
-                    }
-                }
-            }
-
-            if (needAllocation){
-                Py_XDECREF(%(z)s);
-                %(z)s = (CudaNdarray*) CudaNdarray_NewDims(CudaNdarray_NDIM(%(x)s), shape);
-            }
-
-            if (!%(z)s) {
-                %(fail)s;
-            }
-
-            { // Namespace for kernel calls //
-                if (cumSum_%(nodename)s(%(x)s, %(z)s, %(axis)s, %(max_threads_dim0)s, %(max_grid_size1)s, %(max_grid_size2)s) == -1){
-                    %(fail)s;
-                }
-
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "Cuda error: %%s: %%s.\\n",
-                        "cumSum_%(nodename)s",
-                        cudaGetErrorString(sts));
-                    %(fail)s;
-                }
-            }
-        """ % locals()
-
-        return code
-
-
-@local_optimizer([CumsumOp])
-def use_gpu_cumsum(node):
-    if type(node.op) is CumsumOp \
-       and node.inputs[0].dtype == 'float32' \
-       and node.inputs[0].owner \
-       and isinstance(node.inputs[0].owner.op, HostFromGpu):
-
-        axis = node.op.axis
-        x = node.inputs[0]
-
-        if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
-            return None
-
-        x = gpu_from_host(x)
-
-        if axis is None and x.ndim > 1:
-            x = GpuFlatten()(x)
-
-        # ``gpu_cumsum`` assume array has been flattened if needed.
-        if axis is None:
-            axis = 0
-
-        return [host_from_gpu(GpuCumsum(axis)(x))]
-
-if cuda_available:
-    register_gpu_opt()(use_gpu_cumsum)
diff --git a/theano/sandbox/cuda/fftconv.py b/theano/sandbox/cuda/fftconv.py
deleted file mode 100644
index 71e76cd58c6..00000000000
--- a/theano/sandbox/cuda/fftconv.py
+++ /dev/null
@@ -1,677 +0,0 @@
-import string
-
-import numpy as np
-import theano
-import theano.tensor as T
-
-from theano.sandbox.cuda import cuda_available, GpuOp
-from theano.ifelse import ifelse
-
-if cuda_available:
-    from theano.sandbox.cuda import (basic_ops, CudaNdarrayType,
-                                     CudaNdarray)
-from theano.misc.pycuda_init import pycuda_available
-if pycuda_available:
-    import pycuda.gpuarray
-
-try:
-    import scikits.cuda
-    from scikits.cuda import fft, cublas
-    scikits.cuda.misc.init()
-    scikits_cuda_available = True
-except ImportError:
-    scikits_cuda_available = False
-
-
-# TODO: investigate the effect of enabling fastmath on FFT performance
-# (how can it be enabled?).
-
-# base class for shared code between scikits.cuda-based ops
-class ScikitsCudaOp(GpuOp):
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def output_type(self, inp):
-        raise NotImplementedError
-
-    def make_node(self, inp):
-        inp = basic_ops.gpu_contiguous(
-            basic_ops.as_cuda_ndarray_variable(inp))
-
-        assert inp.dtype == "float32"
-
-        return theano.Apply(self, [inp], [self.output_type(inp)()])
-
-    def make_thunk(self, node, storage_map, _, _2):
-        if not scikits_cuda_available:
-            raise RuntimeError(
-                "scikits.cuda is needed for all GPU fft implementation,"
-                " including fftconv.")
-
-
-class CuFFTOp(ScikitsCudaOp):
-    def output_type(self, inp):
-        # add one extra dim for real/imag
-        return CudaNdarrayType(
-            broadcastable=[False] * (inp.type.ndim + 1))
-
-    def make_thunk(self, node, storage_map, _, _2):
-        super(CuFFTOp, self).make_thunk(node, storage_map, _, _2)
-
-        from theano.misc.pycuda_utils import to_gpuarray
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-
-        plan_input_shape = [None]
-        plan = [None]
-
-        def thunk():
-            input_shape = inputs[0][0].shape
-
-            # construct output shape
-            output_shape = list(input_shape)
-            # DFT of real input is symmetric, no need to store
-            # redundant coefficients
-            output_shape[-1] = output_shape[-1] // 2 + 1
-            # extra dimension with length 2 for real/imag
-            output_shape += [2]
-            output_shape = tuple(output_shape)
-
-            z = outputs[0]
-
-            # only allocate if there is no previous allocation of the
-            # right size.
-            if z[0] is None or z[0].shape != output_shape:
-                z[0] = CudaNdarray.zeros(output_shape)
-
-            input_pycuda = to_gpuarray(inputs[0][0])
-            # I thought we'd need to change the type on output_pycuda
-            # so it is complex64, but as it turns out scikits.cuda.fft
-            # doesn't really care either way and treats the array as
-            # if it is complex64 anyway.
-            output_pycuda = to_gpuarray(z[0])
-
-            # only initialise plan if necessary
-            if plan[0] is None or plan_input_shape[0] != input_shape:
-                plan_input_shape[0] = input_shape
-                plan[0] = fft.Plan(input_shape[1:], np.float32, np.complex64,
-                                   batch=input_shape[0])
-
-            fft.fft(input_pycuda, output_pycuda, plan[0])
-
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-
-        return thunk
-
-
-class CuIFFTOp(ScikitsCudaOp):
-    def output_type(self, inp):
-        # remove extra real/imag dim
-        return CudaNdarrayType(
-            broadcastable=[False] * (inp.type.ndim - 1))
-
-    def make_thunk(self, node, storage_map, _, _2):
-        super(CuIFFTOp, self).make_thunk(node, storage_map, _, _2)
-
-        from theano.misc.pycuda_utils import to_gpuarray
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-
-        plan_input_shape = [None]
-        plan = [None]
-
-        def thunk():
-            input_shape = inputs[0][0].shape
-
-            # construct output shape
-            # chop off the extra length-2 dimension for real/imag
-            output_shape = list(input_shape[:-1])
-            # restore full signal length
-            output_shape[-1] = (output_shape[-1] - 1) * 2
-            output_shape = tuple(output_shape)
-
-            z = outputs[0]
-
-            # only allocate if there is no previous allocation of the
-            # right size.
-            if z[0] is None or z[0].shape != output_shape:
-                z[0] = CudaNdarray.zeros(output_shape)
-
-            input_pycuda = to_gpuarray(inputs[0][0])
-            # input_pycuda is a float32 array with an extra dimension,
-            # but will be interpreted by scikits.cuda as a complex64
-            # array instead.
-            output_pycuda = to_gpuarray(z[0])
-
-            # only initialise plan if necessary
-            if plan[0] is None or plan_input_shape[0] != input_shape:
-                plan_input_shape[0] = input_shape
-                plan[0] = fft.Plan(output_shape[1:], np.complex64, np.float32,
-                                   batch=output_shape[0])
-
-            fft.ifft(input_pycuda, output_pycuda, plan[0])
-            # strangely enough, enabling rescaling here makes it run
-            # very, very slowly.  so do this rescaling manually
-            # afterwards!
-
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-
-        return thunk
-
-
-def to_complex_gpuarray(x, copyif=False):
-    """
-    adapted version of theano.misc.pycuda_utils.to_gpuarray that takes
-    an array with an extra trailing dimension of length 2 for
-    real/imaginary parts, and turns it into a complex64 PyCUDA
-    GPUArray.
-    """
-    if not isinstance(x, CudaNdarray):
-        raise ValueError("We can transfer only CudaNdarray "
-                         "to pycuda.gpuarray.GPUArray")
-    else:
-        # Check if trailing dimension has length 2
-        assert x.shape[-1] == 2
-
-        # check if dtype is float32
-        assert x.dtype == 'float32'
-
-        # Check if it is c contiguous
-        size = 1
-        c_contiguous = True
-        for i in range(x.ndim - 1, -1, -1):
-            if x.shape[i] == 1:
-                continue
-            if x._strides[i] != size:
-                c_contiguous = False
-                break
-            size *= x.shape[i]
-        if not c_contiguous:
-            if copyif:
-                x = x.copy()
-            else:
-                raise ValueError("We were asked to not copy memory, "
-                                 "but the memory is not c contiguous.")
-
-        # Now x is always c contiguous
-        px = pycuda.gpuarray.GPUArray(x.shape[:-1], np.complex64, base=x,
-                                      gpudata=x.gpudata)
-        return px
-
-
-def bptrs(a):
-    """
-    Pointer array when input represents a batch of matrices.
-
-    taken from scikits.cuda tests/test_cublas.py
-    """
-    return pycuda.gpuarray.arange(a.ptr, a.ptr + a.shape[0] * a.strides[0],
-                                  a.strides[0], dtype=cublas.ctypes.c_void_p)
-
-
-def sc_complex_dot_batched(bx_gpu, by_gpu, bc_gpu, transa='N', transb='N',
-                           handle=None):
-    """
-    uses cublasCgemmBatched to compute a bunch of complex dot products
-    in parallel
-    """
-    if handle is None:
-        handle = scikits.cuda.misc._global_cublas_handle
-
-    assert len(bx_gpu.shape) == 3
-    assert len(by_gpu.shape) == 3
-    assert len(bc_gpu.shape) == 3
-    assert bx_gpu.dtype == np.complex64
-    assert by_gpu.dtype == np.complex64
-    assert bc_gpu.dtype == np.complex64
-
-    # Get the shapes of the arguments
-    bx_shape = bx_gpu.shape
-    by_shape = by_gpu.shape
-
-    # Perform matrix multiplication for 2D arrays:
-    alpha = np.complex64(1.0)
-    beta = np.complex64(0.0)
-
-    transa = string.lower(transa)
-    transb = string.lower(transb)
-
-    if transb in ['t', 'c']:
-        N, m, k = by_shape
-    elif transb in ['n']:
-        N, k, m = by_shape
-    else:
-        raise ValueError('invalid value for transb')
-
-    if transa in ['t', 'c']:
-        N2, l, n = bx_shape
-    elif transa in ['n']:
-        N2, n, l = bx_shape
-    else:
-        raise ValueError('invalid value for transa')
-
-    if l != k:
-        raise ValueError('objects are not aligned')
-
-    if N != N2:
-        raise ValueError('batch sizes are not the same')
-
-    if transb == 'n':
-        lda = max(1, m)
-    else:
-        lda = max(1, k)
-
-    if transa == 'n':
-        ldb = max(1, k)
-    else:
-        ldb = max(1, n)
-
-    ldc = max(1, m)
-
-    # construct pointer arrays needed for cublasCgemmBatched
-    bx_arr = bptrs(bx_gpu)
-    by_arr = bptrs(by_gpu)
-    bc_arr = bptrs(bc_gpu)
-
-    cublas.cublasCgemmBatched(handle, transb, transa, m, n, k, alpha,
-                              by_arr.gpudata, lda, bx_arr.gpudata, ldb,
-                              beta, bc_arr.gpudata, ldc, N)
-
-
-class BatchedComplexDotOp(ScikitsCudaOp):
-    """
-    This version uses cublasCgemmBatched under the hood, instead of
-    doing multiple cublasCgemm calls.
-    """
-    def make_node(self, inp1, inp2):
-        inp1 = basic_ops.gpu_contiguous(
-            basic_ops.as_cuda_ndarray_variable(inp1))
-        inp2 = basic_ops.gpu_contiguous(
-            basic_ops.as_cuda_ndarray_variable(inp2))
-
-        assert inp1.dtype == "float32"
-        assert inp2.dtype == "float32"
-        assert inp1.ndim == 4  # (batch, a, b, real/imag)
-        assert inp2.ndim == 4
-
-        return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
-
-    def output_type(self, inp):
-        return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
-
-    def make_thunk(self, node, storage_map, _, _2):
-        super(BatchedComplexDotOp, self).make_thunk(node, storage_map, _, _2)
-
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-
-        def thunk():
-            bx = inputs[0]
-            by = inputs[1]
-
-            input_shape_x = bx[0].shape  # (batch, a, b, 2)
-            input_shape_y = by[0].shape  # (batch, b, c, 2)
-
-            output_shape = (input_shape_x[0], input_shape_x[1],
-                            input_shape_y[2], 2)  # (batch, a, c, 2)
-
-            bz = outputs[0]
-
-            # only allocate if there is no previous allocation of the
-            # right size.
-            if bz[0] is None or bz[0].shape != output_shape:
-                bz[0] = CudaNdarray.zeros(output_shape)
-
-            input_bx_pycuda = to_complex_gpuarray(bx[0])
-            input_by_pycuda = to_complex_gpuarray(by[0])
-            output_b_pycuda = to_complex_gpuarray(bz[0])
-
-            # fancy native batched version
-            sc_complex_dot_batched(input_bx_pycuda, input_by_pycuda,
-                                   output_b_pycuda)
-
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-
-        return thunk
-
-
-cufft = CuFFTOp()
-cuifft = CuIFFTOp()
-batched_complex_dot = BatchedComplexDotOp()
-
-
-def mult_and_reduce(input_fft_v, filters_fft_v, input_shape=None,
-                    filter_shape=None):
-    """
-    input_fft_v is (b, ic, i0, i1//2 + 1, 2)
-    filters_fft_v is (oc, ic, i0, i1//2 + 1, 2)
-    """
-
-    if input_shape is None:
-        input_shape = input_fft_v.shape  # symbolic
-
-    if filter_shape is None:
-        filter_shape = filters_fft_v.shape  # symbolic
-
-    b, ic, i0, i1_f, _ = input_shape
-    oc = filter_shape[0]
-
-    # reshape to flatten the dimensions that are multiplied elemwise
-    input_r = input_fft_v.reshape((b, ic, i0 * i1_f, 2))
-    filters_r = filters_fft_v.reshape((oc, ic, i0 * i1_f, 2))
-
-    # shuffle for batched dot product
-    input_s = input_r.dimshuffle(2, 0, 1, 3)  # (i0 * i1_f, b, ic, 2)
-    filters_s = filters_r.dimshuffle(2, 1, 0, 3)  # (i0 * i1_f, ic, oc, 2)
-
-    output_s = batched_complex_dot(input_s, filters_s)
-
-    # shuffle again
-    output_r = output_s.dimshuffle(1, 2, 0, 3)
-
-    # reshape to unflatten
-    output = output_r.reshape((b, oc, i0, i1_f, 2))
-
-    return output
-
-
-def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
-               border_mode='valid', pad_last_dim=False):
-    """
-    Perform a convolution through fft.
-
-    Only support input which will be even on the last dimension
-    (width).  All other dimensions can be anything and the filters can
-    have an even or odd width.
-
-    If you must use input which has an odd width, you can either pad
-    it or use the `pad_last_dim` argument which will do it for you and
-    take care to strip the padding before returning.  Don't use this
-    argument if you are not sure the input is odd since the padding is
-    unconditional and will make even input odd, thus leading to
-    problems.
-
-    On valid mode the filters must be smaller than the input.
-
-    input: (b, ic, i0, i1)
-    filters: (oc, ic, f0, f1)
-
-    border_mode: 'valid' of 'full'
-
-    pad_last_dim: Unconditionally pad the last dimension of the input
-                  to to turn it from odd to even.  Will strip the
-                  padding before returning the result.
-    """
-
-    # use symbolic shapes to compute shape info at runtime if not specified
-    if image_shape is None:
-        image_shape = input.shape
-
-    if filter_shape is None:
-        filter_shape = filters.shape
-
-    # batch size, input channels, input dim 0, input dim 1
-    b, ic, i0, i1 = image_shape
-    # output channels, input channels, filter dim 0, filter dim 1
-    oc, ic_, f0, f1 = filter_shape
-
-    # pad filters/image to output shape
-    if border_mode == 'valid':
-        o0 = i0
-        if pad_last_dim:
-            o1 = i1 + 1
-            input_padded = T.zeros((b, ic, o0, o1), dtype='float32')
-            input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1],
-                                       input)
-        else:
-            o1 = i1
-            input_padded = input
-
-        filters_padded = T.zeros((oc, ic, o0, o1), dtype='float32')
-        filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1],
-                                         filters)
-
-    elif border_mode == 'full':
-
-        # In this particular case, the values of (o0, o1) represent
-        # the dimensions of the work buffer more than the actual dimensions
-        # of the desired output.
-        o0 = i0 + 2 * (f0 - 1)
-        o1 = i1 + 2 * (f1 - 1)
-
-        if pad_last_dim:
-            o1 = o1 + 1
-
-        # We line up the filters and the images in a way
-        # such that the filters are tightly placed against the
-        # top-left of the array, and the images intersect with
-        # them on one pixel. The top-left pixel of the images
-        # is the bottom-right pixel of the filters when we
-        # do the layout here.
-
-        filters_padded = T.zeros((oc, ic, o0, o1), dtype='float32')
-        filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1],
-                                         filters)
-
-        input_padded = T.zeros((b, ic, o0, o1), dtype='float32')
-        input_padded = T.set_subtensor(input_padded[:, :, (f0 - 1):(f0 - 1 + i0), (f1 - 1):(f1 - 1 + i1)],
-                                       input)
-    else:
-        raise ValueError('invalid mode')
-
-    input_padded = T.opt.Assert("in conv2d_fft: width is not even")(
-        input_padded, T.eq(o1 % 2, 0))
-
-    # reshape for FFT
-    input_flat = input_padded.reshape((b * ic, o0, o1))
-    filters_flat = filters_padded.reshape((oc * ic, o0, o1))
-
-    # perform FFT
-    input_fft_flat = cufft(input_flat)  # (b * ic, o0, o1//2 + 1, 2)
-    filters_fft_flat = cufft(filters_flat)  # (oc * ic, o0, o1//2 + 1, 2)
-
-    # unfold ic dimension
-    input_fft_v_shape = (b, ic, o0, o1 // 2 + 1, 2)
-    filters_fft_v_shape = (oc, ic, o0, o1 // 2 + 1, 2)
-    input_fft_v = input_fft_flat.reshape(input_fft_v_shape)
-    filters_fft_v = filters_fft_flat.reshape(filters_fft_v_shape)
-
-    # (b, oc, o0, o1//2 + 1, 2)
-    output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
-                                   input_shape=input_fft_v_shape,
-                                   filter_shape=filters_fft_v_shape)
-
-    # reshape for IFFT
-    output_fft_flat = output_fft_s.reshape((b * oc, o0, o1 // 2 + 1, 2))
-
-    # perform IFFT
-    output_flat = cuifft(output_fft_flat)  # (b * oc, o0, o1)
-
-    # reshape
-    output_circ = output_flat.reshape((b, oc, o0, o1))  # circular!
-
-    # Now we extract the region of interest.
-    # We just cut it out from the output_circ
-    # array that was used for the computation.
-    # We do not need to handle pad_last_dim in a
-    # special way because we specify explicitly here
-    # how much values are expected.
-    if border_mode == 'valid':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1)]
-    elif border_mode == 'full':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1)]
-    else:
-        raise ValueError('invalid mode')
-
-    # Rescale manually. This is just a factor that comes in during the
-    # trip through FFT and inverse FFT.
-    output = (1.0 / T.cast(o0 * o1, 'float32')) * output
-
-    # output should now be the result of a batched valid convolution
-    # of the input with the filters.
-    return basic_ops.as_cuda_ndarray_variable(output)
-
-
-
-def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
-               border_mode='valid', pad_last_dim=False):
-    """
-    Perform a convolution through fft.
-
-    Only supports input whose shape is even on the last dimension.
-    All other dimensions can be anything and the filters can
-    have an even or odd last dimension.
-
-    The semantics associated with the last three dimensions
-    are not important as long as they are in the same order between
-    the inputs and the filters. For example, when the convolution
-    is done on a sequence of images, they could be either
-    (duration, height, width) or (height, width, duration).
-
-    If you must use input which has an odd width, you can either pad
-    it or use the `pad_last_dim` argument which will do it for you and
-    take care to strip the padding before returning. pad_last_dim checks
-    that the last dimension is odd before the actual paddding
-
-    On valid mode the filters must be smaller than the input.
-
-    input: (b, ic, i0, i1, i2)
-    filters: (oc, ic, f0, f1, i2)
-
-    border_mode: 'valid' of 'full'
-
-    pad_last_dim: Unconditionally pad the last dimension of the input
-                  to to turn it from odd to even.  Will strip the
-                  padding before returning the result.
-    """
-
-    # use symbolic shapes to compute shape info at runtime if not specified
-    if image_shape is None:
-        image_shape = input.shape
-
-    if filter_shape is None:
-        filter_shape = filters.shape
-
-    # batch size, input channels, input dim 0, input dim 1
-    b, ic, i0, i1, i2 = image_shape
-    # output channels, input channels, filter dim 0, filter dim 1
-    oc, ic_, f0, f1, f2 = filter_shape
-
-    # Check that the last dimension is odd
-    is_odd = T.eq(T.mod(input.shape[4], 2), 1)
-
-    # pad filters/image to output shape
-    if border_mode == 'valid':
-        o0 = i0
-        o1 = i1
-        o2 = i2
-        input_padded = input
-        if pad_last_dim:
-            o2 = ifelse(is_odd, o2 + 1, o2)
-            input_padded = T.zeros((b, ic, o0, o1, o2), dtype='float32')
-            input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1, :i2],
-                                           input)
-        filters_padded = T.zeros((oc, ic, o0, o1, o2), dtype='float32')
-        filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1, :f2],
-                                         filters)
-
-    elif border_mode == 'full':
-
-        # In this particular case, the values of (o0, o1) represent
-        # the dimensions of the work buffer more than the actual dimensions
-        # of the desired output.
-        o0 = i0 + 2 * (f0 - 1)
-        o1 = i1 + 2 * (f1 - 1)
-        o2 = i2 + 2 * (f2 - 1)
-
-        if pad_last_dim:
-            o2 = ifelse(is_odd, o2 + 1, o2)
-
-        # We line up the filters and the images in a way
-        # such that the filters are tightly placed against the
-        # top-left of the array, and the images intersect with
-        # them on one pixel. The top-left pixel of the images
-        # is the bottom-right pixel of the filters when we
-        # do the layout here.
-
-        filters_padded = T.zeros((oc, ic, o0, o1, o2), dtype='float32')
-        filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1, :f2],
-                                         filters)
-
-        input_padded = T.zeros((b, ic, o0, o1, o2), dtype='float32')
-        input_padded = T.set_subtensor(input_padded[:, :, (f0 - 1):(f0 - 1 + i0), (f1 - 1):(f1 - 1 + i1), (f2 - 1):(f2 - 1 + i2)],
-                                       input)
-    else:
-        raise ValueError('invalid mode')
-
-    # reshape for FFT
-    input_flat = input_padded.reshape((b * ic, o0, o1, o2))
-    filters_flat = filters_padded.reshape((oc * ic, o0, o1, o2))
-
-    # perform FFT
-    input_fft_flat = cufft(input_flat)  # (b * ic, o0, o1, o2//2 + 1, 2)
-    filters_fft_flat = cufft(filters_flat)  # (oc * ic, o0, o1, o2//2 + 1, 2)
-
-    # Unfold ic dimension.
-    # We have to collapse two dimensions together
-    # in order to reuse the same `mult_and_reduce`.
-    # This explains the o0 * 01 instead of just keeping
-    # the two dimensions intact.
-    input_fft_v_shape = (b, ic, o0 * o1, o2 // 2 + 1, 2)
-    filters_fft_v_shape = (oc, ic, o0 * o1, o2 // 2 + 1, 2)
-
-
-    input_fft_v = input_fft_flat.reshape(input_fft_v_shape)
-    filters_fft_v = filters_fft_flat.reshape(filters_fft_v_shape)
-
-    # (b, oc, o0 * o1, o2//2 + 1, 2)
-    output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
-                                   input_shape=input_fft_v_shape,
-                                   filter_shape=filters_fft_v_shape)
-    #output_fft_s = input_fft_v
-
-
-    # reshape for IFFT
-    output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2))
-
-    # perform IFFT
-    output_flat = cuifft(output_fft_flat)  # (b * oc, o0, o1, o2)
-
-    # reshape
-    output_circ = output_flat.reshape((b, oc, o0, o1, o2))  # circular!
-
-    # Now we extract the region of interest.
-    # We just cut it out from the output_circ
-    # array that was used for the computation.
-    # We do not need to handle pad_last_dim in a
-    # special way because we specify explicitly here
-    # how much values are expected.
-    if border_mode == 'valid':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1), (f2-1):(f2-1 + i2-f2+1)]
-    elif border_mode == 'full':
-        output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1), (f2-1):(f2-1 + i2+f2-1)]
-    else:
-        raise ValueError('invalid mode')
-    #output = output_circ[:, :, :, :, :]
-
-    # Rescale manually. This is just a factor that comes in during the
-    # trip through FFT and inverse FFT.
-    output = (1.0 / T.cast(o0 * o1 * o2, 'float32')) * output
-
-    # output should now be the result of a batched valid convolution
-    # of the input with the filters.
-    return basic_ops.as_cuda_ndarray_variable(output)
-
-
diff --git a/theano/sandbox/cuda/neighbours.py b/theano/sandbox/cuda/neighbours.py
index 0c1da2aa55d..6abaca33139 100644
--- a/theano/sandbox/cuda/neighbours.py
+++ b/theano/sandbox/cuda/neighbours.py
@@ -1,37 +1,32 @@
 # This is work in progress
-from theano import Op, Apply, tensor
+from theano import Op, Apply
 from theano.gof import local_optimizer
 from theano.sandbox.cuda import cuda_available, GpuOp
 
-from theano.tensor.nnet.neighbours import Images2Neibs
+from theano.sandbox.neighbours import Images2Neibs
 
 if cuda_available:
     from theano.sandbox.cuda import CudaNdarrayType
-    from theano.sandbox.cuda.basic_ops import (
-        as_cuda_ndarray_variable, host_from_gpu, gpu_from_host)
+    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
     from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
 
 
 class GpuImages2Neibs(Images2Neibs, GpuOp):
     def __init__(self, mode='valid'):
-        if mode not in ['valid', 'ignore_borders', 'wrap_centered']:
-            raise NotImplementedError("Only the mode valid, ignore_borders"
-                                      " and wrap_centered"
+        if mode not in ['valid', 'wrap_centered']:
+            raise NotImplementedError("Only the mode valid and wrap_centered"
                                       " have been implemented for the op"
                                       " GpuImages2Neibs")
         self.mode = mode
 
     def make_node(self, ten4, neib_shape, neib_step):
-        ten4 = as_cuda_ndarray_variable(ten4)
-        neib_shape = tensor.as_tensor_variable(neib_shape)
-        neib_step = tensor.as_tensor_variable(neib_step)
+        assert ten4.dtype == 'float32'
+        if not isinstance(ten4.type, CudaNdarrayType):
+            raise TypeError('ten4 must be cudandarray', ten4)
 
         assert ten4.ndim == 4
-        assert ten4.dtype == 'float32'
         assert neib_shape.ndim == 1
         assert neib_step.ndim == 1
-        assert "int" in neib_shape.dtype
-        assert "int" in neib_step.dtype
 
         return Apply(self, [ten4, neib_shape, neib_step],
                      [CudaNdarrayType(broadcastable=(False, False),
@@ -206,12 +201,12 @@ def c_code(self, node, name, inp, out, sub):
         int grid_d = -1;
 
         {
-            if (CudaNdarray_NDIM(%(ten4)s) != 4)
+            if (%(ten4)s->nd != 4)
             {
                 PyErr_Format(PyExc_TypeError, "pvals wrong rank");
                 %(fail)s;
             }
-            if (PyArray_NDIM(%(neib_shape)s) != 1)
+            if (%(neib_shape)s->nd != 1)
             {
                 PyErr_Format(PyExc_TypeError, "unis wrong rank");
                 %(fail)s;
@@ -282,11 +277,6 @@ def c_code(self, node, name, inp, out, sub):
                 grid_c = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c)/step_x);
                 //number of patch in width
                 grid_d = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d)/step_y);
-            }else if ( "%(mode)s" == "ignore_borders") {
-                //number of patch in height
-                grid_c = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c)/step_x);
-                //number of patch in width
-                grid_d = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d)/step_y);
             }else{
                 PyErr_Format(PyExc_TypeError,
                              "Images2Neibs: unknow mode '%(mode)s'");
@@ -409,12 +399,11 @@ def gpu_images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
     return GpuImages2Neibs(mode)(ten4, neib_shape, neib_step)
 
 
-@local_optimizer([Images2Neibs])
+@local_optimizer()
 def use_gpu_images2neibs(node):
     if (type(node.op) is Images2Neibs and
         node.inputs[0].dtype == 'float32' and
-        node.op.mode in ['valid', 'ignore_borders',
-                         'wrap_centered']):
+        node.op.mode in ['valid', 'wrap_centered']):
         return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]),
                                                node.inputs[1], node.inputs[2],
                                                mode=node.op.mode))]
diff --git a/theano/sandbox/cuda/nnet.py b/theano/sandbox/cuda/nnet.py
index 40f18131724..7188fc59bd5 100644
--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -2,14 +2,13 @@
 from theano.compat.six import StringIO
 
 from theano.sandbox.cuda import GpuOp
-from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
 
 from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
                                                 inline_softmax,
                                                 inline_softmax_fixed_shared)
 
 
-class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
+class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
     """
     Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
     """
@@ -27,9 +26,6 @@ def __str__(self):
 
     def make_node(self, x, b, y_idx):
         #N.B. won't work when we don't cast y_idx to float anymore
-        x = as_cuda_ndarray_variable(x)
-        b = as_cuda_ndarray_variable(b)
-        y_idx = as_cuda_ndarray_variable(y_idx)
         nll = y_idx.type()
         sm = x.type()
         am = y_idx.type()
@@ -45,7 +41,7 @@ def c_support_code(self):
             float * sm_data, int sms0, int sms1,
             float * am_data, int ams0)
         {
-          for (int row = blockIdx.x; row < M; row += gridDim.x){
+            const int row = blockIdx.x;
 
             const float * x = x_data + xs0 * row;
             const int y_idx = (int)y_idx_data[row * y_idxs0];
@@ -87,7 +83,6 @@ def c_support_code(self):
                            + log(sum);
             }
             am_data[row*ams0] = row_max_j;
-          }
         }
 
         """
@@ -99,17 +94,17 @@ def c_code(self, node, nodename, inp, out, sub):
         fail = sub['fail']
         sio = StringIO()
         print >> sio, """
-        if (CudaNdarray_NDIM(%(y_idx)s) != 1)
+        if (%(y_idx)s->nd != 1)
         {
             PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
             %(fail)s;
         }
-        if (CudaNdarray_NDIM(%(x)s) != 2)
+        if (%(x)s->nd != 2)
         {
             PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
             %(fail)s;
         }
-        if (CudaNdarray_NDIM(%(b)s) != 1)
+        if (%(b)s->nd != 1)
         {
             PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
             %(fail)s;
@@ -173,8 +168,7 @@ def c_code(self, node, nodename, inp, out, sub):
             }
         }
         {
-            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                                    NUM_VECTOR_OP_BLOCKS);
+            int n_blocks = CudaNdarray_HOST_DIMS(%(sm)s)[0];
      //TODO: launch more threads per row and do parallel sum and max reductions
             int n_threads = 1;
             int n_shared_bytes = 0; //n_threads * sizeof(float);
@@ -201,11 +195,8 @@ def c_code(self, node, nodename, inp, out, sub):
             if (cudaSuccess != err)
             {
                 PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
-                             "The kernel was launched with %%d threads,"
-                             " %%d blocks and %%d shared memory\\n",
-                             cudaGetErrorString(err),
-                             n_threads, n_blocks, n_shared_bytes);
+                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n",
+                             cudaGetErrorString(err));
                 // no need to decref output vars the cleanup code will do it
                 %(fail)s;
             }
@@ -215,12 +206,12 @@ def c_code(self, node, nodename, inp, out, sub):
 
     def c_code_cache_version(self):
         #return ()
-        return (4,)
+        return (3,)
 
 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
 
 
-class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
+class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
     """
     Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
     """
@@ -240,23 +231,20 @@ def __str__(self):
         return self.__class__.__name__
 
     def make_node(self, dy, sm, y_idx):
-        dy = as_cuda_ndarray_variable(dy)
-        sm = as_cuda_ndarray_variable(sm)
-        y_idx = as_cuda_ndarray_variable(y_idx)
         return Apply(self, [dy, sm, y_idx], [sm.type()])
 
     def c_code_cache_version(self):
         #return ()
-        return (6,)
+        return (5,)
 
     def c_code(self, node, nodename, inp, out, sub):
         dnll, sm, y_idx = inp
         dx, = out
         fail = sub['fail']
         return """
-        if ((CudaNdarray_NDIM(%(dnll)s) != 1)
-            || (CudaNdarray_NDIM(%(sm)s) != 2)
-            || (CudaNdarray_NDIM(%(y_idx)s) != 1))
+        if ((%(dnll)s->nd != 1)
+            || (%(sm)s->nd != 2)
+            || (%(y_idx)s->nd != 1))
         {
             PyErr_SetString(PyExc_ValueError, "rank error");
             %(fail)s;
@@ -295,12 +283,11 @@ def c_code(self, node, nodename, inp, out, sub):
             }
         }
         {
-            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[0],
-                                    NUM_VECTOR_OP_BLOCKS);
-            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256);
-
             kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
-                <<<n_blocks, n_threads>>>(
+                <<<
+                    CudaNdarray_HOST_DIMS(%(dx)s)[0],
+                    std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256)
+                >>>(
                         CudaNdarray_HOST_DIMS(%(dx)s)[0],
                         CudaNdarray_HOST_DIMS(%(dx)s)[1],
 
@@ -323,11 +310,9 @@ def c_code(self, node, nodename, inp, out, sub):
             if( cudaSuccess != err)
             {
                 PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %%s: %%s.\\n"
-                             "The kernel was launched with %%d threads and"
-                             " %%d blocks\\n",
+                             "Cuda error: %%s: %%s.\\n",
                              "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
-                             cudaGetErrorString(err), n_threads, n_blocks);
+                             cudaGetErrorString(err));
                 %(fail)s;
             }
         }
@@ -371,7 +356,7 @@ def c_support_code_apply(self, node, nodename):
 gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
 
 
-class GpuSoftmax(GpuOp):
+class GpuSoftmax (GpuOp):
     """
     Implement Softmax on the gpu.
     """
@@ -385,7 +370,6 @@ def __str__(self):
         return self.__class__.__name__
 
     def make_node(self, x):
-        x = as_cuda_ndarray_variable(x)
         return Apply(self, [x], [x.type()])
 
     def infer_shape(self, node, shape):
@@ -399,7 +383,7 @@ def c_code(self, node, nodename, inp, out, sub):
         z, = out
         fail = sub['fail']
         return """
-        if (CudaNdarray_NDIM(%(x)s) != 2)
+        if (%(x)s->nd != 2)
         {
             PyErr_SetString(PyExc_ValueError, "rank error");
             %(fail)s;
@@ -491,8 +475,8 @@ def c_code(self, node, nodename, inp, out, sub):
     def c_support_code_apply(self, node, nodename):
         ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
                 params=['int M', 'int N',
-                        'const float * x', 'const int sx0', 'const int sx1',
-                        'float * sm', 'const int sm_s0', 'const int sm_s1'],
+                    'const float * x', 'const int sx0', 'const int sx1',
+                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
                 body=[
                     "extern __shared__ float buf[]",
                     "float * buf2 = buf + N",
@@ -514,8 +498,8 @@ def c_support_code_apply(self, node, nodename):
                 ])
         ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
                 params=['int M', 'int N',
-                        'const float * x', 'const int sx0', 'const int sx1',
-                        'float * sm', 'const int sm_s0', 'const int sm_s1'],
+                    'const float * x', 'const int sx0', 'const int sx1',
+                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
                 body=[
                     "extern __shared__ float buf[]",
                     "for (int blockIDX = blockIdx.x; blockIDX < M;"
@@ -533,7 +517,7 @@ def c_support_code_apply(self, node, nodename):
 gpu_softmax = GpuSoftmax()
 
 
-class GpuSoftmaxWithBias(GpuOp):
+class GpuSoftmaxWithBias (GpuOp):
     """
     Implement SoftmaxWithBias on the gpu.
     """
@@ -550,27 +534,26 @@ def __str__(self):
         return self.__class__.__name__
 
     def make_node(self, x, b):
-        x = as_cuda_ndarray_variable(x)
         return Apply(self, [x, b], [x.type()])
 
     def infer_shape(self, node, shape):
-        return [shape[0]]
+        return  [shape[0]]
 
     def c_code_cache_version(self):
         #return ()
-        return (9,) + inline_softmax.code_version
+        return (8,) + inline_softmax.code_version
 
     def c_code(self, node, nodename, inp, out, sub):
         x, b = inp
         z, = out
         fail = sub['fail']
         return """
-        if (CudaNdarray_NDIM(%(x)s) != 2)
+        if (%(x)s->nd != 2)
         {
             PyErr_SetString(PyExc_ValueError, "rank error input");
             %(fail)s;
         }
-        if (CudaNdarray_NDIM(%(b)s) != 1)
+        if (%(b)s->nd != 1)
         {
             PyErr_SetString(PyExc_ValueError, "rank error for the bias");
             %(fail)s;
@@ -658,11 +641,9 @@ def c_code(self, node, nodename, inp, out, sub):
                 if( cudaSuccess != err)
                 {
                     PyErr_Format(PyExc_RuntimeError,
-                                 "Cuda error: %%s: %%s. n_blocks=%%d,"
-                                 " n_threads=%%d, n_shared_bytes=%%d\\n",
+                                 "Cuda error: %%s: %%s.\\n",
                                  "kSoftmaxWithBias_%(nodename)s",
-                                 cudaGetErrorString(err),
-                                 n_blocks, n_threads, n_shared_bytes);
+                                 cudaGetErrorString(err));
                     %(fail)s;
                 }
             }
@@ -671,13 +652,12 @@ def c_code(self, node, nodename, inp, out, sub):
         """ % locals()
 
     def c_support_code_apply(self, node, nodename):
-        ret1 = nvcc_kernel(
-            "kSoftmaxWithBias_%s" % nodename,
-            params=['int M', 'int N',
-                    'const float * x', 'const int sx0', 'const int sx1',
-                    'const float * b', 'const int sb0',
-                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
-            body=[
+        ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
+                params=['int M', 'int N',
+                        'const float * x', 'const int sx0', 'const int sx1',
+                        'const float * b', 'const int sb0',
+                        'float * sm', 'const int sm_s0', 'const int sm_s1'],
+                body=[
                     "extern __shared__ float buf[]",
                     "float * buf2 = buf + N",
                     "for (int blockIDX = blockIdx.x; blockIDX < M;"
@@ -695,7 +675,7 @@ def c_support_code_apply(self, node, nodename):
                       "}",
                       "__syncthreads()",
                     "}",
-            ])
+                    ])
         ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
                            params=['int M', 'int N',
                                    'const float * x',
diff --git a/theano/sandbox/cuda/nvcc_compiler.py b/theano/sandbox/cuda/nvcc_compiler.py
index 6a6f50112d4..b98f4ffaee9 100644
--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -1,6 +1,8 @@
+import commands
 import distutils
 import logging
 import os
+import re
 import subprocess
 import sys
 import warnings
@@ -14,9 +16,10 @@
                                 std_include_dirs, dlimport,
                                 get_lib_extension)
 from theano.gof.python25 import any
-from theano.misc.windows import output_subprocess_Popen
+from theano.misc.windows import call_subprocess_Popen
 
 _logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
+_logger.setLevel(logging.WARN)
 
 from theano.configparser import (config, AddConfigVar, StrParam,
                                  BoolParam, ConfigParam)
@@ -27,29 +30,6 @@
              StrParam(""),
              in_c_key=False)
 
-user_provided_cuda_root = True
-
-
-def default_cuda_root():
-    global user_provided_cuda_root
-    v = os.getenv('CUDA_ROOT', "")
-    user_provided_cuda_root = False
-    if v:
-        return v
-    return find_cuda_root()
-
-AddConfigVar('cuda.root',
-        """directory with bin/, lib/, include/ for cuda utilities.
-        This directory is included via -L and -rpath when linking
-        dynamically compiled modules.  If AUTO and nvcc is in the
-        path, it will use one of nvcc parent directory.  Otherwise
-        /usr/local/cuda will be used.  Leave empty to prevent extra
-        linker directives.  Default: environment variable "CUDA_ROOT"
-        or else "AUTO".
-        """,
-        StrParam(default_cuda_root),
-        in_c_key=False)
-
 AddConfigVar('cuda.nvccflags',
         "DEPRECATED, use nvcc.flags instead",
         StrParam("", allow_override=False),
@@ -95,8 +75,12 @@ def filter_nvcc_flags(s):
 def is_nvcc_available():
     """Return True iff the nvcc compiler is found."""
     def set_version():
-        p_out = output_subprocess_Popen([nvcc_path, '--version'])
-        ver_line = decode(p_out[0]).strip().split('\n')[-1]
+        p = call_subprocess_Popen([nvcc_path, '--version'],
+                                  stdout=subprocess.PIPE,
+                                  stderr=subprocess.PIPE)
+        p.wait()
+
+        ver_line = decode(p.stdout.readlines()[-1])
         build, version = ver_line.split(',')[1].strip().split()
 
         assert build == 'release'
@@ -120,13 +104,14 @@ def set_version():
             return False
 
 
-def find_cuda_root():
+def set_cuda_root():
     s = os.getenv("PATH")
     if not s:
         return
     for dir in s.split(os.path.pathsep):
         if os.path.exists(os.path.join(dir, "nvcc")):
-            return os.path.split(dir)[0]
+            config.cuda.root = os.path.split(dir)[0]
+            return
 
 rpath_defaults = []
 
@@ -153,12 +138,6 @@ def compile_args():
             os.path.join(os.path.split(__file__)[0], 'cuda_ndarray.cuh'))
         flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash)
 
-        # NumPy 1.7 Deprecate the old API. I updated most of the places
-        # to use the new API, but not everywhere. When finished, enable
-        # the following macro to assert that we don't bring new code
-        # that use the old API.
-        flags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION")
-
         # numpy 1.7 deprecated the following macro but the didn't
         # existed in the past
         numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
@@ -243,6 +222,7 @@ def compile_str(
             preargs = list(preargs)
         if sys.platform != 'win32':
             preargs.append('-fPIC')
+        no_opt = False
         cuda_root = config.cuda.root
 
         #The include dirs gived by the user should have precedence over
@@ -256,13 +236,20 @@ def compile_str(
             libs.append('cudart')
 
         lib_dirs = std_lib_dirs() + lib_dirs
-        if any(ld == os.path.join(cuda_root, 'lib') or
-               ld == os.path.join(cuda_root, 'lib64') for ld in lib_dirs):
-            warnings.warn("You have the cuda library directory in your "
-                          "lib_dirs. This has been known to cause problems "
-                          "and should not be done.")
+        if cuda_root:
+            lib_dirs.append(os.path.join(cuda_root, 'lib'))
 
-        if sys.platform != 'darwin':
+            # from Benjamin Schrauwen April 14 2010
+            if sys.platform != 'darwin':
+                # No 64 bit CUDA libraries available on the mac, yet..
+                lib_dirs.append(os.path.join(cuda_root, 'lib64'))
+
+        if sys.platform == 'darwin':
+            # On the mac, nvcc is not able to link using -framework
+            # Python, so we have manually add the correct library and
+            # paths
+            darwin_python_lib = commands.getoutput('python-config --ldflags')
+        else:
             # sometimes, the linker cannot find -lpython so we need to tell it
             # explicitly where it is located
             # this returns somepath/lib/python2.x
@@ -289,8 +276,8 @@ def compile_str(
         #nvcc argument
         preargs1 = []
         for pa in preargs:
-            for pattern in ['-O', '-arch=', '-ccbin=', '-G', '-g', '-I',
-                            '-L', '--fmad', '--ftz', '--maxrregcount',
+            for pattern in ['-O', '-arch=',
+                            '--fmad', '--ftz', '--maxrregcount',
                             '--prec-div', '--prec-sqrt',  '--use_fast_math',
                             '-fmad', '-ftz', '-maxrregcount',
                             '-prec-div', '-prec-sqrt', '-use_fast_math']:
@@ -299,9 +286,7 @@ def compile_str(
         preargs2 = [pa for pa in preargs
                     if pa not in preargs1]  # other arguments
 
-        # Don't put -G by default, as it slow things down.
-        # We aren't sure if -g slow things down, so we don't put it by default.
-        cmd = [nvcc_path, '-shared'] + preargs1
+        cmd = [nvcc_path, '-shared', '-g'] + preargs1
         if config.nvcc.compiler_bindir:
             cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir])
 
@@ -309,10 +294,6 @@ def compile_str(
             # add flags for Microsoft compiler to create .pdb files
             preargs2.extend(['/Zi', '/MD'])
             cmd.extend(['-Xlinker', '/DEBUG'])
-            # remove the complaints for the duplication of `double round(double)`
-            # in both math_functions.h and pymath.h,
-            # by not including the one in pymath.h
-            cmd.extend(['-D HAVE_ROUND'])
 
         if local_bitwidth() == 64:
             cmd.append('-m64')
@@ -322,16 +303,12 @@ def compile_str(
         if len(preargs2) > 0:
             cmd.extend(['-Xcompiler', ','.join(preargs2)])
 
-        # We should not use rpath if possible. If the user provided
-        # provided an cuda.root flag, we need to add one, but
-        # otherwise, we don't add it. See gh-1540 and
-        # https://wiki.debian.org/RpathIssue for details.
-        if (user_provided_cuda_root and
-            os.path.exists(os.path.join(config.cuda.root, 'lib'))):
-
+        if config.cuda.root and os.path.exists(os.path.join(config.cuda.root,
+                                                            'lib')):
             rpaths.append(os.path.join(config.cuda.root, 'lib'))
             if sys.platform != 'darwin':
-                # the CUDA libs are universal (contain both 32-bit and 64-bit)
+                # the 64bit CUDA libs are in the same files as are
+                # named by the function above
                 rpaths.append(os.path.join(config.cuda.root, 'lib64'))
         if sys.platform != 'win32':
             # the -rpath option is not understood by the Microsoft linker
@@ -342,10 +319,22 @@ def compile_str(
         cmd.append(os.path.split(cppfilename)[-1])
         cmd.extend(['-L%s' % ldir for ldir in lib_dirs])
         cmd.extend(['-l%s' % l for l in libs])
+        if module_name != 'cuda_ndarray':
+            cmd.append("-lcuda_ndarray")
         if sys.platform == 'darwin':
-            # This tells the compiler to use the already-loaded python
-            # symbols (which should always be the right ones).
-            cmd.extend(['-Xcompiler', '-undefined,dynamic_lookup'])
+            cmd.extend(darwin_python_lib.split())
+
+        if sys.platform == 'darwin':
+            done = False
+            while not done:
+                try:
+                    indexof = cmd.index('-framework')
+                    newarg = '-Xcompiler', ','.join(cmd[indexof:(indexof + 2)])
+                    cmd.pop(indexof)  # Remove -framework
+                    cmd.pop(indexof)  # Remove argument to -framework
+                    cmd.extend(newarg)
+                except ValueError, e:
+                    done = True
 
         # Remove "-u Symbol" arguments, since they are usually not
         # relevant for the new compilation, even if they were used for
@@ -360,6 +349,8 @@ def compile_str(
             except ValueError, e:
                 done = True
 
+        # Fix for MacOS X.
+        cmd = remove_python_framework_dir(cmd)
         # CUDA Toolkit v4.1 Known Issues:
         # Host linker on Mac OS 10.7 (and 10.6 for me) passes -no_pie option
         # to nvcc this option is not recognized and generates an error
@@ -425,3 +416,37 @@ def compile_str(
             #touch the __init__ file
             open(os.path.join(location, "__init__.py"), 'w').close()
             return dlimport(lib_filename)
+
+
+def remove_python_framework_dir(cmd):
+    """
+    Search for Python framework directory and get rid of it.
+
+    :param cmd: A list of strings corresponding to compilation arguments. On
+    MacOS X, one of these strings may be of the form
+    "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/Python"
+    and it needs to be removed as otherwise compilation will fail.
+
+    :return: The same list as `cmd`, but without the element of the form
+    mentioned above, if one exists.
+    """
+    # The fix below was initially suggested by Nicolas Pinto:
+    #   http://groups.google.com/group/theano-users/browse_thread/thread/c84bfe31bb411493
+    # It was improved later following a bug report by Benjamin Hamner:
+    #   https://groups.google.com/group/theano-users/browse_thread/thread/374ec2dadd3ac369/024e2be792f98d86
+    # It was modified by Graham Taylor to support Enthought Python Distribution
+    #     7.x (32 and 64 bit)
+    # TODO It is a bit hack-ish, is it possible to find a more generic fix?
+    fwk_pattern = '(Python|EPD64).framework/Versions/(2\.[0-9]|7\.[0-9])/Python$'
+    rval = [element for element in cmd
+            if (re.search(fwk_pattern, element) is None
+                # Keep this element if it turns out to be part of an argument
+                # like -L.
+                or element.startswith('-'))]
+    if len(rval) < len(cmd) - 1:
+        warnings.warn("'remove_python_framework_dir' removed %s elements from "
+                      "the command line, while it is expected to remove at "
+                      "most one. If compilation fails, this would be a good "
+                      "place to start looking for a problem." %
+                      (len(cmd) - len(rval)))
+    return rval
diff --git a/theano/sandbox/cuda/opt.py b/theano/sandbox/cuda/opt.py
index be230c523ea..9d083519d14 100644
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -3,33 +3,24 @@
 
 import copy
 import sys
-import time
 import warnings
 
 import numpy
 
 import theano
 from theano import scalar as scal
-from theano import config, tensor, gof
+from theano import tensor, gof
 import theano.ifelse
 
 from theano.compile import optdb
-from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
-                        Optimizer, toolbox)
-from theano.gof.opt import LocalMetaOptimizer
+from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
+                        Optimizer, toolbox, DestroyHandler,
+                        EquilibriumOptimizer)
 from theano.gof.python25 import all, any
-from theano.sandbox.cuda.basic_ops import (
-    gpu_eye, gpu_contiguous,
-    gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
-    GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce, GpuFlatten,
-    GpuSubtensor, GpuAdvancedSubtensor1,
-    GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
-    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape)
+from theano.sandbox.cuda.basic_ops import *
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
-        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
-        GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
-        GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
+        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv)
 from theano.sandbox.cuda.blas import gpu_gemv_inplace
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
@@ -44,27 +35,19 @@
 from theano.scalar.basic_scipy import Erfinv
 from theano.sandbox.cuda.elemwise import erfinv_gpu
 from theano.sandbox.cuda.var import CudaNdarrayConstant
-from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt
 from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.blas import _is_real_vector, _is_real_matrix
-from theano.tensor import nlinalg
-from theano.tensor.nnet.Conv3D import Conv3D
-
-try:
-    # We need to be able to import this file even if cuda isn't avail.
-    from theano.sandbox.cuda import device_properties
-except ImportError:
-    pass
-
 
 #optdb.print_summary()  # shows what is currently registered
 
+gpu_optimizer = EquilibriumDB()
 gpu_cut_copies = EquilibriumDB()
+gpu_seqopt = SequenceDB()
 gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1,
-        'fast_run', 'fast_compile', 'inplace', 'gpu')
+        'fast_run', 'inplace', 'gpu')
 gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2,
-        'fast_run', 'fast_compile', 'gpu')
-# DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS enable the GPU!
+        'fast_run', 'gpu')
+# DO NOT PUT fast_run in gpu_opt! This will ALWAYS enable the GPU!
 optdb.register('gpu_opt',
                gpu_seqopt,
                optdb.__position__.get('add_destroy_handler', 49.5) - 1,
@@ -74,47 +57,24 @@
 # inside the elemwise. When there is no float64 op, this is working.
 optdb.register('gpu_after_fusion',
                ProxyDB(gpu_seqopt),
-               optdb.__position__.get('elemwise_fusion', 49) + .1,
+               optdb.__position__.get('elemwise_fusion', 71) + .1,
                'gpu')
 
-## Register merge_optimizer as a global opt
-gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer,
-                       'fast_run', 'fast_compile')
 
+def register_opt(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpu', *tags)
+        return local_opt
+    return f
 
 #register local_track_shape_i at this level too
 #to make multi-level lift of shape work.
 register_opt()(theano.tensor.opt.local_track_shape_i)
-register_opt(name='gpu_constant_folding')(
-    tensor.opt.constant_folding)
-
-# This is a partial list of CPU ops that can be in some circonstance
-# moved to the GPU. This list is used by an optimization.
-# Hopefully, we can keep this list up to date.
-import theano.tensor.signal.downsample
-import theano.tensor.nnet.neighbours
-cpu_ops_moved_to_gpu = [
-    tensor.blas.Dot22, tensor.blas.Dot22Scalar, tensor.blas.Gemm,
-    tensor.blas.Gemv, tensor.blas.Ger, tensor.nnet.conv.ConvOp,
-    tensor.signal.downsample.DownsampleFactorMax,
-    tensor.signal.downsample.DownsampleFactorMaxGrad,
-    theano.tensor.nnet.neighbours.Images2Neibs,
-    tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias,
-    tensor.nnet.CrossentropySoftmax1HotWithBiasDx,
-    tensor.nnet.Softmax, tensor.nnet.SoftmaxWithBias,
-    tensor.Elemwise, tensor.DimShuffle, tensor.CAReduce,
-    tensor.elemwise.All, tensor.elemwise.Any,
-    tensor.elemwise.CAReduceDtype, tensor.elemwise.Sum,
-    tensor.elemwise.Prod, tensor.elemwise.ProdWithoutZeros,
-    tensor.Reshape, tensor.Flatten, tensor.Subtensor,
-    tensor.AdvancedSubtensor1, tensor.AdvancedIncSubtensor1,
-    tensor.IncSubtensor, tensor.Shape, tensor.Join,
-    tensor.Alloc, tensor.Eye]
 
 
 class InputToGpuOptimizer(Optimizer):
-    """
-    Transfer the input of a graph to the gpu if it is necessary.
+    """Transfert the input of a graph to the gpu if needed
     It should make this part of the optimizer faster we will will need only 1
     pass on the fgraph.
     """
@@ -123,6 +83,7 @@ def __init__(self):
 
     def add_requirements(self, fgraph):
         fgraph.attach_feature(toolbox.ReplaceValidate())
+        fgraph.attach_feature(DestroyHandler())
 
     def apply(self, fgraph):
         for input in fgraph.inputs:
@@ -140,8 +101,8 @@ def apply(self, fgraph):
 
                 if new_input.type == input.type:
                     fgraph.replace_validate(input, new_input,
-                                            "InputToGpuOptimizer")
-            except TypeError:
+                                         "InputToGpuOptimizer")
+            except TypeError, e:
                 #as we currently only support float32, this can fail.
                 #Using try except make that we won't need
                 pass
@@ -155,7 +116,7 @@ def apply(self, fgraph):
                     'merge')  # TODO: how to make it mandatory for gpu_seqopt?
 
 
-@local_optimizer([gpu_from_host, host_from_gpu])
+@local_optimizer([])
 def local_cut_gpu_host_gpu(node):
     if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
         return [node.inputs[0].owner.inputs[0]]
@@ -163,15 +124,14 @@ def local_cut_gpu_host_gpu(node):
         return [node.inputs[0].owner.inputs[0]]
     return False
 gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu,
-                        'fast_run', 'fast_compile', 'gpu')
+        'fast_run', 'inplace', 'gpu')
 gpu_cut_copies.register('cut_gpu_constant_transfers',
                         tensor.opt.constant_folding,
-                        'fast_run', 'fast_compile', 'gpu')
+                        'fast_run', 'gpu')
 #register it into canonicalize to allow other optimization to work without
 #botering with this useless pattern.
 optdb['canonicalize'].register('local_cut_gpu_host_gpu',
-                               local_cut_gpu_host_gpu,
-                               'fast_run', 'fast_compile', 'gpu')
+                               local_cut_gpu_host_gpu, 'fast_run', 'gpu')
 
 # 'float64', 'complex128' and 'complex64' are not supported in elemwise
 # on the gpu.
@@ -205,17 +165,17 @@ def get_all_basic_scalar(composite_op):
 
 
 @register_opt()
-@local_optimizer([tensor.Elemwise])
+@local_optimizer([])
 def local_gpu_elemwise_0(node):
     """elemwise(..., host_from_gpu, ...)
        -> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host)
     """
     if (isinstance(node.op, tensor.Elemwise) and
         dtype_in_elemwise_supported(node.op)):
-        if any([i.owner and
-                isinstance(i.owner.op, HostFromGpu)
-                for i in node.inputs]):
-            if all([o.type.dtype == 'float32' for o in node.outputs]):
+        if numpy.any([i.owner and
+                      isinstance(i.owner.op, HostFromGpu)
+                      for i in node.inputs]):
+            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                 # Don't set any inplace pattern.
                 # gpu_inplace_elemwise_optimizer will do it later
 
@@ -232,14 +192,14 @@ def local_gpu_elemwise_0(node):
                 upcastable = set(['float32', 'int8', 'int16', 'uint8',
                                   'uint16'])
                 # case 1 - all inputs are already float32
-                if all([i.type.dtype == 'float32' for i in node.inputs]):
+                if numpy.all([i.type.dtype == 'float32' for i in node.inputs]):
                     #TODO: change this when fusion makes Elemwise with multiple
                     # outputs
                     gpu_elemwise = new_op(*(gpu_from_host(i)
                                             for i in node.inputs))
                 # case 2 - it is still ok if some inputs were upcast to float32
-                elif all([i.type.dtype in upcastable
-                          for i in node.inputs]):
+                elif numpy.all([i.type.dtype in upcastable
+                                for i in node.inputs]):
                     # second - establish that a new node with upcasted inputs
                     # has the same outputs types as the original node
                     upcasted = node.op.make_node(*[tensor.cast(i, 'float32')
@@ -264,12 +224,12 @@ def local_gpu_elemwise_0(node):
 
 
 @register_opt()
-@local_optimizer([gpu_from_host])
+@local_optimizer([])
 def local_gpu_elemwise_1(node):
     """
     gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...))
     """
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_i, = node.inputs
         if (host_i.owner and
             isinstance(host_i.owner.op, tensor.Elemwise) and
@@ -300,7 +260,7 @@ def local_gpu_elemwise_1(node):
 
 
 @register_opt()
-@local_optimizer([tensor.DimShuffle, gpu_from_host])
+@local_optimizer([])
 def local_gpu_dimshuffle_0(node):
     """
     dimshuffle(host_from_gpu()) -> host_from_gpu(gpu_dimshuffle)
@@ -311,32 +271,32 @@ def local_gpu_dimshuffle_0(node):
         if input.owner and isinstance(input.owner.op, HostFromGpu):
             # move the add to a GpuAdd
             new_op = GpuDimShuffle(node.op.input_broadcastable,
-                                   node.op.new_order)
+                    node.op.new_order)
             return [host_from_gpu(new_op(gpu_from_host(input)))]
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
         if host_input.owner and isinstance(host_input.owner.op,
                                            tensor.DimShuffle):
             dimshuffle_node = host_input.owner
             new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable,
-                                   dimshuffle_node.op.new_order)
+                    dimshuffle_node.op.new_order)
             return [new_op(gpu_from_host(dimshuffle_node.inputs[0]))]
     return False
 
 
 @register_opt()
-@local_optimizer([tensor.SpecifyShape, gpu_from_host])
+@local_optimizer([])
 def local_gpu_specifyShape_0(node):
     """
     specify_shape(host_from_gpu()) -> host_from_gpu(specify_shape)
-    gpu_from_host(specify_shape) -> specify_shape(gpu_from_host)
+    gpu_from_host(specify_shape) -> specifyshape(gpu_from_host)
     """
     if isinstance(node.op, tensor.SpecifyShape):
         input = node.inputs[0]
         if input.owner and isinstance(input.owner.op, HostFromGpu):
             return [host_from_gpu(tensor.specify_shape(gpu_from_host(input),
                                                       *node.inputs[1:]))]
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
         if host_input.owner and isinstance(host_input.owner.op,
                                            tensor.SpecifyShape):
@@ -348,7 +308,7 @@ def local_gpu_specifyShape_0(node):
 
 
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.basic.Dot])
+@local_optimizer([])
 def local_gpu_dot_to_dot22(node):
     """
     gpu_from_host(dot) -> gpudot(gpu_from_host)
@@ -359,28 +319,25 @@ def local_gpu_dot_to_dot22(node):
     the output.
 
     A more suitable solution would be to use the right cublas call
-
-    This is needed in fast_compile
     """
 
     # In case the got do input upcast, we much check that we can
     # make it run on the gpu.
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         if node.outputs[0].type.dtype != 'float32':
             return False
         host_input = node.inputs[0]
-        if host_input.owner and isinstance(host_input.owner.op,
-                                           tensor.basic.Dot):
+        if host_input.owner and host_input.owner.op == tensor.basic.dot:
             x, y = host_input.owner.inputs
             # case one: vector X matrix
             if _is_real_vector(x) and _is_real_matrix(y):
-                new_op = GpuDimShuffle((False,), ('x', 0))
+                new_op = GpuDimShuffle((False,), ['x', 0])
                 shape_out = y.shape[1].dimshuffle(['x'])
                 gpu_x = new_op(gpu_from_host(x))
                 gpu_y = gpu_from_host(y)
             # case two: matrix X vector
             elif _is_real_matrix(x) and _is_real_vector(y):
-                new_op = GpuDimShuffle((False,), (0, 'x'))
+                new_op = GpuDimShuffle((False,), [0, 'x'])
                 shape_out = x.shape[0].dimshuffle(['x'])
                 gpu_x = gpu_from_host(x)
                 gpu_y = new_op(gpu_from_host(y))
@@ -388,20 +345,20 @@ def local_gpu_dot_to_dot22(node):
                 return False
 
             return [GpuReshape(1)(gpu_dot22(gpu_x, gpu_y), shape_out)]
-    if isinstance(node.op, tensor.basic.Dot):
+    if node.op == tensor.basic.dot:
         if node.outputs[0].type.dtype != 'float32':
             return False
-        if any([i.owner and isinstance(i.owner.op, HostFromGpu)
-                for i in node.inputs]):
+        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+                      for i in node.inputs]):
             x, y = node.inputs
             if _is_real_vector(x) and _is_real_matrix(y):
-                new_op = GpuDimShuffle((False,), ('x', 0))
+                new_op = GpuDimShuffle((False,), ['x', 0])
                 shape_out = y.shape[1].dimshuffle(['x'])
                 gpu_x = new_op(gpu_from_host(x))
                 gpu_y = gpu_from_host(y)
 
             elif _is_real_matrix(x) and _is_real_vector(y):
-                new_op = GpuDimShuffle((False,), (0, 'x'))
+                new_op = GpuDimShuffle((False,), [0, 'x'])
                 shape_out = x.shape[0].dimshuffle(['x'])
                 gpu_x = gpu_from_host(x)
                 gpu_y = new_op(gpu_from_host(y))
@@ -414,7 +371,7 @@ def local_gpu_dot_to_dot22(node):
 
 
 @register_opt()
-@local_optimizer([theano.ifelse.IfElse, gpu_from_host])
+@local_optimizer([])
 def local_gpu_lazy_ifelse(node):
     """
     gpu_from_host(ifelse) -> gpu_ifelse(gpu_from_host)
@@ -425,10 +382,10 @@ def local_gpu_lazy_ifelse(node):
         gpu_ifelse = theano.ifelse.IfElse(node.op.n_outs, gpu=True)
         outs_clients = reduce(list.__add__,
                               [out.clients for out in node.outputs])
-        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
-                for i in node.inputs]) or any(
-                    [c != 'output' and c.op == gpu_from_host for c, idx
-                     in outs_clients]):
+        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+                      for i in node.inputs]) or numpy.any(
+                      [c != 'output' and c.op == gpu_from_host for c, idx
+                       in outs_clients]):
 
             c = node.inputs[0]
             outs = node.inputs[1:]
@@ -442,16 +399,11 @@ def local_gpu_lazy_ifelse(node):
             return [host_from_gpu(out) for out in
                     gpu_ifelse.make_node(c, *outs).outputs]
 
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
         if (host_input.owner and
             isinstance(host_input.owner.op, theano.ifelse.IfElse) and
-            not host_input.owner.op.gpu and
-            # If there is more then 1 outputs, we can't replace it
-            # here with a local optimizer as we replace the
-            # GpuFromHost node and the other output of the if won't be
-            # replaced.
-            host_input.owner.op.n_outs == 1):
+            not host_input.owner.op.gpu):
             gpu_ifelse = theano.ifelse.IfElse(host_input.owner.op.n_outs,
                                                   gpu=True)
 
@@ -472,22 +424,21 @@ def local_gpu_lazy_ifelse(node):
 
 
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas.Dot22])
+@local_optimizer([])
 def local_gpu_dot22(node):
     """
     gpu_from_host(dot22) -> gpudot(gpu_from_host)
 
     dot(host_from_gpu) -> host_from_gpu(gpudot22)
     """
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
-        if host_input.owner and isinstance(host_input.owner.op,
-                                           tensor.blas.Dot22):
+        if host_input.owner and host_input.owner.op == tensor.blas._dot22:
             x, y = host_input.owner.inputs
             return [gpu_dot22(gpu_from_host(x), gpu_from_host(y))]
-    if isinstance(node.op, tensor.blas.Dot22):
-        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
-                for i in node.inputs]):
+    if node.op == tensor.blas._dot22:
+        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+                      for i in node.inputs]):
             x, y = node.inputs
             return [host_from_gpu(gpu_dot22(gpu_from_host(x),
                                             gpu_from_host(y)))]
@@ -495,24 +446,23 @@ def local_gpu_dot22(node):
 
 
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas.Dot22Scalar])
+@local_optimizer([])
 def local_gpu_dot22scalar(node):
     """
     gpu_from_host(dot22scalar) -> gpudot(gpu_from_host)
 
     dot(host_from_gpu) -> host_from_gpu(gpudot22scalar)
     """
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
         if (host_input.owner and
-            isinstance(host_input.owner.op,
-                       tensor.blas.Dot22Scalar)):
+            host_input.owner.op == tensor.blas._dot22scalar):
             x, y, scalar = host_input.owner.inputs
             return [gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y),
                                     tensor.blas._as_scalar(scalar))]
-    if isinstance(node.op, tensor.blas.Dot22Scalar):
-        if any([i.owner and isinstance(i.owner.op, HostFromGpu)
-                for i in node.inputs]):
+    if node.op == tensor.blas._dot22scalar:
+        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+                      for i in node.inputs]):
             x, y, scalar = node.inputs
             return [host_from_gpu(
                 gpu_dot22scalar(gpu_from_host(x),
@@ -522,34 +472,38 @@ def local_gpu_dot22scalar(node):
 
 
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv])
+@local_optimizer([])
 def local_gpu_gemv(node):
     """
     gpu_from_host(gemv) -> gpu_gemv(gpu_from_host)
     gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
 
     """
-    gemvs = (tensor.blas.Gemv,
-             tensor.blas_c.CGemv,
-            )
-    if isinstance(node.op, GpuFromHost):
+    gemvs = {
+            tensor.blas.gemv_inplace: gpu_gemv_no_inplace,
+            tensor.blas.gemv_no_inplace: gpu_gemv_no_inplace,
+            tensor.blas_c.CGemv(inplace=True): gpu_gemv_no_inplace,
+            tensor.blas_c.CGemv(inplace=False): gpu_gemv_no_inplace,
+            }
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
-        if host_input.owner and isinstance(host_input.owner.op, gemvs):
+        if host_input.owner and host_input.owner.op in gemvs:
+            op = host_input.owner.op
             z, a, x, y, b = host_input.owner.inputs
-            return [gpu_gemv_no_inplace(
+            return [gemvs[op](
                     gpu_from_host(z),
                     a,
                     gpu_from_host(x),
                     gpu_from_host(y),
                     b)]
-    if isinstance(node.op, gemvs):
+    if node.op in gemvs:
         z, a, x, y, b = node.inputs
-        x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
-        y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
-        z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
+        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
+        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
+        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
         if x_on_gpu or y_on_gpu or z_on_gpu:
             return [host_from_gpu(
-                gpu_gemv_no_inplace(
+                gemvs[node.op](
                     gpu_from_host(z),
                     a,
                     gpu_from_host(x),
@@ -559,37 +513,40 @@ def local_gpu_gemv(node):
 
 
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas_c.CGer, tensor.blas.Ger,
-                  tensor.blas_scipy.ScipyGer])
+@local_optimizer([])
 def local_gpu_ger(node):
     """
     gpu_from_host(ger) -> gpu_ger(gpu_from_host)
     ger(host_from_gpu) -> host_from_gpu(gpu_ger)
 
     """
-    gers = (tensor.blas_c.CGer,
-            tensor.blas.Ger,
-            tensor.blas_scipy.ScipyGer,
-        )
-
-    if isinstance(node.op, GpuFromHost):
+    gers = {
+            tensor.blas_c.CGer(destructive=True): gpu_ger_no_inplace,
+            tensor.blas_c.CGer(destructive=False): gpu_ger_no_inplace,
+            tensor.blas.Ger(destructive=True): gpu_ger_no_inplace,
+            tensor.blas.Ger(destructive=False): gpu_ger_no_inplace,
+            tensor.blas_scipy.ScipyGer(destructive=True): gpu_ger_no_inplace,
+            tensor.blas_scipy.ScipyGer(destructive=False): gpu_ger_no_inplace,
+            }
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
-        if host_input.owner and isinstance(host_input.owner.op, gers):
+        if host_input.owner and host_input.owner.op in gers:
+            op = host_input.owner.op
             z, a, x, y = host_input.owner.inputs
-            return [gpu_ger_no_inplace(
+            return [gers[op](
                     gpu_from_host(z),
                     a,
                     gpu_from_host(x),
                     gpu_from_host(y)
                     )]
-    if isinstance(node.op, gers):
+    if node.op in gers:
         z, a, x, y = node.inputs
-        x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
-        y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
-        z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
+        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
+        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
+        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
         if x_on_gpu or y_on_gpu or z_on_gpu:
             return [host_from_gpu(
-                gpu_ger_no_inplace(
+                gers[node.op](
                     gpu_from_host(z),
                     a,
                     gpu_from_host(x),
@@ -599,30 +556,33 @@ def local_gpu_ger(node):
 
 
 @register_opt()
-@local_optimizer([tensor.blas.Gemm, gpu_from_host])
+@local_optimizer([])
 def local_gpu_gemm(node):
     """
     gpu_from_host(gemm) -> gpu_gemm(gpu_from_host)
 
     gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
     """
-    if isinstance(node.op, GpuFromHost):
+    gemms = {
+            #tensor.blas.gemm_inplace: gpu_gemm_inplace,
+            tensor.blas.gemm_no_inplace: gpu_gemm_no_inplace}
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
-        if host_input.owner and isinstance(host_input.owner.op,
-                                           tensor.blas.Gemm):
+        if host_input.owner and host_input.owner.op in gemms:
+            op = host_input.owner.op
             z, a, x, y, b = host_input.owner.inputs
-            return [gpu_gemm_no_inplace(gpu_from_host(z),
-                                        a,
-                                        gpu_from_host(x),
-                                        gpu_from_host(y),
-                                        b)]
-    if isinstance(node.op, tensor.blas.Gemm):
+            return [gemms[op](gpu_from_host(z),
+                              a,
+                              gpu_from_host(x),
+                              gpu_from_host(y),
+                              b)]
+    if node.op in gemms:
         z, a, x, y, b = node.inputs
-        x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
-        y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
-        z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
+        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
+        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
+        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
         if x_on_gpu or y_on_gpu or z_on_gpu:
-            return [host_from_gpu(gpu_gemm_no_inplace(gpu_from_host(z),
+            return [host_from_gpu(gemms[node.op](gpu_from_host(z),
                                                  a,
                                                  gpu_from_host(x),
                                                  gpu_from_host(y),
@@ -631,52 +591,15 @@ def local_gpu_gemm(node):
 
 
 @register_opt()
-@local_optimizer([tensor.elemwise.CAReduce,
-                  tensor.elemwise.All,
-                  tensor.elemwise.Any,
-                  tensor.elemwise.CAReduceDtype,
-                  tensor.elemwise.Sum,
-                  tensor.elemwise.Prod,
-                  tensor.elemwise.ProdWithoutZeros])
+@local_optimizer([])
 def local_gpu_careduce(node):
     if isinstance(node.op, tensor.elemwise.CAReduce):
         scalar_op = node.op.scalar_op
         # currently, only these two ops are supported at all,
         # and max does not support all combinations of axes
-        if isinstance(node.op.scalar_op, (scal.Add, scal.Mul,
-                                          scal.Maximum, scal.Minimum)):
+        if node.op.scalar_op in [scal.add, scal.maximum, scal.minimum]:
             x, = node.inputs
-            # Otherwise, is some corner case, we will try to move it
-            # to the GPU later and this cause not wanted user warning.
-            if x.dtype != 'float32':
-                return
-            replace = False
-            if x.owner and isinstance(x.owner.op, HostFromGpu):
-                replace = True
-            elif (all([c != "output" and isinstance(c.op, GpuFromHost)
-                      for c, i in node.outputs[0].clients])
-                  and x.owner and x.owner.op.__class__ in
-                  cpu_ops_moved_to_gpu):
-                # It is not always good to transfer the reduction to
-                # the GPU when the clients are on the GPU but not the
-                # reduction input. It mean we will transfer the
-                # (bigger) input to the GPU instead of the
-                # output(smaller) if we stop optimization there. Most
-                # of the time, we will also move to the GPU what
-                # created the input of the reduction. In that case, we
-                # don't introduce a bigger transfer. It is hard to
-                # know if after all optimization we will do the bigger
-                # transfer or not. I'm guessing an heuristic to find
-                # that. I suppose that if the input of the reduction is
-                # generated by an op that we can in some cases move to
-                # the GPU, that we will move it. If some CPU ops are
-                # supported only in some cases on the GPU, this will
-                # move to the GPU the reduction when it wasn't a good
-                # idea.
-
-                replace = True
-
-            if replace:
+            if x.owner and x.owner.op == host_from_gpu:
                 if node.op.axis is None:
                     reduce_mask = [1] * x.type.ndim
                 else:
@@ -685,10 +608,16 @@ def local_gpu_careduce(node):
                         assert reduce_mask[a] == 0
                         reduce_mask[a] = 1
                 greduce = GpuCAReduce(reduce_mask, scalar_op)
-                out = node.outputs[0]
                 if greduce.supports_c_code([gpu_from_host(x)]):
                     rval = host_from_gpu(greduce(gpu_from_host(x)))
+                    if rval.type == node.outputs[0].type:
+                        return [rval]
+                    else:
+                        print >> sys.stderr, \
+                                "WARNING: local_gpu_careduce got type wrong"
+                        return None
                 else:
+
                     # Try to make a simpler pattern based on reshaping
                     # The principle is that if two adjacent dimensions have
                     # the same value in the reduce_mask, then we can reshape
@@ -716,64 +645,28 @@ def local_gpu_careduce(node):
                         reduce_reshaped_x = host_from_gpu(
                             new_greduce(gpu_reshaped_x))
 
-                        if reduce_reshaped_x.ndim != out.ndim:
-                            rval = reduce_reshaped_x.reshape(
-                                tensor.stack(*shape_of[out]))
+                        if reduce_reshaped_x.ndim != node.outputs[0].ndim:
+                            unreshaped_reduce = reduce_reshaped_x.reshape(
+                                tensor.stack(*shape_of[node.outputs[0]]))
                         else:
-                            rval = reduce_reshaped_x
-                    else:
-                        return
-                if rval.type == out.type:
-                    return [rval]
-                else:
-                    for b1, b2 in zip(rval.broadcastable,
-                                      out.type.broadcastable):
-                        if b1 is True:
-                            # It can happen that during
-                            # optimization we discover that the
-                            # input can be broadcasted, but didn't
-                            # know that at graph build time.
-                            continue
-                        if b1 is False and b2 is True:
-                            # We should not loose the information
-                            # that one dimensions was
-                            # broadcastable.
-                            print >> sys.stderr, (
-                                "WARNING: local_gpu_careduce got type"
-                                " wrong",
-                                rval.type, out.type,
-                                node.inputs[0].type, x.type,
-                                node)
+                            unreshaped_reduce = reduce_reshaped_x
+                        if unreshaped_reduce.type == node.outputs[0].type:
+                            return [unreshaped_reduce]
+                        else:
+                            print >> sys.stderr, \
+                                "WARNING: local_gpu_careduce got type wrong"
                             return None
-                    rval = tensor.patternbroadcast(rval,
-                                                   out.broadcastable)
-                    return [rval]
 
+                        raise Exception(
+                            "GpuCAReduce does not yet implement this pattern:",
+                            pattern)
     return False
 
 
-@register_opt("low_memory")
-@local_optimizer([GpuCAReduce])
-def local_gpu_elemwise_careduce(node):
-    if (isinstance(node.op, GpuCAReduce) and
-        node.op.pre_scalar_op is None and
-        node.inputs[0].owner and
-        isinstance(node.inputs[0].owner.op, GpuElemwise) and
-        # The Op support all scalar with 1 inputs.  We don't
-        # automatically add more case, as some like trigonometic
-        # operation with some reduction pattern will probably result
-        # to slow down.
-        isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)
-        ):
-        op = node.op
-        inp = node.inputs[0].owner.inputs[0]
-        return [GpuCAReduce(op.reduce_mask, op.scalar_op, scal.basic.sqr)(inp)]
-
-
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.Reshape])
+@local_optimizer([])
 def local_gpu_reshape(node):
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
         if host_input.owner and \
            isinstance(host_input.owner.op, tensor.Reshape):
@@ -790,7 +683,7 @@ def local_gpu_reshape(node):
             return [gpu_reshape]
     if isinstance(node.op, tensor.Reshape):
         x, shp = node.inputs
-        if x.owner and isinstance(x.owner.op, HostFromGpu):
+        if x.owner and x.owner.op == host_from_gpu:
             gpu_x, = x.owner.inputs
             gpu_reshape = GpuReshape(node.op.ndim)(gpu_x, shp)
             if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
@@ -805,9 +698,9 @@ def local_gpu_reshape(node):
 
 
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.Flatten])
+@local_optimizer([])
 def local_gpu_flatten(node):
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
         if host_input.owner and \
            isinstance(host_input.owner.op, tensor.Flatten):
@@ -817,16 +710,16 @@ def local_gpu_flatten(node):
     if isinstance(node.op, tensor.Flatten):
         x, = node.inputs
         outdim = node.op.outdim
-        if x.owner and isinstance(x.owner.op, HostFromGpu):
+        if x.owner and x.owner.op == host_from_gpu:
             gpu_x, = x.owner.inputs
             return [host_from_gpu(GpuFlatten(outdim)(gpu_x))]
     return False
 
 
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.Subtensor])
+@local_optimizer([])
 def local_gpu_subtensor(node):
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
         if host_input.owner and \
            isinstance(host_input.owner.op, tensor.Subtensor):
@@ -836,20 +729,18 @@ def local_gpu_subtensor(node):
             return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)]
     if isinstance(node.op, tensor.Subtensor):
         x = node.inputs[0]
-        if (x.owner and
-            isinstance(x.owner.op, HostFromGpu) and
-            x.dtype == "float32"):
+        coords = node.inputs[1:]
+        if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32":
             gpu_x, = x.owner.inputs
-            coords = node.inputs[1:]
             return [host_from_gpu(GpuSubtensor(
                 node.op.idx_list)(gpu_x, *coords))]
     return False
 
 
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.AdvancedSubtensor1])
+@local_optimizer([])
 def local_gpu_advanced_subtensor1(node):
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
         if host_input.owner and \
            host_input.owner.op.__class__ is tensor.AdvancedSubtensor1:
@@ -859,16 +750,16 @@ def local_gpu_advanced_subtensor1(node):
     if node.op.__class__ is tensor.AdvancedSubtensor1:
         x = node.inputs[0]
         coords = node.inputs[1:]
-        if x.owner and isinstance(x.owner.op, HostFromGpu) and x.dtype == "float32":
+        if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32":
             gpu_x, = x.owner.inputs
             return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))]
     return False
 
 
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.AdvancedIncSubtensor1])
+@local_optimizer([])
 def local_gpu_advanced_incsubtensor1(node):
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
         # Should not execute for GpuAdvancedIncSubtensor1
         if host_input.owner and \
@@ -885,16 +776,9 @@ def local_gpu_advanced_incsubtensor1(node):
                     'either set the `warn.gpu_set_subtensor1` config '
                     'option to False, or `warn.ignore_bug_before` to at '
                     'least \'0.6\'.', stacklevel=1)
-            active_device_no = theano.sandbox.cuda.active_device_number()
-            compute_capability = device_properties(active_device_no)['major']
-            if (compute_capability < 2 or
-                x.ndim != 2 or
-                y.ndim != 2):
-                gpu_op = GpuAdvancedIncSubtensor1(
-                    set_instead_of_inc=set_instead_of_inc)
-            else:
-                gpu_op = GpuAdvancedIncSubtensor1_dev20(
-                    set_instead_of_inc=set_instead_of_inc)
+
+            gpu_op = GpuAdvancedIncSubtensor1(
+                set_instead_of_inc=set_instead_of_inc)
             return [gpu_op(gpu_from_host(x), gpu_from_host(y), *coords)]
 
     # Should not execute for GpuAdvancedIncSubtensor1
@@ -903,12 +787,12 @@ def local_gpu_advanced_incsubtensor1(node):
         x, y = node.inputs[0:2]
         coords = node.inputs[2:]
         go_gpu = False
-        if x.owner and isinstance(x.owner.op, HostFromGpu):
+        if x.owner and x.owner.op == host_from_gpu:
             go_gpu = True
             gpu_x, = x.owner.inputs
         else:
             gpu_x = gpu_from_host(x)
-        if y.owner and isinstance(y.owner.op, HostFromGpu):
+        if y.owner and y.owner.op == host_from_gpu:
             go_gpu = True
             gpu_y, = y.owner.inputs
         else:
@@ -925,24 +809,16 @@ def local_gpu_advanced_incsubtensor1(node):
                     'option to False, or `warn.ignore_bug_before` to at '
                     'least \'0.6\'.', stacklevel=1)
 
-            active_device_no = theano.sandbox.cuda.active_device_number()
-            compute_capability = device_properties(active_device_no)['major']
-            if (compute_capability < 2 or
-                x.ndim != 2 or
-                y.ndim != 2):
-                gpu_op = GpuAdvancedIncSubtensor1(
-                    set_instead_of_inc=set_instead_of_inc)
-            else:
-                gpu_op = GpuAdvancedIncSubtensor1_dev20(
-                    set_instead_of_inc=set_instead_of_inc)
+            gpu_op = GpuAdvancedIncSubtensor1(
+                set_instead_of_inc=set_instead_of_inc)
             return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]
     return False
 
 
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.IncSubtensor])
+@local_optimizer([])
 def local_gpu_incsubtensor(node):
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_output = node.inputs[0]
         if host_output.owner and \
            type(host_output.owner.op) == tensor.IncSubtensor:
@@ -966,12 +842,12 @@ def local_gpu_incsubtensor(node):
         assert isinstance(y.type, tensor.TensorType)
         coords = node.inputs[2:]
         go_gpu = False
-        if x.owner and isinstance(x.owner.op, HostFromGpu):
+        if x.owner and x.owner.op == host_from_gpu:
             go_gpu = True
             gpu_x, = x.owner.inputs
         else:
             gpu_x = gpu_from_host(x)
-        if y.owner and isinstance(y.owner.op, HostFromGpu):
+        if y.owner and y.owner.op == host_from_gpu:
             go_gpu = True
             gpu_y, = y.owner.inputs
         else:
@@ -987,23 +863,23 @@ def local_gpu_incsubtensor(node):
 
 
 @register_opt()
-@local_optimizer([tensor.Shape])
+@local_optimizer([])
 def local_gpu_shape(node):
     if isinstance(node.op, tensor.Shape):
         x, = node.inputs
-        if x.owner and isinstance(x.owner.op, HostFromGpu):
+        if x.owner and x.owner.op == host_from_gpu:
             gpu_x, = x.owner.inputs
             return [gpu_shape(gpu_x)]
     return False
 
 
 @register_opt()
-@local_optimizer([tensor.Rebroadcast])
+@local_optimizer([])
 def local_gpu_rebroadcast(node):
     '''rebroadcast(host_from_gpu(x)) -> host_from_gpu(rebroadcast(x))'''
     if isinstance(node.op, tensor.Rebroadcast):
         x, = node.inputs
-        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
+        if (x.owner and x.owner.op == host_from_gpu):
             gpu_x = x.owner.inputs[0]
             return [host_from_gpu(node.op(gpu_x))]
 
@@ -1013,11 +889,11 @@ def gpu_print_wrapper(op, cnda):
 
 
 @register_opt()
-@local_optimizer([tensor.printing.Print])
+@local_optimizer([])
 def local_gpu_print_op(node):
     if isinstance(node.op, tensor.printing.Print):
         x, = node.inputs
-        if x.owner and isinstance(x.owner.op, HostFromGpu):
+        if x.owner and x.owner.op == host_from_gpu:
             gpu_x, = x.owner.inputs
             new_op = node.op.__class__(global_fn=gpu_print_wrapper)
             new_op.old_op = node.op
@@ -1034,11 +910,11 @@ def cast(x, dtype):
 
 
 @register_opt()
-@local_optimizer([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
+@local_optimizer([])
 def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
     if isinstance(node.op, tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias):
         x, b, y = node.inputs
-        if x.owner and isinstance(x.owner.op, HostFromGpu):
+        if x.owner and x.owner.op == host_from_gpu:
             gpu_x, = x.owner.inputs
             # if y is a cast to integers, we can go to the underlying
             # thing if we want, since this gpu op will cast to integers
@@ -1064,11 +940,11 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
 
 
 @register_opt()
-@local_optimizer([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
+@local_optimizer([])
 def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
     if isinstance(node.op, tensor.nnet.CrossentropySoftmax1HotWithBiasDx):
         dnll, sm, yidx = node.inputs
-        if sm.owner and isinstance(sm.owner.op, HostFromGpu):
+        if sm.owner and sm.owner.op == host_from_gpu:
             gpu_sm, = sm.owner.inputs
             gpu_dx = GpuCrossentropySoftmax1HotWithBiasDx()(
                 gpu_from_host(dnll),
@@ -1079,11 +955,11 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
 
 
 @register_opt()
-@local_optimizer([tensor.nnet.Softmax])
+@local_optimizer([])
 def local_gpu_softmax(node):
     if isinstance(node.op, tensor.nnet.Softmax):
         x, = node.inputs
-        if x.owner and isinstance(x.owner.op, HostFromGpu):
+        if x.owner and x.owner.op == host_from_gpu:
             gpu_x, = x.owner.inputs
             gpu_sm = GpuSoftmax()(gpu_x)
             return [host_from_gpu(gpu_sm)]
@@ -1091,73 +967,23 @@ def local_gpu_softmax(node):
 
 
 @register_opt()
-@local_optimizer([tensor.nnet.SoftmaxWithBias])
+@local_optimizer([])
 def local_gpu_softmax_with_bias(node):
     if isinstance(node.op, tensor.nnet.SoftmaxWithBias):
         x, b = node.inputs
-        x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu)
-        b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu)
+        x_on_gpu = x.owner and x.owner.op == host_from_gpu
+        b_on_gpu = b.owner and b.owner.op == host_from_gpu
         if x_on_gpu or b_on_gpu:
             gpu_sm = GpuSoftmaxWithBias()(gpu_from_host(x), gpu_from_host(b))
             return [host_from_gpu(gpu_sm)]
     return False
 
-
-# Convolution
+#### Convolution, maxpooling
 from theano.tensor.nnet import conv
 
 
-def _gpu_conv_to_fftconv(node):
-    # shared helper function for local_conv_fft_valid and local_conv_fft_full.
-    # we import conv2d_fft locally to avoid pycuda warnings
-    from theano.sandbox.cuda.fftconv import conv2d_fft
-    kwargs = {'border_mode': node.op.border_mode}
-    if (node.op.imshp is not None and
-        node.op.imshp[-1] is not None and
-        node.op.imshp[-1] % 2 == 1):
-        kwargs['pad_last_dim'] = True
-    # If the user supplied the full nonsymbolic image_shape and
-    # filter_shape in conv2d(), we can pass it on to conv2d_fft().
-    if ((node.op.imshp is not None) and
-            (len(node.op.imshp) == 3) and
-            (None not in node.op.imshp) and
-            (node.op.bsize is not None)):
-        kwargs['image_shape'] = (node.op.bsize,) + node.op.imshp
-    if ((node.op.kshp is not None) and
-            (None not in node.op.kshp) and
-            (node.op.nkern is not None) and
-            (len(node.op.imshp) == 3) and
-            (node.op.imshp[0] is not None)):
-        kwargs['filter_shape'] = (node.op.nkern, node.op.imshp[0]) + node.op.kshp
-    rval = conv2d_fft(node.inputs[0], node.inputs[1], **kwargs)
-    if ('image_shape' in kwargs) or ('filter_shape' in kwargs):
-        # With given shape information, conv2d_fft may return a different
-        # broadcast pattern than GpuConv. This is forbidden, so we fix it.
-        rval = tensor.patternbroadcast(rval, node.outputs[0].type.broadcastable)
-    return rval
-
-
-@local_optimizer([GpuConv])
-def local_conv_fft_valid(node):
-    if isinstance(node.op, GpuConv):
-        if (node.op.border_mode == 'valid' and
-            node.op.subsample == (1, 1) and
-            node.op.fft_opt):
-            return [_gpu_conv_to_fftconv(node)]
-        return False
-
-
-@local_optimizer([GpuConv])
-def local_conv_fft_full(node):
-    if isinstance(node.op, GpuConv):
-        if (node.op.border_mode == 'full' and
-            node.op.subsample == (1, 1) and
-            node.op.fft_opt):
-            return [_gpu_conv_to_fftconv(node)]
-        return
-
-
-@local_optimizer([gpu_from_host, conv.ConvOp])
+@register_opt()
+@local_optimizer([])
 def local_gpu_conv(node):
     """
     gpu_from_host(conv) -> gpu_conv(gpu_from_host)
@@ -1178,13 +1004,12 @@ def GpuConvOp_from_ConvOp(op):
                     logical_kern_align_top=op.kshp_logical_top_aligned,
                     kshp=op.kshp,
                     version=op.version,
-                    direction_hint=op.direction_hint,
                     verbose=op.verbose,
                     imshp=op.imshp,
-                    nkern=op.nkern,
-                    bsize=op.bsize,
-                    fft_opt=op.fft_opt
                     )
+        #HACK to print the number of MFlops in the profiler output.
+        if hasattr(op, 'flops'):
+            ret.flops = op.flops
         if op.imshp_logical is not None:
             logical_img_hw = op.imshp_logical[1:3]
             if logical_img_hw != op.imshp[1:3]:
@@ -1206,23 +1031,7 @@ def make_graph(img, kern):
                 return make_graph
         return ret
 
-    def values_eq_approx(a, b):
-        """This fct is needed to don't have DebugMode raise useless
-        error due to ronding error.
-
-        This happen as We reduce on the two last dimensions, so this
-        can raise the absolute error if the number of element we
-        reduce on is significant.
-
-        """
-        assert a.ndim == 4
-        atol = None
-        if a.shape[-1] * a.shape[-2] > 100:
-            #For float32 the default atol is 1e-5
-            atol = 3e-5
-        return CudaNdarrayType.values_eq_approx(a, b, atol=atol)
-
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         #gpu_from_host(conv) -> gpu_conv(gpu_from_host)
         host_input = node.inputs[0]
         if host_input.owner and isinstance(host_input.owner.op, conv.ConvOp):
@@ -1230,347 +1039,48 @@ def values_eq_approx(a, b):
             if gpu_conv is None:
                 return
             img, kern = host_input.owner.inputs
-            out = gpu_conv(gpu_from_host(img),
-                           gpu_from_host(kern))
-            out = tensor.patternbroadcast(out,
-                                          node.outputs[0].broadcastable)
-            out.values_eq_approx = values_eq_approx
             # in some case the ConvOp broadcast the last 2 dimensions
             # differently then the gpu ConvOp
-            return [out]
+            return [tensor.patternbroadcast(
+                gpu_conv(gpu_from_host(img),
+                         gpu_from_host(kern)),
+                         node.outputs[0].broadcastable)]
 
     if isinstance(node.op, conv.ConvOp):
         #conv(host_from_gpu) -> host_from_gpu(gpu_conv)
         img, kern = node.inputs
-        img_on_gpu = (img.owner and isinstance(img.owner.op, HostFromGpu))
-        kern_on_gpu = (kern.owner and isinstance(kern.owner.op, HostFromGpu))
+        img_on_gpu = (img.owner and img.owner.op == host_from_gpu)
+        kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu)
         if img_on_gpu or kern_on_gpu:
             gpu_conv = GpuConvOp_from_ConvOp(node.op)
             if gpu_conv is None:
                 return
-            out = gpu_conv(gpu_from_host(img),
-                           gpu_from_host(kern))
-            out = tensor.patternbroadcast(
-                host_from_gpu(out),
-                node.outputs[0].broadcastable)
-            out.values_eq_approx = values_eq_approx
             # in some case the ConvOp broadcast the last 2 dimensions
             # differently then the gpu ConvOp
-            return [out]
+            return [tensor.patternbroadcast(
+                host_from_gpu(gpu_conv(gpu_from_host(img),
+                                       gpu_from_host(kern))),
+                node.outputs[0].broadcastable)]
 
-
-@local_optimizer([GpuConv])
-def local_conv_gemm(node):
-    if (isinstance(node.op, GpuConv) and
-        node.op.border_mode in ['full', 'valid']):
-        img, kern = node.inputs
-        border_mode = node.op.border_mode
-        subsample = node.op.subsample
-        pad = (0,0)
-        if (border_mode == 'full') and (subsample != (1,1)):
-            # need to simulate this via a padded valid convolution
-            pad = 'full'
-            border_mode = 'valid'
-        if (border_mode == 'valid'):
-            # need to flip the kernel for valid convolution
-            kern = kern[:, :, ::-1, ::-1]
-            # call GpuCorrMM or GpuCorrMM_gradWeights
-            # (the latter is faster if batchsize * kernelHeight * kernelWidth
-            # is larger than inputChannels * outputHeight * outputWidth.
-            # GpuConv does not always store information on the batchsize and
-            # channels, though, so we only use what information we have.)
-            if ((subsample == (1,1)) and
-                    (node.op.imshp is not None) and
-                    (None not in node.op.imshp[-2:]) and
-                    (node.op.kshp is not None) and
-                    (None not in node.op.kshp)):
-                # we know the kernel and output size
-                prod1 = node.op.kshp[0] * node.op.kshp[1]
-                prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
-                    (node.op.imshp[-1] - node.op.kshp[1] + 1))
-                if ((node.op.bsize is not None) and
-                        (len(node.op.imshp) == 3) and
-                        (node.op.imshp[0] is not None)):
-                    # we also know batchsize and input channels
-                    prod1 *= node.op.bsize
-                    prod2 *= node.op.imshp[0]
-                # compare to decide
-                if prod1 > prod2:
-                    # (we need to wrap the result in as_cuda_ndarray_variable,
-                    # because we are not allowed to replace a CudaNdarray with
-                    # a DimShuffle instance in a graph optimization)
-                    return [theano.sandbox.cuda.as_cuda_ndarray_variable(
-                            GpuCorrMM_gradWeights('valid', subsample, pad)(
-                            gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
-                            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
-                            ).dimshuffle(1, 0, 2, 3))]
-            # use GpuCorrMM if we did not choose GpuCorrMM_gradWeights above
-            return [GpuCorrMM('valid', subsample, pad)(
-                    gpu_contiguous(img), gpu_contiguous(kern))]
-        elif (border_mode == 'full'):
-            # need to dimshuffle the kernel for full convolution
-            kern = kern.dimshuffle(1, 0, 2, 3)
-            # call GpuCorrMM_gradInputs
-            return [GpuCorrMM_gradInputs('valid', subsample, pad)(
-                    gpu_contiguous(kern), gpu_contiguous(img))]
-
-
-# First we register the optimizer that moves convolutions to the GPU.
-register_opt()(local_gpu_conv)
-
-# Then we create a group of optimizers that replace the legacy GpuConv
-# with other implementations. They are tried in a specific order so we
-# can control which ones take precedence over others.
-conv_groupopt = theano.gof.optdb.LocalGroupDB()
-conv_groupopt.__name__ = "gpu_conv_opts"
-register_opt()(conv_groupopt)
-
-# FFT gets the highest priority (lowest number), but is disabled by default.
-# It can be enabled by including 'conv_fft'.
-conv_groupopt.register('conv_fft_valid', local_conv_fft_valid, 10,
-                       'conv_fft')
-conv_groupopt.register('conv_fft_full', local_conv_fft_full, 10,
-                       'conv_fft')
-# cuDNN is the second, but only registered if cuDNN is available.
-# It can be disabled by excluding 'conv_dnn' or 'cudnn'.
-from . import dnn
-# We can't check at import if dnn is available, so we must always
-# register it. This do not cause problem as if it is not avail, the
-# opt will do nothing.
-conv_groupopt.register('local_conv_dnn', dnn.local_conv_dnn, 20,
-                       'conv_dnn',
-                       'fast_compile', 'fast_run', 'cudnn')
-# The GEMM-based convolution comes last to catch all remaining cases.
-# It can be disabled by excluding 'conv_gemm'.
-conv_groupopt.register('local_conv_gemm', local_conv_gemm, 30,
-                       'conv_gemm',
-                       'fast_compile', 'fast_run')
-
-
-class LocalCudaMetaOptimizer(LocalMetaOptimizer):
-    """Base class for CUDA-based LocalMetaOptimizers"""
-
-    def time_call(self, fn):
-        # Override time_call() to do device synchronization
-        theano.sandbox.cuda.synchronize()
-        start = time.time()
-        fn()
-        theano.sandbox.cuda.synchronize()
-        return time.time() - start
-
-
-# Convolution Meta-optimizer
-
-class ConvMetaOptimizer(LocalCudaMetaOptimizer):
-    def __init__(self, optimizers):
-        super(ConvMetaOptimizer, self).__init__([GpuConv], optimizers)
-
-    def provide_inputs(self, node, inputs):
-        # We need to provide dummy data for the given inputs.
-        # We can make use of the fact that GpuConv often knows its shapes.
-        result = {}
-        img, kern = node.inputs
-        # provide dummy image and filters if needed
-        vars = (img, kern)
-        if node.op.imshp is not None and len(node.op.imshp) == 3:
-            nchannels = node.op.imshp[0]
-        else:
-            nchannels = None
-        shapes = ((node.op.bsize,) + node.op.imshp,
-                  (node.op.nkern, nchannels) + node.op.kshp)
-        for (var, shape) in zip(vars, shapes):
-            if ((var in inputs) and
-                (shape is not None) and
-                not any(s is None for s in shape)):
-                result[var] = theano.shared(
-# TODO: Use var.type.filter when cuda_ndarray.filter supports non-strict casts
-#                        var.type.filter(numpy.random.randn(*shape),
-#                                        allow_downcast=True),
-                        numpy.require(numpy.random.randn(*shape),
-                                      dtype=var.dtype),
-                        var.name, borrow=True)
-        # return mapping
-        return result
-
-# We just register all optimizers from conv_groupopt with the metaoptimizer
-conv_metaopt = ConvMetaOptimizer(
-        conv_groupopt.query(*['+' + name for name in conv_groupopt._names]).opts)
-# Then we add some optimizers that try less obvious options
-conv_metaopt.register(dnn.local_conv_dnn_alternative)
-# Finally, we register the metaoptimizer as the first optimizer in conv_groupopt
-conv_groupopt.register('conv_meta', conv_metaopt, 0)
-
-
-@local_optimizer([Conv3D])
-def local_conv3d_fft(node):
-    if not isinstance(node.op, Conv3D):
-        return
-    try:
-        stride_x = tensor.get_scalar_constant_value(node.inputs[3][0])
-        stride_y = tensor.get_scalar_constant_value(node.inputs[3][1])
-        stride_z = tensor.get_scalar_constant_value(node.inputs[3][2])
-    except tensor.NotScalarConstantError:
-        return False
-    if (stride_x, stride_y, stride_z) == (1, 1, 1):
-        # we import conv3d_fft locally to avoid pycuda warnings
-        from theano.sandbox.cuda.fftconv import conv3d_fft
-        # Shuffle inputs signal from (b, 0, 1, t, c) to (b, c, 0, 1, t)
-        x = node.inputs[0]
-        x = gpu_from_host(x.dimshuffle(0, 4, 1, 2, 3))
-        # Shuffle filters from (oc, 0, 1, t, ic) to (oc, ic, 0, 1, t)
-        f = node.inputs[1]
-        f = gpu_from_host(f.dimshuffle(0, 4, 1, 2, 3))
-        # filter flip
-        f = f[:, :, ::-1, ::-1, ::-1]
-        rval = conv3d_fft(x, f, border_mode='valid', pad_last_dim=True)
-        # Shuffle from (oc, c, 0, 1, t) to (oc, 0, 1, t, c)
-        return [rval.dimshuffle(0, 2, 3, 4, 1) + node.inputs[2]]
-
-
-gpu_optimizer.register("conv3d_fft", local_conv3d_fft)
-
-from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
-@local_optimizer([ConvGrad3D])
-def local_convgrad3d_fft(node):
-    try:
-        stride_x = tensor.get_scalar_constant_value(node.inputs[1][0])
-        stride_y = tensor.get_scalar_constant_value(node.inputs[1][1])
-        stride_z = tensor.get_scalar_constant_value(node.inputs[1][2])
-    except tensor.NotScalarConstantError:
-        return False
-    if (isinstance(node.op, ConvGrad3D) and
-        (stride_x, stride_y, stride_z) == (1, 1, 1)):
-        # we import conv3d_fft locally to avoid pycuda warnings
-        from theano.sandbox.cuda.fftconv import conv3d_fft
-        # Shuffle inputs signal from (b, 0, 1, t, ic) to (ic, b, 0, 1, t)
-        x = node.inputs[0]
-        x = x.dimshuffle(4, 0, 1, 2, 3)
-        # Shuffle dCdH from (b, 0, 1, t, oc) to (oc, b, 0, 1, t)
-        f = node.inputs[3]
-        f = f.dimshuffle(4, 0, 1, 2, 3)
-        # filter flip
-        f = f[:,:,::-1,::-1,::-1]
-        rval = conv3d_fft(x, f, border_mode='valid', pad_last_dim=True)
-        # Shuffle from (ic, oc, 0, 1, t) to (oc, 0, 1, t, ic)
-        return [rval.dimshuffle(1, 2, 3, 4, 0)]
-
-
-gpu_optimizer.register("convgrad3d_fft", local_convgrad3d_fft)
-
-from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
-@local_optimizer([ConvTransp3D])
-def local_convtransp3d_fft(node):
-    try:
-        stride_x = tensor.get_scalar_constant_value(node.inputs[2][0])
-        stride_y = tensor.get_scalar_constant_value(node.inputs[2][1])
-        stride_z = tensor.get_scalar_constant_value(node.inputs[2][2])
-    except tensor.NotScalarConstantError:
-        return False
-    if (isinstance(node.op, ConvTransp3D) and
-        (stride_x, stride_y, stride_z) == (1, 1, 1)):
-        # we import conv3d_fft locally to avoid pycuda warnings
-        from theano.sandbox.cuda.fftconv import conv3d_fft
-        # Shuffle filters from (oc, 0, 1, t, ic) to (ic, oc, 0, 1, t)
-        x = node.inputs[0]
-        x = x.dimshuffle(4, 0, 1, 2, 3)
-        # Shuffle dCdH from (b, 0, 1, t, oc) to (b, oc, 0, 1, t)
-        f = node.inputs[3]
-        f = f.dimshuffle(0, 4, 1, 2, 3)
-        rval = conv3d_fft(f, x, border_mode='full', pad_last_dim=True)
-        # Shuffle from (ic, b, 0, 1, t) to (b, 0, 1, t, ic)
-        return [rval.dimshuffle(0, 2, 3, 4, 1) + node.inputs[1]]
-
-gpu_optimizer.register("convtransp3d_fft", local_convtransp3d_fft)
-
-@local_optimizer([Conv3D])
-def local_conv3d_gemm(node):
-    if not isinstance(node.op, Conv3D):
-        return
-    try:
-        sx = tensor.get_scalar_constant_value(node.inputs[3][0])
-        sy = tensor.get_scalar_constant_value(node.inputs[3][1])
-        sz = tensor.get_scalar_constant_value(node.inputs[3][2])
-    except tensor.NotScalarConstantError:
-        return False
-    if isinstance(node.op, Conv3D):
-        # Shuffle inputs signal from (b, 0, 1, t, c) to (b, c, 0, 1, t)
-        x = node.inputs[0]
-        x = x.dimshuffle(0, 4, 1, 2, 3)
-        # Shuffle filters from (oc, 0, 1, t, ic) to (oc, ic, 0, 1, t)
-        f = node.inputs[1]
-        f = f.dimshuffle(0, 4, 1, 2, 3)
-        rval = GpuCorr3dMM(border_mode='valid', subsample=(sx, sy, sz))(x, f)
-        # Shuffle from (oc, c, 0, 1, t) to (oc, 0, 1, t, c)
-        return [rval.dimshuffle(0, 2, 3, 4, 1) + node.inputs[2]]
-
-gpu_optimizer.register("conv3d_gemm", local_conv3d_gemm)
-
-@local_optimizer([ConvGrad3D])
-def local_convgrad3d_gemm(node):
-    try:
-        sx = tensor.get_scalar_constant_value(node.inputs[1][0])
-        sy = tensor.get_scalar_constant_value(node.inputs[1][1])
-        sz = tensor.get_scalar_constant_value(node.inputs[1][2])
-    except tensor.NotScalarConstantError:
-        return False
-    if isinstance(node.op, ConvGrad3D):
-        # Shuffle inputs signal from (b, 0, 1, t, c) to (b, c, 0, 1, t)
-        x = node.inputs[0]
-        x = gpu_contiguous(x.dimshuffle(0, 4, 1, 2, 3))
-
-        # Shuffle dCdH from (b, 0, 1, t, oc) to (oc, b, 0, 1, t)
-        f = node.inputs[3]
-        f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3))
-
-        rval = GpuCorr3dMM_gradWeights(subsample=(sx, sy, sz))(x, f,
-                                                               shape=node.inputs[2][1:4])
-        # Shuffle from (ic, oc, 0, 1, t) to (oc, 0, 1, t, ic)
-        return [rval.dimshuffle(0, 2, 3, 4, 1)]
-
-gpu_optimizer.register("convgrad3d_gemm", local_convgrad3d_gemm)
-
-@local_optimizer([ConvTransp3D])
-def local_convtransp3d_gemm(node):
-    try:
-        sx = tensor.get_scalar_constant_value(node.inputs[2][0])
-        sy = tensor.get_scalar_constant_value(node.inputs[2][1])
-        sz = tensor.get_scalar_constant_value(node.inputs[2][2])
-    except tensor.NotScalarConstantError:
-        return False
-    if isinstance(node.op, ConvTransp3D) and (sx, sy, sz) == (1, 1, 1):
-        # Shuffle filters from (oc, 0, 1, t, ic) to (ic, oc, 0, 1, t)
-        x = node.inputs[0]
-        x = gpu_contiguous(x.dimshuffle(0, 4, 1, 2, 3))
-        # Shuffle dCdH from (b, 0, 1, t, oc) to (b, oc, 0, 1, t)
-        f = node.inputs[3]
-        f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3))
-        rval = GpuCorr3dMM_gradInputs(subsample=(sx, sy, sz))(kern=x, topgrad=f)
-        # Shuffle from (ic, b, 0, 1, t) to (b, 0, 1, t, ic)
-        return [rval.dimshuffle(0, 2, 3, 4, 1) + node.inputs[1]]
-
-gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm)
-
-
-# Pooling
 import theano.tensor.signal.downsample as downsample
 
 
 @register_opt()
-@local_optimizer([downsample.DownsampleFactorMax])
+@local_optimizer([])
 def local_gpu_downsample_factor_max(node):
     if isinstance(node.op, downsample.DownsampleFactorMax):
         x, = node.inputs
-        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
+        if (x.owner and x.owner.op == host_from_gpu):
             gpu_ds = GpuDownsampleFactorMax(node.op.ds, node.op.ignore_border)
             return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]
 
 
 @register_opt()
-@local_optimizer([downsample.DownsampleFactorMaxGrad])
+@local_optimizer([])
 def local_gpu_downsample_factor_max_grad(node):
     if isinstance(node.op, downsample.DownsampleFactorMaxGrad):
         x, z, gz = node.inputs
-        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
+        if (x.owner and x.owner.op == host_from_gpu):
             gpu_ds_grad = GpuDownsampleFactorMaxGrad(node.op.ds,
                                                      node.op.ignore_border)
             return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0],
@@ -1578,11 +1088,11 @@ def local_gpu_downsample_factor_max_grad(node):
                                               gpu_from_host(gz)))]
 
 
-from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin
+from theano.sandbox.cuda.basic_ops import gpu_join
 
 
 @register_opt()
-@local_optimizer([tensor.Join])
+@local_optimizer([])
 def local_gpu_join(node):
     """
     Inspired by the opt for convop.
@@ -1622,12 +1132,12 @@ def local_gpu_join(node):
 
         #print "OPT: axis_and_tensors=", axis_and_tensors
 
-        matches = [(not t.owner is None and isinstance(t.owner.op, HostFromGpu)) or
+        matches = [(not t.owner is None and t.owner.op == host_from_gpu) or
                    isinstance(t, gof.Constant) for t in axis_and_tensors[1:]]
         #print "OPT: matches =", matches
 
         # if all input tensors are host_from_gpu'ified
-        if all(matches):
+        if numpy.all(matches):
             # the extra gpu_from_host introduced here will
             # be removed by further optimizations
             new_tensors = [gpu_from_host(t) for t in axis_and_tensors[1:]]
@@ -1639,32 +1149,24 @@ def local_gpu_join(node):
 
             return [replacement_node]
 
-# This is a copy of the same opt in tensor to make the tests happy,
-# but I'm not convinced it is actually needed.
-@register_opt()
-@local_optimizer([GpuJoin])
-def local_gpujoin_1(node):
-    tensors = node.inputs[1:]
-    if len(tensors) == 1:
-        return [tensors[0]]
 
 # Commented out because it can result in
 #   shared =  dimshuffle(gemm_inplace(dimshuffle(shared)))
 # which causes memory leaks (long term fix is to make the above not leak
 # memory)
-@local_optimizer([gpu_gemm_no_inplace], inplace=True)
+@local_optimizer([gpu_gemm_no_inplace])
 def local_inplace_gemm(node):
     if node.op == gpu_gemm_no_inplace:
         return [gpu_gemm_inplace(*node.inputs)]
 
 
-@local_optimizer([gpu_gemv_no_inplace], inplace=True)
+@local_optimizer([gpu_gemv_no_inplace])
 def local_inplace_gemv(node):
     if node.op == gpu_gemv_no_inplace:
         return [gpu_gemv_inplace(*node.inputs)]
 
 
-@local_optimizer([gpu_ger_no_inplace], inplace=True)
+@local_optimizer([gpu_gemm_no_inplace])
 def local_inplace_ger(node):
     if node.op == gpu_ger_no_inplace:
         return [gpu_ger_inplace(*node.inputs)]
@@ -1674,10 +1176,12 @@ def local_inplace_ger(node):
 # Also, need to make the gemm optimisation(step 70) happen before the fusion of
 # elemwise(step 71)
 optdb.register('InplaceGpuBlasOpt',
-               tensor.opt.in2out(local_inplace_gemm,
-                                 local_inplace_gemv,
-                                 local_inplace_ger,
-                                 name="InplaceGpuBlasOpt"),
+        EquilibriumOptimizer([local_inplace_gemm,
+                              local_inplace_gemv,
+                              local_inplace_ger,
+                              ],
+                            failure_callback=EquilibriumOptimizer.warn_inplace,
+            max_use_ratio=5),
                70.0, 'fast_run', 'inplace', 'gpu')
 
 
@@ -1693,8 +1197,7 @@ def get_device_type_sizes():
     int_size = 8
     try:
 
-        cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
-        t = cuda_ndarray.ptr_int_size()
+        t = cuda_ndarray.cuda_ndarray.ptr_int_size()
         gpu_ptr_size, cpu_ptr_size, int_size, gpu_int_size = t
         assert int_size == gpu_int_size
         del gpu_int_size
@@ -1776,10 +1279,9 @@ def split_huge_add_or_mul(node):
         max_inputs_to_GpuElemwise)
 if config.gpu.local_elemwise_fusion:
     _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
-    #Must be after cpu fusion at 40, gpu at 48.5 and before AddDestroyHandler at 49.5
     optdb.register('gpu_elemwise_fusion',
                    tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
-                   49, 'fast_run', 'fusion',
+                   71.00, 'fast_run', 'fusion',
                    'local_elemwise_fusion', 'gpu')
 else:
     _logger.debug(("not enabling optimization fusion of gpu elemwise in "
@@ -1791,34 +1293,30 @@ def split_huge_add_or_mul(node):
 #GpuElemwise inplace
 gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
         GpuElemwise)
-# DO NOT PLACE add a 'gpu' tag here! This would enable it in fast_compile.
-# It still will be run in fast_run with device=gpu with the current tag.
 optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75,
-               'fast_run', 'inplace', 'gpu_inplace')
+               'fast_run', 'inplace', 'gpu_inplace', 'gpu')
 
 
 @register_opt()
-@local_optimizer([tensor.alloc])
+@local_optimizer([tensor.Alloc])
 def local_gpualloc(node):
     replace = False
     if node.op == tensor.alloc:
         if node.inputs[0].owner and \
-           isinstance(node.inputs[0].owner.op, HostFromGpu):
+           node.inputs[0].owner.op == host_from_gpu:
             replace = True
         elif all([c != 'output' and c.op == gpu_from_host
-                  for c, idx in node.outputs[0].clients]):
+                for c, idx in node.outputs[0].clients]):
             # if all clients are on gpu
             replace = True
         elif all([c != 'output' and
-                  c.op == tensor.join and
-                  all([i.owner and
-                       i.owner.op in [host_from_gpu, tensor.alloc]
-                       for i in c.inputs[1:]])
-                  for c, idx in node.outputs[0].clients]):
+                c.op == tensor.join and
+                all([i.owner and
+                     i.owner.op in [host_from_gpu, tensor.alloc]
+                     for i in c.inputs[1:]])
+                for c, idx in node.outputs[0].clients]):
             # if the client is a subtensor with input on gpu or alloc
             replace = True
-        if replace and node.inputs[0].dtype != 'float32':
-            replace = False
     if replace:
         val = node.inputs[0]
         shp = node.inputs[1:]
@@ -1845,18 +1343,7 @@ def local_gpualloc(node):
 
 
 @register_opt()
-@local_optimizer([theano.tensor.opt.Assert])
-def local_assert(node):
-    if (isinstance(node.op, theano.tensor.opt.Assert) and
-        node.inputs[0].owner and
-        isinstance(node.inputs[0].owner.op,
-                   HostFromGpu)):
-        return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0],
-                                      *node.inputs[1:]))]
-
-
-@register_opt()
-@local_optimizer([GpuAlloc])
+@local_optimizer([tensor.Alloc])
 def local_gpualloc_memset_0(node):
     if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
         inp = node.inputs[0]
@@ -1864,33 +1351,26 @@ def local_gpualloc_memset_0(node):
             inp.data.size == 1 and
             (numpy.asarray(inp.data) == 0).all()):
             new_out = GpuAlloc(memset_0=True)(*node.inputs)
-            old_bcast = node.outputs[0].type.broadcastable
-            if new_out.type.broadcastable != old_bcast:
-                # check that we did not try discarding a broadcastable dimension
-                assert not any(b_old and not b_new for b_old, b_new in zip(
-                        old_bcast, new_out.type.broadcastable))
-                # force old broadcasting pattern; we must not change it here
-                new_out = tensor.patternbroadcast(new_out, old_bcast)
             return [new_out]
 
 
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.Eye])
+@local_optimizer([])
 def local_gpu_eye(node):
     """
     gpu_from_host(eye) -> gpueye(gpu_from_host)
 
     eye(host_from_gpu) -> host_from_gpu(gpueye)
     """
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
         if (host_input.owner and
             isinstance(host_input.owner.op, tensor.Eye) and
             host_input.owner.op.dtype == "float32"):
             return [gpu_eye(*host_input.owner.inputs)]
     if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32":
-        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
-                for i in node.inputs]):
+        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+                      for i in node.inputs]):
             return [host_from_gpu(gpu_eye(*node.inputs))]
     return False
 
@@ -1958,38 +1438,8 @@ def tensor_to_cuda(x):
         return x
 
 
-@register_opt()
-@local_optimizer([nlinalg.ExtractDiag])
-def local_gpu_extract_diagonal(node):
-    """
-    extract_diagonal(host_from_gpu()) -> host_from_gpu(extract_diagonal)
-    gpu_from_host(extract_diagonal) -> extract_diagonal(gpu_from_host)
-    """
-    if (isinstance(node.op, nlinalg.ExtractDiag) and
-        isinstance(node.inputs[0].type,
-                   theano.tensor.TensorType)):
-        inp = node.inputs[0]
-        if inp.owner and isinstance(inp.owner.op, HostFromGpu):
-            return [host_from_gpu(nlinalg.extract_diag(gpu_from_host(inp)))]
-    if isinstance(node.op, GpuFromHost):
-        host_input = node.inputs[0]
-        if (host_input.owner and
-            isinstance(host_input.owner.op, nlinalg.ExtractDiag) and
-            isinstance(host_input.owner.inputs[0].type,
-                       theano.tensor.TensorType)):
-            diag_node = host_input.owner
-            return [nlinalg.extract_diag(
-                gpu_from_host(diag_node.inputs[0]))]
-    return False
-
-def typeConstructor(broadcastable, dtype):
-    if dtype == 'float32':
-        return CudaNdarrayType(broadcastable=broadcastable)
-    else:
-        return tensor.TensorType(broadcastable=broadcastable, dtype=dtype)
-
 @register_opt('scan')
-@local_optimizer([gpu_from_host, scan_op.Scan])
+@local_optimizer([])
 def gpuScanOptimization(node):
     """
     scan(host_from_gpu) -> host_from_gpu(GPUscan)
@@ -1997,7 +1447,7 @@ def gpuScanOptimization(node):
     """
 
     #gpu_from_host(scan) -> GPUscan(gpu_from_host)
-    if isinstance(node.op, GpuFromHost):
+    if node.op == gpu_from_host:
         host_input = node.inputs[0]
         if (host_input.owner and
             isinstance(host_input.owner.op, scan_op.Scan) and
@@ -2040,11 +1490,13 @@ def gpuScanOptimization(node):
             # __init__ does not know about cuda ndarray and can not
             # handle graphs with inputs being Cuda Ndarrays
             tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins,
-                                                    scan_outs)
+                                                       scan_outs)
             local_fgraph = gof.FunctionGraph(tmp_in, tmp_out)
             _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
             info['gpu_hash'] = hash(_cmodule_key)
 
+            typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
+                    broadcastable=broadcastable)
             nw_op = scan_op.Scan(scan_ins,
                                  scan_outs,
                                  info,
@@ -2056,8 +1508,8 @@ def gpuScanOptimization(node):
     #scan(host_from_gpu) -> host_from_gpu(GPUscan)
     if (type(node.op) == scan_op.Scan
         and not node.op.info['gpu']):
-        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
-                for i in node.inputs]):
+        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+                      for i in node.inputs]):
 
             thescan = node.op
             info = copy.deepcopy(thescan.info)
@@ -2087,11 +1539,15 @@ def gpuScanOptimization(node):
             # __init__ does not know about cuda ndarray and can not
             # handle graphs with inputs being Cuda Ndarrays
             tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins,
-                                                    scan_outs)
+                                                       scan_outs)
             local_fgraph = gof.FunctionGraph(tmp_in, tmp_out)
             _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
             info['gpu_hash'] = hash(_cmodule_key)
 
+            def typeConstructor(broadcastable, dtype):
+                assert dtype == 'float32'
+                return CudaNdarrayType(broadcastable=broadcastable)
+
             _outputs = scan_op.Scan(
                 scan_ins,
                 scan_outs,
@@ -2108,12 +1564,10 @@ def gpuScanOptimization(node):
 
 
 optdb.register('gpu_scanOp_make_inplace',
-               scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
-                                             gpu_flag=True),
+               scan_opt.ScanInplaceOptimizer(typeConstructor=CudaNdarrayType,
+                                            gpu_flag=True),
                75,
                'gpu',
                'fast_run',
                'inplace',
                'scan')
-
-import theano.sandbox.cuda.extra_ops
diff --git a/theano/sandbox/cuda/rng_curand.py b/theano/sandbox/cuda/rng_curand.py
index 4b0d7ad09e0..36fe0edd298 100644
--- a/theano/sandbox/cuda/rng_curand.py
+++ b/theano/sandbox/cuda/rng_curand.py
@@ -49,7 +49,6 @@ def __init__(self, output_type, seed, destructive):
         if self.destructive:
             self.destroy_map = {0: [0]}
         self.output_type = output_type
-        assert output_type.dtype == "float32"
 
     def as_destructive(self):
         """Return an destructive version of self"""
@@ -148,28 +147,27 @@ def c_code(self, node, nodename, inp, out, sub):
         int n_elements = 1;
         int must_alloc_sample = ((NULL == %(o_sample)s)
                 || !CudaNdarray_Check(py_%(o_sample)s)
-                || (CudaNdarray_NDIM(%(o_sample)s) != %(ndim)s));
+                || (%(o_sample)s->nd != %(ndim)s));
 
-        if (PyArray_NDIM(%(size)s) != 1)
+        if (%(size)s->nd != 1)
         {
             PyErr_SetString(PyExc_ValueError, "size must be vector");
             %(fail)s
         }
-        if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
+        if (%(size)s->dimensions[0] != %(ndim)s)
         {
             PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%i)",
-                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
+                %(ndim)s, %(size)s->dimensions[0]);
             %(fail)s
         }
-        if (PyArray_TYPE(%(size)s) != NPY_INT32)
+        if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
         {
             PyErr_SetString(PyExc_ValueError, "size must be int32");
             %(fail)s
         }
         for (int i = 0; i < %(ndim)s; ++i)
         {
-            odims[i] = ((npy_int32*)(PyArray_DATA(%(size)s) +
-                        PyArray_STRIDES(%(size)s)[0] * i))[0];
+            odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0];
             n_elements *= odims[i];
             must_alloc_sample = (must_alloc_sample
                     || CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
@@ -347,7 +345,7 @@ def normal(self, size=None, avg=0.0, std=1.0, ndim=None,
         return  rval
 
 
-@local_optimizer([CURAND_Base])
+@local_optimizer([None])
 def local_destructive(node):
     op = node.op
     if isinstance(op, CURAND_Base) and not op.destructive:
diff --git a/theano/sandbox/cuda/tests/CudaNdarray.pkl b/theano/sandbox/cuda/tests/CudaNdarray.pkl
deleted file mode 100644
index a9375565a4b..00000000000
--- a/theano/sandbox/cuda/tests/CudaNdarray.pkl
+++ /dev/null
@@ -1,29 +0,0 @@
-ctheano.sandbox.cuda.type
-CudaNdarray_unpickler
-p1
-(cnumpy.core.multiarray
-_reconstruct
-p2
-(cnumpy
-ndarray
-p3
-(I0
-tS'b'
-tRp4
-(I1
-(I1
-tcnumpy
-dtype
-p5
-(S'f4'
-I0
-I1
-tRp6
-(I3
-S'<'
-NNNI-1
-I-1
-I0
-tbI00
-S'\x00\x00(\xc2'
-tbtR.
\ No newline at end of file
diff --git a/theano/sandbox/cuda/tests/CudaNdarray_py3.pkl b/theano/sandbox/cuda/tests/CudaNdarray_py3.pkl
deleted file mode 100644
index aff0c108400..00000000000
Binary files a/theano/sandbox/cuda/tests/CudaNdarray_py3.pkl and /dev/null differ
diff --git a/theano/sandbox/cuda/tests/test_basic_ops.py b/theano/sandbox/cuda/tests/test_basic_ops.py
index 9549580d165..98541297bb8 100644
--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -1,3 +1,5 @@
+import copy
+import sys
 import time
 import unittest
 
@@ -7,10 +9,10 @@
 import numpy
 import theano
 import theano.tensor as T
+from numpy.testing.noseclasses import KnownFailureTest
 
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-from nose.tools import assert_raises
 import theano.sandbox.cuda as cuda_ndarray
 if cuda_ndarray.cuda_available == False:
     raise SkipTest('Optional package cuda disabled')
@@ -61,22 +63,11 @@ def test_careduce():
     1110,1101,1011
 
     TODO: test with broadcast
-
-    We test with the pre_scalar_op sqr in all cases. This cover all
-    code, with and without it the pre_scalar_op.
-
     """
     for scalar_op, careduce_op in [
-            (theano.scalar.mul, tensor.elemwise.CAReduceDtype),
             (theano.scalar.add, tensor.elemwise.CAReduceDtype),
             (theano.scalar.maximum, tensor.CAReduce),
-            (theano.scalar.minimum, tensor.CAReduce)
-            #The following 2 cases could work if the scalar_op.c_code work with float* dtype.
-            #Currently we have this error:
-            #error: invalid operands of types 'npy_float32' and 'npy_float32' to binary 'operator&'
-            #(theano.scalar.and_, tensor.elemwise.CAReduce),
-            #(theano.scalar.or_, tensor.elemwise.CAReduce),
-    ]:
+            (theano.scalar.minimum, tensor.CAReduce)]:
         for shape, pattern in [((1,1),(1,)),
                                ((1,0),(1,)),
                                ((0,1),(1,)),
@@ -114,14 +105,12 @@ def test_careduce():
                                ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
                                #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
                                ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
-                               ((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111
 
                                ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
                                ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
                                ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
                                ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
-                               ((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
-                               ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
+                               ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
 
 
                                #test pattern implemented by reshape
@@ -129,31 +118,20 @@ def test_careduce():
                                ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
                                ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
                                ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
-
-                               # reduce over 2d
-                               ((4100,4,3,2),[1,2]),((4,4100,3,2),[1,2]),((4,3,4100,2),[1,2]),((4,3,2,4100),[1,2]),#0110
-#                               ((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]),#1001 need 101
-#                               ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented
-                               ((4100,4,3,2),[0,1]),((4,4100,3,2),[0,1]),((4,3,4100,2),[0,1]),((4,3,2,4100),[0,1]),#1100
-
-                               # reduce over 3d
-                               # 3d not tested: 1101, 1110, 1111
-#                               ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]),#1101 need 101
-                               ((4100,4,3,2),[0,1,2]),((4,4100,3,2),[0,1,2]),((4,3,4100,2),[0,1,2]),((4,3,2,4100),[0,1,2]),#1110
-
-                               # reduce over 4d
-                               ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1111
-
-                               # reduce over 5d
                                ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
 
                                ]:
 
             op = careduce_op(scalar_op, axis=pattern)
             pat = tensor_pattern_to_gpu_pattern(shape, pattern)
+            #GpuCAReduce{maximum/minimum} support only those patterns
+            if scalar_op in [theano.scalar.maximum,
+                             theano.scalar.minimum] and pat not in [
+                                 (0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
+                continue
 
             a = tensor.TensorType('float32', (False,) * len(shape))()
-            b = op(a*a)
+            b = op(a)
             val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
     #        val = numpy.ones(shape)
     #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
@@ -161,26 +139,15 @@ def test_careduce():
             f = theano.function([a], b, mode=mode_with_gpu)
             f2 = theano.function([a], b, mode=mode_without_gpu)
             assert tcn.GpuCAReduce in [x.op.__class__
-                                       for x in f.maker.fgraph.toposort()], (
-                                           scalar_op, shape, pattern)
-            if tcn.GpuElemwise in [x.op.__class__
-                                   for x in f.maker.fgraph.toposort()]:
-                assert tcn.GpuReshape in [x.op.__class__
-                                          for x in f.maker.fgraph.toposort()]
+                                       for x in f.maker.fgraph.toposort()]
             assert op.__class__ in [x.op.__class__
-                                    for x in f2.maker.fgraph.toposort()], (
-                                           scalar_op, shape, pattern)
+                                    for x in f2.maker.fgraph.toposort()]
             f_caused_value_error = False
             try:
                 f_out = f(val)
             except ValueError, e:
                 exc = e
                 f_caused_value_error = True
-            except NotImplementedError:
-                if (numpy.prod(shape) == 0 and
-                    getattr(scalar_op, 'identity', None) != 0):
-                    continue
-                raise
 
             f2_caused_value_error = False
             try:
@@ -212,7 +179,6 @@ def test_careduce():
                 theano.tensor.basic.float32_rtol = 2e-5
                 assert _allclose(f_out, f2_out), ('shape', shape,
                                                     'pattern', pattern,
-                                                    scalar_op,
                                                     sum([shape[i] for i in pattern]),
                                                     f2(val), f(val), val)
             finally:
@@ -224,18 +190,21 @@ def test_careduce():
         for shape, pattern in [#((5,),[0]),
                                ((5,4),[0,1]),((5,4),[0]),
                                ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
-                               ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),
-                               ((128,1,3,3),[0,1,2,3]),
-        ]:
+                               ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
             op = careduce_op(scalar_op, axis=pattern)
             pat = tensor_pattern_to_gpu_pattern(shape, pattern)
+            #GpuCAReduce{maximum/minimum} support only those patterns
+            if scalar_op in [theano.scalar.maximum,
+                             theano.scalar.minimum] and pat not in [
+                                 (0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
+                continue
 
             a = tensor.TensorType('float32', (False,) * len(shape))()
             dim_pattern = range(len(shape))
             dim_pattern[0] = 1
             dim_pattern[1] = 0
             a = a.dimshuffle(dim_pattern)
-            b = op(a*a)
+            b = op(a)
             val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
     #        val = numpy.ones(shape)
     #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
@@ -243,16 +212,11 @@ def test_careduce():
             f = theano.function([a], b, mode=mode_with_gpu)
             f2 = theano.function([a], b, mode=mode_without_gpu)
             assert tcn.GpuCAReduce in [x.op.__class__
-                                       for x in f.maker.fgraph.toposort()], (
-                                           scalar_op, shape, pattern)
-            assert tcn.GpuElemwise not in [x.op.__class__
-                                           for x in f.maker.fgraph.toposort()]
+                                       for x in f.maker.fgraph.toposort()]
             assert op.__class__ in [x.op.__class__
-                                    for x in f2.maker.fgraph.toposort()], (
-                                           scalar_op, shape, pattern)
+                                    for x in f2.maker.fgraph.toposort()]
             assert _allclose(f2(val), f(val)), ('shape', shape,
                                                 'pattern', pattern,
-                                                scalar_op,
                                                 sum([shape[i] for i in pattern]))
 
             #test with broadcast
@@ -260,17 +224,20 @@ def test_careduce():
                                ((5,4),[0,1]),((5,4),[0]),
                                ((5,4,3),[0]),((5,4,3),[0,1]),
                                ((5,4,3),[2]),((5,4,3),[0,1,2]),
-                               ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),
-                               ((128,1,3,3),[0,1,2,3]),
-        ]:
+                               ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
             op = careduce_op(scalar_op, axis=pattern)
             pat = tensor_pattern_to_gpu_pattern(shape, pattern)
+            #GpuCAReduce{maximum/minimum} support only those patterns
+            if scalar_op in [theano.scalar.maximum,
+                             theano.scalar.minimum] and pat not in [
+                                 (0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
+                continue
 
             shape = numpy.asarray(shape) * 2
             a = tensor.TensorType('float32', (False,) * len(shape))()
             a2 = tcn.CudaNdarrayType((False,) * len(shape))()
-            b = op(a*a)
-            b2 = op(a2*a2)
+            b = op(a)
+            b2 = op(a2)
             val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
     #        val = numpy.ones(shape)
     #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
@@ -291,13 +258,9 @@ def test_careduce():
             f = theano.function([a], b, mode=mode_without_gpu)
             f2 = theano.function([a2], b2, mode=mode_with_gpu)
             assert tcn.GpuCAReduce in [x.op.__class__
-                                       for x in f2.maker.fgraph.toposort()], (
-                                           scalar_op, shape, pattern)
-            assert tcn.GpuElemwise not in [x.op.__class__
-                                           for x in f.maker.fgraph.toposort()]
+                                       for x in f2.maker.fgraph.toposort()]
             assert op.__class__ in [x.op.__class__
-                                    for x in f.maker.fgraph.toposort()], (
-                                           scalar_op, shape, pattern)
+                                    for x in f.maker.fgraph.toposort()]
             assert _allclose(f2(val2), f(val)), ('shape', shape,
                                                  'pattern', pattern,
                                                  sum([shape[i] for i in pattern]))
@@ -353,25 +316,6 @@ def just_vals(v):
         return T.Reshape(2)(v, theano._asarray([2, 3], dtype='int32'))
     utt.verify_grad(just_vals, [a_val])
 
-    # Test for appropriate handling of -1 indices
-    x = T.tensor3('x')
-    reshp_val = numpy.array([[[1, 0], [0, 1]], [[0, 1], [1, 0]]], dtype='float32')
-    f_reshp = theano.function([x], x.reshape((-1, 1, 1)), mode=mode_with_gpu)
-    y = f_reshp(reshp_val)
-    assert y.shape == (8, 1, 1)
-
-    dim = T.scalar('dim_val', dtype='int32')
-    f_reshp=theano.function(
-        [x, dim],
-        x.reshape((dim, dim, 1)),
-        mode=mode_with_gpu
-    )
-    try:
-        f_reshp(reshp_val, 4)
-        raise('Only one -1 is accepted in the new shape')
-    except ValueError:
-        pass
-
 
 def test_elemwise_empty():
     #test with 0 element
@@ -966,9 +910,8 @@ def setUp(self):
         self.shared = cuda.shared_constructor
 
 
-import theano.tensor.tests.test_subtensor
 # This is to don't duplicate test.
-class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
+class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
 
     # This prevents nose from printing method docstrings instead of method
     # names
@@ -982,13 +925,13 @@ def shortDescription(self):
     adv_incsub1 = cuda.GpuAdvancedIncSubtensor1
     mode = mode_with_gpu
     dtype = 'float32'
-    ignore_topo = (B.HostFromGpu, B.GpuFromHost, theano.compile.DeepCopyOp)
+    ignore_topo = (B.HostFromGpu, B.GpuFromHost)
     fast_compile = False
     ops = (cuda.GpuSubtensor, cuda.GpuIncSubtensor,
            cuda.GpuAdvancedSubtensor1, cuda.GpuAdvancedIncSubtensor1)
 
     def __init__(self, name):
-        return super(theano.tensor.tests.test_subtensor.T_subtensor,
+        return super(theano.tensor.tests.test_basic.T_subtensor,
                      self).__init__(name)
 
     def test_adv_sub1_fast(self):
@@ -1003,8 +946,6 @@ def test_adv_sub1_fast(self):
         # version when we should. Users should not use it.
         for shape, idx, fast in [((70000,), range(70000), True),
                                  ((70000, 5), range(70000), True),
-                                 ((70000, 5),  numpy.zeros((0,), 'int64'),
-                                  True),
                                  ((70000, 2, 3), range(70000), True),
                                  ((1025, 1025), [5, 10], True),
                                  ((3, 1025, 1026), [1, 2], True),
@@ -1024,7 +965,6 @@ def test_adv_sub1_fast(self):
                                  # optimized for that case.
                                  ((4, 4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0,
                                                  -1, -2, -3, -4], False),
-                                 ((1, 10), [0, 0], True),
                              ]:
             # If there is not enough memory on the GPU, skip the test
             size_needed = numpy.prod(shape) * (4 + 1)
@@ -1065,23 +1005,20 @@ def test_adv_sub1_fast(self):
 
 def test_advinc_subtensor1():
     """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
-    for shp in [(3, 3), (3, 3, 3)]:
-        shared = cuda.shared_constructor
-        xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
-        yval = numpy.empty((2,) + shp[1:], dtype='float32')
-        yval[:] = 10
-        x = shared(xval, name='x')
-        y = T.tensor(dtype='float32',
-                     broadcastable=(False,) * len(shp),
-                     name='y')
-        expr = T.advanced_inc_subtensor1(x, y, [0, 2])
-        f = theano.function([y], expr, mode=mode_with_gpu)
-        assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
-                    for node in f.maker.fgraph.toposort()]) == 1
-        rval = f(yval)
-        rep = xval.copy()
-        rep[[0, 2]] += yval
-        assert numpy.allclose(rval, rep)
+    shared = cuda.shared_constructor
+    #shared = tensor.shared
+    xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                      dtype='float32')
+    yval = numpy.asarray([[10, 10, 10], [10, 10, 10]],
+                      dtype='float32')
+    x = shared(xval, name='x')
+    y = T.fmatrices('y')
+    expr = T.advanced_inc_subtensor1(x, y, [0, 2])
+    f = theano.function([y], expr, mode=mode_with_gpu)
+    assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
+                for node in f.maker.fgraph.toposort()]) == 1
+    assert numpy.allclose(f(yval), [[11., 12., 13.], [4., 5., 6.],
+                                    [17., 18., 19.]])
 
 
 def test_inc_subtensor():
@@ -1308,15 +1245,9 @@ def speed_adv_sub1():
         print "ProfileMode with batch size", batch_size
         mode_with_gpu.print_summary()
 
-
-def speed_reduce10():
-    data = numpy.random.rand(1000, 1000).astype("float32")
-    m = theano.tensor.fmatrix()
-    f = theano.function([m], [m.sum(axis=0), m.T.sum(axis=0)],
-                        mode=mode_with_gpu)
-    f(data)
-
-
 if __name__ == '__main__':
     test_many_arg_elemwise()
+    test_gpujoin_twomatrices_joincolumns()
     test_gpujoin_assert_cndas()
+    test_gpujoin_preserves_broadcasting()
+    test_gpujoin_twomatrices_badshapes()
diff --git a/theano/sandbox/cuda/tests/test_blocksparse.py b/theano/sandbox/cuda/tests/test_blocksparse.py
deleted file mode 100644
index 15fe43665e7..00000000000
--- a/theano/sandbox/cuda/tests/test_blocksparse.py
+++ /dev/null
@@ -1,204 +0,0 @@
-import numpy
-from numpy.random import randn
-
-from unittest import TestCase
-
-from nose.plugins.skip import SkipTest
-
-import theano
-from theano import tensor
-import theano.tests.unittest_tools as utt
-
-import theano.sandbox.cuda as cuda_ndarray
-if not cuda_ndarray.cuda_available:
-    raise SkipTest('Optional package cuda disabled')
-
-from theano.sandbox.cuda.basic_ops import (GpuDimShuffle,
-                                           as_cuda_ndarray_variable)
-from theano.sandbox.cuda.blocksparse import (sparse_block_dot_SS,
-                                             sparse_block_gemv_ss,
-                                             sparse_block_outer_ss,
-                                             sparse_block_outer_ss_inplace)
-from theano.sandbox.cuda.var import float32_shared_constructor
-
-
-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-
-
-def setup():
-    utt.seed_rng()
-
-
-def blocksparse_data():
-    nInputBlock = 128
-    nOutputBlock = 64
-    inputSize = 40
-    outputSize = 30
-    inputWindowSize = 7
-    outputWindowSize = 9
-    batchSize = 2
-
-    input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
-    permutation = numpy.random.permutation
-    inputIndice = numpy.vstack(permutation(nInputBlock)[:inputWindowSize]
-                               for _ in range(batchSize))
-    outputIndice = numpy.vstack(permutation(nOutputBlock)[:outputWindowSize]
-                                for _ in range(batchSize))
-    weight = randn(nInputBlock, nOutputBlock,
-                   inputSize, outputSize).astype('float32')
-    bias = randn(nOutputBlock, outputSize).astype('float32')
-
-    return weight, input, inputIndice, bias, outputIndice
-
-
-def blocksparse(W, h, iIdx, b, oIdx):
-    o = b.take(oIdx, axis=0)
-
-    for b in range(o.shape[0]):
-        for j in range(o.shape[1]):
-            outputIdx = oIdx[b, j]
-
-            for i in range(h.shape[1]):
-                inputIdx = iIdx[b, i]
-                w = W[inputIdx, outputIdx]
-                # this below is a gemv I think
-                o[b, j, :] += numpy.dot(h[b, i], w)
-    return o
-
-
-def test_blocksparse():
-    b = tensor.fmatrix()
-    W = tensor.ftensor4()
-    h = tensor.ftensor3()
-    iIdx = tensor.lmatrix()
-    oIdx = tensor.lmatrix()
-
-    o = sparse_block_dot_SS(W, h, iIdx, b, oIdx)
-
-    f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
-
-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
-
-    th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
-    ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
-
-    utt.assert_allclose(ref_out, th_out)
-
-test_blocksparse.setup = setup
-
-
-# test the fortan order for W (which can happen in the grad for some graphs).
-def test_blocksparseF():
-    b = tensor.fmatrix()
-    W = tensor.ftensor4()
-    h = tensor.ftensor3()
-    iIdx = tensor.lmatrix()
-    oIdx = tensor.lmatrix()
-
-    o = sparse_block_dot_SS(GpuDimShuffle((False, False, False, False),
-                                          (0, 1, 3, 2))(
-                                              as_cuda_ndarray_variable(W)),
-                            h, iIdx, b, oIdx)
-
-    f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
-
-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
-
-    th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val)
-    ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
-
-    utt.assert_allclose(ref_out, th_out)
-
-
-def test_blocksparse_grad():
-    h_val = randn(1, 2, 3).astype('float32')
-    iIdx_val = numpy.random.permutation(3)[:2][None, :]
-    oIdx_val = numpy.random.permutation(3)[:2][None, :]
-    W_val = randn(3, 3, 3, 4).astype('float32')
-    b_val = randn(3, 4).astype('float32')
-
-    iIdx = theano.tensor.constant(iIdx_val)
-    oIdx = theano.tensor.constant(oIdx_val)
-
-    def f(b, h, W):
-        return sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-
-    utt.verify_grad(f, [b_val, h_val, W_val], mode=mode_with_gpu)
-
-
-def test_blocksparse_grad_1():
-    # This tests that we correctly handle cases where dimensions are 1.
-    h_val = randn(1, 1, 1).astype('float32')
-    iIdx_val = numpy.random.permutation(1)[:1][None, :]
-    oIdx_val = numpy.random.permutation(1)[:1][None, :]
-    W_val = randn(1, 1, 1, 1).astype('float32')
-    b_val = randn(1, 1).astype('float32')
-
-    iIdx = theano.tensor.constant(iIdx_val)
-    oIdx = theano.tensor.constant(oIdx_val)
-
-    def f(b, h, W):
-        return sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-
-    utt.verify_grad(f, [b_val, h_val, W_val], mode=mode_with_gpu)
-
-
-def test_blocksparse_grad_shape():
-    b = tensor.fmatrix()
-    W = tensor.ftensor4()
-    h = tensor.ftensor3()
-    iIdx = tensor.lmatrix()
-    oIdx = tensor.lmatrix()
-
-    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-    go = theano.grad(o.sum(), [b, W, h])
-
-    f = theano.function([W, h, iIdx, b, oIdx], go, mode=mode_with_gpu)
-
-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
-
-    # just make sure that it runs correcly and all the shapes are ok.
-    b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
-
-    assert b_g.shape == b_val.shape
-    assert h_g.shape == h_val.shape
-    assert W_g.shape == W_val.shape
-
-
-def test_blocksparse_grad_merge():
-    b = tensor.fmatrix()
-    h = tensor.ftensor3()
-    iIdx = tensor.lmatrix()
-    oIdx = tensor.lmatrix()
-
-    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
-    W = float32_shared_constructor(W_val)
-
-    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-    gW = theano.grad(o.sum(), W)
-
-    lr = numpy.asarray(0.05, dtype='float32')
-
-    upd = W - lr * gW
-
-    f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
-                         mode=mode_with_gpu)
-    # not running with mode=gpu ensures that the elemwise is not merged in
-    mode = None
-    if theano.config.mode == 'FAST_COMPILE':
-        mode = theano.compile.mode.get_mode('FAST_RUN')
-
-    f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
-
-    f2(h_val, iIdx_val, b_val, oIdx_val)
-    W_ref = W.get_value()
-
-    # reset the var
-    W.set_value(W_val)
-    f1(h_val, iIdx_val, b_val, oIdx_val)
-    W_opt = W.get_value()
-
-    utt.assert_allclose(W_ref, W_opt)
diff --git a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
index c953cd45340..82bd7db36e6 100644
--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -4,12 +4,11 @@
 import sys
 import time
 import unittest
-import traceback
+
 
 import numpy
 
 from nose.plugins.skip import SkipTest
-from nose.tools import assert_raises
 imported_scipy_convolve2d = False
 try:
     from scipy.signal import convolve2d
@@ -20,24 +19,23 @@
 import theano
 from theano import tensor
 from theano.gof.python25 import any
-from theano.tests.unittest_tools import seed_rng, assert_allclose
 
-# Skip test if cuda is not available.
-from theano.sandbox import cuda
-if cuda.cuda_available == False:
+# Skip test if cuda_ndarray is not available.
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available == False:
     raise SkipTest('Optional package cuda disabled')
 
-from theano.sandbox.cuda.dnn import GpuDnnConv, GpuDnnConvBase, dnn_conv
-
 #needed as the gpu conv don't have a perform implementation.
 if theano.config.mode == 'FAST_COMPILE':
     theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
 else:
     theano_mode = theano.compile.mode.get_default_mode().including('gpu')
 
+cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4)
+
 device_id = theano.sandbox.cuda.use.device_number
 if device_id is None:
-    cuda.shared_constructor(numpy.zeros(2, dtype='float32'))
+    cuda_ndarray.shared_constructor(numpy.zeros(2, dtype='float32'))
 device_id = theano.sandbox.cuda.use.device_number
 if device_id is None:
     cuda.use("gpu",
@@ -47,7 +45,6 @@
              enable_cuda=False,
              test_driver=True)
     device_id = theano.sandbox.cuda.use.device_number
-
 cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
 device_prop = cuda_ndarray.device_properties(device_id)
 
@@ -71,21 +68,16 @@ def py_conv_valid_numpy(img, kern):
                     out[b, k, rr, cc] = innerprod
     return out
 
-def py_conv_pad_img(img, pad_h, pad_w):
-    assert pad_h >= 0 and pad_w >= 0
-    padded_img = numpy.zeros(
-        (img.shape[0], img.shape[1],
-         pad_h * 2 + img.shape[2], pad_w * 2 + img.shape[3]),
-        dtype=img.dtype)
-    padded_img[:, :,
-               pad_h: pad_h + img.shape[2],
-               pad_w: pad_w + img.shape[3]] = img
-    return padded_img
 
 def py_conv_full_numpy(img, kern):
     # manually pad the img with zeros all around, and then run it
     # through py_conv_valid
-    padded_img = py_conv_pad_img(img, kern.shape[2] - 1, kern.shape[3] - 1)
+    pad_rows = 2 * (kern.shape[2] - 1) + img.shape[2]
+    pad_cols = 2 * (kern.shape[3] - 1) + img.shape[3]
+    padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols),
+                             dtype=img.dtype)
+    padded_img[:, :, kern.shape[2] - 1: kern.shape[2] - 1 + img.shape[2],
+                     kern.shape[3] - 1: kern.shape[3] - 1 + img.shape[3]] = img
     return py_conv_valid_numpy(padded_img, kern)
 
 
@@ -94,12 +86,6 @@ def py_conv(img, kern, mode, subsample):
     use a scipy or numpy implementation depending is scipy is available.
     The scipy version is faster.
     """
-    if isinstance(mode, int):
-        mode = (mode, mode)
-    if isinstance(mode, tuple):
-        pad_h, pad_w = map(int, mode)
-        img = py_conv_pad_img(img, pad_h, pad_w)
-        mode = 'valid'
     if imported_scipy_convolve2d:
         return py_conv_scipy(img, kern, mode, subsample)
     elif mode == 'valid':
@@ -126,10 +112,9 @@ def py_conv_scipy(img, kern, mode, subsample):
     for b in xrange(out.shape[0]):
         for k in xrange(out.shape[1]):
             for s in xrange(img.shape[1]):
-                #convolve2d or correlate
                 out[b, k, :, :] += convolve2d(img[b, s, :, :],
-                                  kern[k, s, :, :],
-                                  mode)
+                                              kern[k, s, :, :],
+                                              mode)
     return out[:, :, ::subsample[0], ::subsample[1]]
 
 
@@ -140,8 +125,7 @@ def _params_allgood_header():
 def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
                     kern_stride=(1, 1), version=-1, verbose=0, random=True,
                     print_=None, id=None, rtol=1e-5, atol=1e-8,
-                    nb_iter=0, ones=False, compile_kshp=None,
-                    theano_mode=None, cls=None):
+                    nb_iter=0, ones=False, compile_kshp=None):
     #
     # This function is the core of several of the big unit-test drivers,
     # but it can also be used very directly on its own to test a specific
@@ -183,34 +167,37 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
         npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]
 
     t2 = None
-
-    t0 = time.time()
-    cpuval = py_conv(npy_img, npy_kern, mode, subsample)
-    t1 = time.time()
-    i = cuda.CudaNdarrayType(
-        broadcastable=[sh == 1 for sh in npy_img.shape])()
-    k = cuda.CudaNdarrayType(
-        broadcastable=[sh == 1 for sh in npy_kern.shape])()
-    op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
-                                          subsample=subsample,
-                                          version=version,
-                                          verbose=verbose,
-                                          kshp=compile_kshp)(i, k)
-    assert [(sh == 1) is br for
-            sh, br in zip(cpuval.shape[:2], op.type.broadcastable[:2])]
-    f = theano.function([i, k], op, mode=theano_mode)
-    if cls is not None:
-        assert any([isinstance(node.op, cls)
-                    for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
-    gpuval = f(img, kern)
-    t2 = time.time()
-    for i in range(nb_iter):
-        gpuval2 = f(img, kern)
-        assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
-    gpuval = numpy.asarray(gpuval)
-    assert gpuval.shape == cpuval.shape, ("shape mismatch", gpuval.shape, cpuval.shape)
-    assert_allclose(cpuval, gpuval, rtol=rtol, atol=atol)
-    assert numpy.all(numpy.isfinite(gpuval)), gpuval
+    rval = True
+    try:
+        t0 = time.time()
+        cpuval = py_conv(npy_img, npy_kern, mode, subsample)
+        t1 = time.time()
+        i = cuda_tensor4()
+        k = cuda_tensor4()
+        op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
+                                              subsample=subsample,
+                                              version=version,
+                                              verbose=verbose,
+                                              kshp=compile_kshp)(i, k)
+        f = theano.function([i, k], op, mode=theano_mode)
+        gpuval = f(img, kern)
+        t2 = time.time()
+        for i in range(nb_iter):
+            gpuval2 = f(img, kern)
+            assert numpy.allclose(numpy.asarray(gpuval),
+                                  numpy.asarray(gpuval2))
+            assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
+        gpuval = numpy.asarray(gpuval)
+        if gpuval.shape != cpuval.shape:
+            print >> sys.stdout, "ERROR: shape mismatch",
+            print >> sys.stdout, gpuval.shape, cpuval.shape
+            rval = False
+        if rval:
+            rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
+            assert numpy.all(numpy.isfinite(gpuval))
+    except NotImplementedError, e:
+        print >> sys.stdout, '_params_allgood Failed allclose', e
+        rval = False
 
     if (t2 is not None):
         if mode == 'valid':
@@ -225,20 +212,82 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
             print >> sys.stdout, '%15s' % str(ishape), '%15s' % str(kshape),
             print >> sys.stdout, '%12.5f  %7.2f %7.2f %7.1f' % (approx_fp,
                     cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1))
+    if not rval:
+        print >> sys.stdout, ('test_' + mode + ' id=' + str(id) +
+                              ' FAILED for ishape, kshape, mode, subsample,' +
+                              ' img_stride, kern_stride, version', ishape,
+                              kshape, mode, subsample, img_stride, kern_stride,
+                              version)
+        diff = cpuval - gpuval
+        diffabs = numpy.absolute(diff)
+        pr_diff = diffabs / numpy.absolute(cpuval)
+        nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
+        print "max absolute diff:", (diffabs.max(), "avg abs diff:",
+                                     numpy.average(diffabs))
+        print "median abs diff:", (numpy.median(diffabs), "nb close:",
+                                   nb_close, "/", diff.size)
+        print "max relatif diff:", (pr_diff.max(), "avg rel diff:",
+                                    numpy.average(pr_diff))
+    if not rval and print_ != False:
+        if npy_img.shape[0] > 5:
+            print "img", npy_img[0]
+            print "kern", npy_kern[0]
+            print "gpu", gpuval[0][0]
+            print "cpu", cpuval[0][0]
+            print "diff", diff[0][0]
+        else:
+            print "img", npy_img
+            print "kern", npy_kern
+            print "gpu", gpuval
+            print "cpu", cpuval
+            print "diff", diff
+
+    return rval
 
 
 def exec_conv(version, shapes, verbose, random, mode,
-              print_=None, rtol=1e-5, ones=False,
-              theano_mode=theano_mode, cls=None):
+              print_=None, rtol=1e-5, ones=False):
     if verbose > 0:
         _params_allgood_header()
+    nb_failed = 0
+    nb_tests = 0
 
+    failed_version = set()
+    failed_id = []
+    # I put -1 in case we forget to add version in the test to.
     for ver in version:
         for id, (ishape, kshape, subshape,
                  istride, kstride) in enumerate(shapes):
-            yield (_params_allgood, ishape, kshape, mode, subshape,
-                   istride, kstride, ver, verbose, random, print_, id,
-                   rtol, 1e-8, 0, ones, None, theano_mode, cls)
+            ret = False
+            try:
+                ret = _params_allgood(ishape,
+                        kshape,
+                        mode,
+                        subsample=subshape,
+                        img_stride=istride,
+                        kern_stride=kstride,
+                        version=ver,
+                        verbose=verbose,
+                        random=random,
+                        id=id,
+                        print_=print_,
+                        rtol=rtol,
+                        ones=ones)
+            except Exception, e:
+                print ver, id, (ishape, kshape, subshape, istride, kstride)
+                print e
+                pass
+            if not ret:
+                failed_version.add(ver)
+                failed_id.append(id)
+                nb_failed += 1
+            nb_tests += 1
+    if nb_failed > 0:
+        print "nb_failed", nb_failed, "on", nb_tests,
+        print "failed_version", failed_version, "failed_id", failed_id
+        assert nb_failed == 0, nb_failed
+    else:
+        print 'Executed', nb_tests, 'different shapes'
 
 
 def get_basic_shapes():
@@ -366,7 +415,6 @@ def get_valid_shapes():
 
 
 def test_valid_0_2():
-    seed_rng()
     shapes = get_valid_shapes()
     version = [0, 2]
     verbose = 0
@@ -393,13 +441,11 @@ def test_valid_0_2():
             shapes2.append((ishape, kshape, subshape, istride, kstride))
     shapes = shapes2
 
-    for t in exec_conv(version, shapes, verbose, random, 'valid',
-                       print_=print_, ones=ones, rtol=1.1e-5):
-        yield t
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 
 
 def test_valid_1_3_11_12():
-    seed_rng()
     shapes = get_valid_shapes()
     version = [1, 3, 11, 12]
     verbose = 0
@@ -424,13 +470,11 @@ def test_valid_1_3_11_12():
             shapes2.append((ishape, kshape, subshape, istride, kstride))
     shapes = shapes2
 
-    for t in exec_conv(version, shapes, verbose, random, 'valid',
-                       print_=print_, ones=ones, rtol=1.1e-5):
-        yield t
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 
 
 def test_valid_4():
-    seed_rng()
     shapes = get_valid_shapes()
     version = [4]
     verbose = 0
@@ -457,13 +501,11 @@ def test_valid_4():
             shapes2.append((ishape, kshape, subshape, istride, kstride))
     shapes = shapes2
 
-    for t in exec_conv(version, shapes, verbose, random, 'valid',
-                       print_=print_, ones=ones, rtol=1.1e-5):
-        yield t
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 
 
 def test_valid_5():
-    seed_rng()
     shapes = get_valid_shapes()
     version = [5]
     verbose = 0
@@ -475,6 +517,7 @@ def test_valid_5():
         random = False
     shapes2 = []
 
+#    print len(shapes)
     for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
         oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                   numpy.asarray(kshape[2:]) +
@@ -487,14 +530,13 @@ def test_valid_5():
         if subshape == (1, 1):
             shapes2.append((ishape, kshape, subshape, istride, kstride))
     shapes = shapes2
+#    print len(shapes2)
 
-    for t in exec_conv(version, shapes, verbose, random, 'valid',
-                       print_=print_, ones=ones, rtol=1.1e-5):
-        yield t
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 
 
 def test_valid_7_8_13():
-    seed_rng()
     shapes = get_valid_shapes()
     # This is to test the "new" lower shared memory usage.
     shapes.append(((10, 30, 60, 60), (20, 30, 40, 40),
@@ -509,6 +551,7 @@ def test_valid_7_8_13():
         random = False
     shapes2 = []
 
+#    print len(shapes)
     for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
         oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                   numpy.asarray(kshape[2:]) +
@@ -521,14 +564,13 @@ def test_valid_7_8_13():
         if subshape == (1, 1):
             shapes2.append((ishape, kshape, subshape, istride, kstride))
     shapes = shapes2
+#    print len(shapes2)
 
-    for t in exec_conv(version, shapes, verbose, random, 'valid',
-                       print_=print_, ones=ones, rtol=1.1e-5):
-        yield t
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 
 
 def test_valid_9_10():
-    seed_rng()
     shapes = get_valid_shapes()
     version = [9, 10]
     verbose = 0
@@ -540,6 +582,7 @@ def test_valid_9_10():
         random = False
     shapes2 = []
 
+#    print len(shapes)
     for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
         oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                   numpy.asarray(kshape[2:]) +
@@ -551,17 +594,21 @@ def test_valid_9_10():
         if subshape == (1, 1):
             shapes2.append((ishape, kshape, subshape, istride, kstride))
     shapes = shapes2
+#    print len(shapes2)
 
-    for t in exec_conv(version, shapes, verbose, random, 'valid',
-                       print_=print_, ones=ones, rtol=1.1e-5):
-        yield t
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 
 
-def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
-    seed_rng()
+def test_valid():
     shapes = get_valid_shapes()
 
+    #shapes=shapes[400:426]
+    # I put -1 in case we forget to add version in the test to.
+    # I put -2 to test the reference version.
+    version = [-2, -1, 6]
     verbose = 0
+#    version=[1]
 
     random = True
     print_ = False
@@ -569,70 +616,11 @@ def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
     if ones:
         random = False
 
-    shapes += extra_shapes
-
-    return exec_conv(version, shapes, verbose, random, 'valid',
-                     print_=print_, ones=ones, rtol=1.1e-5,
-                     theano_mode=mode, cls=cls)
-
-
-def test_valid():
-    for t in _test_valid(None,
-                         mode=theano_mode,
-                         version=[-2, -1, 6]):
-        yield t
-
-
-def test_gemm_valid():
-    extra_shapes = get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
-    extra_shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
 
-    for t in _test_valid(cuda.blas.BaseGpuCorrMM,
-                         mode=theano_mode.excluding("cudnn"),
-                         extra_shapes=extra_shapes):
-        yield t
 
-
-def test_dnn_valid():
-    if not cuda.dnn.dnn_available():
-        raise SkipTest(cuda.dnn.dnn_available.msg)
-    for t in _test_valid(GpuDnnConvBase, mode=theano_mode.including("cudnn")):
-        yield t
-
-
-def test_default_conv():
-    """Just test that we introduce the right GPU convolution
-    version.
-
-    """
-    img = theano.tensor.ftensor4()
-    fil = theano.tensor.ftensor4()
-
-    c = theano.tensor.nnet.conv2d(img, fil)
-    f = theano.function([img, fil], c, mode=theano_mode)
-
-    if cuda.dnn.dnn_available():
-        assert any([isinstance(a.op, GpuDnnConv)
-                    for a in f.maker.fgraph.apply_nodes])
-    else:
-        assert any([isinstance(a.op, cuda.blas.GpuCorrMM)
-                    for a in f.maker.fgraph.apply_nodes])
-
-    mode = theano_mode.excluding('local_conv_dnn', 'local_conv_gemm')
-    f = theano.function([img, fil], c, mode=mode)
-
-    assert any([isinstance(a.op, cuda.blas.GpuConv)
-                for a in f.maker.fgraph.apply_nodes])
-
-    mode = theano_mode.excluding('conv_dnn', 'conv_gemm')
-    f = theano.function([img, fil], c, mode=mode)
-
-    assert any([isinstance(a.op, cuda.blas.GpuConv)
-                for a in f.maker.fgraph.apply_nodes])
-
-
-def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
-    seed_rng()
+def test_full():
     shapes = get_basic_shapes()
     shapes += get_shapes2()
     #test image stride
@@ -682,40 +670,19 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
             #Test more than maxThreadsDim0
             , ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
             , ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
-            , ((1,1,44800,1), (6,1,1,1), (1, 1), (1, 1), (1, 1))#This caused crash
             ]
 
+#    shapes=shapes[:277]
+    version = [-2, -1, 0, 1, 2, 3, 4, 5]
     verbose = 0
+#    version=[4]
     random = True
 
-    shapes += extra_shapes
-
-    return exec_conv(version, shapes, verbose, random, 'full',
-                     theano_mode=mode, cls=cls)
-
-
-def test_full():
-    for t in _test_full(None,
-                        mode=theano_mode,
-                        version=[-2, -1, 0, 1, 2, 3, 4, 5]):
-        yield t
-
-
-def test_gemm_full():
-    for t in _test_full(cuda.blas.BaseGpuCorrMM,
-                        mode=theano_mode.excluding("cudnn")):
-        yield t
-
+    exec_conv(version, shapes, verbose, random, 'full')
 
-def test_dnn_full():
-    if not cuda.dnn.dnn_available():
-        raise SkipTest(cuda.dnn.dnn_available.msg)
-    for t in _test_full(GpuDnnConvBase, mode=theano_mode.including("cudnn")):
-        yield t
 
-
-def _test_subsample(cls, mode, version_valid=[-1], version_full=[-1]):
-    seed_rng()
+def test_subsample():
+    # implement when
     shapes = [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
               ((1, 1, 1, 1), (1, 1, 1, 1), (2, 2), (1, 1), (1, 1)),
               ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1, 1), (1, 1)),
@@ -727,8 +694,9 @@ def _test_subsample(cls, mode, version_valid=[-1], version_full=[-1]):
     shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1))
     shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 2))
 
-    # We put only the version that implement the subsample to make the
-    # test faster.
+#We put only the version that implement the subsample to make the test faster.
+    version_valid = [-2, -1, 1, 3, 11, 12]
+    version_full = [-2, -1]
     verbose = 0
     random = True
     print_ = False
@@ -736,43 +704,14 @@ def _test_subsample(cls, mode, version_valid=[-1], version_full=[-1]):
     if ones:
         random = False
 
-    for t in exec_conv(version_valid, shapes, verbose, random, 'valid',
-                       print_=print_, ones=ones,
-                       theano_mode=mode, cls=cls):
-        yield t
-    for t in exec_conv(version_full, shapes, verbose, random, 'full',
-                       print_=print_, ones=ones,
-                       theano_mode=mode, cls=cls):
-        yield t
-
-
-def test_subsample():
-    for t in _test_subsample(None, theano_mode,
-                             version_valid=[-2, -1, 1, 3, 11, 12],
-                             version_full=[-2, -1]):
-        yield t
-
-
-def test_gemm_subsample():
-    for t in _test_subsample(cuda.blas.BaseGpuCorrMM,
-                             theano_mode.excluding("cudnn")):
-        yield t
-
-
-def test_dnn_subsample():
-    if not cuda.dnn.dnn_available():
-        raise SkipTest(cuda.dnn.dnn_available.msg)
-    for t in _test_subsample(GpuDnnConvBase, theano_mode.including('cudnn')):
-        yield t
+    exec_conv(version_valid, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones)
+    exec_conv(version_full, shapes, verbose, random, 'full',
+              print_=print_, ones=ones)
 
 
 class TestConv2DGPU(unittest.TestCase):
-    conv_ops = (cuda.blas.GpuConv,
-                cuda.dnn.GpuDnnConvBase,
-                cuda.blas.BaseGpuCorrMM)
-
     def test_logical_shapes(self):
-        seed_rng()
         for stride in range(1, 4):
             kshp = (10, 2, 10, 10)
             featshp = (3, 10, 11, 11)
@@ -787,7 +726,7 @@ def test_logical_shapes(self):
             featshp_logical = (featshp[0], featshp[1], featshp[2] * stride,
                                featshp[3] * stride)
             kshp_rotated = (kshp[1], kshp[0], kshp[2], kshp[3])
-            #print featshp, kshp_rotated, featshp_logical[1:], kshp[2:]
+            print featshp, kshp_rotated, featshp_logical[1:], kshp[2:]
             image_estimate = tensor.nnet.conv2d(a, kernel_rotated,
                                                 border_mode='full',
                                                 image_shape=featshp,
@@ -796,8 +735,8 @@ def test_logical_shapes(self):
                                                 kshp_logical=kshp[2:])
 
             func = theano.function([a, A], image_estimate, mode=theano_mode)
-            #theano.printing.debugprint(func,)
-            assert any([isinstance(node.op, self.conv_ops)
+            theano.printing.debugprint(func,)
+            assert any([isinstance(node.op, theano.sandbox.cuda.blas.GpuConv)
                         for node in func.maker.fgraph.toposort()])
 
             a_in = numpy.random.randn(*featshp).astype("float32")
@@ -810,7 +749,6 @@ def test_invalid_input_shape(self):
         Tests that when the shape gived at build time is not the same as
         run time we raise an error
         """
-        seed_rng()
         verbose = 0
         random = True
         print_ = False
@@ -840,221 +778,31 @@ def test_invalid_input_shape(self):
         finally:
             theano_mode = theano_mode_orig
 
-class TestConvWithPadding(object):
-    """test conv ops that support arbitrary padding via border_mode
-    note that in order to make the yield work, we can not subclass from 
-    unittest.TestCase
-    """
-
-    @staticmethod
-    def gemm_conv_op(img, kern, border_mode):
-        kern = theano.sandbox.cuda.basic_ops.gpu_contiguous(
-            kern[:, :, ::-1, ::-1])
-        y = theano.sandbox.cuda.blas.GpuCorrMM(border_mode=border_mode)(
-            img, kern)
-        return y
-
-    conv_ops = []
-
-    @classmethod
-    def setup_class(cls):
-        cls.conv_ops.append(cls.gemm_conv_op)
-        if cuda.dnn.dnn_available():
-            cls.conv_ops.append(cuda.dnn.dnn_conv)
-
-    def test_invalid_arg(self):
-        img = theano._asarray(numpy.empty((1, 1, 1, 1)), dtype='float32')
-        kern = theano._asarray(numpy.empty((1, 1, 1, 1)), dtype='float32')
-        for i in self.conv_ops:
-            assert_raises(ValueError, i, img, kern,
-                              border_mode=(-1, 0))
-            assert_raises(ValueError, i, img, kern,
-                              border_mode=(0, -1))
-            assert_raises(ValueError, i, img, kern,
-                              border_mode='not border')
-
-    def _run_onecase(self, img_shape, kern_shape, padding, op):
-        npy_img = numpy.random.rand(*img_shape).astype('float32')
-        npy_kern = numpy.random.rand(*kern_shape).astype('float32')
-        img = theano._asarray(npy_img, dtype='float32')
-        kern = theano.shared(npy_kern)
-        border_mode = padding
-        cpuval = py_conv(npy_img, npy_kern, border_mode, (1, 1))
-        X = tensor.ftensor4()
-        Y = op(X, kern, border_mode=border_mode)
-        func = theano.function([X], Y, mode=theano_mode)
-        gpuval = numpy.asarray(func(img))
-        assert_allclose(cpuval, gpuval, rtol=1e-5, atol=1e-5)
-
-    def test_numeric_value(self):
-        params = [
-            ((5, 10, 4, 4), (12, 10, 4, 4), (2, 1)),
-            ((5, 10, 8, 8), (12, 10, 4, 4), 3),
-            ((5, 10, 6, 8), (12, 10, 3, 4), 'full'),
-            ((5, 10, 9, 6), (12, 10, 9, 4), 'valid')
-        ]
-        for img_shape, kern_shape, padding in params:
-            for op in self.conv_ops:
-                yield self._run_onecase, img_shape, kern_shape, padding, op
-
-
-def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
-                  direction):
-    ishape = (bs, ch, rImg1, rImg2)
-    kshape = (nf, ch, rFlt1, rFlt2)
-    subsample = (subsx, subsy)
-
-    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
-    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
-
-    if direction == 'fprop':
-        i = cuda.CudaNdarrayType(
-            broadcastable=[sh == 1 for sh in npy_img.shape])()
-        k = cuda.CudaNdarrayType(
-            broadcastable=[sh == 1 for sh in npy_kern.shape])()
-
-        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
-        op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
-                                                subsample=subsample)(i, k)
-        f = theano.function([i, k], op, mode=theano_mode)
-        gpuval = f(npy_img, npy_kern[:,:,::-1,::-1])
-    elif direction == 'bprop img':
-        i = cuda.CudaNdarrayType(
-            broadcastable=[sh == 1 for sh in
-                           npy_kern.transpose(1, 0, 2, 3).shape])()
-        k = cuda.CudaNdarrayType(
-            broadcastable=[sh == 1 for sh in npy_img.shape])()
-
-        cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
-        op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(
-            border_mode='valid', subsample=subsample)(i, k)
-        f = theano.function([i, k], op, mode=theano_mode)
-        gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
-    elif direction == 'bprop kern':
-        i = cuda.CudaNdarrayType(
-            broadcastable=[sh == 1 for sh in
-                           npy_img.transpose(1, 0, 2, 3).shape])()
-        k = cuda.CudaNdarrayType(
-            broadcastable=[sh == 1 for sh in
-                           npy_kern.transpose(1, 0, 2, 3).shape])()
-
-        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
-        op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(
-            border_mode='valid', subsample=subsample)(i, k)
-        f = theano.function([i, k], op, mode=theano_mode)
-        gpuval = numpy.array(f(
-                npy_img.transpose(1, 0, 2, 3),
-                npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose(
-            1, 0, 2, 3)
-
-    assert_allclose(cpuval, gpuval, rtol=1e-4)
-
 
-def test_gemm_directly():
-    for bs in range(1, 5):
-        for ch in range(1,4):
-            for nf in range(1,4):
-                for rImg1 in range(5, 9):
-                    for rImg2 in range(5, 9):
-                        for rFlt1 in range(2, 4):
-                            for rFlt2 in range(2, 4):
-                                for direction in ['bprop img', 'bprop kern']:
-                                    yield (gemm_directly, bs, ch, nf, rImg1,
-                                           rImg2, rFlt1, rFlt2, 1, 1,
-                                           direction)
-
-                                for subsx in range(1, 3):
-                                    for subsy in range(1, 3):
-                                        yield (gemm_directly, bs, ch, nf,
-                                               rImg1, rImg2, rFlt1, rFlt2,
-                                               subsx, subsy, 'fprop')
-
-
-def gemm_op(mode, subsample):
-    return theano.sandbox.cuda.blas.GpuCorrMM(mode, subsample)
-
-
-def dnn_op(mode, subsample):
-    def f(img, kern):
-        return dnn_conv(img, kern, border_mode=mode, conv_mode='cross',
-                        subsample=subsample)
-    return f
-
-
-def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
-    ishape = (bs, ch, rImg1, rImg2)
-    kshape = (nf, ch, rFlt1, rFlt2)
+def _test_dummy():
+    ishape = (1, 1, 5, 5)
+    kshape = (1, 1, 3, 3)
+    mode = 'valid'
+    subsample = (1, 1)
 
     npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
     npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
 
-    i = cuda.CudaNdarrayType(
-        broadcastable=[sh == 1 for sh in npy_img.shape])()
-    k = cuda.CudaNdarrayType(
-        broadcastable=[sh == 1 for sh in npy_kern.shape])()
-
-    # TODO: also test custom pad values
-    corr_op = op(mode, subsample)(i, k)
-    # try to compile reference implementation without shape,
-    # so we don't have to compile hundreds of versions
-    conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
-                                 border_mode=mode, subsample=subsample)
-    try:
-        conv_op_di = theano.grad(conv_op.sum(), i)
-        conv_op_dk = theano.grad(conv_op.sum(), k)
-    except Exception:
-        # compile with shape information only when needed
-        conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
-                                     ishape, kshape, mode, subsample)
-    conv_op_di = theano.grad(conv_op.sum(), i)
-    conv_op_dk = theano.grad(conv_op.sum(), k)
-    corr_op_di = theano.grad(corr_op.sum(), i)
-    corr_op_dk = theano.grad(corr_op.sum(), k)
-    outputs = [corr_op, conv_op,
-               corr_op_di, conv_op_di,
-               corr_op_dk, conv_op_dk]
-    try:
-        conv_op_dik = theano.grad(conv_op_di.sum(), k)
-        conv_op_dki = theano.grad(conv_op_dk.sum(), i)
-        corr_op_dik = theano.grad(corr_op_di.sum(), k)
-        corr_op_dki = theano.grad(corr_op_dk.sum(), i)
-        outputs.extend([corr_op_dik, conv_op_dik,
-                        corr_op_dki, conv_op_dki])
-    except Exception:
-        # skip if the reference implementation can't do it
-        pass
-
-    f = theano.function([i, k], outputs, mode=theano_mode.excluding('conv_dnn', 'conv_gemm'))
-
-    allvals = f(npy_img, npy_kern)
-
-    for a, b, oa, ob, p in zip(allvals[::2], allvals[1::2],
-                               outputs[::2], outputs[1::2],
-                               ('top', 'dtop/dbottom', 'dtop/dweight',
-                                'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
-        assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2]
-
-        assert_allclose(a, b, rtol=1e-4)
+    img = cuda_ndarray.CudaNdarray(npy_img)
+    kern = cuda_ndarray.CudaNdarray(npy_kern)
 
+    #print >> sys.stdout, '_params_allgood trying ', ishape, kshape, mode
+    t2 = None
+    rval = True
 
-def test_conv_grads():
-    if cuda.device_properties(cuda.active_device_number())['major'] < 3:
-        ops = [gemm_op]
-    else:
-        ops = [gemm_op, dnn_op]
-    for mode in 'valid', 'full':
-        for bs in [1, 5]:
-            for ch in [4]:
-                for nf in [3]:
-                    for rImg1 in [2, 5]:
-                        for rImg2 in [2, 8]:
-                            for rFlt1 in [1, 2]:
-                                for rFlt2 in [1, 2]:
-                                    for subsample in (1, 1), (1, 2), (2, 2):
-                                        for op in ops:
-                                            yield (conv_grad, mode, bs, ch, nf,
-                                                   rImg1, rImg2, rFlt1, rFlt2,
-                                                   subsample, op)
+    t0 = time.time()
+    cpuval = py_conv(npy_img, npy_kern, mode, subsample)
+    t1 = time.time()
+    gpuval = cuda_ndarray.conv(img, kern, mode, subsample)
+    t2 = time.time()
+    gpuval = numpy.asarray(gpuval)
+    print gpuval
+    print cpuval
 
 
 def benchmark():
@@ -1100,19 +848,18 @@ def benchmark():
          ,((2, 30,116,116), (20, 30, 9,9), (1, 1), (1, 1), (1, 1))#full conv_reference_full
             ]
 
+#    shapes_valid=shapes_valid[-1:]
+#    shapes_full=shapes_full[-1:]
     version = [-1]
     verbose = 1
     random = True
 
-    for t in exec_conv(version, shapes_valid, verbose, random, 'valid',
-                       print_=None, rtol=1e-3):
-        t[0](*t[1:])
-    for t in exec_conv(version, shapes_full, verbose, random, 'full'):
-        t[0](*t[1:])
+    exec_conv(version, shapes_valid, verbose, random, 'valid',
+              print_=None, rtol=1e-3)
+    exec_conv(version, shapes_full, verbose, random, 'full')
 
 
 def test_stack_rows_segfault_070312():
-    seed_rng()
     # 07/03/2012
     # Running this unittest with cuda-memcheck exposes an illegal read.
     # THEANO_FLAGS=device=gpu cuda-memcheck nosetests \
@@ -1122,5 +869,5 @@ def test_stack_rows_segfault_070312():
     out = theano.shared(numpy.random.rand(1, 2, 2, 3).astype('float32'))
     op = theano.tensor.nnet.conv.ConvOp(imshp=(80, 96, 96), kshp=(9, 9),
             nkern=1, bsize=1)
-    f = theano.function([], [], updates=[(out, op(img, kern))], mode=theano_mode)
+    f = theano.function([], [], updates=[(out, op(img, kern))])
     f()
diff --git a/theano/sandbox/cuda/tests/test_cuda_ndarray.py b/theano/sandbox/cuda/tests/test_cuda_ndarray.py
index 429dfea0e5d..3ad96c8355b 100644
--- a/theano/sandbox/cuda/tests/test_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_cuda_ndarray.py
@@ -10,7 +10,7 @@
 from theano.tensor.basic import _allclose
 from theano.tests import unittest_tools as utt
 
-if not cuda_ndarray.cuda_available:
+if cuda_ndarray.cuda_available == False:
     raise SkipTest('Optional package cuda disabled')
 
 
@@ -29,27 +29,14 @@ def advantage(cpu_dt, gpu_dt):
     else:
         return cpu_dt / gpu_dt
 
-
 def test_host_to_device():
     #print >>sys.stdout, 'starting test_host_to_dev'
-    for shape in ((), (3,), (2, 3), (3, 4, 5, 6)):
+    for shape in ((), (3,), (2,3), (3,4,5,6)):
         a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
         b = cuda_ndarray.CudaNdarray(a)
         c = numpy.asarray(b)
         assert numpy.all(a == c)
 
-        # test with float32 dtype
-        d = numpy.asarray(b, dtype='float32')
-        assert numpy.all(a == d)
-
-        # test with not float32 dtype
-        try:
-            numpy.asarray(b, dtype='int8')
-            assert False
-        except TypeError:
-            pass
-
-
 def test_add_iadd_idiv():
     for shapes in (
                   [(5,5),(5,1)],
@@ -64,18 +51,8 @@ def test_add_iadd_idiv():
                   (3,34,35,36,37),
                   (33,34,3,36,37),
                   (33,34,35,36,3),
-                  (0,0,0,0,0,0),
-                  (3,34,35,36,37,2),
-                  (33,34,3,36,37,2),
-                  (33,34,35,36,3,2),
-                  (3,4,5,6,7,1025),
-                  (3,4,5,6,1025,7),
-                  (3,4,5,1025,6,7),
-                  (3,4,1025,5,6,7),
-                  (3,1025,4,5,6,7),
-                  (1025,3,4,5,6,7),
                   ):
-        if isinstance(shapes, tuple):
+        if isinstance(shapes,tuple):
             shape = shapes
             shape2 = shapes
             a0 = theano._asarray(numpy.random.rand(*shape), dtype='float32')
@@ -114,7 +91,7 @@ def test_add_iadd_idiv():
         #should raise not implemented.
         a0 = a0_orig.copy()
         b0 = cuda_ndarray.CudaNdarray(a0)
-        if len(shape) == 0:
+        if len(shape)==0:
             continue
         elif len(shape) == 1:
             _b = b1[::-1]
@@ -126,10 +103,6 @@ def test_add_iadd_idiv():
             _b = b1[::, ::, ::, ::-1]
         elif len(shape) == 5:
             _b = b1[::, ::, ::, ::, ::-1]
-        elif len(shape) == 6:
-            _b = b1[::, ::, ::, ::, ::, ::-1]
-        else:
-            raise Exception("You need to modify this case!")
         # TODO: b0[...,::-1] don't work
 
         if shape == shape2:
@@ -168,15 +141,11 @@ def test_add_iadd_idiv():
         b0 /= _b
         a0 /= a1[..., ::-1]
         assert numpy.allclose(a0, numpy.asarray(b0))
-        assert numpy.allclose(a0, ((a0_orig + a1) / a1 +
-                                   a1[..., ::-1]) / a1[..., ::-1])
-
+        assert numpy.allclose(a0, ((a0_orig+a1)/a1+a1[..., ::-1])/a1[..., ::-1])
 
 def test_exp():
     #print >>sys.stdout, 'starting test_exp'
-    for shape in ((), (3,), (2, 3),
-                  (1, 10000000), (10, 1000000),
-                  (100, 100000), (1000, 10000), (10000, 1000)):
+    for shape in ((), (3,), (2,3), (1,10000000),(10,1000000), (100,100000),(1000,10000),(10000,1000)):
         a0 = theano._asarray(numpy.random.rand(*shape), dtype='float32')
         a1 = a0.copy()
         b0 = cuda_ndarray.CudaNdarray(a0)
@@ -212,108 +181,90 @@ def test_copy():
     assert numpy.allclose(a, numpy.asarray(b))
     assert numpy.allclose(a, numpy.asarray(c))
     assert numpy.allclose(a, numpy.asarray(d))
-    b += b
+    b+=b
     assert numpy.allclose(a+a, numpy.asarray(b))
     assert numpy.allclose(a+a, numpy.asarray(c))
     assert numpy.allclose(a, numpy.asarray(d))
 
-
 def test_nvcc_bug():
     """
     The fct k_elemwise_unary_rowmajor_copy(used by cuda.copy()) in cuda_ndarray.cu
     is not well compiled with nvcc 3.0 and 3.1 beta. We found a workaround, so it
     sould work correctly. Without the workaround, this test fail.
     """
-    shape = (5, 4)
+    shape = (5,4)
     aa = theano._asarray(numpy.random.rand(*shape), dtype='float32')
-    a = aa[::, ::-1]
+    a = aa[::,::-1]
 
-    b = cuda_ndarray.CudaNdarray(aa)[::, ::-1]
+    b = cuda_ndarray.CudaNdarray(aa)[::,::-1]
     c = copy.copy(b)
     d = copy.deepcopy(b)
 
     assert numpy.allclose(a, numpy.asarray(b))
     assert numpy.allclose(a, numpy.asarray(c))
     assert numpy.allclose(a, numpy.asarray(d))
-    b += b
+    b+=b
     assert numpy.allclose(a+a, numpy.asarray(b))
     assert numpy.allclose(a+a, numpy.asarray(c))
     assert numpy.allclose(a, numpy.asarray(d))
 
-
 class test_DimShuffle(unittest.TestCase):
     def test_dimshuffle(self):
         utt.seed_rng()
         rng = numpy.random.RandomState(utt.fetch_seed())
 
         # 2d -> 0d
-        a = theano._asarray(rng.randn(1, 1), dtype='float32')
+        a = theano._asarray(rng.randn(1,1), dtype='float32')
         b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(numpy.transpose(a),
-                              cuda_ndarray.dimshuffle(b, ()))
+        assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b,()))
 
         # Test when we drop a axis that don't have shape 1
-        a = theano._asarray(rng.randn(2, 1), dtype='float32')
+        a = theano._asarray(rng.randn(2,1), dtype='float32')
         b = cuda_ndarray.CudaNdarray(a)
-        self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b, ())
+        self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b,())
 
         # Test that we can't take a dimensions multiple time
-        a = theano._asarray(rng.randn(2, 1), dtype='float32')
+        a = theano._asarray(rng.randn(2,1), dtype='float32')
         b = cuda_ndarray.CudaNdarray(a)
-        self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b, (1, 1))
+        self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b,(1,1))
 
         # 1d
         a = theano._asarray(rng.randn(3,), dtype='float32')
         b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(numpy.transpose(a),
-                              cuda_ndarray.dimshuffle(b, (0,)))
-        assert numpy.allclose(a[None, :, None],
-                              cuda_ndarray.dimshuffle(b, (-1, 0, -1)))
+        assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b,(0,)))
+        assert numpy.allclose(a[None,:,None], cuda_ndarray.dimshuffle(b,(-1,0,-1)))
 
         # 2d
-        a = theano._asarray(rng.randn(3, 11), dtype='float32')
+        a = theano._asarray(rng.randn(3,11), dtype='float32')
         b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(numpy.transpose(a),
-                              cuda_ndarray.dimshuffle(b, (1, 0)))
-        assert numpy.allclose(numpy.transpose(a)[None, :, None, :, None],
-                              cuda_ndarray.dimshuffle(b, (-1, 1, -1, 0, -1)))
+        assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b,(1,0)))
+        assert numpy.allclose(numpy.transpose(a)[None,:,None,:,None], cuda_ndarray.dimshuffle(b,(-1,1,-1,0,-1)))
 
         # 2d -> 1d
-        a = theano._asarray(rng.randn(1, 11), dtype='float32')
+        a = theano._asarray(rng.randn(1,11), dtype='float32')
         b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(a[:],
-                              cuda_ndarray.dimshuffle(b, (1,)))
-        a = theano._asarray(rng.randn(11, 1), dtype='float32')
+        assert numpy.allclose(a[:,], cuda_ndarray.dimshuffle(b,(1,)))
+        a = theano._asarray(rng.randn(11,1), dtype='float32')
         b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(a.reshape((11,)),
-                              cuda_ndarray.dimshuffle(b, (0,)))
+        assert numpy.allclose(a.reshape((11,)), cuda_ndarray.dimshuffle(b,(0,)))
 
         # 3d
-        a = theano._asarray(rng.randn(3, 4, 5), dtype='float32')
+        a = theano._asarray(rng.randn(3,4,5), dtype='float32')
         b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(a, cuda_ndarray.dimshuffle(b, (0, 1, 2)))
-        assert numpy.allclose(numpy.swapaxes(a, 0, 1),
-                              cuda_ndarray.dimshuffle(b, (1, 0, 2)))
-        assert numpy.allclose(numpy.swapaxes(a, 0, 2),
-                              cuda_ndarray.dimshuffle(b, (2, 1, 0)))
-        assert numpy.allclose(numpy.swapaxes(a, 1, 2),
-                              cuda_ndarray.dimshuffle(b, (0, 2, 1)))
-        assert numpy.allclose(numpy.swapaxes(a, 1, 2)[None, :, None, :, :, None],
-                              cuda_ndarray.dimshuffle(b, (-1, 0, -1, 2, 1, -1)))
+        assert numpy.allclose(a, cuda_ndarray.dimshuffle(b,(0,1,2)))
+        assert numpy.allclose(numpy.swapaxes(a,0,1), cuda_ndarray.dimshuffle(b,(1,0,2)))
+        assert numpy.allclose(numpy.swapaxes(a,0,2), cuda_ndarray.dimshuffle(b,(2,1,0)))
+        assert numpy.allclose(numpy.swapaxes(a,1,2), cuda_ndarray.dimshuffle(b,(0,2,1)))
+        assert numpy.allclose(numpy.swapaxes(a,1,2)[None,:,None,:,:,None], cuda_ndarray.dimshuffle(b,(-1,0,-1,2,1,-1)))
 
         # 4d
-        a = theano._asarray(rng.randn(3, 11, 4, 5), dtype='float32')
+        a = theano._asarray(rng.randn(3,11,4,5), dtype='float32')
         b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(numpy.swapaxes(a, 0, 1),
-                              cuda_ndarray.dimshuffle(b, (1, 0, 2, 3)))
-        assert numpy.allclose(numpy.swapaxes(a, 0, 2),
-                              cuda_ndarray.dimshuffle(b, (2, 1, 0, 3)))
-        assert numpy.allclose(numpy.swapaxes(a, 0, 3),
-                              cuda_ndarray.dimshuffle(b, (3, 1, 2, 0)))
-        assert numpy.allclose(numpy.swapaxes(a, 0, 3),
-                              cuda_ndarray.dimshuffle(b, (3, 1, 2, 0)))
-        assert numpy.allclose(numpy.swapaxes(a, 0, 3)[None, :, None, :, :, :],
-                              cuda_ndarray.dimshuffle(b, (-1, 3, -1, 1, 2, 0)))
+        assert numpy.allclose(numpy.swapaxes(a,0,1), cuda_ndarray.dimshuffle(b,(1,0,2,3)))
+        assert numpy.allclose(numpy.swapaxes(a,0,2), cuda_ndarray.dimshuffle(b,(2,1,0,3)))
+        assert numpy.allclose(numpy.swapaxes(a,0,3), cuda_ndarray.dimshuffle(b,(3,1,2,0)))
+        assert numpy.allclose(numpy.swapaxes(a,0,3), cuda_ndarray.dimshuffle(b,(3,1,2,0)))
+        assert numpy.allclose(numpy.swapaxes(a,0,3)[None,:,None,:,:,:], cuda_ndarray.dimshuffle(b,(-1,3,-1,1,2,0)))
 
 
 def test_dot():
@@ -330,72 +281,66 @@ def test_dot():
 
     assert _allclose(numpy.dot(a0, a1), cuda_ndarray.dot(b0, b1))
 
+
     a1 = theano._asarray(rng.randn(6, 7), dtype='float32')
     b1 = cuda_ndarray.CudaNdarray(a1)
 
     numpy_version = numpy.dot(a0, a1.T)
-    transposed = cuda_ndarray.dimshuffle(b1, (1, 0))
-    cuda_version = cuda_ndarray.dot(b0,  transposed)
+    transposed = cuda_ndarray.dimshuffle(b1,(1,0))
+    cuda_version  =  cuda_ndarray.dot(b0,  transposed)
 
     assert _allclose(numpy_version, cuda_version)
 
     a1 = theano._asarray(rng.randn(7, 6), dtype='float32')
     b1 = cuda_ndarray.CudaNdarray(a1)
 
+
     a0 = theano._asarray(rng.randn(7, 4), dtype='float32')
     b0 = cuda_ndarray.CudaNdarray(a0)
 
     assert _allclose(numpy.dot(a0.T, a1),
-            cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0, (1, 0)), b1))
+            cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0,(1,0)), b1))
 
     a1 = theano._asarray(rng.randn(6, 7), dtype='float32')
     b1 = cuda_ndarray.CudaNdarray(a1)
 
     assert _allclose(numpy.dot(a0.T, a1.T),
-            cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0, (1, 0)),
-                             cuda_ndarray.dimshuffle(b1, (1, 0))))
+            cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0,(1,0)),
+                             cuda_ndarray.dimshuffle(b1,(1,0))))
 
 
 def test_sum():
-    shape = (2, 3)
-    a0 = theano._asarray(numpy.arange(shape[0] * shape[1]).reshape(shape),
-                         dtype='float32')
+    shape = (2,3)
+    a0 = theano._asarray(numpy.arange(shape[0]*shape[1]).reshape(shape), dtype='float32')
 
     b0 = cuda_ndarray.CudaNdarray(a0)
 
-    assert numpy.allclose(a0.sum(),
-                          numpy.asarray(b0.reduce_sum([1, 1])))
+    assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1,1])))
 
     a0sum = a0.sum(axis=0)
-    b0sum = b0.reduce_sum([1, 0])
+    b0sum = b0.reduce_sum([1,0])
 
     #print 'asum\n',a0sum
     #print 'bsum\n',numpy.asarray(b0sum)
 
-    assert numpy.allclose(a0.sum(axis=0),
-                          numpy.asarray(b0.reduce_sum([1, 0])))
-    assert numpy.allclose(a0.sum(axis=1),
-                          numpy.asarray(b0.reduce_sum([0, 1])))
-    assert numpy.allclose(a0, numpy.asarray(b0.reduce_sum([0, 0])))
+    assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1,0])))
+    assert numpy.allclose(a0.sum(axis=1), numpy.asarray(b0.reduce_sum([0,1])))
+    assert numpy.allclose(a0, numpy.asarray(b0.reduce_sum([0,0])))
 
-    shape = (3, 4, 5, 6, 7, 8)
-    a0 = theano._asarray(numpy.arange(3 * 4 * 5 * 6 * 7 * 8).reshape(shape),
-                         dtype='float32')
+    shape = (3,4,5,6,7,8)
+    a0 = theano._asarray(numpy.arange(3*4*5*6*7*8).reshape(shape), dtype='float32')
     b0 = cuda_ndarray.CudaNdarray(a0)
-    assert numpy.allclose(a0.sum(axis=5).sum(axis=3).sum(axis=0),
-                          numpy.asarray(b0.reduce_sum([1, 0, 0, 1, 0, 1])))
+    assert numpy.allclose(a0.sum(axis=5).sum(axis=3).sum(axis=0), numpy.asarray(b0.reduce_sum([1,0,0,1,0,1])))
 
-    shape = (16, 2048)
-    a0 = theano._asarray(numpy.arange(16 * 2048).reshape(shape),
-                         dtype='float32')
+    shape = (16,2048)
+    a0 = theano._asarray(numpy.arange(16*2048).reshape(shape), dtype='float32')
     b0 = cuda_ndarray.CudaNdarray(a0)
-    assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1, 0])))
+    assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1,0])))
 
-    shape = (16, 10)
+    shape = (16,10)
     a0 = theano._asarray(numpy.arange(160).reshape(shape), dtype='float32')
     b0 = cuda_ndarray.CudaNdarray(a0)
-    assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1, 1])))
-
+    assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1,1])))
 
 def test_reshape():
     shapelist = [
@@ -996,33 +941,6 @@ def test_base():
     e = b.reshape((5,2,2,3))
     assert e.base is a
 
-
-def test_set_strides():
-    a = cuda_ndarray.CudaNdarray.zeros((5, 5))
-
-    # Test with tuple
-    new_strides = (a.strides[1], a.strides[0])
-    a.strides = new_strides
-    assert a.strides == new_strides
-
-    # Test with list
-    new_strides = (a.strides[1], a.strides[0])
-    a.strides = [a.strides[1], a.strides[0]]
-    assert a.strides == new_strides
-
-    try:
-        a.strides = (a.strides[1],)
-        assert False
-    except ValueError:
-        pass
-
-    try:
-        a.strides = (1, 1, 1)
-        assert False
-    except ValueError:
-        pass
-
-
 def test_is_c_contiguous():
     a = cuda_ndarray.CudaNdarray.zeros((3,4,5))
     assert a.is_c_contiguous()
diff --git a/theano/sandbox/cuda/tests/test_dnn.py b/theano/sandbox/cuda/tests/test_dnn.py
deleted file mode 100644
index 3922d081b36..00000000000
--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import logging
-import unittest
-
-from nose.plugins.skip import SkipTest
-import numpy
-
-import theano
-from theano.compat.six import StringIO
-from theano.gof.python25 import any
-import theano.tensor as T
-import theano.tests.unittest_tools as utt
-from theano.sandbox.neighbours import images2neibs, neibs2images
-from theano.tensor.signal.downsample import max_pool_2d
-from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
-
-
-# Skip test if cuda_ndarray is not available.
-import theano.sandbox.cuda as cuda
-if not cuda.cuda_available:
-    raise SkipTest('Optional package cuda disabled')
-
-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode(
-        'FAST_RUN').excluding('gpu')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
-
-
-def pool_2d_i2n(input, ds=(2, 2), strides=None,
-                pool_function=T.max, mode='ignore_borders'):
-    if strides is None:
-        strides = ds
-
-    if strides[0] > ds[0] or strides[1] > ds[1]:
-        raise RuntimeError(
-            "strides should be smaller than or equal to ds,"
-            " strides=(%d, %d) and ds=(%d, %d)" %
-            (strides + ds))
-
-    shape = input.shape
-    neibs = images2neibs(input, ds, strides, mode=mode)
-    pooled_neibs = pool_function(neibs, axis=1)
-
-    output_width = (shape[2] - ds[0]) // strides[0] + 1
-    output_height = (shape[3] - ds[1]) // strides[1] + 1
-
-    pooled_output = pooled_neibs.reshape((shape[0], shape[1],
-                                          output_width, output_height))
-    return pooled_output
-
-
-def test_pooling():
-    if not cuda.dnn.dnn_available():
-        raise SkipTest(cuda.dnn.dnn_available.msg)
-
-    x = T.ftensor4()
-
-    for func in (T.max, T.mean):
-        for ws in (2, 4, 5):
-            for stride in (2, 3):
-                if stride > ws:
-                    continue
-                if ws == stride and func is T.max:
-                    # We will check that the opt introduced it.
-                    out1 = max_pool_2d(x, (ws, ws), ignore_border=True)
-                else:
-                    out1 = cuda.dnn.dnn_pool(
-                        x, ws=(ws, ws),
-                        stride=(stride, stride),
-                        mode='max' if func is T.max else "average")
-                out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride),
-                                   pool_function=func)
-
-                f1 = theano.function([x], out1, mode=mode_with_gpu)
-                assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
-                            for node in f1.maker.fgraph.apply_nodes])
-                f2 = theano.function([x], out2, mode=mode_with_gpu)
-                assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
-                                for node in f2.maker.fgraph.apply_nodes])
-                for shp in [(1, 10, 100, 100),
-                            (1, 3, 99, 99),
-                            (32, 1, 147, 197),
-                         ]:
-                    data = numpy.random.normal(0, 1, shp).astype("float32")
-                    a = f1(data).__array__()
-
-                    b = f2(data).__array__()
-                    assert numpy.allclose(a, b,
-                                          atol=numpy.finfo(numpy.float32).eps)
-
-        # Test the grad
-        for shp in [(1, 1, 2, 2),
-                    (1, 1, 3, 3)]:
-            data = numpy.random.normal(0, 1, shp).astype("float32")*10
-
-            ws = 2
-            strides = 2
-
-            # This test the CPU grad + opt + GPU implemtentation
-            def fn(x):
-                return max_pool_2d(x, (ws, ws), ignore_border=True)
-            theano.tests.unittest_tools.verify_grad(fn, [data],
-                                                    cast_to_output_type=False,
-                                                    mode=mode_with_gpu)
-            # Confirm that the opt would have inserted it.
-            f = theano.function([x], theano.grad(fn(x).sum(), x),
-                                mode=mode_with_gpu)
-            assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
-                        for node in f.maker.fgraph.toposort()])
-
-            # Test the GPU grad + GPU implementation
-            def fn(x):
-                dnn_op = cuda.dnn.dnn_pool(
-                    x, ws=(ws, ws),
-                    stride=(stride, stride),
-                    mode='max' if func is T.max else "average")
-                return dnn_op
-            theano.tests.unittest_tools.verify_grad(fn, [data],
-                                                    cast_to_output_type=False,
-                                                    mode=mode_with_gpu)
-            # Confirm that we get the good op.
-            f = theano.function([x], theano.grad(fn(x).sum(), x),
-                                mode=mode_with_gpu)
-            assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
-                        for node in f.maker.fgraph.toposort()])
-            g_out = f(data)
-
-            if func is T.max:
-                # Compare again the CPU result
-                out = max_pool_2d(x, (ws, ws), ignore_border=True)
-                f = theano.function([x], theano.grad(out.sum(), x),
-                                    mode=mode_without_gpu)
-                assert any([isinstance(node.op, DownsampleFactorMaxGrad)
-                            for node in f.maker.fgraph.toposort()])
-                c_out = f(data)
-                assert numpy.allclose(c_out, g_out)
-
-
-def test_pooling_opt():
-    if not cuda.dnn.dnn_available():
-        raise SkipTest(cuda.dnn.dnn_available.msg)
-
-    x = T.ftensor4()
-
-    f = theano.function(
-        [x],
-        max_pool_2d(x, ds=(2, 2), ignore_border=True),
-        mode=mode_with_gpu)
-
-    assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
-                for n in f.maker.fgraph.toposort()])
-
-    f = theano.function(
-        [x],
-        T.grad(max_pool_2d(x, ds=(2, 2), ignore_border=True).sum(), x),
-        mode=mode_with_gpu.including("cudnn"))
-
-    assert any([isinstance(n.op, cuda.dnn.GpuDnnPoolGrad)
-                for n in f.maker.fgraph.toposort()])
-
-
-def test_dnn_tag():
-    """
-    We test that if cudnn isn't avail we crash and that if it is avail, we use it.
-    """
-    x = T.ftensor4()
-    old = theano.config.on_opt_error
-    theano.config.on_opt_error = "raise"
-
-    sio = StringIO()
-    handler = logging.StreamHandler(sio)
-    logging.getLogger('theano.compile.tests.test_dnn').addHandler(handler)
-    # Silence original handler when intentionnally generating warning messages
-    logging.getLogger('theano').removeHandler(theano.logging_default_handler)
-    raised = False
-    try:
-        f = theano.function(
-            [x],
-            max_pool_2d(x, ds=(2, 2), ignore_border=True),
-            mode=mode_with_gpu.including("cudnn"))
-    except (AssertionError, RuntimeError), e:
-        assert not cuda.dnn.dnn_available()
-        raised = True
-    finally:
-        theano.config.on_opt_error = old
-        logging.getLogger('theano.compile.tests.test_dnn').removeHandler(handler)
-        logging.getLogger('theano').addHandler(theano.logging_default_handler)
-
-    if not raised:
-        assert cuda.dnn.dnn_available()
-        assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
-                    for n in f.maker.fgraph.toposort()])
diff --git a/theano/sandbox/cuda/tests/test_driver.py b/theano/sandbox/cuda/tests/test_driver.py
index a6b56ff4c84..d9406507a7c 100644
--- a/theano/sandbox/cuda/tests/test_driver.py
+++ b/theano/sandbox/cuda/tests/test_driver.py
@@ -2,15 +2,11 @@
 import theano
 
 # Skip test if cuda_ndarray is not available.
-try:
-    from nose.plugins.skip import SkipTest
-    import theano.sandbox.cuda as cuda_ndarray
-    if cuda_ndarray.cuda_available == False:
-        raise SkipTest('Optional package cuda disabled')
-except ImportError:
-    # To have the GPU back-end work without nose, we need this file to
-    # be importable without nose.
-    pass
+from nose.plugins.skip import SkipTest
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available == False:
+    raise SkipTest('Optional package cuda disabled')
+
 from theano.gof.python25 import any
 import theano.sandbox.cuda as cuda
 import theano.sandbox.cuda.basic_ops as B
diff --git a/theano/sandbox/cuda/tests/test_extra_ops.py b/theano/sandbox/cuda/tests/test_extra_ops.py
deleted file mode 100644
index 5ae9a8bd9d1..00000000000
--- a/theano/sandbox/cuda/tests/test_extra_ops.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Skip test if cuda_ndarray is not available.
-from nose.plugins.skip import SkipTest
-
-import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available is False:
-    raise SkipTest('Optional package cuda disabled')
-
-import theano.tensor.tests.test_extra_ops
-from theano.sandbox.cuda.extra_ops import GpuCumsum
-
-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-
-from theano import tensor as T
-import numpy as np
-import theano
-from theano.tensor.extra_ops import cumsum, CumsumOp
-import itertools
-
-class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
-    mode = mode_with_gpu
-
-    def setUp(self):
-        super(TestGpuCumsum, self).setUp()
-
-        # Fetch some useful properties on the device
-        cuda = theano.sandbox.cuda
-        device_id = cuda.use.device_number
-        if device_id is None:
-            cuda.use("gpu",
-                     force=False,
-                     default_to_move_computation_to_gpu=False,
-                     move_shared_float32_to_gpu=False,
-                     enable_cuda=False,
-                     test_driver=True)
-            device_id = cuda.use.device_number
-        cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
-        prop = cuda_ndarray.device_properties(device_id)
-        self.max_threads_dim0 = prop['maxThreadsDim0']
-        self.max_grid_size1 = prop['maxGridSize1']
-
-    def test_Strides1D(self):
-        x = T.fvector('x')
-
-        for axis in [0, None]:
-            a = np.random.random((42,)).astype("float32")
-            cumsum_function = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
-
-            slicings = [slice(None, None, None),    # Normal strides
-                        slice(None, None, 2),       # Stepped strides
-                        slice(None, None, -1),      # Negative strides
-                        ]
-
-            # Cartesian product of all slicings to test.
-            for slicing in itertools.product(slicings, repeat=x.ndim):
-                f = theano.function([x], cumsum(x[slicing], axis=axis), mode=self.mode)
-                assert [n for n in f.maker.fgraph.toposort()
-                        if isinstance(n.op, GpuCumsum)]
-                assert np.allclose(np.cumsum(a[slicing], axis=axis), f(a))
-                assert np.allclose(np.cumsum(a[slicing], axis=axis), cumsum_function(a[slicing]))
-
-    def test_Strides2D(self):
-        x = T.fmatrix('x')
-
-        for axis in [0, 1, None]:
-            a = np.random.random((42, 30)).astype("float32")
-            cumsum_function = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
-
-            slicings = [slice(None, None, None),    # Normal strides
-                        slice(None, None, 2),       # Stepped strides
-                        slice(None, None, -1),      # Negative strides
-                        ]
-
-            # Cartesian product of all slicings to test.
-            for slicing in itertools.product(slicings, repeat=x.ndim):
-                f = theano.function([x], cumsum(x[slicing], axis=axis), mode=self.mode)
-                assert [n for n in f.maker.fgraph.toposort()
-                        if isinstance(n.op, GpuCumsum)]
-                assert np.allclose(np.cumsum(a[slicing], axis=axis), f(a))
-                assert np.allclose(np.cumsum(a[slicing], axis=axis), cumsum_function(a[slicing]))
-
-    def test_Strides3D(self):
-        x = T.ftensor3('x')
-
-        for axis in [0, 1, 2, None]:
-            a = np.random.random((42, 30, 25)).astype("float32")
-            cumsum_function = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
-
-            slicings = [slice(None, None, None),    # Normal strides
-                        slice(None, None, 2),       # Stepped strides
-                        slice(None, None, -1),      # Negative strides
-                        ]
-
-            # Cartesian product of all slicings to test.
-            for slicing in itertools.product(slicings, repeat=x.ndim):
-                f = theano.function([x], cumsum(x[slicing], axis=axis), mode=self.mode)
-                assert [n for n in f.maker.fgraph.toposort()
-                        if isinstance(n.op, GpuCumsum)]
-                assert np.allclose(np.cumsum(a[slicing], axis=axis), f(a))
-                assert np.allclose(np.cumsum(a[slicing], axis=axis), cumsum_function(a[slicing]))
-
-
-    def test_GpuCumsum1D(self):
-        block_max_size = self.max_threads_dim0 * 2
-
-        x = T.fvector('x')
-        f = theano.function([x], cumsum(x), mode=self.mode)
-        assert [n for n in f.maker.fgraph.toposort()
-                if isinstance(n.op, GpuCumsum)]
-
-        # Extensive testing for the first 1025 sizes
-        a = np.random.random(1025).astype("float32")
-        for i in xrange(a.shape[0]):
-            assert np.allclose(np.cumsum(a[:i]), f(a[:i]))
-
-        # Use multiple GPU threadblocks
-        a = np.random.random((block_max_size+2,)).astype("float32")
-        assert np.allclose(np.cumsum(a), f(a))
-
-        # Use recursive cumsum
-        a = np.ones((block_max_size*(block_max_size+1)+2,),
-                    dtype="float32")
-        assert np.allclose(np.cumsum(a), f(a))
-
-    def test_GpuCumsum2D(self):
-        block_max_size = self.max_threads_dim0 * 2
-
-        x = T.fmatrix('x')
-        for shape_axis, axis in zip([0, 1, 0], [0, 1, None]):
-            f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
-            assert [n for n in f.maker.fgraph.toposort()
-                    if isinstance(n.op, GpuCumsum)]
-
-            # Extensive testing for the first 1025 sizes
-            a_shape = [5, 5]
-            a_shape[shape_axis] = 1025
-            a = np.random.random(a_shape).astype("float32")
-            slices = [slice(None), slice(None)]
-            for i in xrange(a.shape[shape_axis]):
-                slices[shape_axis] = slice(i)
-                fa = f(a[slices])
-                npa = np.cumsum(a[slices], axis=axis)
-                assert np.allclose(npa, fa)
-
-            # Use multiple GPU threadblocks
-            a_shape = [5, 5]
-            a_shape[shape_axis] = block_max_size+2
-            a = np.random.random(a_shape).astype("float32")
-            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-
-            # Use multiple GPU gridblocks
-            a_shape = [5, 5]
-            a_shape[1-shape_axis] = self.max_grid_size1+1
-            a = np.random.random(a_shape).astype("float32")
-            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-
-            # Use recursive cumsum
-            a_shape = [3, 3]
-            a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
-            a = np.random.random(a_shape).astype("float32")
-            a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
-            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-
-    def test_GpuCumsum3D(self):
-        block_max_size = self.max_threads_dim0 * 2
-
-        x = T.ftensor3('x')
-        for shape_axis, axis in zip([0, 1, 2, 0], [0, 1, 2, None]):
-            f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
-            assert [n for n in f.maker.fgraph.toposort()
-                    if isinstance(n.op, GpuCumsum)]
-
-            # Extensive testing for the first 1025 sizes
-            a_shape = [5, 5, 5]
-            a_shape[shape_axis] = 1025
-            a = np.random.rand(*a_shape).astype("float32")
-            slices = [slice(None), slice(None), slice(None)]
-            for i in xrange(a.shape[shape_axis]):
-                slices[shape_axis] = slice(i)
-                fa = f(a[slices])
-                npa = np.cumsum(a[slices], axis=axis)
-                assert np.allclose(npa, fa)
-
-            # Use multiple GPU threadblocks (along accumulation axis)
-            a_shape = [2, 2, 2]
-            a_shape[shape_axis] = block_max_size+2
-            a = np.random.random(a_shape).astype("float32")
-            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-
-            # Use multiple GPU gridblocks (not along accumulation axis)
-            a_shape = [5, 5, 5]
-            a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1
-            a = np.random.random(a_shape).astype("float32")
-            if axis is None:
-                a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
-            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-
-            a_shape = [5, 5, 5]
-            a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1
-            a = np.random.random(a_shape).astype("float32")
-            if axis is None:
-                a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
-            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-
-            # Use recursive cumsum (along accumulation axis)
-            a_shape = [3, 3, 3]
-            a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
-            a = np.random.random(a_shape).astype("float32")
-            a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
-            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-
-    def test_GpuCumsum4D(self):
-        # Should not use the GPU version.
-        x = T.ftensor4('x')
-        f = theano.function([x], cumsum(x, axis=1), mode=self.mode)
-        assert [n for n in f.maker.fgraph.toposort()
-                if isinstance(n.op, CumsumOp)]
diff --git a/theano/sandbox/cuda/tests/test_fftconv.py b/theano/sandbox/cuda/tests/test_fftconv.py
deleted file mode 100644
index 2bef4158c02..00000000000
--- a/theano/sandbox/cuda/tests/test_fftconv.py
+++ /dev/null
@@ -1,329 +0,0 @@
-import unittest
-import numpy
-
-import theano
-from theano.tests import unittest_tools as utt
-
-# Skip tests if cuda_ndarray is not available.
-from nose.plugins.skip import SkipTest
-import theano.sandbox.cuda as cuda_ndarray
-if not cuda_ndarray.cuda_available:
-    raise SkipTest('Optional package cuda not available')
-from theano.misc.pycuda_init import pycuda_available
-if not pycuda_available:
-    raise SkipTest('Optional package pycuda not available')
-from theano.sandbox.cuda.fftconv import scikits_cuda_available
-if not scikits_cuda_available:
-    raise SkipTest('Optional package scikits.cuda not available')
-
-from theano.sandbox.cuda import float32_shared_constructor as shared
-import theano.sandbox.cuda.fftconv
-
-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-
-
-class TestConv2dFFT(unittest.TestCase):
-    def run_conv(self, inputs_shape, filters_shape, pad=False, **other_args):
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-
-        conv_ref = theano.tensor.nnet.conv.conv2d(inputs, filters,
-                                                  **other_args)
-        conv_fft = theano.sandbox.cuda.fftconv.conv2d_fft(inputs, filters,
-                                                          pad_last_dim=pad,
-                                                          **other_args)
-
-        f_ref = theano.function([], conv_ref)
-        f_fft = theano.function([], conv_fft, mode=mode_with_gpu)
-
-        res_ref = f_ref()
-        res_fft = f_fft()
-
-        utt.assert_allclose(res_ref, res_fft)
-
-    def test_valid(self):
-        self.run_conv(inputs_shape=(5, 3, 7, 6),
-                      filters_shape=(2, 3, 3, 3),
-                      border_mode='valid')
-        self.run_conv(inputs_shape=(5, 3, 7, 7),
-                      filters_shape=(2, 3, 3, 3),
-                      border_mode='valid', pad=True)
-
-    def test_full(self):
-        self.run_conv(inputs_shape=(5, 3, 7, 6),
-                      filters_shape=(2, 3, 3, 3),
-                      border_mode='full')
-        self.run_conv(inputs_shape=(5, 3, 7, 7),
-                      filters_shape=(2, 3, 3, 3),
-                      border_mode='full', pad=True)
-
-    def test_opt_valid(self):
-        inputs_shape = (5, 3, 7, 6)
-        filters_shape = (2, 3, 3, 3)
-
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-
-        conv = theano.tensor.nnet.conv.conv2d(inputs, filters)
-
-        mode = mode_with_gpu.including('conv_fft_valid')
-
-        f_ref = theano.function([], conv)
-        f_fft = theano.function([], conv, mode=mode)
-
-        # make sure we inserted the fft trickery
-        topo = f_fft.maker.fgraph.toposort()
-        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
-                   for n in topo) == 2, topo
-
-
-        res_ref = f_ref()
-        res_fft = f_fft()
-
-        utt.assert_allclose(res_ref, res_fft)
-
-    def test_opt_full(self):
-        inputs_shape = (5, 3, 7, 6)
-        filters_shape = (2, 3, 3, 3)
-
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-
-        conv = theano.tensor.nnet.conv.conv2d(inputs, filters,
-                                              border_mode='full')
-
-        mode = mode_with_gpu.including('conv_fft_full')
-
-        f_ref = theano.function([], conv)
-        f_fft = theano.function([], conv, mode=mode)
-
-        # make sure we inserted the fft trickery
-        topo = f_fft.maker.fgraph.toposort()
-        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
-                   for n in topo) == 2, topo
-
-        res_ref = f_ref()
-        res_fft = f_fft()
-
-        utt.assert_allclose(res_ref, res_fft)
-
-    def test_opt_nofft_valid(self):
-        inputs_shape = (5, 3, 7, 6)
-        filters_shape = (2, 3, 3, 3)
-
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-
-        conv = theano.tensor.nnet.conv.conv2d(inputs, filters, version='no_fft')
-
-        mode = mode_with_gpu.including('conv_fft_valid')
-
-        f_ref = theano.function([], conv)
-        f_fft = theano.function([], conv, mode=mode)
-
-        # make sure we that no CuFFTOp has been inserted
-        topo = f_fft.maker.fgraph.toposort()
-        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
-                   for n in topo) == 0
-
-    def test_opt_nofft_full(self):
-        inputs_shape = (5, 3, 7, 6)
-        filters_shape = (2, 3, 3, 3)
-
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-
-        conv = theano.tensor.nnet.conv.conv2d(inputs, filters,
-                                              border_mode='full',
-                                              version='no_fft')
-
-        mode = mode_with_gpu.including('conv_fft_full')
-
-        f_ref = theano.function([], conv)
-        f_fft = theano.function([], conv, mode=mode)
-
-        # make sure we that no CuFFTOp has been inserted
-        topo = f_fft.maker.fgraph.toposort()
-        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
-                   for n in topo) == 0
-
-
-class TestConv3dFFT(unittest.TestCase):
-
-    def run_conv_valid(self, inputs_shape, filters_shape, pad=False):
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
-
-        # Flip filter as conv3D compute correlation
-        filters_flip = filters[:,::-1,::-1,::-1,:]
-        #filters_flip = filters
-        conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters_flip,
-                                             b=bias, d=(1,1,1))
-
-        conv_fft = theano.sandbox.cuda.fftconv.conv3d_fft(inputs.dimshuffle(0, 4, 1, 2, 3),
-                                                          filters.dimshuffle(0, 4, 1, 2, 3),
-                                                          border_mode = "valid",
-                                                          pad_last_dim = pad)
-        conv_fft = conv_fft.dimshuffle(0, 2, 3, 4, 1)
-
-        f_ref = theano.function([], conv_ref)
-        f_fft = theano.function([], conv_fft, mode=mode_with_gpu)
-
-        res_ref = f_ref()
-        res_fft = f_fft()
-        utt.assert_allclose(res_ref, res_fft,  rtol=1e-05, atol=1e-05)
-
-
-
-    def run_conv_full(self, inputs_shape, filters_shape, pad=False):
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
-
-        conv_ref = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=(1,1,1),
-                                                   H=inputs)
-
-        filters = filters.dimshuffle(4, 0, 1, 2, 3)
-        inputs = inputs.dimshuffle(0, 4, 1, 2, 3)
-        conv_fft = theano.sandbox.cuda.fftconv.conv3d_fft(inputs, filters,
-                                                          border_mode = "full",
-                                                          pad_last_dim = pad)
-        conv_fft = conv_fft.dimshuffle(0, 2, 3, 4, 1)
-
-        f_ref = theano.function([], conv_ref)
-        f_fft = theano.function([], conv_fft, mode=mode_with_gpu)
-
-        res_ref = f_ref()
-        res_fft = f_fft()
-        utt.assert_allclose(res_ref, res_fft,  rtol=1e-04, atol=1e-04)
-
-
-    def test_valid(self):
-        self.run_conv_valid(inputs_shape=(16, 20, 32, 16, 1),
-                            filters_shape=(10, 6, 12, 4, 1),
-                            pad=True)
-        self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
-                            filters_shape=(10, 6, 12, 4, 1),
-                            pad=True)
-    def test_full(self):
-        self.run_conv_full(inputs_shape=(16, 15, 21, 16, 10),
-                           filters_shape=(10, 6, 12, 4, 1),
-                           pad=True)
-        self.run_conv_full(inputs_shape=(16, 15, 21, 12, 10),
-                           filters_shape=(10, 6, 12, 4, 1),
-                           pad=True)
-
-    def test_opt_conv3d_fft(self):
-        inputs_shape = (16, 20, 32, 16, 1)
-        filters_shape = (10, 6, 12, 4, 1)
-
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
-
-        conv = theano.tensor.nnet.conv3D(V=inputs, W=filters,
-                                         b=bias, d=(1,1,1))
-        mode = mode_with_gpu.including('conv3d_fft')
-
-        f_ref = theano.function([], conv)
-        f_fft = theano.function([], conv, mode=mode)
-
-        # make sure we inserted the fft trickery
-        topo = f_fft.maker.fgraph.toposort()
-        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
-                   for n in topo) == 2
-
-
-        res_ref = f_ref()
-        res_fft = f_fft()
-
-        utt.assert_allclose(res_ref, res_fft)
-
-    def test_opt_convgrad3d_fft(self):
-        inputs_shape = (16, 20, 32, 16, 1)
-        filters_shape = (10, 6, 12, 4, 1)
-        dCdH_shape = (16, 15, 21, 13, 10)
-
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        dCdH = shared(dCdH_val)
-
-        conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH,
-                                             WShape=filters_shape,
-                                             d=(1,1,1))
-        mode = mode_with_gpu.including('convgrad3d_fft')
-
-        f_ref = theano.function([], conv)
-        f_fft = theano.function([], conv, mode=mode)
-
-        # make sure we inserted the fft trickery
-        topo = f_fft.maker.fgraph.toposort()
-        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
-                   for n in topo) == 2
-
-
-        res_ref = f_ref()
-        res_fft = f_fft()
-
-        utt.assert_allclose(res_ref, res_fft,  rtol=1e-04, atol=1e-04)
-
-
-    def test_opt_convtransp3d_fft(self):
-        inputs_shape = (16, 15, 21, 12, 10)
-        filters_shape = (10, 6, 12, 4, 1)
-
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-
-        conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=(1,1,1),
-                                               H=inputs)
-        mode = mode_with_gpu.including('convtransp3d_fft')
-
-        f_ref = theano.function([], conv)
-        f_fft = theano.function([], conv, mode=mode)
-
-        # make sure we inserted the fft trickery
-        topo = f_fft.maker.fgraph.toposort()
-        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
-                   for n in topo) == 2
-
-
-        res_ref = f_ref()
-        res_fft = f_fft()
-
-        utt.assert_allclose(res_ref, res_fft, rtol=1e-04, atol=1e-04)
-
diff --git a/theano/sandbox/cuda/tests/test_gemmcorr3d.py b/theano/sandbox/cuda/tests/test_gemmcorr3d.py
deleted file mode 100644
index 91598c9688e..00000000000
--- a/theano/sandbox/cuda/tests/test_gemmcorr3d.py
+++ /dev/null
@@ -1,238 +0,0 @@
-import unittest
-import numpy
-
-import theano
-from theano.tests import unittest_tools as utt
-
-# Skip tests if cuda_ndarray is not available.
-from nose.plugins.skip import SkipTest
-import theano.sandbox.cuda as cuda_ndarray
-if not cuda_ndarray.cuda_available:
-    raise SkipTest('Optional package cuda not available')
-from theano.sandbox.cuda import float32_shared_constructor as shared
-from theano.sandbox.cuda.blas import (
-    GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
-from theano.sandbox.cuda.basic_ops import gpu_contiguous
-
-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-
-
-class TestCorr3DMM(unittest.TestCase):
-
-    def run_conv_valid(self, inputs_shape, filters_shape,
-                       subsample=(1, 1, 1)):
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
-        conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters,
-                                             b=bias, d=subsample)
-        conv = GpuCorr3dMM(border_mode="valid",
-                           subsample=subsample)(
-                               inputs.dimshuffle(0, 4, 1, 2, 3),
-                               filters.dimshuffle(0, 4, 1, 2, 3))
-        conv = conv.dimshuffle(0, 2, 3, 4, 1)
-
-        f_ref = theano.function([], conv_ref)
-        f = theano.function([], conv, mode=mode_with_gpu)
-
-        res_ref = f_ref()
-        res = f()
-        utt.assert_allclose(res_ref, res)
-
-    def test_valid(self):
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 16, 1),
-                            filters_shape=(10, 6, 12, 4, 1))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
-                            filters_shape=(10, 6, 12, 4, 1),
-                            subsample=(2, 2, 2))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
-                            filters_shape=(10, 6, 12, 4, 1),
-                            subsample=(2, 2, 2))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
-                            filters_shape=(10, 6, 12, 4, 1),
-                            subsample=(3, 3, 3))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
-                            filters_shape=(10, 6, 12, 4, 1),
-                            subsample=(3, 3, 3))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
-                            filters_shape=(10, 6, 12, 4, 1),
-                            subsample=(3, 2, 1))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
-                            filters_shape=(10, 6, 12, 4, 1),
-                            subsample=(1, 2, 3))
-
-    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
-                       subsample=(1, 1, 1)):
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
-        inputs = shared(inputs_val)
-        dCdH = shared(dCdH_val)
-
-        conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH,
-                                             WShape=filters_shape,
-                                             d=subsample)
-        img = gpu_contiguous(inputs.dimshuffle(0, 4, 1, 2, 3))
-        topgrad = gpu_contiguous(dCdH.dimshuffle(0, 4, 1, 2, 3))
-        if (subsample == (1, 1, 1)):
-            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(img,
-                                                                     topgrad)
-        else:
-            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
-                img, topgrad, shape=filters_shape[1:4])
-        conv_gemm = conv_gemm.dimshuffle(0, 2, 3, 4, 1)
-        f_ref = theano.function([], conv)
-        f = theano.function([], conv_gemm, mode=mode_with_gpu)
-
-        res_ref = f_ref()
-        res = f()
-        utt.assert_allclose(res_ref, res)
-
-    def test_gradweight(self):
-        self.run_gradweight(inputs_shape=(16, 10, 12, 16, 1),
-                            filters_shape=(10, 6, 12, 4, 1),
-                            dCdH_shape=(16, 5, 1, 13, 10),
-                            subsample=(1, 1, 1))
-        self.run_gradweight(inputs_shape=(16, 20, 10, 16, 1),
-                            filters_shape=(10, 6, 4, 4, 1),
-                            dCdH_shape=(16, 8, 4, 7, 10),
-                            subsample=(2, 2, 2))
-        self.run_gradweight(inputs_shape=(16, 20, 10, 16, 1),
-                            filters_shape=(10, 6, 3, 4, 1),
-                            dCdH_shape=(16, 5, 3, 5, 10),
-                            subsample=(3, 3, 3))
-        self.run_gradweight(inputs_shape=(16, 20, 12, 16, 1),
-                            filters_shape=(10, 6, 12, 4, 1),
-                            dCdH_shape=(16, 8, 1, 5, 10),
-                            subsample=(2, 1, 3))
-
-    def run_gradinput(self, inputs_shape, filters_shape,
-                      subsample=(1, 1, 1)):
-
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
-        conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=subsample,
-                                               H=inputs)
-        f_ref = theano.function([], conv)
-        res_ref = f_ref()
-
-        # Get bottom shape using convTransp3D
-        bottom_shape = res_ref.shape
-        bottom_val = numpy.random.random(bottom_shape).astype('float32')
-        bottom = shared(bottom_val)
-
-        weight = gpu_contiguous(filters.dimshuffle(0, 4, 1, 2, 3))
-        top = gpu_contiguous(inputs.dimshuffle(0, 4, 1, 2, 3))
-        if (subsample == (1, 1, 1)):
-            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
-                kern=weight, topgrad=top)
-        else:
-            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
-                kern=weight, topgrad=top,
-                shape=bottom.shape[1:4])
-        conv_gemm = conv_gemm.dimshuffle(0, 2, 3, 4, 1)
-        f = theano.function([], conv_gemm, mode=mode_with_gpu)
-
-        res = f()
-        utt.assert_allclose(res_ref, res)
-
-    def test_gradinput(self):
-        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
-                           filters_shape=(10, 6, 12, 4, 1))
-        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
-                           filters_shape=(10, 6, 12, 4, 1),
-                           subsample=(2, 2, 2))
-        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
-                           filters_shape=(10, 6, 12, 4, 1),
-                           subsample=(3, 3, 3))
-        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
-                           filters_shape=(10, 6, 12, 4, 1),
-                           subsample=(3, 1, 2))
-
-    def test_opt_conv3d_gemm(self):
-        inputs_shape = (16, 20, 32, 16, 1)
-        filters_shape = (10, 6, 12, 4, 1)
-
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
-
-        conv = theano.tensor.nnet.conv3D(V=inputs, W=filters,
-                                         b=bias, d=(1, 1, 1))
-        mode = mode_with_gpu.including('conv3d_gemm')
-
-        f_ref = theano.function([], conv)
-        f_gemm = theano.function([], conv, mode=mode)
-
-        # make sure we inserted the gemm trickery
-        topo = f_gemm.maker.fgraph.toposort()
-        assert sum(isinstance(n.op, GpuCorr3dMM) for n in topo) > 0
-
-        res_ref = f_ref()
-        res_gemm = f_gemm()
-        utt.assert_allclose(res_ref, res_gemm)
-
-    def test_opt_convgrad3d_gemm(self):
-        inputs_shape = (16, 10, 12, 16, 1)
-        filters_shape = (10, 6, 12, 4, 1)
-        dCdH_shape = (16, 5, 1, 13, 10)
-
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
-
-        inputs = shared(inputs_val)
-        dCdH = shared(dCdH_val)
-
-        conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH,
-                                             WShape=filters_shape,
-                                             d=(1, 1, 1))
-        mode = mode_with_gpu.including('convgrad3d_gemm')
-
-        f_ref = theano.function([], conv)
-        f_gemm = theano.function([], conv, mode=mode)
-
-        # make sure we inserted the gemm trickery
-        topo = f_gemm.maker.fgraph.toposort()
-        assert sum(isinstance(n.op, GpuCorr3dMM_gradWeights) for n in topo) > 0
-
-        res_ref = f_ref()
-        res_gemm = f_gemm()
-        utt.assert_allclose(res_ref, res_gemm)
-
-    def test_opt_convtransp3d_gemm(self):
-        inputs_shape = (16, 15, 12, 12, 10)
-        filters_shape = (10, 6, 12, 4, 1)
-
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
-        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
-
-        inputs = shared(inputs_val)
-        filters = shared(filters_val)
-
-        conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=(1, 1, 1),
-                                               H=inputs)
-        mode = mode_with_gpu.including('convtransp3d_gemm')
-
-        f_ref = theano.function([], conv)
-        f_gemm = theano.function([], conv, mode=mode)
-
-        # make sure we inserted the gemm trickery
-        topo = f_gemm.maker.fgraph.toposort()
-        assert sum(isinstance(n.op, GpuCorr3dMM_gradInputs) for n in topo) > 0
-
-        res_ref = f_ref()
-        res_gemm = f_gemm()
-        utt.assert_allclose(res_ref, res_gemm)
diff --git a/theano/sandbox/cuda/tests/test_memory.py b/theano/sandbox/cuda/tests/test_memory.py
index 3a456ab06ed..4a146f4fe3d 100644
--- a/theano/sandbox/cuda/tests/test_memory.py
+++ b/theano/sandbox/cuda/tests/test_memory.py
@@ -1,4 +1,3 @@
-import copy
 import gc
 
 import numpy as np
@@ -19,11 +18,6 @@
 else:
     mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
 
-# The GC need to be enabled for those tests to work correctly.
-if not getattr(mode_with_gpu.linker, 'allow_gc', False):
-    mode_with_gpu.linker = copy.copy(mode_with_gpu.linker)
-    mode_with_gpu.linker.allow_gc = True
-
 
 def freemem(extra_alloc=0):
     """
diff --git a/theano/sandbox/cuda/tests/test_mlp.py b/theano/sandbox/cuda/tests/test_mlp.py
index ef5ebea9542..d0fc6a9874e 100644
--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
@@ -187,6 +187,7 @@ def run_conv_nnet1(use_gpu):
     lr = tensor.fscalar('lr')
 
     conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
+    conv_op.set_flops()
 
     hid = tensor.tanh(conv_op(x, w) + b.dimshuffle((0, 'x', 'x')))
     hid_flat = hid.reshape((n_batch, n_hid))
@@ -277,6 +278,8 @@ def run_conv_nnet2(use_gpu):  # pretend we are training LeNet for MNIST
     conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
     conv_op1 = conv.ConvOp((n_kern, logical_hid_shape[0] / 2,
          logical_hid_shape[1] / 2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
+    conv_op.set_flops()
+    conv_op1.set_flops()
 
     hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))
     hid1 = tensor.tanh(conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle((
@@ -368,6 +371,8 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
     conv_op1 = conv.ConvOp(
         (n_kern, logical_hid_shape[0] / 2, logical_hid_shape[1] / 2),
         shape_kern1[2:], n_kern1, n_batch, 1, 1,verbose=verbose, version=version)
+    conv_op.set_flops()
+    conv_op1.set_flops()
 
     ds_op = downsample.DownsampleFactorMax((2, 2), ignore_border=False)
     if downsample_ops:
@@ -396,11 +401,7 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
     if use_gpu:
         # Check that GpuConv is used
         topo = train.maker.fgraph.toposort()
-        conv_ops = (tcn.blas.GpuConv,
-                    tcn.dnn.GpuDnnConvBase,
-                    tcn.blas.BaseGpuCorrMM)
-
-        assert len([n for n in topo if isinstance(n.op, conv_ops)]) > 0
+        assert len([n for n in topo if isinstance(n.op, tcn.blas.GpuConv)]) > 0
 
     shape_target = (n_batch, n_out)
     return train, params, shape_img, shape_target, mode
diff --git a/theano/sandbox/cuda/tests/test_neighbours.py b/theano/sandbox/cuda/tests/test_neighbours.py
index 8fd718648af..cda30bcc99e 100644
--- a/theano/sandbox/cuda/tests/test_neighbours.py
+++ b/theano/sandbox/cuda/tests/test_neighbours.py
@@ -5,7 +5,7 @@
 if cuda_ndarray.cuda_available == False:
     raise SkipTest('Optional package cuda disabled')
 
-import theano.tensor.nnet.tests.test_neighbours
+import theano.sandbox.test_neighbours
 from theano.sandbox.cuda.neighbours import GpuImages2Neibs
 
 if theano.config.mode == 'FAST_COMPILE':
@@ -14,7 +14,7 @@
     mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
 
 
-class T_GpuImages2Neibs(theano.tensor.nnet.tests.test_neighbours.T_Images2Neibs):
+class T_GpuImages2Neibs(theano.sandbox.test_neighbours.T_Images2Neibs):
     mode = mode_with_gpu
     op = GpuImages2Neibs
     dtypes = ['float32']
diff --git a/theano/sandbox/cuda/tests/test_nnet.py b/theano/sandbox/cuda/tests/test_nnet.py
index c31c3572463..f536cbeacae 100644
--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -1,6 +1,5 @@
 from nose.plugins.skip import SkipTest
 import numpy
-import unittest
 
 import theano
 from theano.gof.python25 import any
@@ -14,13 +13,11 @@
 
 if theano.config.mode == 'FAST_COMPILE':
     mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-    # We should not exclude the 'gpu' tag, as some CPU opt are tagged
-    # as GPU to make them run in fast_compile with gpu.
-
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN')
+    mode_without_gpu = theano.compile.mode.get_mode(
+        'FAST_RUN').excluding('gpu')
 else:
     mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-    mode_without_gpu = theano.compile.mode.get_default_mode()
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
 
 
 def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
@@ -28,6 +25,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
     This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
 
     We check that we loop when their is too much threads
+    TODO: check that we loop when their is too much block(>32*1024)
 
     """
 
@@ -102,16 +100,13 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
     This is basic test for GpuCrossentropySoftmax1HotWithBiasDx
 
     We check that we loop when their is too much threads
+    TODO: check that we loop when their is too much block(>32*1024)
 
     """
     n_in = 1000
     batch_size = 4097
     n_out = 1250
 
-    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
-        n_in = 4098
-        n_out = 4099
-
     # Seed numpy.random with config.unittests.rseed
     utt.seed_rng()
 
@@ -211,227 +206,42 @@ def cmp(n, m):
     cmp(128, 64 * 1024)
 
 
-class test_SoftMax(unittest.TestCase):
-    def _test_softmax(
-        self,
-        x,
-        x_gpu,
-        f_z,
-        f_gpu_z,
-        cmp,
-        gpu_mode,
-        check_types
-    ):
-        """
-        This is basic test for GpuSoftmax and GpuDnnSoftmax
-
-        We check that we loop when their is too much block
-        We use slower code when there isn't enough shared memory
-        """
-        f_z_out = f_z(x)
-        f_gpu_z_out = f_gpu_z(x_gpu)
-
-        f = theano.function([x], f_z_out, mode=mode_without_gpu)
-        f_gpu = theano.function([x_gpu], f_gpu_z_out, mode=gpu_mode)
-        check_types(f, f_gpu)
-
-        #we need to test n>32*1024 to check that we make the block loop.
-        cmp(1, 5, f, f_gpu)
-        cmp(2, 5, f, f_gpu)
-        cmp(10, 5, f, f_gpu)
-        cmp(100, 5, f, f_gpu)
-        cmp(1000, 5, f, f_gpu)
-        cmp(10000, 5, f, f_gpu)
-        cmp(4074, 400, f, f_gpu)
-        cmp(784, 784, f, f_gpu)
-        cmp(4, 1000, f, f_gpu)
-        cmp(4, 1024, f, f_gpu)
-        cmp(4, 2000, f, f_gpu)
-        cmp(4, 2024, f, f_gpu)
-        # The GTX285 don't have enough shared memory.
-        cmp(4, 4074, f, f_gpu)
-        # The GTX580, 680 and kepler don't have enough shared memory.
-        cmp(2, 10000, f, f_gpu)
-        cmp(128, 16 * 1024, f, f_gpu)
-        cmp(128, 64 * 1024, f, f_gpu)
-        # cudnn permits no more than 2^15 - 1 rows
-        cmp((2 << 15) - 1, 5, f, f_gpu)
-        cmp(5, 2 << 15, f, f_gpu)
-
-        return f, f_gpu
-
-    def _cmp(self, n, m, f, f_gpu):
+def test_softmax():
+    """
+    This is basic test for GpuSoftmax
+
+    We check that we loop when their is too much block
+    We use slower code when there isn't enough shared memory
+    """
+    x = T.fmatrix('x')
+
+    z = T.nnet.softmax(x)
+    f = theano.function([x], z, mode=mode_without_gpu)
+    f_gpu = theano.function([x], z, mode=mode_with_gpu)
+    assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax
+    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
+                      cuda.nnet.GpuSoftmax)
+
+    def cmp(n, m):
         #print "test_softmax",n,m
         data = numpy.arange(n * m, dtype='float32').reshape(n, m)
         out = f(data)
         gout = f_gpu(data)
         assert numpy.allclose(out, gout), numpy.absolute(out - gout)
 
-    def _check_types(self, graph, graph_gpu, topo_idx, f_type, f_gpu_type):
-        assert isinstance(graph.maker.fgraph.toposort()[-1].op, f_type)
-        assert isinstance(
-            graph_gpu.maker.fgraph.toposort()[topo_idx].op,
-            f_gpu_type
-        )
-
-    def test_softmax(self):
-        x = T.fmatrix('x')
-        z = T.nnet.softmax
-
-        def check_types_without_cudnn(graph, graph_gpu):
-            self._check_types(
-                graph,
-                graph_gpu,
-                -2,
-                type(z),
-                cuda.nnet.GpuSoftmax
-            )
-
-        mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
-        f, f_gpu = self._test_softmax(
-            x,
-            x,
-            z,
-            z,
-            self._cmp,
-            mode_wo_cudnn,
-            check_types_without_cudnn
-        )
-
-        # cuDNN R1 cannot handle these test cases but the Theano softmax can so
-        # we test them only for the Theano softmax.
-        self._cmp(2 << 15, 5, f, f_gpu)
-        self._cmp(0, 10, f, f_gpu)
-
-    def test_softmax_cudnn(self):
-        if not cuda.dnn.dnn_available():
-            raise SkipTest(cuda.dnn.dnn_available.msg)
-        x = T.fmatrix('x')
-        z = T.nnet.softmax
-
-        def check_types_with_cudnn(graph, graph_gpu):
-            self._check_types(
-                graph,
-                graph_gpu,
-                -3,
-                type(z),
-                theano.sandbox.cuda.dnn.GpuDnnSoftmax
-            )
-
-        f, f_gpu = self._test_softmax(
-            x,
-            x,
-            z,
-            z,
-            self._cmp,
-            mode_with_gpu,
-            check_types_with_cudnn
-        )
-
-    def test_cudnn_softmax_grad(self):
-        if not cuda.dnn.dnn_available():
-            raise SkipTest(cuda.dnn.dnn_available.msg)
-
-        def cmp(n, m, f, f_gpu):
-            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-            gdata = numpy.asarray(data)[:, :, None, None]
-            out = f(data)
-            gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
-            assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-
-        x = T.matrix('x', 'float32')
-        x_gpu = T.tensor4('x_gpu', 'float32')
-        f_z = T.nnet.softmax
-        f_gpu = theano.sandbox.cuda.dnn.GpuDnnSoftmax(
-            'bc01',
-            'accurate',
-            'channel'
-        )
-
-        # Verify the grad operation
-        dims = (2, 3, 4, 5)
-        gdata = numpy.arange(
-            numpy.product(dims),
-            dtype='float32'
-        ).reshape(dims)
-        T.verify_grad(f_gpu, [gdata], rng=numpy.random)
-
-        def check_types(graph, graph_gpu):
-            self._check_types(
-                graph,
-                graph_gpu,
-                -1,
-                type(f_z),
-                theano.sandbox.cuda.dnn.GpuDnnSoftmax
-            )
-
-        def check_types_opt(graph, graph_gpu):
-            assert isinstance(graph.maker.fgraph.toposort()[-1].op, type(f_z))
-            assert len([n for n in graph_gpu.maker.fgraph.toposort()
-                        if isinstance(
-                            n.op,
-                            theano.sandbox.cuda.dnn.GpuDnnSoftmax
-                        )]) == 1
-
-        # Verify that the CPU and GPU implementations return the same results
-        # up to a tolerance.
-        self._test_softmax(
-            x,
-            x_gpu,
-            f_z,
-            f_gpu,
-            cmp,
-            mode_with_gpu,
-            check_types
-        )
-
-        mode_w_cudnn = mode_with_gpu.including("cudnn")
-        self._test_softmax(
-            x, x, f_z, f_z, self._cmp,
-            mode_w_cudnn, check_types_opt
-        )
-
-        # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad optimization is
-        # applied when cudnn is required
-        y = T.fvector('y')
-        f = theano.function(
-            [y],
-            T.grad(T.nnet.softmax(y).mean(), y),
-            mode=mode_with_gpu
-        )
-        sorted_f = f.maker.fgraph.toposort()
-        assert(len([i
-                    for i in sorted_f
-                    if isinstance(
-                        i.op,
-                        theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad
-                    )]) == 1)
-        assert(len([i
-                    for i in sorted_f
-                    if isinstance(
-                        i.op,
-                        theano.tensor.nnet.SoftmaxGrad
-                    )]) == 0)
-
-        # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad optimization is not
-        # applied when cudnn is excluded or not available
-        mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
-        y = T.vector('y')
-        f = theano.function(
-            [y],
-            T.grad(T.nnet.softmax(y).mean(), y),
-            mode=mode_wo_cudnn
-        )
-        sorted_f = f.maker.fgraph.toposort()
-        assert(len([i
-                    for i in sorted_f
-                    if isinstance(
-                        i.op,
-                        theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad
-                    )]) == 0)
-        assert(len([i
-                    for i in sorted_f
-                    if isinstance(
-                        i.op,
-                        theano.tensor.nnet.SoftmaxGrad
-                    )]) == 1)
+    #we need to test n>32*1024 to check that we make the block loop.
+    cmp(2, 5)
+    cmp(2 << 15, 5)
+    cmp(4074, 400)
+    cmp(0, 10)
+    cmp(784, 784)
+    cmp(4, 1000)
+    cmp(4, 1024)
+    cmp(4, 2000)
+    cmp(4, 2024)
+    # The GTX285 don't have enough shared memory.
+    cmp(4, 4074)
+    # The GTX580, 680 and kepler don't have enough shared memory.
+    cmp(2, 10000)
+    cmp(128, 16 * 1024)
+    cmp(128, 64 * 1024)
diff --git a/theano/sandbox/cuda/tests/test_nvcc_compiler.py b/theano/sandbox/cuda/tests/test_nvcc_compiler.py
new file mode 100644
index 00000000000..946aaacd712
--- /dev/null
+++ b/theano/sandbox/cuda/tests/test_nvcc_compiler.py
@@ -0,0 +1,50 @@
+__copyright__ = "(c) 2011, Universite de Montreal"
+__license__   = "3-clause BSD License"
+
+
+import warnings
+
+from theano.sandbox.cuda.nvcc_compiler import remove_python_framework_dir
+
+
+def test_remove_python_framework_dir():
+    """
+    Test function 'remove_python_framework_dir'.
+    """
+    # This is a typical output of 'python-config --ldflags'.
+    cmd = ('-L/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config '
+           '-ldl -framework CoreFoundation -lpython2.7 -u _PyMac_Error '
+           '/opt/local/Library/Frameworks/Python.framework/Versions/2.7/Python').split()
+    assert remove_python_framework_dir(cmd) == cmd[0:-1]
+    # Add a fake argument that should not be removed.
+    cmd.append(
+        '-L/opt/local/Library/Frameworks/Python.framework/Versions/2.7/Python')
+    assert remove_python_framework_dir(cmd) == cmd[0:-2] + cmd[-1:]
+
+    # We test for the warning only if we can use 'catch_warnings' (Python 2.6+)
+    # as otherwise it is difficult to do it properly.
+    try:
+        warnings.catch_warnings
+        test_warning = True
+    except AttributeError:
+        test_warning = False
+    if test_warning:
+        cmd.append('Frameworks/Python.framework/Versions/2.6/Python')
+        # Python 2.4 "emulation" of `with` statement. It is necessary even if this
+        # code is not executed, because using `with` would result in a SyntaxError.
+        with_context = warnings.catch_warnings(record=True)
+        record = with_context.__enter__()
+        try:
+            assert remove_python_framework_dir(cmd) == cmd[0:-3] + cmd[-2:-1]
+            assert len(record) == 1
+            assert 'remove_python_framework_dir' in str(record[0].message)
+        finally:
+            with_context.__exit__(None, None, None)
+
+    # Now test some more typical arguments that should be caught by the regex.
+    for arg_to_remove in [
+            '/Library/Frameworks/EPD64.framework/Versions/7.1/Python',
+            '/Library/Frameworks/Python.framework/Versions/7.2/Python',
+            ]:
+        # Make sure those arguments are removed.
+        assert not remove_python_framework_dir([arg_to_remove])
diff --git a/theano/sandbox/cuda/tests/test_opt.py b/theano/sandbox/cuda/tests/test_opt.py
index d2dba8aa4e5..20ab26af724 100644
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -1,14 +1,12 @@
-import operator
-import sys
+import sys, time, unittest
 
 import numpy
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
 
-import theano
 from theano.compile.pfunc import pfunc
 from theano import config, tensor
-import theano.tensor.tests.test_nlinalg
+import theano
 
 from theano.tests import unittest_tools as utt
 
@@ -41,17 +39,6 @@ def test_no_shared_var_graph():
     assert numpy.any(isinstance(x.op,cuda.GpuFromHost) for x in l)
     assert numpy.any(isinstance(x.op,cuda.HostFromGpu) for x in l)
 
-
-def test_local_assert():
-    x = theano.tensor.fmatrix()
-    a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())
-    f = theano.function([x], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    a_op = [n for n in topo if isinstance(n.op, theano.tensor.opt.Assert)]
-    assert len(a_op) == 1
-    assert isinstance(a_op[0].inputs[0].type, CudaNdarrayType)
-
-
 def test_int_pow():
     a = CudaNdarrayType([False])()
 
@@ -60,30 +47,28 @@ def test_int_pow():
     op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
     assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']
 
-    f = theano.function([a], tensor.pow(a, 4).sum(), mode=mode_with_gpu)
+    f = theano.function([a], tensor.pow(a,4).sum(), mode=mode_with_gpu)
     op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
     assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']
 
     #theano.printing.debugprint(f)
 
-
 def test_gpualloc():
     '''
     This tests tries to catch the scenario when, due to infer_shape,
-    the input of the alloc changes from tensor scalar to a constant
+    the input of the alloc changes from tesnor scalar to a constant
     1. In this case the original constracted broadcastable pattern will
     have a False for that dimension, but the new broadcastable pattern
     that will be inserted by gpualloc will have  a True since it knows the
     dimension is 1 and therefore broadcastable.
     '''
 
-    x = theano.shared(numpy.ones(3, dtype='float32'), 'x')
-    m = (x).dimshuffle(['x', 0])
+    x = theano.shared(numpy.ones(3,dtype='float32'), 'x')
+    m = (x).dimshuffle(['x',0])
     v = tensor.alloc(1., *m.shape)
-    f = theano.function([], v + x,
-                        mode=mode_with_gpu.excluding("local_alloc_elemwise"))
+    f = theano.function([], v+x)
     l = f.maker.fgraph.toposort()
-    assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])
+    assert numpy.any(ininstance(x.op, cuda.GpuAlloc) for x in l )
 
 
 def test_alloc_memset_0():
@@ -226,29 +211,20 @@ def test_huge_elemwise_fusion():
     """
     shape = (2, 3, 4, 5, 6)
     ttype = tensor.tensor(dtype='float32', broadcastable=(False,) * len(shape))
-    gpu_ptr_size = theano.sandbox.cuda.opt.get_device_type_sizes()['gpu_ptr_size']
-    if gpu_ptr_size == 8:
-        nb_in = 7
-        len_topo = 10
-    elif gpu_ptr_size == 4:
-        nb_in = 8
-        len_topo = 11
-    else:
-        raise Exception("Unexpected value for gpu_ptr_size", gpu_ptr_size)
-    vars = [tensor.tanh(ttype) for x in range(nb_in)]
-    f = pfunc(vars, [reduce(operator.sub, vars)], mode=mode_with_gpu)
-
+    vars = [tensor.tanh(ttype) for x in range(7)]
+    f = pfunc(vars, [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] -
+                     vars[5] - vars[6]], mode=mode_with_gpu)
     topo = f.maker.fgraph.toposort()
     #theano.printing.debugprint(f)
     #for i, node in enumerate(topo):
     #    print >> sys.stdout, i, node
-    assert len(topo) == len_topo
+    assert len(topo) == 10
     assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 2
-    assert isinstance(topo[-3].op.scalar_op, theano.scalar.basic.Sub)
-    assert isinstance(topo[-2].op.scalar_op, theano.scalar.basic.Composite)
+    assert isinstance(topo[7].op.scalar_op, theano.scalar.basic.Sub)
+    assert isinstance(topo[8].op.scalar_op, theano.scalar.basic.Composite)
     #let debugmode catch errors
     gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
-    f(*[gen() for i in range(nb_in)])
+    f(gen(), gen(), gen(), gen(), gen(), gen(), gen())
 
     # Test the case where we can't put the computation on the gpu! their is too
     # many dimensions to the input to have 2 inputs to the op!
@@ -405,43 +381,6 @@ def test_erfinvgpu():
     assert numpy.allclose(f(xv),f2(xv))
 
 
-def test_local_gpu_dot_to_dot22dot():
-    def cmp(a_shp, b_shp):
-        a0 = numpy.random.rand(*a_shp).astype('float32')
-        a = cuda.shared_constructor(a0, 'a')
-        b0 = numpy.random.rand(*b_shp).astype('float32')
-        b = cuda.shared_constructor(b0, 'a')
-
-        f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu)
-        assert cuda.opt.local_gpu_dot_to_dot22.transform(
-            tensor.dot(a, b).owner)
-        out = f()
-
-        assert numpy.allclose(numpy.dot(a0, b0), out)
-
-        # Try with a matrix equal to a0, but with strides in both dims
-        a.set_value(a0)
-        a.set_value(
-            a.get_value(borrow=True,
-                        return_internal_type=True)[::-1],
-            borrow=True)
-        f()
-
-    cmp((4,), (4, 5))
-    cmp((3, 4), (4,))
-
-
-class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
-    mode = mode_with_gpu
-    shared = staticmethod(cuda.shared_constructor)
-    floatX = 'float32'
-    type = CudaNdarrayType
-
-    def __init__(self, name):
-        super(theano.tensor.tests.test_nlinalg.test_diag,
-              self).__init__(name)
-
-
 if __name__ == '__main__':
     test_gpualloc()
     test_opt_gpujoin_onlyajoin()
diff --git a/theano/sandbox/cuda/tests/test_tensor_op.py b/theano/sandbox/cuda/tests/test_tensor_op.py
index 23ba1841210..8e04cad0bd6 100644
--- a/theano/sandbox/cuda/tests/test_tensor_op.py
+++ b/theano/sandbox/cuda/tests/test_tensor_op.py
@@ -12,12 +12,11 @@
 
 # Skip test if cuda_ndarray is not available.
 import theano.sandbox.cuda as cuda
-from theano.tensor.nnet.tests import test_conv3d2d
 if cuda.cuda_available == False:
     raise SkipTest('Optional package cuda disabled')
 
 
-if theano.config.mode == 'FAST_COMPILE':
+if theano.config.mode=='FAST_COMPILE':
     mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
     mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
 else:
@@ -27,28 +26,26 @@
 
 def test_shape_i():
     x = cuda.ftensor3()
-    v = cuda.CudaNdarray(numpy.zeros((3, 4, 5), dtype='float32'))
-    f = theano.function([x], x.shape[1])
+    v = cuda.CudaNdarray(numpy.zeros((3,4,5),dtype='float32'))
+    f = theano.function([x],x.shape[1])
     topo = f.maker.fgraph.toposort()
-    assert f(v) == 4
-    if theano.config.mode != 'FAST_COMPILE':
-        assert len(topo) == 1
-        assert isinstance(topo[0].op, T.opt.Shape_i)
-
+    assert f(v)==4
+    if theano.config.mode!='FAST_COMPILE':
+        assert len(topo)==1
+        assert isinstance(topo[0].op,T.opt.Shape_i)
 
 def test_shape():
     x = cuda.ftensor3()
-    v = cuda.CudaNdarray(numpy.zeros((3, 4, 5), dtype='float32'))
-    f = theano.function([x], x.shape)
+    v = cuda.CudaNdarray(numpy.zeros((3,4,5),dtype='float32'))
+    f = theano.function([x],x.shape)
     topo = f.maker.fgraph.toposort()
-    assert numpy.all(f(v) == (3, 4, 5))
-    if theano.config.mode != 'FAST_COMPILE':
-        assert len(topo) == 4
-        assert isinstance(topo[0].op, T.opt.Shape_i)
-        assert isinstance(topo[1].op, T.opt.Shape_i)
-        assert isinstance(topo[2].op, T.opt.Shape_i)
-        assert isinstance(topo[3].op, T.opt.MakeVector)
-
+    assert numpy.all(f(v)==(3,4,5))
+    if theano.config.mode!='FAST_COMPILE':
+        assert len(topo)==4
+        assert isinstance(topo[0].op,T.opt.Shape_i)
+        assert isinstance(topo[1].op,T.opt.Shape_i)
+        assert isinstance(topo[2].op,T.opt.Shape_i)
+        assert isinstance(topo[3].op,T.opt.MakeVector)
 
 def test_softmax_optimizations():
     from theano.tensor.nnet.nnet import softmax, crossentropy_categorical_1hot
@@ -69,17 +66,16 @@ def test_softmax_optimizations():
     assert fgraph.outputs[0].owner.inputs[0].owner.op == cuda.host_from_gpu
     assert fgraph.outputs[0].owner.inputs[0].owner.inputs[0].owner.op == cuda.nnet.gpu_crossentropy_softmax_argmax_1hot_with_bias
 
-
 def test_may_share_memory_cuda():
     from theano.misc.may_share_memory import may_share_memory
-    a = cuda.CudaNdarray(numpy.zeros((3, 4), dtype='float32'))
-    b = cuda.CudaNdarray(numpy.zeros((3, 4), dtype='float32'))
-    na = numpy.zeros((3, 4))
-    nb = numpy.zeros((3, 4))
+    a = cuda.CudaNdarray(numpy.zeros((3,4),dtype='float32'))
+    b = cuda.CudaNdarray(numpy.zeros((3,4),dtype='float32'))
+    na = numpy.zeros((3,4))
+    nb = numpy.zeros((3,4))
     va = a.view()
     vb = b.view()
-    ra = a.reshape((4, 3))
-    rb = b.reshape((4, 3))
+    ra = a.reshape((4,3))
+    rb = b.reshape((4,3))
 
     #can't test the transpose as ta._strides = is not implemented
     #manual transpose of a
@@ -88,28 +84,25 @@ def test_may_share_memory_cuda():
     #elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
     #ta.gpudata += ta.size*elem_size
 
-    for a_, b_, rep in [(a, a, True), (b, b, True), (a, b, False),
-                        (a, na, False), (b, nb, False),
-                        (na, b, False), (nb, a, False),
-                        (a, va, True), (b, vb, True),
-                        (va, b, False), (a, vb, False),
-                        (a, ra, True), (b, rb, True),
-                        (ra, b, False), (a, rb, False),
+    for a_,b_,rep in [(a,a,True),(b,b,True),(a,b,False),
+                      (a,na,False),(b,nb,False),(na,b,False),(nb,a,False),
+                      (a,va,True),(b,vb,True),(va,b,False),(a,vb,False),
+                      (a,ra,True),(b,rb,True),(ra,b,False),(a,rb,False),
                       ]:
-        assert may_share_memory(a_, b_) == rep
-        assert may_share_memory(b_, a_) == rep
+        assert may_share_memory(a_,b_)==rep
+        assert may_share_memory(b_,a_)==rep
 
     #test that it raise error when needed.
-    for a_, b_, rep in [(a, (0,), False), (a, 1, False), (a, None, False)]:
-        assert may_share_memory(a_, b_, False) == rep
-        assert may_share_memory(b_, a_, False) == rep
+    for a_,b_,rep in [(a,(0,),False),(a,1,False),(a,None,False)]:
+        assert may_share_memory(a_,b_,False)==rep
+        assert may_share_memory(b_,a_,False)==rep
         try:
-            may_share_memory(a_, b_)
+            may_share_memory(a_,b_)
             raise Exception("An error was expected")
         except TypeError:
             pass
         try:
-            may_share_memory(b_, a_)
+            may_share_memory(b_,a_)
             raise Exception("An error was expected")
         except TypeError:
             pass
@@ -134,12 +127,3 @@ def test_deepcopy():
     out = f(a_v)
     assert out is not a_v
     assert numpy.allclose(numpy.asarray(a_v), numpy.asarray(out))
-
-
-def test_get_diagonal_subtensor_view():
-    test_conv3d2d.test_get_diagonal_subtensor_view(wrap=cuda.CudaNdarray)
-
-
-def test_conv3d():
-    test_conv3d2d.test_conv3d(mode=mode_with_gpu,
-                              shared=cuda.shared_constructor)
diff --git a/theano/sandbox/cuda/tests/test_type.py b/theano/sandbox/cuda/tests/test_type.py
deleted file mode 100644
index 2576dc09344..00000000000
--- a/theano/sandbox/cuda/tests/test_type.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import cPickle
-import os.path
-import sys
-
-from nose.tools import assert_raises
-import numpy
-
-from theano import config
-from theano.sandbox.cuda import cuda_available
-
-if cuda_available:
-    from theano.sandbox.cuda import CudaNdarray
-
-# testfile created on cuda enabled machine using
-# >>> with open('CudaNdarray.pkl', 'wb') as fp:
-# >>> cPickle.dump(theano.sandbox.cuda.CudaNdarray(np.array([-42.0], dtype=np.float32)), fp)
-
-
-def test_unpickle_flag_is_false_by_default():
-    assert not config.experimental.unpickle_gpu_on_cpu, (
-        "Config flag experimental.unpickle_gpu_on_cpu is "
-        "set to true. Make sure the default value stays false "
-        "and that you have not set the flag manually.")
-
-
-def test_unpickle_cudandarray_as_numpy_ndarray_flag0():
-    oldflag = config.experimental.unpickle_gpu_on_cpu
-    config.experimental.unpickle_gpu_on_cpu = False
-
-    try:
-        testfile_dir = os.path.dirname(os.path.realpath(__file__))
-        fname = 'CudaNdarray.pkl'
-        if sys.version_info[0] == 3:
-            fname = 'CudaNdarray_py3.pkl'
-
-        with open(os.path.join(testfile_dir, fname), 'rb') as fp:
-            if cuda_available:
-                mat = cPickle.load(fp)
-                assert isinstance(mat, CudaNdarray)
-                assert numpy.asarray(mat)[0] == -42.0
-            else:
-                assert_raises(ImportError, cPickle.load, fp)
-
-    finally:
-        config.experimental.unpickle_gpu_on_cpu = oldflag
-
-
-def test_unpickle_cudandarray_as_numpy_ndarray_flag1():
-    oldflag = config.experimental.unpickle_gpu_on_cpu
-    config.experimental.unpickle_gpu_on_cpu = True
-
-    try:
-        testfile_dir = os.path.dirname(os.path.realpath(__file__))
-        fname = 'CudaNdarray.pkl'
-        if sys.version_info[0] == 3:
-            fname = 'CudaNdarray_py3.pkl'
-
-        with open(os.path.join(testfile_dir, fname), 'rb') as fp:
-            mat = cPickle.load(fp)
-
-        assert isinstance(mat, numpy.ndarray)
-        assert mat[0] == -42.0
-
-    finally:
-        config.experimental.unpickle_gpu_on_cpu = oldflag
diff --git a/theano/sandbox/cuda/type.py b/theano/sandbox/cuda/type.py
index 03011c36733..3376266e77e 100644
--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -2,7 +2,6 @@
 """
 import os
 import copy_reg
-import warnings
 
 import numpy
 
@@ -184,16 +183,11 @@ def values_eq(a, b):
         return tensor.TensorType.values_eq(numpy.asarray(a), numpy.asarray(b))
 
     @staticmethod
-    def values_eq_approx(a, b, allow_remove_inf=False, allow_remove_nan=False,
-                         rtol=None, atol=None):
+    def values_eq_approx(a, b, allow_remove_inf=False):
         #TODO: make the comparaison without transfert.
-        return tensor.TensorType.values_eq_approx(
-            numpy.asarray(a),
-            numpy.asarray(b),
-            allow_remove_inf=allow_remove_inf,
-            allow_remove_nan=allow_remove_nan,
-            rtol=rtol, atol=atol
-        )
+        return tensor.TensorType.values_eq_approx(numpy.asarray(a),
+                                                  numpy.asarray(b),
+                allow_remove_inf=allow_remove_inf)
 
     def dtype_specs(self):
         """Return a tuple (python type, c type, numpy typenum) that
@@ -273,14 +267,13 @@ def __repr__(self):
         return str(self)
         #"CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
 
-    def c_declare(self, name, sub, check_input=True):
+    def c_declare(self, name, sub):
         return """ CudaNdarray * %(name)s;""" % locals()
 
     def c_init(self, name, sub):
         return "%(name)s = NULL;" % locals()
 
-    def c_extract(self, name, sub, check_input=True,
-                  check_broadcast=True):
+    def c_extract(self, name, sub):
         sio = StringIO()
         fail = sub['fail']
         nd = self.ndim
@@ -293,90 +286,63 @@ def c_extract(self, name, sub, check_input=True,
             //fprintf(stderr, "c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
             %(name)s = (CudaNdarray*)py_%(name)s;
             //std::cerr << "c_extract " << %(name)s << '\\n';
-        """ % locals()
-        if(check_input):
-            print >> sio, """
-                if (%(name)s->nd != %(nd)s)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "c_extract: Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s",
-                                 %(name)s->nd);
-                    %(name)s = NULL;
-                    %(fail)s;
-                }
-                //std::cerr << "c_extract " << %(name)s << " nd check passed\\n";
-            """ % locals()
-            for i, b in enumerate(self.broadcastable):
-                if b and check_broadcast:
-                    print >> sio, """
-                if (CudaNdarray_HOST_DIMS(%(name)s)[%(i)s] != 1)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "c_extract: Some CudaNdarray has dim %%i on broadcastable dimension %%i",
-                                 CudaNdarray_HOST_DIMS(%(name)s)[%(i)s], %(i)s);
-                    %(name)s = NULL;
-                    %(fail)s;
-                }
-                //std::cerr << "c_extract " << %(name)s << "dim check %(i)s passed\\n";
-                //std::cerr << "c_extract " << %(name)s << "checking bcast %(i)s <" << %(name)s->str<< ">\\n";
-                //std::cerr << "c_extract " << %(name)s->str[%(i)s] << "\\n";
-                if (CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s])
-                {
-                    //std::cerr << "c_extract bad stride detected...\\n";
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "c_extract: Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i",
-                                 CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s], %(i)s);
-                    %(name)s = NULL;
-                    %(fail)s;
-                }
-                //std::cerr << "c_extract " << %(name)s << "bcast check %(i)s passed\\n";
-                    """ % locals()
-            print >> sio, """
-                assert(%(name)s);
-                Py_INCREF(py_%(name)s);
-            }
-            else if (py_%(name)s == Py_None)
+            if (%(name)s->nd != %(nd)s)
             {
-                PyErr_SetString(PyExc_TypeError,
-                                "expected a CudaNdarray, not None");
+                PyErr_Format(PyExc_RuntimeError,
+                             "c_extract: Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s",
+                             %(name)s->nd);
                 %(name)s = NULL;
                 %(fail)s;
             }
-            else
+            //std::cerr << "c_extract " << %(name)s << " nd check passed\\n";
+        """ % locals()
+        for i, b in enumerate(self.broadcastable):
+            if b:
+                print >> sio, """
+            if (CudaNdarray_HOST_DIMS(%(name)s)[%(i)s] != 1)
             {
-                //fprintf(stderr, "FAILING c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
-                PyErr_SetString(PyExc_TypeError, "Argument not a CudaNdarray");
+                PyErr_Format(PyExc_RuntimeError,
+                             "c_extract: Some CudaNdarray has dim %%i on broadcastable dimension %%i",
+                             CudaNdarray_HOST_DIMS(%(name)s)[%(i)s], %(i)s);
                 %(name)s = NULL;
                 %(fail)s;
             }
-            //std::cerr << "c_extract done " << %(name)s << '\\n';
-            """ % locals()
-        else:
-            print >> sio, """
-                assert(%(name)s);
-                Py_INCREF(py_%(name)s);
+            //std::cerr << "c_extract " << %(name)s << "dim check %(i)s passed\\n";
+            //std::cerr << "c_extract " << %(name)s << "checking bcast %(i)s <" << %(name)s->str<< ">\\n";
+            //std::cerr << "c_extract " << %(name)s->str[%(i)s] << "\\n";
+            if (CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s])
+            {
+                //std::cerr << "c_extract bad stride detected...\\n";
+                PyErr_Format(PyExc_RuntimeError,
+                             "c_extract: Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i",
+                             CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s], %(i)s);
+                %(name)s = NULL;
+                %(fail)s;
             }
-            """ % locals()
-        #print sio.getvalue()
-        return sio.getvalue()
-
-    def c_extract_out(self, name, sub, check_input=True, check_broadcast=True):
-        """ To allow the hack to skip check_broadcast.
-        """
-        return """
-        if (py_%(name)s == Py_None)
+            //std::cerr << "c_extract " << %(name)s << "bcast check %(i)s passed\\n";
+                """ % locals()
+        print >> sio, """
+            assert(%(name)s);
+            Py_INCREF(py_%(name)s);
+        }
+        else if (py_%(name)s == Py_None)
         {
-            %(c_init_code)s
+            PyErr_SetString(PyExc_TypeError,
+                            "expected a CudaNdarray, not None");
+            %(name)s = NULL;
+            %(fail)s;
         }
         else
         {
-            %(c_extract_code)s
+            //fprintf(stderr, "FAILING c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
+            PyErr_SetString(PyExc_TypeError, "Argument not a CudaNdarray");
+            %(name)s = NULL;
+            %(fail)s;
         }
-        """ % dict(
-            name=name,
-            c_init_code=self.c_init(name, sub),
-            c_extract_code=self.c_extract(name, sub, check_input,
-                                          check_broadcast))
+        //std::cerr << "c_extract done " << %(name)s << '\\n';
+        """ % locals()
+        #print sio.getvalue()
+        return sio.getvalue()
 
     def c_cleanup(self, name, sub):
         return """
@@ -426,12 +392,15 @@ def c_header_dirs(self):
 
     def c_lib_dirs(self):
         ret = [os.path.dirname(cuda_ndarray.__file__)]
+        cuda_root = config.cuda.root
+        if cuda_root:
+            ret.append(os.path.join(cuda_root, 'lib'))
         return ret
 
     def c_libraries(self):
         # returning cublas because the cuda_ndarray.cuh header
         # includes calls to SetVector and cublasGetError
-        return ['cudart', config.cublas.lib, 'cuda_ndarray']
+        return ['cudart', 'cublas']
 
     def c_support_code(cls):
         return ""
@@ -440,7 +409,7 @@ def c_code_cache_version(self):
         #return ()
         #no need to put nvcc.fastmath in the tuple as the
         #c_compile_args is put in the key.
-        return (3,)  # cublas v2 changes
+        return (2,)  # with assertion about refcounts
 
     def c_compiler(self):
         return NVCC_compiler
@@ -469,23 +438,6 @@ def get_size(self, shape_info):
         """,
         version=1)
 
-theano.compile.register_shape_i_c_code(
-    CudaNdarrayType,
-    """
-    if(!%(oname)s)
-        %(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
-    ((npy_int64*)PyArray_DATA(%(oname)s))[0] =
-                              CudaNdarray_HOST_DIMS(%(iname)s)[%(i)s];
-    """,
-    """
-    if (%(i)s>=CudaNdarray_NDIM(%(iname)s)){
-        PyErr_SetString(PyExc_TypeError,
-            "Number of dimensions lower than expected");
-        %(fail)s
-    }
-    """,
-    version=(1,))
-
 # Register CudaNdarrayType to the DeepCopyOp list of types with c code.
 theano.compile.register_deep_copy_op_c_code(
         CudaNdarrayType,
@@ -523,16 +475,7 @@ def get_size(self, shape_info):
 # equal the pickled version, and the cmodule cache is not happy with
 # the situation.
 def CudaNdarray_unpickler(npa):
-
-    if config.experimental.unpickle_gpu_on_cpu:
-        # directly return numpy array
-        warnings.warn("config.experimental.unpickle_gpu_on_cpu is set to True. Unpickling CudaNdarray as numpy.ndarray")
-        return npa
-    elif cuda:
-        return cuda.CudaNdarray(npa)
-    else:
-        raise ImportError("Cuda not found. Cannot unpickle CudaNdarray")
-
+    return cuda.CudaNdarray(npa)
 copy_reg.constructor(CudaNdarray_unpickler)
 
 
diff --git a/theano/sandbox/cuda/var.py b/theano/sandbox/cuda/var.py
index cb2cf5688ca..a868ff7d690 100644
--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -49,11 +49,7 @@ def signature(self):
     def __str__(self):
         if self.name is not None:
             return self.name
-        try:
-            data = str(numpy.asarray(self.data))
-        except Exception, e:
-            data = "error while transferring the value: " + str(e)
-        return "CudaNdarrayConstant{"+data+"}"
+        return "CudaNdarrayConstant{"+str(numpy.asarray(self.data))+"}"
 CudaNdarrayType.Constant = CudaNdarrayConstant
 
 class CudaNdarraySharedVariable(_operators, SharedVariable):
diff --git a/theano/sandbox/gpuarray/__init__.py b/theano/sandbox/gpuarray/__init__.py
deleted file mode 100644
index 3ef4d776da6..00000000000
--- a/theano/sandbox/gpuarray/__init__.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import sys
-import logging
-
-import theano
-from theano.configparser import config, AddConfigVar, BoolParam
-from theano.compile import optdb
-
-_logger_name = 'theano.sandbox.gpuarray'
-_logger = logging.getLogger(_logger_name)
-
-error = _logger.error
-info = _logger.info
-
-pygpu_activated = False
-try:
-    import pygpu
-    import pygpu.gpuarray
-except ImportError:
-    pygpu = None
-
-AddConfigVar('gpuarray.sync',
-             """If True, every op will make sure its work is done before
-                returning.  Setting this to True will slow down execution,
-                but give much more accurate results in profiling.""",
-             BoolParam(False),
-             in_c_key=True)
-
-# This is for documentation not to depend on the availability of pygpu
-from type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
-                  GpuArraySharedVariable, gpuarray_shared_constructor)
-import opt
-
-
-def init_dev(dev):
-    global pygpu_activated
-    context = pygpu.init(dev)
-    pygpu.set_default_context(context)
-    pygpu_activated = True
-    if config.print_active_device:
-        print >> sys.stderr, "Using device %s: %s" % (dev, context.devname)
-    # remember the active device
-    init_dev.device = dev
-
-init_dev.device = None
-
-if pygpu:
-    try:
-        if (config.device.startswith('cuda') or
-            config.device.startswith('opencl')):
-            init_dev(config.device)
-            import theano.compile
-            theano.compile.shared_constructor(gpuarray_shared_constructor)
-            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile', 'inplace')
-        elif config.gpuarray.init_device != '':
-            init_dev(config.gpuarray.init_device)
-    except Exception:
-        error("Could not initialize pygpu, support disabled", exc_info=True)
-else:
-    if (config.gpuarray.init_device != '' or
-        config.device.startswith('opencl') or
-        config.device.startswith('cuda')):
-        error("pygpu was configured but could not be imported", exc_info=True)
diff --git a/theano/sandbox/gpuarray/basic_ops.py b/theano/sandbox/gpuarray/basic_ops.py
deleted file mode 100644
index a979a414274..00000000000
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ /dev/null
@@ -1,1013 +0,0 @@
-import os
-
-import numpy
-
-import theano
-from theano import Op, Apply
-from theano import tensor, scalar, config
-from theano.gradient import grad_undefined
-from theano.scalar import Scalar
-from theano.tensor.basic import Alloc, Join, Split
-
-from theano.gof.python25 import any
-from theano.gof.utils import MethodNotDefined
-from theano.compat import PY3
-
-try:
-    import pygpu
-    from pygpu import gpuarray, elemwise
-except ImportError:
-    pass
-
-from type import GpuArrayType
-
-
-def as_gpuarray_variable(x):
-    # This is needed to lower the number of useless transfer
-    # introduced during optimization.  This speed up optimization and
-    # "canonicalize" the graph, so it make easier making some
-    # optimization.
-    if (hasattr(x, 'fgraph') and
-        len(x.clients) == 1 and
-        x.owner and
-        isinstance(x.owner.op, HostFromGpu)):
-        return x.owner.inputs[0]
-    if hasattr(x, '_as_GpuArrayVariable'):
-        return x._as_GpuArrayVariable()
-    # TODO we need to have the cuda -> gpu path taken care of.
-    tensor_x = tensor.as_tensor_variable(x)
-    return gpu_from_host(tensor_x)
-
-
-def as_gpuarray(x):
-    return gpuarray.array(x, copy=False)
-
-
-class HideC(object):
-    def __hide(*args):
-        raise MethodNotDefined()
-
-    c_code = __hide
-    c_code_cleanup = __hide
-
-    c_headers = __hide
-    c_header_dirs = __hide
-    c_libraries = __hide
-    c_lib_dirs = __hide
-
-    c_support_code = __hide
-    c_support_code_apply = __hide
-
-    c_compile_args = __hide
-    c_no_compile_args = __hide
-    c_init_code = __hide
-    c_init_code_apply = __hide
-
-    c_init_code_struct = __hide
-    c_support_code_struct = __hide
-    c_cleanup_code_struct = __hide
-
-    def c_code_cache_version(self):
-        return ()
-
-    def c_code_cache_version_apply(self, node):
-        return self.c_code_cache_version()
-
-
-class Kernel(object):
-    """
-    This class groups together all the attributes of a gpu kernel.
-    """
-    def __init__(self, code, params, name, flags,
-                 codevar=None, binvar=None, objvar=None):
-        self.code = code
-        self.params = params
-        self.name = name
-        self.flags = flags
-        if codevar is None:
-            codevar = 'kcode_' + name
-        self.codevar = codevar
-        if binvar is None:
-            binvar = 'kbin_' + name
-        self.binvar = binvar
-        if objvar is None:
-            objvar = 'k_' + name
-        self.objvar = objvar
-
-    @staticmethod
-    def get_flags(*types):
-        def get_dtype(t):
-            if isinstance(t, (str, unicode)):
-                return numpy.dtype(t)
-            elif isinstance(t, Type):
-                return t.dtype
-            elif isinstance(t, Variable):
-                return t.type.dtype
-            else:
-                raise TypeError, "can't get a dtype from %s" % (type(t),)
-        dtypes = [get_dtype(t) for t in types]
-        flags = dict(cluda=True)
-        if any(d == numpy.float64 for d in dtypes):
-            flags['have_double'] = True
-        if any(d.itemsize < 4 for d in dtypes):
-            flags['have_small'] = True
-        if any(d.kind == 'c' for d in dtypes):
-            flags['have_complex'] = True
-        if any(d == numpy.float16 for d in dtypes):
-            flags['have_half'] = True
-        return flags
-
-    def _get_c_flags(self):
-        res = []
-        if self.flags.get('cluda', False):
-            res.append('GA_USE_CLUDA')
-        if self.flags.get('have_double', False):
-            res.append('GA_USE_DOUBLE')
-        if self.flags.get('have_small', False):
-            res.append('GA_USE_SMALL')
-        if self.flags.get('have_complex', False):
-            res.append('GA_USE_COMPLEX')
-        if self.flags.get('have_half', False):
-            res.append('GA_USE_SMALL')
-        return '|'.join(res)
-
-    def _get_c_types(self):
-        def m(t):
-            if t == gpuarray.GpuArray:
-                return "GA_BUFFER"
-            else:
-                return str(gpuarray.dtype_to_typecode(t))
-        return ', '.join(m(t) for t in self.params)
-
-
-class GpuKernelBase(object):
-    def gpu_kernels(self, node, name):
-        """
-        This is the method to override.  This should return an
-        iterable of Kernel objects that describe the kernels this op
-        will need.
-        """
-        raise MethodNotDefined, 'gpu_kernels'
-
-    def c_headers(self):
-        try:
-            o = super(GpuKernelBase, self).c_headers()
-        except MethodNotDefined:
-            o = []
-        return o + ['gpuarray/types.h']
-
-    def _generate_kernel_bin(self, k):
-        gk = gpuarray.GpuKernel(k.code, k.name, k.params, **k.flags)
-        bin = gk._binary
-        bcode = ','.join(hex(ord(c)) for c in bin)
-        return ("""static const char %(bname)s[] = { %(bcode)s };""" %
-                dict(bname=k.binvar, bcode=bcode))
-
-    def _generate_kernel_code(self, k):
-        code = '\\n'.join(l for l in k.code.split('\n'))
-        code = code.replace('"', '\\"')
-        return ("""static const char *%(cname)s = "%(code)s";""" %
-                dict(cname=k.codevar, code=code))
-
-    def _generate_kernel_vars(self, k):
-        return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)
-
-    def c_support_code_apply(self, node, name):
-        kernels = self.gpu_kernels(node, name)
-        bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
-        codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
-        vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels)
-        return '\n'.join([bins, codes, vars])
-
-    def _generate_kernel_init(self, k, err):
-        if PY3:
-            error_out = "NULL"
-        else:
-            error_out = ""
-        return """{
-  int types[%(numargs)u] = {%(types)s};
-  const char *bcode = %(bvar)s;
-  size_t sz = sizeof(%(bvar)s);
-  PyGpuContextObject *c = pygpu_default_context();
-  if (GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &bcode, &sz, "%(kname)s",
-                     %(numargs)u, types, GA_USE_BINARY) != GA_NO_ERROR) {
-    if ((%(err)s = GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &%(cname)s,
-                                  NULL, "%(kname)s", %(numargs)u, types,
-                                  %(flags)s)) != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
-                   %(err)s, Gpu_error(c->ops, c->ctx, %(err)s));
-      return %(error_out)s;
-    }
-  }
-}""" % dict(numargs=len(k.params), types=k._get_c_types(), bvar=k.binvar,
-            ovar=k.objvar, kname=k.name, err=err, cname=k.codevar,
-            flags=k._get_c_flags(), error_out=error_out)
-
-    def c_init_code_apply(self, node, name):
-        err = 'err_' + name
-        kernels = self.gpu_kernels(node, name)
-        inits ='\n'.join(self._generate_kernel_init(k, err) for k in kernels)
-        return ("int %(err)s;\n" % dict(err=err)) + inits
-
-    def _GpuKernelBase_version(self):
-        ctx = gpuarray.get_default_context()
-        return (2, ctx.kind, ctx.devname)
-
-    GpuKernelBase_version = property(_GpuKernelBase_version)
-
-
-class HostFromGpu(Op):
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __str__(self):
-        return 'HostFromGpu(gpuarray)'
-
-    def make_node(self, x):
-        if not isinstance(x.type, GpuArrayType):
-            raise TypeError(x)
-        return Apply(self, [x],
-                     [tensor.TensorType(dtype=x.dtype,
-                                        broadcastable=x.broadcastable)()])
-
-    def perform(self, node, inp, out):
-        x, = inp
-        z, = out
-        z[0] = numpy.asarray(x)
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        return """
-        GpuArray %(name)s_ga_s;
-        GpuArray *%(name)s_ga = NULL;
-        int %(name)serr;
-        PyArray_Descr *%(name)s_dtype;
-        if (!GpuArray_ISONESEGMENT(&%(inp)s->ga)) {
-            if (GpuArray_copy(&%(name)s_ga_s, &%(inp)s->ga, GA_C_ORDER) != GA_NO_ERROR) {
-                PyErr_SetString(PyExc_RuntimeError, "Can't make contiguous copy");
-                %(fail)s;
-            }
-            %(name)s_ga = &%(name)s_ga_s;
-        } else {
-            %(name)s_ga = &%(inp)s->ga;
-        }
-        %(name)s_dtype = typecode_to_dtype(%(name)s_ga->typecode);
-        Py_XDECREF(%(out)s);
-        // PyArray_Empty below steals a reference to the dtype we pass it
-        // so we need an extra one to spare.
-        Py_INCREF(%(name)s_dtype);
-        %(out)s = (PyArrayObject *)PyArray_Empty(%(inp)s->ga.nd,
-                                (npy_intp *)%(inp)s->ga.dimensions,
-                                %(name)s_dtype,
-                                (%(inp)s->ga.flags & GA_F_CONTIGUOUS) &&
-                                !(%(inp)s->ga.flags & GA_C_CONTIGUOUS));
-        if (%(out)s == NULL) {
-            if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
-            %(fail)s
-        }
-        %(name)serr = GpuArray_read(PyArray_DATA(%(out)s),
-                                    PyArray_NBYTES(%(out)s),
-                                    %(name)s_ga);
-        if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
-        if (%(name)serr != GA_NO_ERROR) {
-            PyErr_SetString(PyExc_RuntimeError, "Could not read device data.");
-            %(fail)s
-        }
-        """ % {'name': name, 'fail': sub['fail'], 'inp': inputs[0],
-               'out': outputs[0]}
-
-    def c_code_cache_version(self):
-        return (1,)
-
-    def grad(self, inputs, grads):
-        gz, = grads
-        return [gpu_from_host(gz)]
-
-    def R_op(self, inputs, eval_points):
-        ev, = eval_points
-        if isinstance(ev, tensor.TensorType):
-            return [gpu_from_host(ev)]
-        else:
-            return [ev]
-
-    def infer_shape(self, node, xshp):
-        return xshp
-
-
-host_from_gpu = HostFromGpu()
-
-
-class GpuFromHost(Op):
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __str__(self):
-        return 'GpuFromHost(gpuarray)'
-
-    def make_node(self, x):
-        if not isinstance(x.type, tensor.TensorType):
-            raise TypeError(x)
-        return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
-                                              dtype=x.dtype)()])
-
-    def perform(self, node, inp, out):
-        x, = inp
-        z, = out
-        type = node.outputs[0].type
-        z[0] = gpuarray.array(x)
-
-    def grad(self, inputs, grads):
-        gz, = grads
-        return [host_from_gpu(as_gpuarray_variable(gz))]
-
-    def R_op(self, inputs, eval_points):
-        ev, = eval_points
-        if isinstance(ev, GpuArrayType):
-            return [host_from_gpu(ev)]
-        else:
-            return ev
-
-    def infer_shape(self, node, xshp):
-        return xshp
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        return """
-        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_fromhostdata(PyArray_DATA(%(inp)s),
-                                     get_typecode((PyObject *)PyArray_DESCR(%(inp)s)),
-                                     PyArray_NDIM(%(inp)s),
-                                     (size_t *)PyArray_DIMS(%(inp)s),
-                                     (ssize_t *)PyArray_STRIDES(%(inp)s),
-                                     pygpu_default_context(),
-                                     Py_None);
-        if (%(out)s == NULL) {
-            %(fail)s
-        }
-        """ % {'name': name, 'inp': inputs[0],
-               'out': outputs[0], 'fail': sub['fail']}
-
-    def c_code_cache_version(self):
-        return (4,)
-
-gpu_from_host = GpuFromHost()
-
-
-class GpuFromCuda(Op):
-    view_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __str__(self):
-        return 'GpuFromCuda'
-
-    def make_node(self, x):
-        from theano.sandbox.cuda import CudaNdarrayType
-        if not isinstance(x.type, CudaNdarrayType):
-            raise TypeError(x)
-        return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
-                                              dtype=x.dtype)()])
-
-    def perform(self, node, inp, out):
-        x, = inp
-        z, = out
-        z[0] = gpuarray.array(numpy.asarray(x))
-
-    def grad(self, inputs, grads):
-        gz, = grads
-        return [cuda_from_gpu(gz)]
-
-    def R_op(self, inputs, eval_points):
-        ev, = eval_points
-        if isinstance(ev, GpuArrayType):
-            return [cuda_from_gpu(ev)]
-        else:
-            return ev
-
-    def infer_shape(self, node, xshp):
-        return xshp
-
-    def c_headers(self):
-        return ['<cuda_ndarray.cuh>', '<gpuarray/extension.h>',
-                '<gpuarray/types.h>', '<cuda.h>']
-
-    def c_header_dirs(self):
-        import cuda_ndarray
-        ret = [os.path.dirname(cuda_ndarray.__file__)]
-        cuda_root = config.cuda.root
-        if cuda_root:
-            ret.append(os.path.join(cuda_root, 'include'))
-        return ret
-
-    def c_lib_dirs(self):
-        import cuda_ndarray
-        ret = [os.path.dirname(cuda_ndarray.__file__)]
-        cuda_root = config.cuda.root
-        if cuda_root:
-            ret.append(os.path.join(cuda_root, 'lib'))
-        return ret
-
-    def c_libraries(self):
-        return ['cudart', 'cublas', 'cuda']
-
-    def c_support_code(self):
-        return """
-        CUcontext (*cuda_get_ctx)(void *ctx);
-        gpudata *(*cuda_make_buf)(void *c, CUdeviceptr p, size_t sz);
-        """
-
-    def c_init_code(self):
-        return ['cuda_get_ctx = (CUcontext (*)(void *))gpuarray_get_extension("cuda_get_ctx");',
-                'cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))gpuarray_get_extension("cuda_make_buf");']
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        return """
-        int %(name)serr;
-        gpudata *%(name)sdata;
-        CUcontext %(name)scur;
-        size_t *%(name)sdims;
-        ssize_t *%(name)sstr;
-
-        cuCtxGetCurrent(&%(name)scur);
-        if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
-            PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
-            %(fail)s
-        }
-        %(name)sdims = (size_t *)calloc(%(in)s->nd, sizeof(size_t));
-        if (%(name)sdims == NULL) {
-            PyErr_SetString(PyExc_MemoryError, "Can't allocate dimensions.");
-            %(fail)s
-        }
-        %(name)sstr = (ssize_t *)calloc(%(in)s->nd, sizeof(ssize_t));
-        if (%(name)sstr == NULL) {
-            free(%(name)sdims);
-            PyErr_SetString(PyExc_MemoryError, "Can't allocate strides.");
-            %(fail)s
-        }
-
-        for (unsigned int i = 0; i < %(in)s->nd; i++) {
-            %(name)sdims[i] = (size_t)CudaNdarray_HOST_DIMS(%(in)s)[i];
-            %(name)sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(%(in)s)[i]*4;
-        }
-
-        %(name)sdata = cuda_make_buf(pygpu_default_context()->ctx,
-                                     (CUdeviceptr)%(in)s->devdata,
-                                     ((size_t)%(in)s->data_allocated)*4);
-        if (%(name)sdata == NULL) {
-            Py_DECREF(%(out)s);
-            free(%(name)sdims);
-            free(%(name)sstr);
-            PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure.");
-            %(fail)s
-        }
-        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_fromgpudata(%(name)sdata, 0, GA_FLOAT, %(in)s->nd,
-                                    %(name)sdims, %(name)sstr,
-                                    pygpu_default_context(), 1,
-                                    (PyObject *)%(in)s,
-                                    (PyObject *)&PyGpuArrayType);
-        pygpu_default_context()->ops->buffer_release(%(name)sdata);
-        free(%(name)sdims);
-        free(%(name)sstr);
-        if (%(out)s == NULL) {
-            %(fail)s
-        }
-        """ % {'name': name, 'in': inputs[0], 'out': outputs[0],
-               'fail': sub['fail']}
-
-    def c_code_cache_version(self):
-        return (5,)
-
-gpu_from_cuda = GpuFromCuda()
-
-
-class CudaFromGpu(Op):
-    view_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __str__(self):
-        return 'CudaFromGpu'
-
-    def make_node(self, x):
-        from theano.sandbox.cuda import CudaNdarrayType
-        if not isinstance(x.type, GpuArrayType):
-            raise TypeError(x)
-        if x.type.dtype != 'float32':
-            raise TypeError(x)
-        return Apply(self, [x], [CudaNdarrayType(broadcastable=x.broadcastable)()])
-
-    def perform(self, node, inp, out):
-        from theano.sandbox.cuda import filter as cuda_filter
-        x, = inp
-        z, = out
-        z[0] = cuda_filter(theano._asarray(x, dtype='float32'),
-                           tuple([0] * x.ndim), 0, z[0])
-
-    def grad(self, inputs, grads):
-        gz, = grads
-        return [gpu_from_cuda(gz)]
-
-    def R_op(self, inputs, eval_points):
-        from theano.sandbox.cuda import CudaNdarrayType
-        ev, = eval_points
-        if (isinstance(ev, CudaNdarrayType)):
-            return [gpu_from_cuda(ev)]
-        else:
-            return [ev]
-
-    def infer_shape(self, node, shp):
-        return shp
-
-    def c_headers(self):
-        return ['<cuda_ndarray.cuh>', '<gpuarray/extension.h>', '<cuda.h>']
-
-    def c_header_dirs(self):
-        import cuda_ndarray
-        ret = [os.path.dirname(cuda_ndarray.__file__)]
-        cuda_root = config.cuda.root
-        if cuda_root:
-            ret.append(os.path.join(cuda_root, 'include'))
-        return ret
-
-    def c_lib_dirs(self):
-        import cuda_ndarray
-        ret = [os.path.dirname(cuda_ndarray.__file__)]
-        cuda_root = config.cuda.root
-        if cuda_root:
-            ret.append(os.path.join(cuda_root, 'lib'))
-        return ret
-
-    def c_libraries(self):
-        return ['cudart', 'cublas', 'cuda']
-
-    def c_support_code(self):
-        return """
-        CUcontext (*cuda_get_ctx)(void *ctx);
-        CUdeviceptr (*cuda_get_ptr)(gpudata *g);
-        """
-
-    def c_init_code(self):
-        return ['cuda_get_ctx = (CUcontext (*)(void *ctx))gpuarray_get_extension("cuda_get_ctx");',
-                'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        return """
-        int %(name)serr = 0, %(name)si;
-        CUcontext %(name)scur;
-
-        cuCtxGetCurrent(&%(name)scur);
-        if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
-            PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
-            %(fail)s
-        }
-
-        if (GpuArray_sync(&%(inp)s->ga) != GA_NO_ERROR) {
-            PyErr_SetString(PyExc_RuntimeError, "Could not sync GpuArray");
-            %(fail)s
-        }
-        Py_XDECREF(%(out)s);
-        %(out)s = (CudaNdarray *)CudaNdarray_new_nd(%(inp)s->ga.nd);
-        if (!%(out)s) {
-            %(fail)s
-        }
-        for (%(name)si = 0; %(name)si < %(inp)s->ga.nd; %(name)si++) {
-            CudaNdarray_set_dim(%(out)s, %(name)si, %(inp)s->ga.dimensions[%(name)si]);
-            CudaNdarray_set_stride(%(out)s, %(name)si, %(inp)s->ga.strides[%(name)si]/4);
-        }
-        %(name)serr = CudaNdarray_set_device_data(%(out)s,
-          (float *)(((char *)cuda_get_ptr(%(inp)s->ga.data))+%(inp)s->ga.offset),
-                                          (PyObject *)%(inp)s);
-        if (%(name)serr) {
-           %(fail)s
-        }
-        """ % {'name': name, 'inp': inputs[0], 'out': outputs[0],
-               'fail': sub['fail']}
-
-    def c_code_cache_version(self):
-        return (3,)
-
-
-cuda_from_gpu = CudaFromGpu()
-
-
-class GpuAlloc(HideC, Alloc):
-    def __init__(self, memset_0=False):
-        """memset_0 is only an optimized version. True, it mean the
-        value is always 0, so the c code call memset as it is faster.
-
-        """
-        self.memset_0 = memset_0
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.memset_0 == other.memset_0
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.memset_0)
-
-    def __str__(self):
-        #Hide the memset parameter when not used to prevent confusion.
-        if self.memset_0:
-            s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
-        else:
-            s = self.__class__.__name__
-        return s
-
-    def make_node(self, value, *shape):
-        res = Alloc.make_node(self, value, *shape)
-        value = as_gpuarray_variable(value)
-        otype = GpuArrayType(dtype=res.outputs[0].dtype,
-                             broadcastable=res.outputs[0].broadcastable)
-        return Apply(self, [value] + res.inputs[1:], [otype()])
-
-    def c_headers(self):
-        return ['<numpy_compat.h>']
-
-    def perform(self, node, inputs, outs):
-        out, = outs
-        v = inputs[0]
-        sh = tuple(map(int, inputs[1:]))
-        if out[0] is None or out[0].shape != sh:
-            if v.size == 1 and numpy.asarray(v)[0].item() == 0:
-                out[0] = gpuarray.zeros(sh, dtype=v.dtype)
-            else:
-                out[0] = gpuarray.empty(sh, dtype=v.dtype)
-                out[0][...] = v
-        else:
-            out[0][...] = v
-        if config.gpuarray.sync:
-            out[0].sync()
-
-    def c_code(self, node, name, inp, out, sub):
-        vv = inp[0]
-        ndim = len(inp[1:])
-        zz, = out
-
-        memset_0 = int(self.memset_0)
-        code = """
-        int i;
-        size_t %(name)s_shape[%(ndim)s];
-        """ % dict(name=name, ndim=ndim)
-
-        for i, shp_i in enumerate(inp[1:]):
-            code += """
-        %(name)s_shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
-        """ % dict(name=name, i=i, shp_i=shp_i)
-
-        code += """
-        int need_new_out = (NULL == %(zz)s || %(zz)s->ga.nd != %(ndim)s);
-
-        if (!need_new_out)
-            for (i = 0; i < %(ndim)s; i++)
-                need_new_out |= %(zz)s->ga.dimensions[i] != %(name)s_shape[i];
-
-        if (need_new_out && (%(memset_0)s)) {
-            //pygpu_zeros can be faster then empty followed by memset.
-            Py_XDECREF(%(zz)s);
-            %(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape,
-                                 %(vv)s->ga.typecode, GA_C_ORDER,
-                                 pygpu_default_context(), Py_None);
-            if (!%(zz)s) {
-                %(fail)s
-            }
-        } else {
-            if (need_new_out) {
-                Py_XDECREF(%(zz)s);
-                %(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
-                                     %(vv)s->ga.typecode, GA_C_ORDER,
-                                     pygpu_default_context(), Py_None);
-                if (!%(zz)s) {
-                    %(fail)s
-                }
-            }
-            if (%(memset_0)s && GpuArray_ISONESEGMENT(&%(zz)s->ga))
-            {
-                int err = GpuArray_memset(&%(zz)s->ga, 0);
-                if (err != GA_NO_ERROR)
-                {
-                    PyErr_Format(PyExc_MemoryError,
-                                 "GpuAlloc: Error memsetting %%d"
-                                 " element of device memory to 0.",
-                                 PyGpuArray_SIZE(%(zz)s));
-                    %(fail)s;
-                }
-            }
-            else if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) !=
-                     GA_NO_ERROR) {
-                PyErr_SetString(PyExc_ValueError, "setarray failed");
-                %(fail)s
-            }
-        }
-        """ % dict(name=name, ndim=ndim, zz=zz, vv=vv,
-                   fail=sub['fail'], memset_0=memset_0)
-
-        if config.gpuarray.sync:
-            code += "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
-
-        return code
-
-    def c_code_cache_version(self):
-        return (2,)
-
-    def do_constant_folding(self, node):
-        for client in node.outputs[0].clients:
-            if client[0] == 'output':
-                # If the output is a constant, it will have to be deepcopied
-                # each time the function is called.  So we do not fold.
-                return False
-            elif (#The following ops work inplace of their input id 0.
-                  client[1] == 0 and
-                  isinstance(client[0].op, (
-                    #Ops that will work inplace on the Alloc. So if they
-                    #get constant_folded, they would copy the
-                    #constant and this is less efficients.
-
-                    #Not doing the constant folding could also lower
-                    #the peak memory usage, as we the "constant" won't
-                    #always exists.
-                      #theano.tensor.subtensor.AdvancedIncSubtensor,
-                      theano.sandbox.gpuarray.subtensor.GpuIncSubtensor,
-                      theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1,
-                      theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1_dev20,
-                      theano.sandbox.gpuarray.blas.GpuGemm,
-                      theano.sandbox.gpuarray.blas.GpuGemv,
-                      theano.sandbox.gpuarray.blas.GpuGer,
-                  ))):
-                return False
-            #If the clients is a transfer, we don't want to fold. We
-            #let the moving opt finish before deciding what to do.
-            elif isinstance(client[0].op, HostFromGpu):
-                return False
-        return True
-
-gpu_alloc = GpuAlloc()
-
-
-class GpuContiguous(Op):
-    """
-    Always return a c contiguous output. Copy the input only if it is
-    not already c contiguous.
-    """
-    view_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def grad(self, inputs, dout):
-
-        x, = inputs
-        dout, = dout
-        dout = as_gpuarray_variable(dout)
-
-        return [dout]
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def make_node(self, input):
-        input = as_gpuarray_variable(input)
-        return Apply(self, [input], [input.type()])
-
-    def c_headers(self):
-        return ['<numpy_compat.h>']
-
-    def c_code_cache_version(self):
-        return (3,)
-
-    def c_code(self, node, name, inp, out, sub):
-        input, = inp
-        z, = out
-        fail = sub['fail']
-        str = """
-        {
-            if (GpuArray_IS_C_CONTIGUOUS(&(%(input)s->ga))){
-                Py_XDECREF(%(z)s);
-                %(z)s = %(input)s;
-                Py_INCREF(%(z)s);
-
-            } else if ((NULL == %(z)s)""" % locals()
-        for i in xrange(len(node.inputs[0].type.broadcastable)):
-            str += "\n|| (PyGpuArray_DIMS(%(input)s)[%(i)s] != PyGpuArray_DIMS(%(z)s)[%(i)s])" % locals()
-        str += """
-                || !GpuArray_IS_C_CONTIGUOUS(&(%(z)s->ga)))
-            {
-                Py_XDECREF(%(z)s);
-                %(z)s = pygpu_copy(%(input)s, GA_C_ORDER);
-                if (!%(z)s)
-                {
-                    %(fail)s;
-                }
-            }else if(pygpu_move(%(z)s, %(input)s) == -1) {
-                %(fail)s;
-            }
-        }
-        """ % locals()
-        return str
-
-gpu_contiguous = GpuContiguous()
-
-
-class GpuReshape(HideC, tensor.Reshape):
-    """
-    Implement Reshape on the gpu.
-    """
-    # __hash__, __eq__, __str__ come from tensor.Reshape
-    def make_node(self, x, shp):
-        x = as_gpuarray_variable(x)
-        res = host_from_gpu(x).reshape(shp, ndim=self.ndim)
-        otype = GpuArrayType(dtype=res.dtype,
-                             broadcastable=res.broadcastable)
-        return Apply(self, [x, shp], [otype()])
-
-    def perform(self, node, inp, out_):
-        x, shp = inp
-        out, = out_
-        if (len(shp) != self.ndim):
-            raise ValueError('shape argument to GpuReshape.perform'
-                             ' has incorrect length %i'
-                             ', should be %i' % (len(shp), self.ndim), shp)
-
-        if shp.prod() != x.size:
-            # We need to do check here to raise the same error as NumPy.
-            # We should make pygpu do the same.
-            ss = 1
-            nb_m1 = 0
-            for i in shp:
-                if i == -1:
-                    nb_m1 += 1
-                else:
-                    ss *= i
-            if nb_m1 > 1:
-                raise ValueError("Only one -1 is accepted in the new shape")
-            elif nb_m1 == 1:
-                if (x.size % ss) != 0:
-                    raise ValueError("When using -1 in new shape, the computed new shape must be an multiple of the original shape.")
-            else:
-                raise ValueError("total size of new array must be unchanged")
-        out[0] = x.reshape(tuple(shp))
-
-
-class GpuJoin(HideC, Join):
-    def make_node(self, axis, *tensors):
-        node = Join.make_node(self, axis, *tensors)
-
-        return Apply(self, [node.inputs[0]] + map(as_gpuarray_variable,
-                                                  tensors),
-                     [GpuArrayType(broadcastable=node.outputs[0].broadcastable,
-                                   dtype=node.outputs[0].dtype)()])
-
-    def perform(self, node, axis_and_tensors, out_):
-        out, = out_
-        axis = int(axis_and_tensors[0])
-        tensors = axis_and_tensors[1:]
-        out[0] = pygpu.concatenate(tensors, axis=axis).astype(
-            node.outputs[0].dtype)
-
-    def c_code_cache_version(self):
-        return (1,)
-
-    def c_code(self, node, name, inputs, out_, sub):
-        copy_to_list = []
-        restype=pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        for i, inp in enumerate(inputs[1:]):
-            copy_to_list.append("als[%s] = &%s->ga;" % (i, inp))
-        return """
-GpuArray **als = (GpuArray **)PyMem_Malloc(sizeof(GpuArray *) * %(n)s);
-if (als == NULL) {
-  PyErr_NoMemory();
-  %(fail)s
-}
-%(copy_inputs_to_list)s
-Py_XDECREF(%(out)s);
-%(out)s = pygpu_concatenate(als, %(n)s, PyInt_AsLong((PyObject *)%(axis)s),
-                            %(restype)s, (PyObject *)&PyGpuArrayType,
-                            pygpu_default_context());
-PyMem_Free(als);
-if (%(out)s == NULL)
-  %(fail)s
-        """ % dict(n=len(inputs[1:]), fail=sub['fail'], out=out_[0],
-                   axis=inputs[0], copy_inputs_to_list='\n'.join(copy_to_list),
-                   restype=restype)
-
-
-gpu_join = GpuJoin()
-
-
-class GpuSplit(HideC, Split):
-    def make_node(self, x, axis, splits):
-        node = Split.make_node(self, x, axis, splits)
-        x = as_gpuarray_variable(x)
-        outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable)()
-                for o in node.outputs]
-        return Apply(self, [x] + node.inputs[1:], outs)
-    # we reuse the perform of the CPU op, which is suitable
-
-
-class GpuEye(GpuKernelBase, Op):
-    def __init__(self, dtype=None):
-        if dtype is None:
-            dtype = config.floatX
-        self.dtype = dtype
-
-    def make_node(self, n, m, k):
-        n = tensor.as_tensor_variable(n)
-        m = tensor.as_tensor_variable(m)
-        k = tensor.as_tensor_variable(k)
-        assert n.ndim == 0
-        assert m.ndim == 0
-        assert k.ndim == 0
-        otype = GpuArrayType(dtype=self.dtype,
-                             broadcastable=(False, False))
-
-        # k != 0 isn't implemented on the GPU yet.
-        assert tensor.get_scalar_constant_value(k) == 0
-        return Apply(self, [n, m], [otype()])
-
-    def infer_shape(self, node, in_shapes):
-        out_shape = [node.inputs[0], node.inputs[1]]
-        return [out_shape]
-
-    def grad(self, inp, grads):
-        return [grad_undefined(self, i, inp[i])
-                for i in xrange(3)]
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.dtype == other.dtype
-
-    def __hash__(self):
-        return hash(self.dtype) ^ hash(type(self))
-
-    def gpu_kernels(self, node, name):
-        code = """
-KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
-    ga_size nb = n < m ? n : m;
-    for (ga_size i = LID_0; i < nb; i += LDIM_0) {
-        a[i*m + i] = 1;
-    }
-}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype), name=name)
-        return [Kernel(
-                code=code, name="k",
-                params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],
-                flags=Kernel.get_flags(self.dtype),
-                objvar='k_eye_'+name,
-                )]
-
-    def c_code(self, node, name, inp, out, sub):
-        n, m = inp
-        z, = out
-        fail = sub['fail']
-        typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
-        sync = bool(config.gpuarray.sync)
-        kname = self.gpu_kernels(node, name)[0].objvar
-        s = """
-        size_t dims[2] = {0, 0};
-        void *args[3];
-        int err;
-
-        dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
-        dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
-        Py_CLEAR(%(z)s);
-
-        %(z)s = pygpu_zeros(2, dims,
-                            %(typecode)s,
-                            GA_C_ORDER,
-                            pygpu_default_context(), Py_None);
-        if (%(z)s == NULL) {
-            %(fail)s
-        }
-
-        args[0] = &%(z)s->ga;
-        args[1] = &dims[0];
-        args[2] = &dims[1];
-        err = GpuKernel_call(&%(kname)s, 0, 1, 256, args);
-        if (err != GA_NO_ERROR) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "gpuarray error: kEye: %%s. n%%lu, m=%%lu.",
-                         GpuKernel_error(&%(kname)s, err),
-                         (unsigned long)dims[0], (unsigned long)dims[1]);
-            %(fail)s;
-        }
-
-        if(%(sync)d)
-            GpuArray_sync(&%(z)s->ga);
-        """ % locals()
-
-        return s
-
-    def c_code_cache_version(self):
-        return (3, self.GpuKernelBase_version)
diff --git a/theano/sandbox/gpuarray/blas.py b/theano/sandbox/gpuarray/blas.py
deleted file mode 100644
index 2b33cb293aa..00000000000
--- a/theano/sandbox/gpuarray/blas.py
+++ /dev/null
@@ -1,299 +0,0 @@
-from theano import Op, Apply, config
-
-from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
-from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)
-
-try:
-    import pygpu
-    from pygpu import blas
-except ImportError, e:
-    # To make sure theano is importable
-    pass
-
-
-class BlasOp(HideC):
-    def c_headers(self):
-        return ['<blas_api.h>']
-
-    def c_header_dirs(self):
-        return [pygpu.get_include()]
-
-    def c_init_code(self):
-        return ['import_pygpu__blas();']
-
-
-class GpuGemv(BlasOp, Gemv):
-    def make_node(self, y, alpha, A, x, beta):
-        res = Gemv.make_node(self, y, alpha, A, x, beta)
-        A = as_gpuarray_variable(A)
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
-        assert A.dtype == x.dtype == y.dtype
-        return Apply(self, [y, alpha, A, x, beta], [y.type()])
-
-    def perform(self, node, inputs, out_storage):
-        y, alpha, A, x, beta = inputs
-        inplace = self.inplace
-        if inplace and y.strides[0] < 0:
-            inplace = False
-        out_storage[0][0] = blas.gemv(alpha, A, x, beta, y,
-                                      overwrite_y=inplace)
-
-    def c_code(self, node, name, inp, out, sub):
-        vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3],
-                    beta=inp[4], fail=sub['fail'], name=name)
-        if self.inplace:
-            code = """
-                   Py_XDECREF(%(out)s);
-                   if (%(y)s->ga.strides[0] <= 0) {
-                     %(out)s = pygpu_copy(%(y)s, GA_ANY_ORDER);
-                     if (%(out)s == NULL) {
-                       %(fail)s
-                     }
-                   } else {
-                     %(out)s = %(y)s;
-                     Py_INCREF(%(out)s);
-                   }
-                   """ % vars
-        else:
-            code = """
-                   Py_XDECREF(%(out)s);
-                   %(out)s = pygpu_copy(%(y)s, GA_ANY_ORDER);
-                   if (%(out)s == NULL) {
-                       %(fail)s
-                   }
-                   """ % vars
-        code += """
-        if (pygpu_blas_rgemv(cb_no_trans,
-                             ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
-                             %(A)s, %(x)s,
-                             ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
-                             %(out)s, 0) == -1) {
-            %(fail)s
-        }
-        """ % vars
-        if config.gpuarray.sync:
-            code += """
-            GpuArray_sync(&%(out)s->ga);
-            """ % vars
-        return code
-
-    def c_code_cache_version(self):
-        return (2,)
-
-gpugemv_no_inplace = GpuGemv(inplace=False)
-gpugemv_inplace = GpuGemv(inplace=True)
-
-
-class GpuGemm(BlasOp, Gemm):
-    def make_node(self, C, alpha, A, B, beta):
-        res = Gemm.make_node(self, C, alpha, A, B, beta)
-        A = as_gpuarray_variable(A)
-        B = as_gpuarray_variable(B)
-        C = as_gpuarray_variable(C)
-        assert A.dtype == B.dtype == C.dtype
-        return Apply(self, [C, alpha, A, B, beta], [C.type()])
-
-    def perform(self, node, inputs, outputs):
-        C, alpha, A, B, beta = inputs
-        inplace = self.inplace
-        if inplace and not C.flags.forc:
-            inplace = False
-        outputs[0][0] = blas.gemm(alpha, A, B, beta, C,
-                                  overwrite_c=inplace)
-
-    def c_code(self, node, name, inp, out, sub):
-        vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
-                    beta=inp[4], fail=sub['fail'], name=name)
-        if self.inplace:
-            code = """
-                   Py_XDECREF(%(out)s);
-                   if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
-                     %(out)s = pygpu_copy(%(C)s, GA_ANY_ORDER);
-                     if (%(out)s == NULL) {
-                       %(fail)s
-                     }
-                   } else {
-                     %(out)s = %(C)s;
-                     Py_INCREF(%(out)s);
-                   }
-                   """ % vars
-        else:
-            code = """
-                   Py_XDECREF(%(out)s);
-                   %(out)s = pygpu_copy(%(C)s, GA_ANY_ORDER);
-                   if (%(out)s == NULL) {
-                       %(fail)s
-                   }
-                   """ % vars
-        code += """
-        if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
-                             ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
-                             %(A)s, %(B)s,
-                             ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
-                             %(out)s, 0) == -1) {
-            %(fail)s
-        }
-        """ % vars
-        if config.gpuarray.sync:
-            code += """
-            GpuArray_sync(&%(out)s->ga);
-            """ % vars
-        return code
-
-    def c_code_cache_version(self):
-        return (2,)
-
-
-gpugemm_no_inplace = GpuGemm(inplace=False)
-gpugemm_inplace = GpuGemm(inplace=True)
-
-
-class GpuGer(BlasOp, Ger):
-    def make_node(self, A, alpha, x, y):
-        res = Ger.make_node(self, A, alpha, x, y)
-        A = as_gpuarray_variable(A)
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
-        assert A.dtype == x.dtype == y.dtype
-        return Apply(self, [A, alpha, x, y], [A.type()])
-
-    def perform(self, node, inp, out):
-        A, alpha, x, y = inp
-        inplace = self.destructive
-        if inplace and not A.flags.forc:
-            inplace = False
-        outputs[0][0] = blas.ger(alpha, x, y, A,
-                                 overwrite_a=inplace)
-
-    def c_code(self, node, name, inp, out, sub):
-        vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3],
-                    fail=sub['fail'], name=name)
-        if self.destructive:
-            code = """
-                   Py_XDECREF(%(out)s);
-                   if (!GpuArray_ISONESEGMENT(&%(A)s->ga)) {
-                     %(out)s = pygpu_copy(%(A)s, GA_ANY_ORDER);
-                     if (%(out)s == NULL) {
-                       %(fail)s
-                     }
-                   } else {
-                     %(out)s = %(A)s;
-                     Py_INCREF(%(out)s);
-                   }
-                   """ % vars
-        else:
-            code = """
-                   Py_XDECREF(%(out)s);
-                   %(out)s = pygpu_copy(%(A)s, GA_ANY_ORDER);
-                   if (%(out)s == NULL) {
-                       %(fail)s
-                   }
-                   """ % vars
-        code += """
-        if (pygpu_blas_rger(((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
-                            %(x)s, %(y)s, %(out)s, 0) == -1) {
-            %(fail)s
-        }
-        """ % vars
-        if config.gpuarray.sync:
-            code += """
-            GpuArray_sync(&%(out)s->ga);
-            """ % vars
-        return code
-
-    def c_code_cache_version(self):
-        return (1,)
-
-
-gpuger_no_inplace = GpuGer(destructive=False)
-gpuger_inplace = GpuGer(destructive=True)
-
-
-class GpuDot22(BlasOp, Dot22):
-    def make_node(self, x, y):
-        res = Dot22.make_node(self, x, y)
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
-        assert x.dtype == y.dtype
-        return Apply(self, [x, y], [x.type()])
-
-    def perform(self, node, inputs, outputs):
-        x, y = inputs
-
-        out = pygpu.empty((x.shape[0], y.shape[1]), dtype=x.dtype)
-        outputs[0][0] = blas.gemm(1., x, y, 0., out,
-                                  overwrite_c=True)
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        dtype = node.inputs[0].dtype
-        typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
-        vars = dict(A=inputs[0], B=inputs[1], dtype=dtype, out=outputs[0],
-                    typecode=typecode,
-                    fail=sub['fail'], name=name)
-        code = """
-        double one = 1.;
-        double zero = 0.;
-
-        size_t dims[] = {0, 0};
-        dims[0] = PyGpuArray_DIMS(%(A)s)[0];
-        dims[1] = PyGpuArray_DIMS(%(B)s)[1];
-
-        %(out)s = pygpu_empty(2, dims,
-                            %(typecode)s,
-                            GA_C_ORDER,
-                            pygpu_default_context(), Py_None);
-        if (!%(out)s) {
-            %(fail)s
-        }
-
-        if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
-                             one,
-                             %(A)s, %(B)s,
-                             zero,
-                             %(out)s, 0) == -1) {
-            %(fail)s
-        }
-        """ % vars
-        if config.gpuarray.sync:
-            code += """
-            GpuArray_sync(&%(out)s->ga);
-            """ % vars
-        return code
-
-    def c_code_cache_version(self):
-        return (1,)
-
-    def c_headers(self):
-        ret = super(GpuDot22, self).c_headers()
-        return ret + ['<numpy_compat.h>']
-
-gpu_dot22 = GpuDot22()
-
-from theano.compile import optdb
-from theano.gof import local_optimizer, LocalOptGroup
-from theano.tensor.opt import in2out
-
-
-@local_optimizer([gpugemv_no_inplace], inplace=True)
-def local_inplace_gpuagemv(node):
-    if node.op == gpugemv_no_inplace:
-        return [gpugemv_inplace(*node.inputs)]
-
-
-@local_optimizer([gpugemm_no_inplace], inplace=True)
-def local_inplace_gpuagemm(node):
-    if node.op == gpugemm_no_inplace:
-        return [gpugemm_inplace(*node.inputs)]
-
-@local_optimizer([gpuger_no_inplace], inplace=True)
-def local_inplace_gpuager(node):
-    if node.op == gpuger_no_inplace:
-        return [gpuger_inplace(*node.inputs)]
-
-gpuablas_opt_inplace = in2out(LocalOptGroup(
-        local_inplace_gpuagemv, local_inplace_gpuagemm, local_inplace_gpuager),
-                              name='gpuablas_opt_inplace')
-optdb.register('InplaceGpuaBlasOpt',
-               gpuablas_opt_inplace,
-               70.0, 'fast_run', 'inplace', 'gpuarray')
diff --git a/theano/sandbox/gpuarray/comp.py b/theano/sandbox/gpuarray/comp.py
deleted file mode 100644
index ecaa23e21ef..00000000000
--- a/theano/sandbox/gpuarray/comp.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import os
-
-import numpy
-
-import theano
-from theano import config
-
-# This is a big hack to avoid creating a second context on the card.
-from theano.sandbox.cuda.nvcc_compiler import (NVCC_compiler as NVCC_base,
-                                               hash_from_file)
-class NVCC_compiler(NVCC_base):
-    @staticmethod
-    def compile_args():
-        """
-        Re-implementation of compile_args that does not create an
-        additionnal context on the GPU.
-        """
-        flags = [flag for flag in config.nvcc.flags.split(' ') if flag]
-        if config.nvcc.fastmath:
-            flags.append('-use_fast_math')
-        cuda_ndarray_cuh_hash = hash_from_file(
-            os.path.join(os.path.split(theano.sandbox.cuda.__file__)[0],
-                         'cuda_ndarray.cuh'))
-        flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash)
-
-        # numpy 1.7 deprecated the following macros but they didn't
-        # exist in the past
-        numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
-        if bool(numpy_ver < [1, 7]):
-            flags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
-            flags.append("-D NPY_ARRAY_ALIGNED=NPY_ALIGNED")
-            flags.append("-D NPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
-            flags.append("-D NPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL")
-            flags.append("-D NPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS")
-            flags.append("-D NPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS")
-
-        # If the user didn't specify architecture flags add them
-        if not any(['-arch=sm_' in f for f in flags]):
-            dev = theano.sandbox.gpuarray.init_dev.device
-            if dev is None:
-                raise Exception, "Trying to compile GPU code without a context"
-            if dev.startswith("opencl"):
-                raise Exception, "Trying to call nvcc with an OpenCL context"
-            assert dev.startswith('cuda')
-            if dev == 'cuda':
-                n = theano.sandbox.cuda.use.device_number
-            else:
-                n = int(dev[4:])
-            p = theano.sandbox.cuda.device_properties(n)
-            flags.append('-arch=sm_' + str(p['major']) + str(p['minor']))
-
-        return flags
diff --git a/theano/sandbox/gpuarray/conv.cu b/theano/sandbox/gpuarray/conv.cu
deleted file mode 100644
index d69fc1718a0..00000000000
--- a/theano/sandbox/gpuarray/conv.cu
+++ /dev/null
@@ -1,1530 +0,0 @@
-// REMEMBER TO INCREASE c_code_cache_version when changing this file
-//
-//TODO detect SHARED_SIZE dynamically
-#define SHARED_SIZE (16*1024)
-
-enum { ConvMode_FULL, ConvMode_VALID };
-PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
-                           PyGpuArrayObject * out, const int mode,
-                           const size_t subsample_rows,
-                           const size_t subsample_cols,
-                           const int version, const int verbose);
-
-template <typename T>
-static T ceil_intdiv(T a, T b)
-{
-    return (a/b) + ((a % b) ? 1: 0);
-}
-
-/*
- * version: -1, autodetect, >=0 a specific version to use.
- *          If it can't be executed, we revert to the reference implementation
- */
-int
-PyGpuArray_conv_valid(const PyGpuArrayObject *img,
-                      const PyGpuArrayObject * kern,
-                      PyGpuArrayObject * out, size_t subsample_rows,
-                      size_t subsample_cols,
-                      int version = -1, int verbose=0,
-                      int max_threads_dim0 = 512)
-{
-    int work_complete = 0;
-    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
-    if (PyGpuArray_NDIM(img) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required img of 4D");
-        return -1;
-    }
-    if (PyGpuArray_NDIM(kern) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
-        return -1;
-    }
-    if (PyGpuArray_NDIM(out) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required out of 4D");
-        return -1;
-    }
-    
-    if (verbose>1)
-    {
-        fprintf(stderr,
-                "INFO: Running conv_valid version=%d,"
-                " MACRO kern_width=%d with inputs:\n",
-                version, THEANO_KERN_WID);
-        fprintf(stderr,
-                "INFO:   img  dim: %llu %llu %llu %llu  "
-                "img  stride: %lld %lld %lld %lld\n",
-                (unsigned long long)PyGpuArray_DIMS(img)[0],
-                (unsigned long long)PyGpuArray_DIMS(img)[1],
-                (unsigned long long)PyGpuArray_DIMS(img)[2],
-                (unsigned long long)PyGpuArray_DIMS(img)[3],
-                (long long)PyGpuArray_STRIDES(img)[0]/4,
-                (long long)PyGpuArray_STRIDES(img)[1]/4,
-                (long long)PyGpuArray_STRIDES(img)[2]/4,
-                (long long)PyGpuArray_STRIDES(img)[3]/4);
-        fprintf(stderr,
-                "INFO:   kern dim: %llu %llu %llu %llu  "
-                "kern stride: %lld %lld %lld %lld\n",
-                (unsigned long long)PyGpuArray_DIMS(kern)[0],
-                (unsigned long long)PyGpuArray_DIMS(kern)[1],
-                (unsigned long long)PyGpuArray_DIMS(kern)[2],
-                (unsigned long long)PyGpuArray_DIMS(kern)[3],
-                (long long)PyGpuArray_STRIDES(kern)[0]/4,
-                (long long)PyGpuArray_STRIDES(kern)[1]/4,
-                (long long)PyGpuArray_STRIDES(kern)[2]/4,
-                (long long)PyGpuArray_STRIDES(kern)[3]/4);
-        fprintf(stderr,
-                "INFO:   out dim: %llu %llu %llu %llu  "
-                "out stride: %lld %lld %lld %lld\n",
-                (unsigned long long)PyGpuArray_DIMS(out)[0],
-                (unsigned long long)PyGpuArray_DIMS(out)[1],
-                (unsigned long long)PyGpuArray_DIMS(out)[2],
-                (unsigned long long)PyGpuArray_DIMS(out)[3],
-                (long long)PyGpuArray_STRIDES(out)[0]/4,
-                (long long)PyGpuArray_STRIDES(out)[1]/4,
-                (long long)PyGpuArray_STRIDES(out)[2]/4,
-                (long long)PyGpuArray_STRIDES(out)[3]/4);
-        fprintf(stderr,
-                "INFO:   subsample_rows=%llu, subsample_cols=%llu\n",
-                (unsigned long long)subsample_rows,
-                (unsigned long long)subsample_cols);
-    }
-
-    //Check the output size is valid
-    assert (PyGpuArray_DIMS(out)[2] == ceil_intdiv(PyGpuArray_DIMS(img)[2]- PyGpuArray_DIMS(kern)[2] + 1, subsample_rows));
-    assert (PyGpuArray_DIMS(out)[3] == ceil_intdiv(PyGpuArray_DIMS(img)[3]- PyGpuArray_DIMS(kern)[3] + 1, subsample_cols));
-
-    assert (PyGpuArray_DIMS(out)[0] == PyGpuArray_DIMS(img)[0]);
-    assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
-    assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);
-
-    // we now search through a few implementations until one applies to our arguments.
-
-    //TODO: make separate version as if all fill this is slower.
-    //TODO: Make a switch with power of 2 max size as template
-    //TODO: make a parameter the number of division
-    //TODO: Should we make them in separate grid block instead?
- 
-    const int nstack=PyGpuArray_DIMS(kern)[1];
-    const int nbatch=PyGpuArray_DIMS(img)[0];
-    const int nkern=PyGpuArray_DIMS(kern)[0];
-    const int img_wid=PyGpuArray_DIMS(img)[3];
-    const int img_len=PyGpuArray_DIMS(img)[2];
-    const int kern_wid=PyGpuArray_DIMS(kern)[3];
-    const int kern_len=PyGpuArray_DIMS(kern)[2];
-    const int out_wid=PyGpuArray_DIMS(out)[3];
-    const int out_len=PyGpuArray_DIMS(out)[2];
-
-    const int img_stride_col= PyGpuArray_STRIDES(img)[3]/4;
-    const int img_stride_row=PyGpuArray_STRIDES(img)[2]/4;
-    const int img_stride_stack= PyGpuArray_STRIDES(img)[1]/4;
-    const int img_stride_batch=PyGpuArray_STRIDES(img)[0]/4;
-    const int kern_stride_col= PyGpuArray_STRIDES(kern)[3]/4;
-    const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
-    const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
-    const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
-
-    const int img_size=img_len*img_wid;
-    const int kern_size=kern_len*kern_wid;
-    const int out_size=out_len*out_wid;
-    const int img_size_byte = img_size*sizeof(float);
-    const int kern_size_byte = kern_size*sizeof(float);
-    const int out_size_byte = out_size*sizeof(float);
-    if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
-     PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
-                   " %d kernel columns, but the kernel we received had %llud columns!",
-                   THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]);
-      return -1;
-    }
-
-    bool subsample = subsample_rows!=1 || subsample_cols!=1;
-    bool img_contiguous = img->ga.flags & GA_C_CONTIGUOUS;
-    bool kern_contiguous = kern->ga.flags & GA_C_CONTIGUOUS;
-    bool out_contiguous = out->ga.flags & GA_C_CONTIGUOUS;
-    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
-
-    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
-    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
-
-    //if the lower 2 dims are c_contiguous but flipped, unflipping the
-    // stride and not flipping the kernel in shared memroy
-    //allow to use a version that use less registers(so is faster)
-    //the unflipped version of variable have the original value when
-    //we don't need to unflip it, but have the new value when we unflip it.
-    bool kern_flipped=true;
-    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    const float * kern_data_unflipped = cuda_get_ptr(kern);
-    int kern_stride_col_unflipped=kern_stride_col;
-    int kern_stride_row_unflipped=kern_stride_row;
-    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
-      //the last two dimensions are c_contiguous but flipped!
-      kern_stride_col_unflipped=1;
-      kern_stride_row_unflipped=kern_wid;
-      kern_flipped=false;
-      kern_contiguous_2d_unflipped = true;
-      kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
-    }
-
-    //if we remove the restriction
-    //img_size_byte+kern_size_byte>8*1024, we can enter in condition where
-    //we will lower the occupency due to shared memory and/or registers.
-    if ((version == -1) &&
-        (out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
-        out_size<=256){
-      //condition for exec 
-      if(!subsample &&
-        out_contiguous &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-         std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
-        !work_complete)
-        version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
-    }
-
-    if (!subsample && c_contiguous &&
-        (version==0||version==2||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads for block.x
-        nstack == 1 &&// don't implement the stack in the kernel.
-        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
-        !work_complete) //conv_patch
-    {
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-        if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
-        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-        while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
-            nb_split++;
-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
-
-        dim3 grid(nbatch, nkern);
-        int shared_size=(img_size + kern_size)*sizeof(float);
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int);
-
-#define CONV_PATCH_SPECIAL(kern_wid) \
-            if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
-            else f=conv_patch<true,kern_wid,true>;
-
-        CONV_PATCH_SPECIAL(THEANO_KERN_WID);
-
-         f<<< grid, threads, shared_size>>>
-             (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
-              img_len, img_wid, kern_len, kern_wid, nkern, nstack);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: used 'conv_patch' version %s nb_split=%d\n",
-                      threads.y==out_len ? "no split": "split", nb_split);
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i, nb_split=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y, nb_split);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_patch' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-
-    if (out_contiguous &&
-        (version==1||version==3||version==11||version==12||version==-1) &&
-        (version!=1 || out_size<=max_threads_dim0) &&//Maximum of X threads by block.x
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
-        !work_complete) //conv_patch_stack
-    {
-      //version 1 is without split and preload the full kernel
-      //version 3 is with split and preload the full kernel
-      //version 11 is without split and load only 1 kernel row at a time.
-      //version 12 is with split and load only 1 kernel row at a time.
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-        if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
-        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-        while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
-
-        bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
-        if(version==11 || version==12) preload_full_kernel=false;
-        dim3 grid(nbatch,nkern);
-        int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
-
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-
-#define CONV_PATCH_STACK_SPECIAL(kern_wid) \
-        if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
-
-        CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
-        f<<< grid, threads, shared_size>>>
-            (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
-              img_len, img_wid, kern_len, kern_wid, 
-              out_len, out_wid, nkern, nstack,
-              img_stride_col, img_stride_row, img_stride_stack,
-              img_stride_batch, kern_stride_col, kern_stride_row,
-              kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            if (verbose>1)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i,"
-                      " kern_flipped=true, accumulate=false, kern_width=%i,"
-                      " img_c_contiguous_2d=%i,"
-                      " kern_c_contiguous_2d=%i, nb_split=%i,"
-                      " preload_full_kernel=%i,"
-                      " subsample_rows=%llu, subsample_cols=%llu\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y,
-                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-                      nb_split, preload_full_kernel,
-                      (unsigned long long)subsample_rows,
-                      (unsigned long long)subsample_cols);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: used 'conv_patch_stack' version with nb_split=%i"
-                      " and preload_full_kernel=%i,"
-                      " subsample_rows=%llu, subsample_cols=%llu\n",
-                      nb_split, preload_full_kernel,
-                      (unsigned long long)subsample_rows,
-                      (unsigned long long)subsample_cols);
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i,"
-                      " kern_flipped=true, accumulate=false,"
-                      " kern_width=%i, img_c_contiguous_2d=%i,"
-                      " kern_c_contiguous_2d=%i, nb_split=%i,"
-                      " preload_full_kernel=%i,"
-                      " subsample_rows=%llu, subsample_cols=%llu\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y,
-                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-                      nb_split, preload_full_kernel,
-                      (unsigned long long)subsample_rows,
-                      (unsigned long long)subsample_cols);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_patch_stack' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-
-    if (!subsample && out_contiguous &&
-        (version==4||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        nstack == 1 &&// don't implement the stack in the kernel.
-        kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
-        !work_complete) //conv_rows
-
-    {
-        dim3 threads(out_wid);
-        dim3 grid(out_len, nbatch*nkern);
-        int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-
-#define CONV_ROWS_SPECIAL(kern_wid) \
-        if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
-        else f = conv_rows<kern_wid, true>;\
-
-        CONV_ROWS_SPECIAL(THEANO_KERN_WID);
-        f<<< grid, threads, shared_size >>>
-            (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
-           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-           img_stride_col, img_stride_row,
-           img_stride_stack,img_stride_batch,
-           kern_stride_col, kern_stride_row,
-           kern_stride_stack, kern_stride_nkern);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            work_complete = true;
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_rows' version\n");
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_rows' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-    if (!subsample && out_contiguous &&
-        (version==5||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
-        !work_complete) //conv_rows_stack
-
-    {
-        int nb_row=1;
-        //TODO:if not c_contiguous, lower max_thread as we use 22
-        //registers by thread and we won't execute 2 block in one MP.
-        for(int i=2;i<=out_len;i++){
-          if((i)*out_wid<=max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
-            nb_row=i;
-        }
-
-        dim3 threads(out_wid,nb_row);
-        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
-
-        int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
-
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-
-        if (0)
-          fprintf(stderr,
-                  "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
-                  img_contiguous_2d, kern_contiguous_2d,
-                  threads.x, threads.y, threads.z,
-                  grid.x, grid.y, grid.z);
-
-        if(!img_contiguous_2d || !kern_contiguous_2d) {
-            //fprintf(stderr, "using false version\n");
-            f = conv_rows_stack<THEANO_KERN_WID, false>;
-        } else {
-            //fprintf(stderr, "using true version\n");
-            f = conv_rows_stack<THEANO_KERN_WID, true>;
-        }
-
-        f<<< grid, threads, shared_size >>>
-            (cuda_get_ptr(img),
-             cuda_get_ptr(kern),
-             cuda_get_ptr(out),
-           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-           img_stride_col, img_stride_row,
-           img_stride_stack,img_stride_batch,
-           kern_stride_col, kern_stride_row,
-           kern_stride_stack, kern_stride_nkern);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            work_complete = true;
-            if (verbose>1)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_rows_stack' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-
-    if (!subsample && out_contiguous &&
-        (version==9||version==10||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        (img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
-        (version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
-        !work_complete) //conv_rows_stack2
-
-    {
-      // version 9:we preload the full kernel
-      // version 10: load only a few row at a time.
-        int nb_row=1;
-        int version_back = version;
-        //TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
-        if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
-          version = 9;
-        else if(version==-1)version = 10;
-
-        int k_size = kern_size;
-        if(version==10)
-          k_size=kern_wid;
-
-        for(int i=2;i<=out_len;i++){
-          if(i*out_wid<=max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
-            nb_row=i;
-        }
-
-        //to test the case when we don't have a thread by output pixel.
-        if((version_back!=-1)&& nb_row>1) nb_row--;
-
-        dim3 threads(out_wid,nb_row);
-        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
-          
-        int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
-
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-
-#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
-        if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
-        else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
-        else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
-        else f = conv_rows_stack2<kern_wid, true, false>;
-
-        CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
-
-        f<<< grid, threads, shared_size >>>
-            (cuda_get_ptr(img),
-             cuda_get_ptr(kern),
-             cuda_get_ptr(out),
-           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-           img_stride_col, img_stride_row,
-           img_stride_stack,img_stride_batch,
-           kern_stride_col, kern_stride_row,
-           kern_stride_stack, kern_stride_nkern);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            work_complete = true;
-            if (verbose>1)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: used 'conv_rows_stack2' version %s with"
-                      " %d row(s).\n",
-                      (version==9?"'load full kernel'":
-                       "'load 1 kern row at a time'"),nb_row);
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i version=%d\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y,(version==9?2:3));
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_rows_stack2' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-
-    //version 8 is the same but we force the split.
-    // The split is need in case we have too much threads.
-    // This happen frequently if the kernel length is big.
-    // Big kernel is frequent in the gradient.
-    //version 8 need a minimum of kernel length as we force the split.
-    //version 8 is needed to test more easily this kernel template parameter.
-    //version 13 load only 1 kernel row at a time.
-    if (!subsample &&
-        out_contiguous &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-        (version==7||version==8||version==13||version==-1) &&
-        (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
-        //version 13 need a minimal kernel length as big as the split.
-        (version!=13||kern_len>1) &&
-        !work_complete) //conv_patch_stack_reduce
-    {
-        int nb_split=1;
-        int full_kern=true;
-
-        if(version==8||version==13) nb_split++;//force the split.
-        if(version==13)full_kern=false;
-
-        //check if we can fit the full kernel in the shared memory
-        if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){
-          full_kern = false;
-        }
-
-        //thread_z is going to be ceil_intdiv(kern_len, nb_split)
-        // we need enough splits so that
-        // a) thread_z fits in the 'z' threadIdx (i.e. is less than 64)
-        // b) thread_z * out_len * out_wid fits in the thread count
-        // c) the kernel doesn't need too much shared memory
-
-        // constraint (a)
-        // device 1.3 have a max of 64 thread in z
-        while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;
-
-        // constraint (b)
-        //  (TODO: read the number of threads per block from the device)
-        while(out_size*ceil_intdiv(kern_len,nb_split)>max_threads_dim0)
-            nb_split++;
-
-        // tentative estimates (prior to contraint c)
-        int thread_z=ceil_intdiv(kern_len,nb_split);
-        int shared_size = sizeof(float)*(full_kern
-                ? std::max(img_size + kern_size, out_size*thread_z)
-                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
-
-        // constraint (c)
-        while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
-            //if we can't fit the kernel in shared memory, we must split it more.
-            nb_split++;
-            thread_z=ceil_intdiv(kern_len,nb_split);
-            shared_size = sizeof(float)*(full_kern
-                ? std::max(img_size + kern_size, out_size*thread_z)
-                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
-        }
-        if (nb_split <= kern_len)
-        {
-            assert(thread_z>0);//should not happen, but in case...
-            if(!full_kern) assert(thread_z!=kern_len);
-
-            dim3 threads(out_wid, out_len, thread_z);
-            dim3 grid(nbatch,nkern);
-
-            void (*f)(const float*, const float*, float*,
-                      int, int, int, int,
-                      int, int, int, int,
-                      int, int,
-                      int, int,
-                      int, int);
-
-            const bool split=thread_z!=kern_len;
-            const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
-
-            //printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
-            //We will always be split when we don't load the full kernel
-#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
-                if     (kern_flipped  && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
-                else if(kern_flipped  && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
-                else if(kern_flipped  && ccontig  && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
-                else if(kern_flipped  && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
-                else if(!kern_flipped && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
-                else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
-                else if(!kern_flipped && ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
-                else if(!kern_flipped && !ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
-                /*else if(kern_flipped  && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
-                /*else if(kern_flipped  && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
-                else if(kern_flipped  && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
-                else if(kern_flipped  && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
-                /*else if(!kern_flipped && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
-                /*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
-                else if(!kern_flipped && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
-                else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
-            CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
-
-            f<<< grid, threads, shared_size>>>(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
-                                               img_len, img_wid, kern_len, kern_wid,
-                                               nkern, nstack,
-                                               img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
-                                               kern_stride_col_unflipped, kern_stride_row_unflipped,
-                                               kern_stride_stack, kern_stride_nkern);
-
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess == sts)
-            {
-                if (verbose>1)
-                    fprintf(stderr,
-                            "threads.x=%i, threads.y=%i, threads.z=%i, "
-                            "grid.x=%i, grid.y=%i, shared_size=%i,"
-                            " nb_threads=%i\n",
-                            threads.x, threads.y, threads.z, grid.x, grid.y,
-                            shared_size, threads.x * threads.y * threads.z);
-                if (verbose)
-                    fprintf(stderr,
-                            "INFO: used 'conv_patch_stack_reduce' version"
-                            " kern_flipped=%i ccontig=%i nb_split=%d,"
-                            " preload_full_kern=%d\n",
-                            kern_flipped, ccontig, nb_split, full_kern);
-                work_complete = true;
-            }
-            else
-            {
-                if (verbose)
-                  fprintf(stderr,
-                          "threads.x=%i, threads.y=%i, threads.z=%i,"
-                          " grid.x=%i, grid.y=%i,shared_size=%i,"
-                          " nb_threads=%i\n",
-                          threads.x, threads.y, threads.z,
-                          grid.x, grid.y, shared_size,
-                          threads.x * threads.y * threads.z);
-                if (verbose)
-                  fprintf(stderr,
-                          "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
-                          " trying next implementation\n",
-                          cudaGetErrorString(sts));
-            }
-        } // else no good nb_splits was found
-    }
-
-    if (1 && (version==6||version==-1) &&
-        kern_len<=320 &&
-        !work_complete) //conv_valid_row_reduce
-    {
-        int outsize = PyGpuArray_SIZE(out);
-        int n_blocks = std::min(outsize, 4096);
-
-        int block_nstack=nstack;
-        //Max of 512 threads per blocks.
-        //On old hardware, we have a max of 356 threads as we have only 
-        //8k registers and the kernel use 23 register
-        //TODO: check if we have 8k or 16k of register...
-        while(block_nstack*kern_len>320)block_nstack--;
-        dim3 n_threads(block_nstack, kern_len, 1);
-
-        int n_reduce_buf = block_nstack * kern_len * sizeof(float);
-        /* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
-         *
-         * if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
-         * */
-        int initial_reduce_boundary = (1 << (int)(log2((double)(n_reduce_buf/sizeof(float)))));
-        if (initial_reduce_boundary == (n_reduce_buf / sizeof(float)))
-            initial_reduce_boundary >>= 1;
-
-        if (n_reduce_buf == sizeof(float))
-            assert (initial_reduce_boundary == 0);
-        else
-        {
-            assert (initial_reduce_boundary * 2 >= n_reduce_buf/sizeof(float));
-            assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
-        }
-
-
-        void (*f)(int, int, int, int,
-                  int, int, int, int, int,
-                  const float*, int, int, int, int,
-                  const float*, int, int, int, int,
-                  float*, int, int, int, int,
-                  int, int, int);
-
-        //std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
-        //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
-        //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
-        if(block_nstack==nstack)
-          f=conv_valid_row_reduce<false>;
-        else
-          f=conv_valid_row_reduce<true>;
-        f<<<n_blocks, n_threads, n_reduce_buf>>>(
-                nbatch, nkern, PyGpuArray_DIMS(img)[1],
-                img_len, img_wid,
-                kern_len, kern_wid,
-                out_len, out_wid,
-                cuda_get_ptr(img),
-                PyGpuArray_STRIDES(img)[0]/4, PyGpuArray_STRIDES(img)[1]/4, 
-                img_stride_row, img_stride_col,
-                cuda_get_ptr(kern),
-                PyGpuArray_STRIDES(kern)[0]/4, PyGpuArray_STRIDES(kern)[1]/4,
-                PyGpuArray_STRIDES(kern)[2]/4, PyGpuArray_STRIDES(kern)[3]/4,
-                cuda_get_ptr(out),
-                PyGpuArray_STRIDES(out)[0]/4, PyGpuArray_STRIDES(out)[1]/4,
-                PyGpuArray_STRIDES(out)[2]/4, PyGpuArray_STRIDES(out)[3]/4,
-                subsample_rows, subsample_cols, initial_reduce_boundary);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            work_complete = true;
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      n_threads.x, n_threads.y, n_blocks,
-                      n_reduce_buf, n_threads.x * n_threads.y);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_valid_row_reduce' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-
-    if (1 && !work_complete) //conv_reference_valid
-    {
-        int outsize = PyGpuArray_SIZE(out);
-        int n_blocks = std::min(outsize, 4096);
-        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
-                                 256);
-        if (1)
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: launching conv_reference_valid\n");
-            if (verbose>1)
-              fprintf(stderr, "      img : %i %llu %i %i %p  "
-                      "%lld %lld %lld %lld\n",
-                      nbatch, (unsigned long long)PyGpuArray_DIMS(img)[1],
-                      img_len, img_wid,
-                      cuda_get_ptr(img),
-                      (long long)PyGpuArray_STRIDES(img)[0]/4,
-                      (long long)PyGpuArray_STRIDES(img)[1]/4,
-                      (long long)PyGpuArray_STRIDES(img)[2]/4,
-                      (long long)PyGpuArray_STRIDES(img)[3]/4);
-            if (verbose>1)
-              fprintf(stderr, "      kern: %i %i %i %i %p  "
-                      "%lld %lld %lld %lld\n",
-                      nkern, nstack, kern_len, kern_wid,
-                      cuda_get_ptr(kern),
-                      (long long)PyGpuArray_STRIDES(kern)[0]/4,
-                      (long long)PyGpuArray_STRIDES(kern)[1]/4,
-                      (long long)PyGpuArray_STRIDES(kern)[2]/4,
-                      (long long)PyGpuArray_STRIDES(kern)[3]/4);
-            if (verbose>1)
-                fprintf(stderr, "      out : %llu %llu %i %i %p  "
-                        "%lld %lld %lld %lld\n",
-                      (unsigned long long)PyGpuArray_DIMS(out)[0],
-                      (unsigned long long)PyGpuArray_DIMS(out)[1],
-                      out_len, out_wid,
-                      cuda_get_ptr(out),
-                      (long long)PyGpuArray_STRIDES(out)[0]/4,
-                      (long long)PyGpuArray_STRIDES(out)[1]/4,
-                      (long long)PyGpuArray_STRIDES(out)[2]/4,
-                      (long long)PyGpuArray_STRIDES(out)[3]/4);
-            if (verbose>1)
-              fprintf(stderr, "   launch params: %i %i %i\n",
-                      outsize, n_blocks, n_threads);
-        }
-        conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
-                PyGpuArray_DIMS(img)[1],
-                img_len, img_wid,
-                kern_len, kern_wid,
-                out_len, out_wid,
-                cuda_get_ptr(img),
-                PyGpuArray_STRIDES(img)[0]/4,
-                PyGpuArray_STRIDES(img)[1]/4,
-                PyGpuArray_STRIDES(img)[2]/4,
-                PyGpuArray_STRIDES(img)[3]/4,
-                cuda_get_ptr(kern),
-                PyGpuArray_STRIDES(kern)[0]/4,
-                PyGpuArray_STRIDES(kern)[1]/4,
-                PyGpuArray_STRIDES(kern)[2]/4,
-                PyGpuArray_STRIDES(kern)[3]/4,
-                cuda_get_ptr(out),
-                PyGpuArray_STRIDES(out)[0]/4,
-                PyGpuArray_STRIDES(out)[1]/4,
-                PyGpuArray_STRIDES(out)[2]/4,
-                PyGpuArray_STRIDES(out)[3]/4,
-                subsample_rows, subsample_cols);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-            work_complete = true;
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: 'conv_reference_valid' failed\n");
-            PyErr_Format(PyExc_RuntimeError,
-                         "ERROR: all implementations failed for"
-                         " PyGpuArray_conv_valid! (%s)",
-                         cudaGetErrorString(sts));
-            return -1;
-        }
-    }
-    if (!work_complete)
-    {
-      PyErr_Format(PyExc_RuntimeError,
-                   "ERROR: no implementation(s) worked for"
-                   " PyGpuArray_conv_valid!"
-                   " Version asked(%d) (-1 mean use an heuristic)",
-                   version);
-        return -1;
-    }
-    return 0;
-}
-
-int
-PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
-                      PyGpuArrayObject * out, size_t subsample_rows,
-                      size_t subsample_cols, int version = -1, int verbose=0,
-                      int max_threads_dim0=512)
-{
-  //144 is the biggest static shared size used with compiling this file.
-    const int shared_avail = SHARED_SIZE - 150;
-
-    int work_complete = 0;
-    if (PyGpuArray_NDIM(img) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required img of 4D");
-        return -1;
-    }
-    if (PyGpuArray_NDIM(kern) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
-        return -1;
-    }
-    if (PyGpuArray_NDIM(out) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required out of 4D");
-        return -1;
-    }
-    // check the size of the output matrix
-    assert (PyGpuArray_DIMS(out)[2] == ceil_intdiv(PyGpuArray_DIMS(img)[2] + PyGpuArray_DIMS(kern)[2] - 1, subsample_rows));
-    assert (PyGpuArray_DIMS(out)[3] == ceil_intdiv(PyGpuArray_DIMS(img)[3] + PyGpuArray_DIMS(kern)[3] - 1, subsample_cols));
-
-    assert (PyGpuArray_DIMS(out)[0] == PyGpuArray_DIMS(img)[0]);
-    assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
-    assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);
-
-    const int nstack=PyGpuArray_DIMS(kern)[1];
-    const int nbatch=PyGpuArray_DIMS(img)[0];
-    const int nkern=PyGpuArray_DIMS(kern)[0];
-    const int img_wid=PyGpuArray_DIMS(img)[3];
-    const int img_len=PyGpuArray_DIMS(img)[2];
-    const int kern_wid=PyGpuArray_DIMS(kern)[3];
-    const int kern_len=PyGpuArray_DIMS(kern)[2];
-    const int out_wid=PyGpuArray_DIMS(out)[3];
-    const int out_len=PyGpuArray_DIMS(out)[2];
-
-    const int img_stride_col= PyGpuArray_STRIDES(img)[3]/4;
-    const int img_stride_row=PyGpuArray_STRIDES(img)[2]/4;
-    const int img_stride_stack=PyGpuArray_STRIDES(img)[1]/4;
-    const int img_stride_batch=PyGpuArray_STRIDES(img)[0]/4;
-    const int kern_stride_col= PyGpuArray_STRIDES(kern)[3]/4;
-    const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
-    const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
-    const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
-
-    const int img_size=img_len*img_wid;
-    const int kern_size=kern_len*kern_wid;
-    const int out_size=out_len*out_wid;
-    const int img_size_byte = img_size*sizeof(float);
-    const int kern_size_byte = kern_size*sizeof(float);
-    //padded image sizes
-    const int img_wid_padded=img_wid+2*kern_wid-2;
-    const int img_len_padded=img_len+2*kern_len-2;
-    const int img_size_padded=img_len_padded * img_wid_padded;
-    const int img_size_padded_byte = img_size_padded*sizeof(float);
-    
-    //const int out_size_byte = out_size*sizeof(float); // unused 
-
-    if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) ||
-          (THEANO_KERN_WID == 0))){
-      PyErr_Format(PyExc_ValueError,
-                   "ERROR: This GpuConv code was compiled for"
-                   " %d kernel columns, but the kernel we received"
-                   " had %llud columns!",
-                   THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]);
-      return -1;
-    }
-    bool subsample = subsample_rows!=1 || subsample_cols!=1;
-
-    bool img_contiguous = img->ga.flags & GA_C_CONTIGUOUS;
-    bool kern_contiguous = kern->ga.flags & GA_C_CONTIGUOUS;
-    bool out_contiguous = out->ga.flags & GA_C_CONTIGUOUS;
-    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
-
-    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
-    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
-
-    bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
-
-    //if the lower 2 dims are c_contiguous but flipped, unflipping the
-    //stride and not flipping the kernel in shared memroy
-    //allow to use a version that use less registers(so is faster)
-    //the unflipped version of variable have the original value when
-    //we don't need to unflip it, but have the new value when we unflip it.
-    bool kern_flipped=true;
-    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    const float * kern_data_unflipped = cuda_get_ptr(kern);
-    int kern_stride_col_unflipped=kern_stride_col;
-    int kern_stride_row_unflipped=kern_stride_row;
-    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
-      //the last two dimensions are c_contiguous but flipped!
-      kern_stride_col_unflipped=1;
-      kern_stride_row_unflipped=kern_wid;
-      kern_flipped=false;
-      kern_contiguous_2d_unflipped = true;
-      kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
-    }
-
-    if (verbose>1)
-    {
-        printf("INFO: Running conv_full version=%d,"
-               " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
-        printf("INFO:   img  dim: %llu %llu %llu %llu  "
-               "img  stride: %lld %lld %lld %lld\n",
-               (unsigned long long)PyGpuArray_DIMS(img)[0],
-               (unsigned long long)PyGpuArray_DIMS(img)[1],
-               (unsigned long long)PyGpuArray_DIMS(img)[2],
-               (unsigned long long)PyGpuArray_DIMS(img)[3],
-               (long long)PyGpuArray_STRIDES(img)[0]/4,
-               (long long)PyGpuArray_STRIDES(img)[1]/4,
-               (long long)PyGpuArray_STRIDES(img)[2]/4,
-               (long long)PyGpuArray_STRIDES(img)[3]/4);
-        printf("INFO:   kern dim: %llu %llu %llu %llu  "
-               "kern stride: %lld %lld %lld %lld\n",
-               (unsigned long long)PyGpuArray_DIMS(kern)[0],
-               (unsigned long long)PyGpuArray_DIMS(kern)[1],
-               (unsigned long long)PyGpuArray_DIMS(kern)[2],
-               (unsigned long long)PyGpuArray_DIMS(kern)[3],
-               (long long)PyGpuArray_STRIDES(kern)[0]/4,
-               (long long)PyGpuArray_STRIDES(kern)[1]/4,
-               (long long)PyGpuArray_STRIDES(kern)[2]/4,
-               (long long)PyGpuArray_STRIDES(kern)[3]/4);
-        printf("INFO:   out dim: %llu %llu %llu %llu  "
-               "out stride: %lld %lld %lld %lld\n",
-               (unsigned long long)PyGpuArray_DIMS(out)[0],
-               (unsigned long long)PyGpuArray_DIMS(out)[1],
-               (unsigned long long)PyGpuArray_DIMS(out)[2],
-               (unsigned long long)PyGpuArray_DIMS(out)[3],
-               (long long)PyGpuArray_STRIDES(out)[0]/4,
-               (long long)PyGpuArray_STRIDES(out)[1]/4,
-               (long long)PyGpuArray_STRIDES(out)[2]/4,
-               (long long)PyGpuArray_STRIDES(out)[3]/4);
-    }
-
-    if (!subsample &&
-        out_contiguous &&
-        (version==3||version==4||version==5||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
-        !work_complete) //conv_full_patch_stack_padded
-    {
-      //version 3 without split
-      //version 4 with split (more registers)
-      //version 5 with split (more registers) low mem version(some restriction and still more register)
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-        if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
-        if(kern_len==1 && version==5){
-          //version 5 don't support kern_len==1 as 1%0 return -1.
-          version=-1;
-          if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
-        }
-        if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
-
-        //we pass by ceil_intdiv in case the out_len is not a multiple
-        //of nb_split, we want nb_split the number of iteration.
-        //Max of 16k of shared memory
-        if(version==5)
-          while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
-        
-        //327 as we use 25 register
-        //version 5 will have only 1 block running at a time, so we
-        //can use 32 registers per threads, but their is some other stuff that
-        //for the limit to bu lower then 512.
-        int max_thread = (version!=5?327:450);
-        while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
-        if(version==-1 && out_size>max_threads_dim0)version=4;
-        if(version==-1)version=3;
-
-
-        if(version==-1 && nb_split>1) version=4;
-        else if(version==-1) version=3;
-        //force version 4 when more than 1 split are needed to always execute.
-        else if(version==3 && nb_split!=1) version=4;
-
-        assert(version!=3 || nb_split==1);
-        assert(version!=5 || kern_len>1);
-        assert(version!=-1);
-
-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
-        dim3 grid(nbatch,nkern);
-
-        int shared_size=img_size_padded_byte + kern_size_byte;
-        if(version==5)
-          shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-
-#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
-             if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
-        else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
-        else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
-        else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
-        else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
-        else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
-        else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
-        else assert(false);
-
-        CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
-
-        f<<< grid, threads, shared_size>>>
-            (cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
-              img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-              img_stride_col, img_stride_row, img_stride_stack,
-              img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
-              kern_stride_stack, kern_stride_nkern);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
-        {
-          if (verbose>1)
-            fprintf(stderr,
-                    "threads.x=%i, threads.y=%i, threads.z=%i,"
-                    " grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
-                    " out_len=%i, nb_split=%i, version=%i\n",
-                    threads.x, threads.y, threads.z,
-                    grid.x, grid.y, shared_size,
-                    threads.x * threads.y * threads.z,
-                    out_len, nb_split, version);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: used 'conv_full_patch_stack_padded'"
-                      " nb_split=%d low_mem=%s\n",
-                      nb_split, (version==5?"true":"false"));
-            work_complete = true;
-        }
-        else
-        {
-          if (verbose)
-            fprintf(stderr,
-                    "threads.x=%i, threads.y=%i, threads.z=%i,"
-                    " grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
-                    " out_len=%i, nb_split=%i, version=%i\n",
-                    threads.x, threads.y, threads.z,
-                    grid.x, grid.y, shared_size,
-                    threads.x * threads.y * threads.z,
-                    out_len, nb_split, version);
-          if (verbose)
-            fprintf(stderr,
-                    "INFO: impl 'conv_full_patch_stack_padded' %s %s"
-                    " failed (%s), trying next implementation\n",
-                    version==3?"no split": "split",
-                    (version==5?"low_mem":"not_low_mem"),
-                    cudaGetErrorString(sts));
-        }                         
-    }
-
-    if (!subsample && c_contiguous &&
-        (version==0||version==-1) &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-        nstack == 1 &&// don't implement the stack in the kernel.
-        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
-        !work_complete) //conv_full_patch
-    {
-        dim3 threads(out_wid, out_len);
-        dim3 grid(nbatch,nkern);
-        int shared_size=(img_size + kern_size)*sizeof(float);
-        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
-
-        conv_full_patch<<< grid, threads, shared_size>>>
-            (cuda_get_ptr(img),
-             cuda_get_ptr(kern),
-             cuda_get_ptr(out),
-           img_len, img_wid,
-           kern_len, kern_wid,
-           nkern, nstack);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y, shared_size,
-                      threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_full_patch' failed (%s),"
-                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }                         
-    }
-    if (false && !subsample && //disabled as test fail for this kernel
-        (version==1||version==-1) &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-        (nbatch > 20 || version==1) &&  // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
-        nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
-        !work_complete) //conv_full_load_everything
-    {
-        dim3 threads(out_wid, out_len);
-        dim3 grid(nbatch);
-        int shared_size=(img_size + kern_size)*nstack*sizeof(float);
-        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
-
-        //typeof(conv_full_load_everything<0>) f = ;
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int, int, int,
-                  int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
-
-        f = conv_full_load_everything<THEANO_KERN_WID>;
-
-        f<<< grid, threads, shared_size>>>
-            (cuda_get_ptr(img),
-             cuda_get_ptr(kern),
-             cuda_get_ptr(out),
-           img_len, img_wid, 
-           kern_len, kern_wid,
-           nkern, nstack,
-           PyGpuArray_STRIDES(img)[3]/4,
-           PyGpuArray_STRIDES(img)[2]/4,
-           PyGpuArray_STRIDES(img)[1]/4,
-           PyGpuArray_STRIDES(img)[0]/4,
-           PyGpuArray_STRIDES(kern)[3]/4,
-           PyGpuArray_STRIDES(kern)[2]/4,
-           PyGpuArray_STRIDES(kern)[1]/4,
-           PyGpuArray_STRIDES(kern)[0]/4
-           );
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y, shared_size,
-                      threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
-                      " failed (%s), trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }
-    }
-
-    if (!subsample &&
-        img_batch_stack_contiguous &&
-        out_contiguous &&
-        (version==2||version==-1) &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
-        !work_complete) //conv_full_patch_stack
-    {
-        dim3 threads(out_wid, out_len);
-        dim3 grid(nbatch,nkern);
-        int shared_size=(img_size + kern_size)*sizeof(float);
-
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int);
-
-        if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
-        else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
-        else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\
-        else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
-
-        f<<< grid, threads, shared_size>>>(
-                cuda_get_ptr(img),
-                cuda_get_ptr(kern),
-                cuda_get_ptr(out),
-                img_len, img_wid,
-                kern_len, kern_wid,
-                nkern, nstack,img_stride_col, img_stride_row,
-                kern_stride_col, kern_stride_row,
-                kern_stride_stack, kern_stride_nkern);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
-            if (verbose)
-              fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
-                      cudaGetErrorString(sts));
-        }                         
-    }
-    if (1 && !work_complete) //conv_reference_full
-    {
-        if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
-
-        int outsize = PyGpuArray_SIZE(out);
-        int n_blocks = std::min(outsize, 4096);
-        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
-                                 256);
-        if (0)
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: launching conv_reference_valid\n");
-            if (verbose)
-              fprintf(stderr, "      img : %llu %llu %llu %llu %p  "
-                      "%lld %lld %lld %lld\n",
-                      (unsigned long long)PyGpuArray_DIMS(img)[0],
-                      (unsigned long long)PyGpuArray_DIMS(img)[1],
-                      (unsigned long long)PyGpuArray_DIMS(img)[2],
-                      (unsigned long long)PyGpuArray_DIMS(img)[3],
-                      cuda_get_ptr(img),
-                      (long long)PyGpuArray_STRIDES(img)[0]/4,
-                      (long long)PyGpuArray_STRIDES(img)[1]/4,
-                      (long long)PyGpuArray_STRIDES(img)[2]/4,
-                      (long long)PyGpuArray_STRIDES(img)[3]/4);
-            if (verbose)
-              fprintf(stderr, "      kern: %llu %llu %llu %llu %p  "
-                      "%lld %lld %lld %lld\n",
-                      (unsigned long long)PyGpuArray_DIMS(kern)[0],
-                      (unsigned long long)PyGpuArray_DIMS(kern)[1],
-                      (unsigned long long)PyGpuArray_DIMS(kern)[2],
-                      (unsigned long long)PyGpuArray_DIMS(kern)[3],
-                      cuda_get_ptr(kern),
-                      (long long)PyGpuArray_STRIDES(kern)[0]/4,
-                      (long long)PyGpuArray_STRIDES(kern)[1]/4,
-                      (long long)PyGpuArray_STRIDES(kern)[2]/4,
-                      (long long)PyGpuArray_STRIDES(kern)[3]/4
-                        );
-            if (verbose)
-                fprintf(stderr, "      out : %llu %llu %llu %llu %p  "
-                        "%lld %lld %lld %lld\n",
-                      (unsigned long long)PyGpuArray_DIMS(out)[0],
-                      (unsigned long long)PyGpuArray_DIMS(out)[1],
-                      (unsigned long long)PyGpuArray_DIMS(out)[2],
-                      (unsigned long long)PyGpuArray_DIMS(out)[3],
-                      cuda_get_ptr(out),
-                      (long long)PyGpuArray_STRIDES(out)[0]/4,
-                      (long long)PyGpuArray_STRIDES(out)[1]/4,
-                      (long long)PyGpuArray_STRIDES(out)[2]/4,
-                      (long long)PyGpuArray_STRIDES(out)[3]/4);
-            if (verbose)
-              fprintf(stderr, "   launch params: %i %i %i\n",
-                      outsize, n_blocks, n_threads);
-            if (verbose)
-                fprintf(stderr, "   subsample params: %llu %llu\n",
-                        (unsigned long long)subsample_rows,
-                        (unsigned long long)subsample_cols);
-        }
-        conv_reference_full<<<n_blocks, n_threads>>>(
-                PyGpuArray_DIMS(img)[0], PyGpuArray_DIMS(kern)[0],
-                PyGpuArray_DIMS(img)[1],
-                PyGpuArray_DIMS(img)[2], PyGpuArray_DIMS(img)[3],
-                PyGpuArray_DIMS(kern)[2], PyGpuArray_DIMS(kern)[3],
-                PyGpuArray_DIMS(out)[2], PyGpuArray_DIMS(out)[3],
-                cuda_get_ptr(img), PyGpuArray_STRIDES(img)[0]/4,
-                PyGpuArray_STRIDES(img)[1]/4,
-                PyGpuArray_STRIDES(img)[2]/4,
-                PyGpuArray_STRIDES(img)[3]/4,
-                cuda_get_ptr(kern), PyGpuArray_STRIDES(kern)[0]/4,
-                PyGpuArray_STRIDES(kern)[1]/4,
-                PyGpuArray_STRIDES(kern)[2]/4,
-                PyGpuArray_STRIDES(kern)[3]/4,
-                cuda_get_ptr(out), PyGpuArray_STRIDES(out)[0]/4,
-                PyGpuArray_STRIDES(out)[1]/4,
-                PyGpuArray_STRIDES(out)[2]/4,
-                PyGpuArray_STRIDES(out)[3]/4,
-                subsample_rows, subsample_cols);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_reference_full' version"
-                      " ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
-                      " nkern=%d nstack=%d subsample=%d\n",
-                      img_len,img_wid, kern_len, kern_wid,
-                      out_len, out_wid, nbatch, nkern, nstack, subsample);
-            work_complete = true;
-        }
-        else
-        {
-          if (verbose)
-            fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                    " shared_size=%i, nb_threads=%i\n",
-                    n_threads, 1, n_blocks, 1, 0, n_threads);
-          if (verbose)
-            fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
-                    " trying next implementation\n",
-                    cudaGetErrorString(sts));
-          PyErr_Format(PyExc_RuntimeError,
-                       "ERROR: all implementations failed for"
-                       " CudaNdarray_conv_full! (%s)",
-                       cudaGetErrorString(sts));
-          return -1;
-        }
-    }
-    return 0;
-}
-
-PyObject *
-PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
-                 PyGpuArrayObject * out, const int mode,
-                 const size_t subsample_rows, const size_t subsample_cols,
-                 const int version, const int verbose,
-                 const int max_threads_dim0 = 512
-                 )
-{
-    // Re-use the out object if possible.  If the out object it not used, then its refcount is not modified.
-    //  If the out object is re-used then it is returned, and its refcount is incremented by 1.
-    //
-    if (PyGpuArray_NDIM(img) != 4)
-    {
-      PyErr_SetString(PyExc_ValueError, "PyGpuArray 4-D tensor required");
-      return NULL;
-    }
-    if (PyGpuArray_NDIM(kern) != 4)
-    {
-      PyErr_SetString(PyExc_ValueError, "PyGpuArray 4-D tensor required");
-      return NULL;
-    }
-
-    size_t out_dim[4];
-    out_dim[0] = PyGpuArray_DIMS(img)[0];
-    out_dim[1] = PyGpuArray_DIMS(kern)[0];
-    size_t logical_rows, logical_cols;
-    if (mode == ConvMode_VALID)
-    {
-        logical_rows = PyGpuArray_DIMS(img)[2] - PyGpuArray_DIMS(kern)[2] + 1;
-        logical_cols = PyGpuArray_DIMS(img)[3] - PyGpuArray_DIMS(kern)[3] + 1;
-    }
-    else
-    {
-        logical_rows = PyGpuArray_DIMS(img)[2] + PyGpuArray_DIMS(kern)[2] - 1;
-        logical_cols = PyGpuArray_DIMS(img)[3] + PyGpuArray_DIMS(kern)[3] - 1;
-    }
-    out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
-    out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
-
-    PyGpuArrayObject * rval = NULL;
-
-    if ( out
-         && PyGpuArray_NDIM(out)==4
-         && out->ga.flags & GA_C_CONTIGUOUS
-         && PyGpuArray_DIMS(out)[0]==out_dim[0]
-         && PyGpuArray_DIMS(out)[1]==out_dim[1]
-         && PyGpuArray_DIMS(out)[2]==out_dim[2]
-         && PyGpuArray_DIMS(out)[3]==out_dim[3])
-    {
-      rval = out;
-      Py_INCREF(rval);
-      if (verbose)
-        fprintf(stderr,
-                "INFO: Conv is reusing the 'out' argument"
-                " structure.\n");
-    }
-    else
-    {
-      if (out && verbose)
-        fprintf(stderr,
-                "INFO: Conv is ignoring 'out' argument with wrong"
-                " structure.\n");
-      else if(verbose)
-        fprintf(stderr,
-                "INFO: Conv don't have an 'out' argument"
-                " structure.\n");
-
-      rval = pygpu_zeros(4, out_dim,
-                         img->ga.typecode, GA_C_ORDER,
-                         pygpu_default_context(), Py_None);
-      //rval might be null
-    }
-    if ((rval==NULL)
-        || ((mode==ConvMode_VALID) && PyGpuArray_conv_valid(img, kern, rval,
-                                                            subsample_rows,
-                                                            subsample_cols,
-                                                            version, verbose,
-                                                            max_threads_dim0))
-        || ((mode==ConvMode_FULL) && PyGpuArray_conv_full(img, kern, rval,
-                                                          subsample_rows,
-                                                          subsample_cols,
-                                                          version, verbose,
-                                                          max_threads_dim0))
-            )
-    {
-        // if rval is something we just allocated,
-        // and there was a problem, then we have to free it.
-        Py_XDECREF(rval);
-        return NULL;
-    }
-    return (PyObject*)rval;
-}
-
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
diff --git a/theano/sandbox/gpuarray/conv.py b/theano/sandbox/gpuarray/conv.py
deleted file mode 100644
index 3b5f94aada4..00000000000
--- a/theano/sandbox/gpuarray/conv.py
+++ /dev/null
@@ -1,264 +0,0 @@
-import copy
-import os
-
-import theano
-from theano import config, gof
-from theano.sandbox.gpuarray.comp import NVCC_compiler
-from theano.sandbox.gpuarray.type import GpuArrayType
-from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable
-
-
-class GpuConv(gof.Op):
-    """
-    Implement the batched and stacked 2d convolution on the gpu.
-    """
-    @staticmethod
-    def logical_output_shape_2d(imshp, kshp, mode):
-        if mode == 'valid':
-            return imshp[0] - kshp[0] + 1, imshp[1] - kshp[1] + 1
-        if mode == 'full':
-            return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
-        raise ValueError(mode)
-
-    def __init__(self, border_mode,
-            subsample=(1, 1),
-            logical_img_hw=None,
-            logical_kern_hw=None,
-            logical_kern_align_top=True,
-            version=-1,
-            verbose=0,
-            kshp=None,
-            imshp=None,
-            max_threads_dim0=None):
-        """
-        :param version: each version of c_code implements many kernels for the
-                        convolution. By default we try to guess the best one.
-                        You can force one version with this parameter. This
-                        parameter is used by the tests.
-        :param verbose: for value of 1,2 and 3. Print more information during
-                        the execution of the convolution. Mostly used for
-                        optimization or debugging.
-        :param kshp:    The size of the kernel. If provided, can generate
-                        faster code. If the GpuConv op is automatically
-                        inserted,
-                        we take its value automatically from the Conv op.
-        :param imshp:   The size of the image. Not used for code generation but
-                        allows to select an experimental new version in another
-                        repo.
-        :param max_threads_dim0: The maximum number of threads for the
-                        block size dimensions 0 (blockDim.x) used by the
-                        GPU function.
-
-        """
-        self.border_mode = border_mode
-        self.subsample = subsample
-        if logical_img_hw is not None:
-            h, w = logical_img_hw
-            #TODO: reconsider this... since shapes are not given in
-            # constructor, maybe a multiplier + offset is a more
-            # appropriate way of passing this logical grid
-            logical_img_hw = tuple(logical_img_hw)
-        self.logical_img_hw = logical_img_hw
-        if logical_kern_hw is not None:
-            h, w = logical_kern_hw
-            #TODO: reconsider this... since shapes are not given in
-            # constructor, maybe a multiplier + offset is a more
-            # appropriate way of passing this logical grid
-            logical_kern_hw = tuple(logical_kern_hw)
-        self.logical_kern_hw = logical_kern_hw
-        self.logical_kern_align_top = logical_kern_align_top
-        self.version = version
-        self.verbose = verbose
-        self.kshp = kshp
-        self.imshp = imshp
-        self.max_threads_dim0 = max_threads_dim0
-
-    def __eq__(self, other):
-        return type(self) == type(other) \
-            and self.border_mode == other.border_mode \
-            and self.subsample == other.subsample \
-            and self.logical_img_hw == other.logical_img_hw \
-            and self.logical_kern_hw == other.logical_kern_hw \
-            and self.logical_kern_align_top == other.logical_kern_align_top \
-            and self.version == other.version \
-            and self.verbose == other.verbose \
-            and self.kshp == other.kshp\
-            and self.imshp == other.imshp\
-            and self.max_threads_dim0 == other.max_threads_dim0
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        if not hasattr(self, "imshp"):
-            self.imshp = None
-        if not hasattr(self, "max_threads_dim0"):
-            self.max_threads_dim0 = None
-
-    def __hash__(self):
-        # don't use hash(self.version) as hash(-1)==-2 and
-        # hash(-2)==-2 in python!
-        return hash(type(self)) \
-            ^ hash(self.border_mode) \
-            ^ hash(self.subsample) \
-            ^ hash(self.logical_img_hw) \
-            ^ hash(self.logical_kern_hw) \
-            ^ hash(self.logical_kern_align_top) \
-            ^ self.version \
-            ^ hash(self.verbose) \
-            ^ hash(self.kshp)\
-            ^ hash(self.imshp)\
-            ^ hash(self.max_threads_dim0)
-
-    def __str__(self):
-        return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
-            self.__class__.__name__,
-            self.border_mode,
-            str(self.subsample),
-            str(self.logical_img_hw),
-            str(self.logical_kern_hw),
-            str(self.logical_kern_align_top),
-            str(self.imshp),
-            str(self.kshp))
-
-    def make_node(self, img, kern):
-        if img.dtype != "float32" or kern.dtype != "float32":
-            raise NotImplementedError("GpuConv currently only work"
-                                      " with float32 dtype")
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
-        img = as_gpuarray_variable(img)
-        kern = as_gpuarray_variable(kern)
-        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
-                         False, False]
-        out = GpuArrayType(img.dtype, broadcastable)()
-        return gof.Apply(self, [img, kern], [out])
-
-    def flops(self, inputs, outputs):
-        """ Useful with the hack in profilemode to print the MFlops"""
-        images, kerns = inputs
-        out, = outputs
-        assert images[1] == kerns[1]
-        flops = 0
-        if self.border_mode == "valid":
-            # nb mul and add by output pixel
-            flops = kerns[2] * kerns[3] * 2
-            # nb flops by output image
-            flops *= out[2] * out[3]
-            # nb patch multiplied
-            flops *= images[1] * kerns[0] * images[0]
-        else:
-            flops = (images[0] * kerns[0] * images[1] *
-                     kerns[2] * kerns[3] *
-                     images[2] * images[3] * 2)
-        return flops
-
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
-        node_ = copy.copy(node)
-        assert node.op is node_.op
-        if config.gpuarray.sync:
-            raise NotImplementedError("GpuConv do not implement gpuarray.sync Theano flag")
-        if node_.op.max_threads_dim0 is None:
-            cuda = theano.sandbox.cuda
-            device_id = cuda.use.device_number
-            if device_id is None:
-                cuda.use("gpu",
-                         force=False,
-                         default_to_move_computation_to_gpu=False,
-                         move_shared_float32_to_gpu=False,
-                         enable_cuda=False,
-                         test_driver=True)
-                device_id = cuda.use.device_number
-            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
-            prop = cuda_ndarray.device_properties(device_id)
-            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
-        return super(GpuConv, node_.op).make_thunk(node_, storage_map,
-                                                   compute_map, no_recycling)
-
-    def c_compile_args(self):
-        nb = 0
-        if self.kshp is not None:
-            nb = self.kshp[1]
-        return ['-DTHEANO_KERN_WID=' + str(nb)]  # ,'-g','-G']
-
-    def c_headers(self):
-        return ['<stdio.h>', 'cuda.h',
-                '<gpuarray/extension.h>', '<numpy_compat.h>']
-
-    def c_code_cache_version(self):
-        # raise this whenever modifying any of the support_code_files
-        return (0, 20)
-
-    def c_init_code(self):
-        return ['cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
-
-    def c_support_code_apply(self, node, nodename):
-        # REMEMBER TO RAISE c_code_cache_version when changing any of
-        # these files
-        files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
-        codes = ["CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);",
-                 "float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}",
-                 "const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}"]
-        codes += [open(os.path.join(os.path.split(__file__)[0], f)).read()
-                  for f in files]
-        return reduce(str.__add__, codes)
-
-    def c_compiler(self):
-        return NVCC_compiler
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        img, kern = inp
-        out, = out_
-        dx = self.subsample[0]
-        dy = self.subsample[1]
-        border_mode = self.border_mode
-        version = self.version
-        verbose = self.verbose
-        sub = sub.copy()
-        max_threads_dim0 = self.max_threads_dim0
-        if max_threads_dim0 is None:
-            raise NotImplementedError("GpuConv.c_code should not be called "
-                                      "directly. It should be called by "
-                                      "make_thunk() that add some information "
-                                      "related to the selected GPU.")
-        sub.update(locals())
-        return """
-    //Mandatory args
-    const char *mode_str = "%(border_mode)s";
-
-    //Optional args
-    int version = %(version)s;
-    int verbose = %(verbose)s;
-    int dx = %(dx)s;
-    int dy = %(dy)s;
-
-    int mode;
-    if (strcmp(mode_str, "full") == 0)
-    {
-        mode = ConvMode_FULL;
-    }
-    else if (strcmp(mode_str, "valid") == 0)
-    {
-        mode = ConvMode_VALID;
-    }
-    else
-    {
-        PyErr_SetString(PyExc_ValueError,
-                        "mode must be one of 'full' or 'valid'");
-        return NULL;
-    }
-
-    // TODO, make out be decref before we alloc out2!
-    PyGpuArrayObject * out2 = (PyGpuArrayObject *)PyGpuArray_Conv(
-                                                         %(img)s, %(kern)s,
-                                                         %(out)s, mode,
-                                                         dx, dy,
-                                                         version, verbose,
-                                                         %(max_threads_dim0)s);
-    Py_XDECREF(%(out)s);
-    %(out)s = out2;
-
-    if (%(out)s==NULL){
-        %(fail)s
-    }
-""" % sub
diff --git a/theano/sandbox/gpuarray/conv_full_kernel.cu b/theano/sandbox/gpuarray/conv_full_kernel.cu
deleted file mode 100644
index fb14612d691..00000000000
--- a/theano/sandbox/gpuarray/conv_full_kernel.cu
+++ /dev/null
@@ -1,455 +0,0 @@
-//we store the full image and the full kernel in the shared memory
-//each thread compute only one value for the output
-//thread block size=out_wid, out_len/nb_split
-//grid block size=batch_id
-//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
-__global__ void
-conv_full_patch_split(const float* img, const float* kern, float* out,
-                      int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = img_len + kern_len - 1;
-  out_wid = img_wid + kern_wid - 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-
-  extern __shared__ float s_data[];
-
-    int batch_id = blockIdx.x;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    int out_col = tx;//output col
-    int out_row = ty;//output row
-    const int thread_id  = out_row*out_wid + out_col;
-
-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-
-    img+=img_len*img_wid*batch_id;//the good batch
-
-    load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
-    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid);
-    __syncthreads();
-
-    for(int out_row=ty;out_row<out_len;out_row+=out_len/nb_split){
-      float sum = 0.0f;
-      int img_row = out_row;
-
-      for (int row=0; row < kern_len; row++) {//loop over row
-        int inverse_row = (img_row-row);
-        if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image
-
-        const float* idx_in=&d_img[inverse_row*img_wid];
-        const float* idx_kern=&d_kern[row*kern_wid];
-        int img_col = out_col;
-        int col=0,last=0;
-        for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col
-          if(last<0 ||last>=(img_wid))continue;//col outside the image        
-          sum+=idx_in[last]*idx_kern[col];
-        }
-      }
-      out[batch_id*out_len*out_wid+//the output image
-          out_row*out_wid+out_col] = sum;
-    }
-}
-
-//we store the full image and the full kernel in the shared memory
-//each thread compute only one value for the output
-//thread block size=out_wid, out_len
-//grid block size=batch_id, nkern
-//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
-__global__ void
-conv_full_patch( const float* img, const float* kern, float* out,
-                 int img_len, int img_wid,
-                 int kern_len, int kern_wid, int nkern, int nstack)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = img_len + kern_len - 1;
-  out_wid = img_wid + kern_wid - 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-
-  extern __shared__ float s_data[];
-
-    int batch_id = blockIdx.x;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    int out_col = tx;//output col
-    int out_row = ty;//output row
-    const int thread_id  = out_row*out_wid + out_col;
-
-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-
-    kern+=kern_len*kern_wid*nstack*blockIdx.y;//the good nkern
-    img+=img_len*img_wid*batch_id;//the good batch
-
-    load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
-    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid, true);
-    __syncthreads();
-
-    float sum = 0.0f;
-
-    for (int row=0; row < kern_len; row++) {//loop over row
-      if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
-
-      const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
-      const float* idx_kern=&d_kern[row*kern_wid];
-      int col=0;
-      int max_col=kern_wid;
-      int img_col=out_col-kern_wid+1;
-      max_col=min(max_col,img_wid-img_col);
-      
-      if(img_col<0){col=-img_col;img_col+=col;}
-      for (; col < max_col; col++, img_col++) {//loop over col
-        sum+=idx_in[col]*idx_kern[col];
-      }
-    }
-    out[batch_id*out_wid*out_len*nkern+//the good batch
-        out_wid*out_len*blockIdx.y+//the output image
-        out_row*out_wid+out_col] = sum;
-}
-
-//we store the full image and the full kernel in the shared memory
-//each thread compute only one value for the output
-//thread block size=out_wid, out_len
-//grid block size=batch_id, nkern
-//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
-//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
-
-template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
-__global__ void
-conv_full_patch_stack( const float* img, const float* kern, float* out,
-                       int img_len, int img_wid,
-                       int kern_len, int kern_wid, int nkern, int nstack,
-                       int img_stride_col, int img_stride_row,
-                       int kern_stride_col, int kern_stride_row, 
-                       int kern_stride_stack, int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = img_len + kern_len - 1;
-  out_wid = img_wid + kern_wid - 1;
-  nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
-  const float __shared__ *kern_, *img_;
-  extern __shared__ float s_data[];
-
-    const int batch_id = blockIdx.x;
-    const int nkern_id = blockIdx.y;
-
-
-    const int out_col = threadIdx.x;
-    const int out_row = threadIdx.y;
-    const int thread_id  = threadIdx.y*blockDim.x+ threadIdx.x;
-
-    float* d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float* d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-    kern_=kern+kern_stride_nkern*nkern_id;//the good nkern
-    img_=img+img_len*img_stride_row*(nstack*batch_id);//the good batch
-
-    float sum = 0.0f;
-
-    for (int stack = 0;stack<nstack;stack++){
-
-      load_to_shared(d_img, img_+stack*img_len*img_stride_row, thread_id,nb_thread_id,img_wid,img_len,img_stride_col, img_stride_row,false,img_c_contiguous_2d);
-      load_to_shared(d_kern, kern_+stack*kern_stride_stack, thread_id,nb_thread_id,kern_wid,kern_len,kern_stride_col,kern_stride_row,true,kern_c_contiguous_2d);
-      __syncthreads();
-
-
-      for (int row=0; row < kern_len; row++) {//loop over row
-        if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
-        const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
-        const float* idx_kern=&d_kern[row*kern_wid];
-        int col=0;
-        int max_col=kern_wid;
-        int img_col=out_col-kern_wid+1;
-        max_col=min(max_col,img_wid-img_col);
-
-        if(img_col<0){col=-img_col;img_col+=col;}
-        for (; col < max_col; col++, img_col++) {//loop over col
-          sum+=idx_in[col]*idx_kern[col];
-        }
-      }
-      //Needed as not all thread finish at the same time the loop
-      //And we don't want to overwrite the shared memory.
-      __syncthreads();
-    }
-    out[batch_id*out_wid*out_len*nkern+//the good batch
-        out_wid*out_len*blockIdx.y+//the output image
-        out_row*out_wid+out_col] = sum;
-}
-
-/**
- * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
- * I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
- * thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
- * grid block size=batch_id, nkern
- * dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
- * dynamic shared memory: low mem:((kern_len+nb_row-1)+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template c_contiguous: if true, the image and kernel have are c_contiguous.(use less registers)
- * template split: if true, each thread compute more than 1 output pixel.
- * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
- *          if you set split and low_mem to true, we will use the low_mem version!
- */
-template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
-__global__ void
-conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
-                  const int img_len, const int img_wid,
-                  const int kern_len, const int kern_wid,
-                  const int nkern, const int nstack,
-                  const int img_stride_col, const int img_stride_row,
-                  const int img_stride_stack, const int img_stride_batch,
-                  const int kern_stride_col, const int kern_stride_row,
-                  const int kern_stride_stack, const int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = img_len + kern_len - 1;
-  out_wid = img_wid + kern_wid - 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-
-  extern __shared__ float s_data[];
-
-    __shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
-    batch_id = blockIdx.x;
-    kern_id = blockIdx.y;
-    nb_rows = blockDim.y;
-
-    // Thread index
-    const int tx = threadIdx.x;
-    const int ty = threadIdx.y;
-
-    int out_col = tx;//output col
-    const int thread_id  = ty*blockDim.x + tx;
-
-    float * d_kern=&s_data[0];//size of [KERNEL_LEN * KERNEL_WID];
-    float * d_img=&s_data[kern_len*kern_wid];//size of [see fct doc];
-
-    kern+=kern_stride_nkern*kern_id;//the good nkern
-    img+=img_stride_batch*batch_id;//the good batch
-
-    img_wid_valid=img_wid+2*kern_wid-2;
-
-    if(!split && !low_mem){
-      fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
-      const int out_row = ty;//output row
-      float sum = 0.0f;
-      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-             img+=img_stride_stack){
-          __syncthreads();
-        load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img,
-                                  thread_id,nb_thread_id,img_wid,img_len,
-                                  img_stride_col, img_stride_row, kern_wid-1,
-                                  c_contiguous);
-        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
-                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-        __syncthreads();
-
-        for (int row=0; row < kern_len; row++) {//loop over row
-          const float* idx_kern=&d_kern[row*kern_wid];
-          const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
-          
-          convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
-        }
-      }
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          kern_id*out_wid*out_len+//the output image
-          out_row*out_wid+out_col] = sum;
-    }else if(split && !low_mem){
-      fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
-      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
-      __shared__ int out_len_max;
-      //TODO pass a parameter nb_split
-      out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
-      for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
-        float sum = 0.0f;
-        for (int stack = 0;stack<nstack;stack++){
-          __syncthreads();
-          //TODO: load only the part of the image needed or put the partial result in shared memory
-          load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),
-                                    img+img_stride_stack*stack,
-                                    thread_id,nb_thread_id,img_wid,img_len,
-                                    img_stride_col, img_stride_row, kern_wid-1,
-                                    c_contiguous);
-          load_to_shared(d_kern, kern+kern_stride_stack*stack,
-                         thread_id, nb_thread_id, kern_wid,kern_len,
-                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-          __syncthreads();
-          //The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
-          //Not needed on generation before as they worked anyway. Removing the if generate the good code
-          //as we store the result of only the good thread.
-          //This was with nvcc 3.0 on an GTX470 card.
-          if(out_row<out_len)
-            for (int row=0; row < kern_len; row++) {//loop over row
-              const float* idx_kern=&d_kern[row*kern_wid];
-              const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
-              
-              convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
-            }
-          if(out_row<out_len)
-            out[batch_id*out_wid*out_len*nkern+//the good batch
-                out_wid*out_len*kern_id+//the output image
-                out_row*out_wid+out_col] = sum;
-        }
-      }
-    }else{//low_mem version
-      //don't need to fill the last rows padding as this is done later.
-      fill(d_img,img_wid_valid*((kern_len+nb_rows-1)+2*kern_len-2), 0, thread_id, nb_thread_id);
-      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
-      __shared__ int out_len_max;
-      //TODO pass a parameter nb_split
-      if(thread_id==0)
-        out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows;
-      __syncthreads();
-      for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
-          out_row+=nb_rows, out_row_iter++){
-        float sum = 0.0f;
-        for (int stack = 0;stack<nstack;stack++){
-          __syncthreads();
-          const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
-          const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start
-          //we need to reload some row as when we change of out_row we lost the last load du to the stack.
-          const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload
-          load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid,
-                                    img+img_stride_stack*stack//the good stack image
-                                    +(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
-                                    thread_id,nb_thread_id,img_wid,
-                                    len_to_load+previous_row,
-                                    img_stride_col, img_stride_row, kern_wid-1,
-                                    c_contiguous);
-          //TODO: fill the last row padding only when needed.
-          //We always fill the last rows padding event when not needed.
-          int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load;
-          row_to_fill = min(row_to_fill,kern_len-1);
-          fill(d_img+(kern_len-1+len_to_load)*img_wid_valid,
-               img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
-          load_to_shared(d_kern, kern+kern_stride_stack*stack,
-                         thread_id, nb_thread_id, kern_wid,kern_len,
-                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-          __syncthreads();
-
-          for (int row=0; row < kern_len; row++) {//loop over row
-            const float* idx_kern=&d_kern[row*kern_wid];
-            const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
-            
-            convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
-          }
-        }
-        if(out_row<out_len)
-          out[batch_id*out_wid*out_len*nkern+//the good batch
-              out_wid*out_len*kern_id+//the output image
-              out_row*out_wid+out_col] = sum;
-      }
-    }
-}
-
-template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) 
-{ 
-    return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
-    //return x[0] * y[0] + everything_dot<i-1>(x+sx, sx, y+sy, sy);
-}
-template <> __device__ float everything_dot<0>(const float * x, const int sx, const float * y, const int sy)
-{ 
-    return 0;
-}
-template <> __device__ float everything_dot<1>(const float * x, const int sx, const float * y, const int sy)
-{ 
-    return x[0] * y[0];
-}
-template<int NSTACK>
-__global__ void
-conv_full_load_everything( const float* img, const float* kern, float* out,
-                 int img_len, int img_wid,
-                 int kern_len, int kern_wid, int nkern, int nstack,
-                 int img_stride_col, int img_stride_row,
-                 int img_stride_stack, int img_stride_batch,
-                 int kern_stride_col, int kern_stride_row, 
-                 int kern_stride_stack, int kern_stride_nkern)
-{
-    int __shared__ out_len, out_wid, nb_thread_id;
-    out_len = img_len + kern_len - 1;
-    out_wid = img_wid + kern_wid - 1;
-    nb_thread_id = blockDim.y*blockDim.x;
-
-    extern __shared__ float s_data[];
-
-    int batch_id = blockIdx.x;
-
-    const int out_col = threadIdx.x;//output col
-    const int out_row = threadIdx.y;//output row
-    const int thread_id  = out_row*out_wid + out_col;
-
-    float * d_img=&s_data[0]; //size [nstack * IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[nstack * img_len * img_wid];//size [nstack * KERNEL_LEN * KERNEL_WID];
-
-    img += blockIdx.x * img_stride_batch;//the good batch
-
-    // load the image to shared memory
-    for (int i = thread_id; i < nstack * img_len * img_wid; i += nb_thread_id)
-    {
-        int stack = i / (img_wid*img_len);
-        int row = (i % (img_wid*img_len)) / img_wid;
-        int col = (i % (img_wid*img_len)) % img_wid;
-        d_img[i] = img[stack*img_stride_stack +row*img_stride_row +col*img_stride_col];
-    }
-
-    for (int kern_idx = 0; kern_idx < nkern; ++kern_idx, kern += kern_stride_nkern)
-    {
-        // load the kernel into shared memory and flip it
-        for (int i = thread_id; i < nstack * kern_len * kern_wid; i += nb_thread_id)
-        {
-            int stack = i / (kern_wid*kern_len);
-            int row = (i % (kern_wid*kern_len)) / kern_wid;
-            int col = (i % (kern_wid*kern_len)) % kern_wid;
-            d_kern[stack*kern_len*kern_wid + (kern_len-1-row)*kern_wid + (kern_wid-1-col)]
-               = kern[stack*kern_stride_stack +row*kern_stride_row +col*kern_stride_col];
-        }
-        __syncthreads();
-
-        float sum = 0.0f;
-        for (int row=0; row < kern_len; ++row)
-        {
-            int irow = out_row - kern_len+1+row;
-            if (irow < 0 || irow > img_len) continue;
-            for (int col = 0; col < kern_wid; ++col)
-            {
-                int icol = out_col - kern_wid+1+col;
-                if (icol < 0 || icol > img_wid) continue;
-                if (NSTACK > 0)
-                {
-                    sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid,
-                            d_kern + row*kern_wid+col, kern_len*kern_wid);
-                }
-                else
-                {
-                    for (int stack = 0; stack < nstack; ++stack)
-                    {
-                        sum += d_img[stack*img_len*img_wid + irow*img_wid + icol] * d_kern[stack*kern_len*kern_wid+row*kern_wid+col];
-                    }
-                }
-            }
-        }
-        out[batch_id*out_wid*out_len*nkern+//the good batch
-            out_wid*out_len*kern_idx+//the output image
-            out_row*out_wid+out_col] = sum;
-        __syncthreads(); //don't start loading another kernel until we're done here
-    }
-}
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
diff --git a/theano/sandbox/gpuarray/conv_kernel.cu b/theano/sandbox/gpuarray/conv_kernel.cu
deleted file mode 100644
index 945abc593f2..00000000000
--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ /dev/null
@@ -1,1045 +0,0 @@
-// REMEMBER TO INCREASE c_code_cache_version when changing this file
-//
-//implement the valid convolution only
-
-/*
-for (int iter_m=0; iter_m < Os[0]; iter_m++) {
-  // Reposition index into input image based on requested output size
-  int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
-  int new_m = (pos_m+dim_ker[0]-1);
-
-  for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
-    int pos_n=iter_n*%(self_dy)s;
-    %(type)s sum=0;
-
-    // Sum over kernel, if index into image is out of bounds
-    // fill with the value
-    for (int j=0; j < dim_ker[0]; j++) {
-      int inverse_row = (new_m-j);
-      const %(type)s* idx_in=&in[inverse_row*dim_im[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
-      const %(type)s* idx_kern=&hvals[j*dim_ker[1]];
-      int new_n = (pos_n+dim_ker[1]-1);
-      for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
-        sum+=idx_kern[k]*idx_in[last];
-      }
-    }//for j
-    out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
-  }//for n
- }//for m
-*/
-#ifndef CONV_KERNEL_CU
-#define CONV_KERNEL_CU
-#include <stdint.h>
-
-/*
-#define CHECK_BANK_CONFLICTS 0
-#if CHECK_BANK_CONFLICTS
-#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
-#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
-#else
-#define AS(i, j) As[i][j]
-#define BS(i, j) Bs[i][j]
-#endif
-*/
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b) )
-#define MAX(a, b) ((a) < (b) ? (b) : (a) )
-
-//Must be the same size as a ptr. We can't use unsigned long as on Windows 64
-//bit, it is 32 bit.
-const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
-
-__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
-  if (nb_thread < 64)
-    {
-      if(flipped)
-        //TODO very slow on device before 1.3.
-        //     make access to kern sequential and access to d_kern flipped.
-        for(int i=thread_id;i<N;i+=nb_thread)
-          dst[i]=src[N - 1 - i];
-        //dst[N-1-i]=src[i];
-      else
-      {
-        for(int i = thread_id; i < N; i += nb_thread)
-        {
-            dst[i] = src[i];
-        }
-      }
-    }
-  else
-    {
-      nb_thread = nb_thread & 0xFFFFFFE0; //make nb_thread a multiple of 32
-      // Global memory:
-      //  <-------------------------------------->
-      //      A      A      A      A      A   // points of 256-byte alignment
-      //         dddddddddddddddddddddd       // layout of src in global memory
-      if (thread_id < nb_thread)
-        {
-          const float * my_src_ptr = (const float *)(
-                  ((uintptr_t)src) & COALESCED_ALIGN);
-          my_src_ptr += thread_id;
-          while (my_src_ptr < src + N)
-          {
-              if (my_src_ptr >= src)
-              {
-                  int i = my_src_ptr - src;
-                  if (flipped)
-                  {
-                      dst[N - 1 - i] = *my_src_ptr;
-                  }
-                  else
-                  {
-                      dst[i] = *my_src_ptr;
-                  }
-              }
-              my_src_ptr += nb_thread;
-          }
-        }
-    }
-}
-
-/*
- * We load from global memory to shared memory. The outer if is optimized away at compilation.
- */
-__device__ void load_to_shared(float * dst, const float * src, const int thread_id,
-                               int nb_thread, const int nb_col, const int nb_row,
-                               const int stride_col, const int stride_row,
-                               const bool flipped=false, const bool c_contiguous=true){
-    if (c_contiguous)
-    {
-        load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
-    }
-    else
-    {
-        if (flipped)
-        {
-            int LAST = nb_row * nb_col - 1;
-            for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
-            {
-                // XXX
-                // THIS IS SLOW - use whatever blocks are in the the
-                // threads to avoid division and modulo
-                dst[LAST - i] \
-                    = src[(i/nb_col)*stride_row+(i%nb_col)*stride_col];
-            }
-        }
-        else
-        {
-            for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
-            {
-                // XXX
-                // THIS IS SLOW - use whatever blocks are in the the
-                // threads to avoid division and modulo
-                dst[i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
-            }
-        }
-    }
-}
-
-__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
-  for(int i=thread_id;i<N;i+=nb_thread)
-    dst[i]=value;
-}
-
-/*
- * We load from global memory to shared memory. The outer if is optimized away at compilation.
- * We put the image at the center of another one. Usefull to padd an image with 0.
- */
-__device__ void load_padded_col_to_shared(float * dst, const float * src, 
-                                          const int thread_id, const int nb_thread,
-                                          const int nb_col, const int nb_row, 
-                                          const int stride_col, const int stride_row,
-                                          const int wid_pad, const bool c_contiguous=true){
-  if(c_contiguous){//flipped==false
-    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
-      int col=i%nb_col;
-      int row=i/nb_col;
-      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
-    }
-    
-  }else{
-    for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread){
-      int col=i%nb_col;
-      int row=i/nb_col;
-      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[row*stride_row+col*stride_col];
-    }
-  }
-
-}
-
-template<int i> __device__ float convolutionRowNoFlip(const float *data,
-                                                      const float *kern){
-    return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ;
-  //return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
-}
-
-template<> __device__ float convolutionRowNoFlip<1>(const float *data,
-                                                    const float *kern){
-    return data[0]*kern[0];
-}
-template<> __device__ float convolutionRowNoFlip<0>(const float *data,
-                                                    const float *kern){
-    return 0;
-}
-
-template<int KERN_WIDTH>
-__device__ void convolutionRowNoFlip(float& sum,
-                                     const float *data,
-                                     const float *kern, const int kern_wid){
-  if(KERN_WIDTH>0)
-    sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
-  else
-#pragma unroll 8
-    for (int col=0; col < kern_wid; col++) {//loop over col
-      sum+=data[col]*kern[col];
-    }
-}
-
-template<bool accumulate>
-__device__ void store_or_accumulate(float& dst,const float value ){
-  if(accumulate){
-    dst += value;
-  }else
-    dst = value;
-}
-
-
-/**
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * Don't implement the stack.
- * each thread compute only one value for the output if split is false
- * thread block size=out_wid, out_len(or less then out_len if split is true)
- * grid block size=batch_id, nkern
- * dynamic shared memory: img_len*img_wid+kern_len*kern_wid
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template split: if true, each thread computes more than 1 output pixel
- *                 When true, allow for output image bigger then 512 pixel.
- *                 Use more registers.
- */
-template<bool flipped_kern, int KERN_WIDTH, bool split>
-__global__ void
-conv_patch( const float* img, const float* kern, float* out,
-            int img_len, int img_wid, int kern_len, int kern_wid,
-            int nkern, int nstack)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-
-  extern __shared__ float s_data[];
-
-    __shared__ int batch_id, kern_id;
-    batch_id = blockIdx.x;
-    kern_id = blockIdx.y;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    int out_col = tx;//output col
-    const int thread_id  = ty*blockDim.x + tx;
-
-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-
-    kern+=kern_len*kern_wid*nstack*kern_id;
-
-    img+=img_len*img_wid*(nstack*batch_id);
-
-    load_to_shared(d_img, img, thread_id,nb_thread_id,img_len*img_wid);
-    load_to_shared(d_kern, kern, thread_id,nb_thread_id,kern_len*kern_wid,flipped_kern);
-    __syncthreads();
-
-    if(!split){
-      int out_row = ty;//output row
-      float sum = 0.0f;
-      for (int row=0; row < kern_len; row++) {//loop over row
-        const float* idx_kern=&d_kern[row*kern_wid];
-        const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-      }
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          blockIdx.y*out_wid*out_len+//the output image
-          out_row*out_wid+out_col] = sum;
-    }else{
-      for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){
-        float sum = 0.0f;
-        for (int row=0; row < kern_len; row++) {//loop over row
-          const float* idx_kern=&d_kern[row*kern_wid];
-          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-        }
-        out[batch_id*out_wid*out_len*nkern+//the good batch
-            kern_id*out_wid*out_len+//the output image
-            out_row*out_wid+out_col] = sum;
-      }
-    }
-}
-
-/**
- * As conv_patch, but implement the stack in the kernel.
- * I keep it separated from conv_patch as we take more registers and this could lower the occupency.
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * each thread compute only one value for the output if split==false else it compute more than 1 values
- * thread block size=out_wid, out_len/X (X is any number, optimized value is ceil(out_len/N)
- * grid block size=batch_id, nkern
- * dynamic shared memory: img_len*img_wid+(preload_full_kern?KERNEL_LEN:1)*kern_wid
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * dx: patch stride rows(1 for normal convolution)
- * dy: patch stride cols(1 for normal convolution)
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template accumulate: if true, we add the result, else we override the result
- * template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
- * template img_c_contiguous_2d: if true, the img have are collon and row contiguous
- * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
- * template split: if true, each thread generate more than 1 output pixel, but use more registers.
- * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
- * template subsample: if false, remove some computation needed when dx or dy!=1.
- */
-template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
-__global__ void
-conv_patch_stack( const float* img, const float* kern, float* out,
-                  int img_len, int img_wid, int kern_len, int kern_wid,
-                  int out_len, int out_wid,
-                  int nkern, int nstack, int img_stride_col,int img_stride_row,
-                  int img_stride_stack, int img_stride_batch,
-                  int kern_stride_col, int kern_stride_row,
-                  int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
-{
-  int __shared__ nb_thread_id;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-
-  extern __shared__ float s_data[];
-
-    int batch_id = blockIdx.x;
-    int kern_id = blockIdx.y;
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    int out_col = tx;//output col
-    int out_row = ty;//output row
-    const int thread_id  = out_row*out_wid + out_col;
-
-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
-
-    if(!split){
-      kern+=kern_stride_nkern*kern_id;//the good nkern
-      img+=img_stride_batch*batch_id;//the good batch
-      float sum = 0.0f;
-      
-      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-             img+=img_stride_stack){
-        load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len,
-                       img_stride_col, img_stride_row, false, img_c_contiguous_2d);
-        if(preload_full_kern)
-          load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
-                         kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-
-        __syncthreads();
-        
-        for (int row=0; row < kern_len; row++) {//loop over row
-          if(!preload_full_kern){
-            __syncthreads();
-            int idx2;
-            if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
-            else idx2=(row)*kern_stride_row;
-            load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1,
-                           kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-            __syncthreads();              
-          }
-
-          const float* idx_kern;
-          if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
-          else idx_kern=d_kern;
-          const float* idx_in;
-          if(subsample)
-            idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
-          else
-            idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          
-          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-        }
-        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
-      }
-      store_or_accumulate<accumulate>(
-                                      out[batch_id*out_wid*out_len*nkern+//the good batch
-                                          out_wid*out_len*kern_id+//the output image
-                                          out_row*out_wid+out_col],sum);
-    }else{
-
-      const float __shared__ *kern_, *img_;
-      int __shared__ out_len_max;
-
-      kern_=kern+kern_stride_nkern*kern_id;//the good nkern
-      img_=img+img_stride_batch*batch_id;//the good batch
-      //out_len_max must by higher then out_len as we need all thread when we load the image as the blockDim.y is not always a multiple of out_len.
-      out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
-
-      //TODO: inverse the out_row and stack loop to don't load the date as frequently!
-      //TODO: do this happen elsewhere?
-      for(;out_row<out_len_max;out_row+=blockDim.y){
-        float sum = 0.0f;
-        for (int stack = 0;stack<nstack;stack++){
-          //TODO: load only the part of the image needed or put the partial result in shared memory
-          int idx1=img_stride_stack*stack;
-          load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len,
-                         img_stride_col, img_stride_row, false, img_c_contiguous_2d);
-          if(preload_full_kern){
-            int idx2=kern_stride_stack*stack;
-            load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len,
-                           kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-          }
-          __syncthreads();
-          
-          for (int row=0; row < kern_len; row++) {//loop over row
-            if(!preload_full_kern){
-              __syncthreads();
-              int idx2=kern_stride_stack*stack;
-              if(flipped_kern)
-                idx2+=(kern_len-row-1)*kern_stride_row;
-              else
-                idx2+=(row)*kern_stride_row;
-              load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1,
-                             kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-              __syncthreads();              
-            }
-            const float* idx_kern;
-            if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
-            else idx_kern=d_kern;
-            const float* idx_in;
-            if(subsample)
-              idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
-            else
-              idx_in=&d_img[(row+out_row)*img_wid+out_col];
-            
-            //if needed as on Fermi as reading out of bound index from shared memory generate an error.
-            //Not needed on generation before as they worked anyway. Removing the if generate the good code
-            //as we store the result of only the good thread.
-            //This was with nvcc 3.0 on an GTX470 card.
-            if(out_row<out_len)
-              convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-          }
-          __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
-        }
-        if(out_row<out_len)
-          store_or_accumulate<accumulate>(
-                                          out[batch_id*out_wid*out_len*nkern+//the good batch
-                                              out_wid*out_len*kern_id+//the output image
-                                              out_row*out_wid+out_col],sum);
-      }
-
-    }
-
-}
-
-/**
- * As conv_patch_stack, but kern_len thread for each output pixel
- * I keep it separated as use more register.
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * thread block size=out_wid, out_len, ceil_intdiv(kern_len/nb_split)
- * grid block size=batch_id, nkern
- * dynamic shared memory: img_len*img_wid+kern_wid*(preload_full_kern?kern_len:thread_z)+out_size*thread_z
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template img_contiguous: if true, the img have are collon and row contiguous
- * template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
- */
-template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
-__global__ void
-conv_patch_stack_reduce( const float* img, const float* kern, float* out,
-                  int img_len, int img_wid, int kern_len, int kern_wid,
-                  int nkern, int nstack, int img_stride_col,int img_stride_row,
-                  int img_stride_stack, int img_stride_batch,
-                  int kern_stride_col, int kern_stride_row,
-                  int kern_stride_stack, int kern_stride_nkern)
-{
-  //int __shared__ out_len, out_wid, nb_thread_id;
-  //out_len = img_len - kern_len + 1;
-  //out_wid = img_wid - kern_wid + 1;
-  const int out_wid = blockDim.x;
-  const int out_len = blockDim.y;
-  const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-
-  extern __shared__ float s_data[];
-
-    int batch_id = blockIdx.x;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-    int tz = threadIdx.z;
-
-    int out_col = tx;//output col
-    int out_row = ty;//output row
-    const int thread_id  = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;
-
-    //d_img size [IMAGE_LEN * IMAGE_WID];
-    float * d_img=&s_data[0];
-
-    //d_kern size[(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]
-    float * d_kern=&s_data[img_len * img_wid];
-
-    //d_reduce size [n_threads]
-    //N.B. this overlaps with d_img and d_kern!
-    float * d_reduce=&s_data[0];
-
-    float sum = 0.0f;
-
-    kern+=kern_stride_nkern*blockIdx.y;//the good nkern
-    img+=img_stride_batch*batch_id;//the good batch
-
-    for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-           img+=img_stride_stack){
-      __syncthreads();
-      load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
-                     img_stride_col, img_stride_row, false, c_contiguous);
-      if(split && ! preload_full_kern){
-        for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
-            //N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
-            // (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
-            // details, but that seemed really weird. tricky bug to find too.
-          int idx3 = flipped_kern
-              ? max((kern_len - (int)blockDim.z - first_row),0)
-              : first_row;
-          int len3 = min(blockDim.z, kern_len - first_row);
-
-          __syncthreads();
-          load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
-                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-          __syncthreads();
-          const float* idx_kern=&d_kern[tz*kern_wid];
-          const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
-          float sum2 = 0;
-          if(tz<len3)
-            convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
-          sum+=sum2;
-        }
-      }else if(split){
-        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
-                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-        __syncthreads();
-        for(int row=tz;row<kern_len;row+=blockDim.z){
-          const float* idx_kern=&d_kern[row*kern_wid];
-          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-        }
-      }else{
-        int row = tz;//The row of the kernel.
-        const float* idx_kern=&d_kern[row*kern_wid];
-        const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
-                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-        __syncthreads();
-        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-      }
-        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
-    }
-
-    //reduce no sync because previous loop ends with sync
-    d_reduce[thread_id]=sum;
-    __syncthreads();
-    if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
-      //sum=0;
-      for(int i=1;i<blockDim.z;i++){
-        sum+=d_reduce[thread_id+i*out_wid*out_len];
-      }
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          out_wid*out_len*blockIdx.y+//the output image
-          out_row*out_wid+out_col] = sum;
-    }
-}
-
-/**
- * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
- * we store kern_len row of the image and the full kernel in the shared memory
- * each thread compute only one value for the output
- * Don't implement the stack and nkern in the kernel.
- * thread block size=out_wid
- * grid block size=out_len,batch_id
- * dynamic shared memory: kern_len*img_wid+kern_len*kern_wid
- * Diff with conv_patch: don't store the full image in the shared memory. 
- *    I.E. work for bigger image then conv_patch<split=true,...>.
- */
-template<int KERN_WIDTH, bool c_contiguous>
-__global__ void
-conv_rows( const float* img, const float* kern, float* out,
-           int img_len, int img_wid, int kern_len, int kern_wid,
-           int nkern, int nstack,
-           int img_stride_col, int img_stride_row,
-           int img_stride_stack, int img_stride_batch,
-           int kern_stride_col, int kern_stride_row,
-           int kern_stride_stack, int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
-  float __shared__ *d_img, *d_kern;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  batch_id= blockIdx.y/nkern;
-  kern_id = blockIdx.y%nkern;
-
-  extern __shared__ float s_data[];
-
-    const int out_col = threadIdx.x;//output col
-    const int out_row = blockIdx.x;;//output row
-    const int thread_id = threadIdx.x;
-
-    d_img=&s_data[0];//size of [KERN_LEN * IMAGE_WID];
-    d_kern=&s_data[kern_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-    
-    img+=img_stride_batch*batch_id;//selection the good image from the batch
-    img+=out_row*img_stride_row;//select the good top row.
-    kern+=kern_stride_nkern*kern_id;//the good nkern
-
-    load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len,
-                   img_stride_col, img_stride_row, false, c_contiguous);
-    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
-                   kern_stride_col, kern_stride_row, true, c_contiguous);
-
-    __syncthreads();
-    float sum = 0.0f;
-
-    for (int row=0; row < kern_len; row++) {//loop over row
-      const float* idx_kern=&d_kern[row*kern_wid];
-      const float* idx_in=&d_img[(row)*img_wid+out_col];
-      convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-    }
-
-    out[batch_id*out_wid*out_len*nkern+//the good batch
-        kern_id*out_wid*out_len+//the output image
-        out_row*out_wid+out_col] = sum;
-}
-
-/**
- * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
- * as conv_rows, but implement the stack. Separate as this use more register.
- * we store kern_len row of the image and the full kernel in the shared memory
- * each thread compute only one value for the output
- * thread block size=out_wid, block_len
- * grid block size=intceil(out_len/block_len),nb_batch*nb_kern
- * dynamic shared memory: (kern_len+block_len-1)*img_wid+kern_len*kern_wid
- * Diff with conv_patch: don't store the full image in the shared memory. 
- *    I.E. work for bigger image then conv_patch<split=true,...>.
- */
-template<int KERN_WIDTH, bool c_contiguous>
-__global__ void
-conv_rows_stack( const float* img, const float* kern, float* out,
-                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
-                 const int nkern, const int nstack,
-                 const int img_stride_col, const int img_stride_row,
-                 const int img_stride_stack, const int img_stride_batch,
-                 const int kern_stride_col, const int kern_stride_row,
-                 const int kern_stride_stack, const int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
-  float  __shared__ *d_img, *d_kern;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  batch_id= blockIdx.y/nkern;
-  kern_id = blockIdx.y%nkern;
-  nb_rows = blockDim.y;
-
-  int rows_to_read = MIN(
-          kern_len + nb_rows - 1,
-          img_len - blockIdx.x * nb_rows);
-
-  /**
-   * Every thread ultimately computes one value in the output, at coordinates
-   *   out[ batch_id, kern_id, out_row, out_col]
-   *
-   * The batch_id and kern_id are packed into blockIdx.y. out_row and out_col
-   * are the threadIdx.x and threadIdx.y.
-   *
-   * Every thread block deals only with one image, and one filter kernel.
-   */
-  extern __shared__ float s_data[];
-
-    const int out_col = threadIdx.x;//output col
-    const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
-    const int shared_row = threadIdx.y;
-    const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
-
-  /*
-   * The kernel works by looping over channels (aka colours, aka the stack).
-   * On each iteration, a thread block loads one channel of all the image rows that
-   * it needs to use, and one channel slice of one kernel.
-   */
-    d_img=&s_data[0];//size of [(KERN_LEN+block_len-1) * IMAGE_WID];
-    d_kern=&s_data[(kern_len+nb_rows-1) * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-
-    float sum = 0.0f;
-    for (int stack = 0; stack < nstack; stack++){
-
-      int offset =
-          img_stride_batch * batch_id
-          + img_stride_stack * stack
-          //blockIdx.x is which chunk of nb_rows this thread block deals with
-          + img_stride_row * (blockIdx.x * nb_rows);
-
-      load_to_shared(
-              d_img,              // dst
-              img+offset,         // src
-              thread_id,          // linear position in block
-              nb_thread_id,       // number of threads
-              img_wid,            // cols in image to read
-              rows_to_read,       // number of rows to read
-              img_stride_col,     // img[i, j, k, l] to img[i, j, k, l + 1]
-              img_stride_row,     // img[i, j, k, l] to img[i, j, k + 1, l]
-              false,              // flip while reading
-              c_contiguous);
-
-      offset = kern_stride_nkern * kern_id + kern_stride_stack * stack;
-      load_to_shared(d_kern, kern+offset, thread_id, nb_thread_id, kern_wid,kern_len,
-                     kern_stride_col, kern_stride_row, true, c_contiguous);
-
-      __syncthreads();
-
-      for (int row=0; row < kern_len; row++) {//loop over row
-        const float* idx_kern=&d_kern[row*kern_wid];
-        const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
-        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-      }
-      __syncthreads();//to be sure all thread have finished before we modif the shared memory.
-    }
-    if (out_row < out_len)
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          kern_id*out_wid*out_len+//the output image
-          out_row*out_wid+out_col] = sum;
-}
-
-/**
- * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
- * as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
- * we store block_len row of the image(at a time) and one or all kernel row in the shared memory
- * each thread compute only one value for the output
- * thread block size=out_wid, block_len
- * grid block size=intceil(out_len/block_len),nb_batch*nb_kern
- * dynamic shared memory: block_len * img_wid+(preload_full_kern?kern_len:1)*kern_wid
- * Diff with conv_patch: don't store the full image and kernel in the shared memory. 
- *    I.E. work for bigger image then conv_patch<split=true,...>.
- */
-template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
-__global__ void
-conv_rows_stack2(const float* img, const float* kern, float* out,
-                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
-                 const int nkern, const int nstack,
-                 const int img_stride_col, const int img_stride_row,
-                 const int img_stride_stack, const int img_stride_batch,
-                 const int kern_stride_col, const int kern_stride_row,
-                 const int kern_stride_stack, const int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
-  float  __shared__ *d_img, *d_kern;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  batch_id= blockIdx.y/nkern;
-  kern_id = blockIdx.y%nkern;
-  nb_rows = blockDim.y;
-
-  extern __shared__ float s_data[];
-
-    const int out_col = threadIdx.x;//output col
-    const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
-    const int shared_row = threadIdx.y;
-    const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
-
-    d_img=&s_data[0];//size of [nb_rows * IMAGE_WID];
-    d_kern=&s_data[nb_rows*img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
-    
-    float sum = 0.0f;
-    for (int stack = 0;stack<nstack;stack++){
-
-      int _idx2=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
-      _idx2+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
-    
-      __syncthreads();
-      load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1,
-                           img_stride_col, img_stride_row, false, c_contiguous);
-      if(preload_full_kern)
-        load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack,
-                       thread_id, nb_thread_id, kern_wid,kern_len,
-                       kern_stride_col, kern_stride_row, true, c_contiguous);
-      __syncthreads();
-
-      for (int row=0; row < kern_len; row++) {//loop over row
-        __syncthreads();
-        if((blockIdx.x*nb_rows+row+nb_rows-1)<img_len){
-          int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
-          _idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
-          _idx1+=(row+nb_rows-1)*img_stride_row;//the current last row
-          load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid,
-                         img+_idx1, thread_id, nb_thread_id, img_wid, 1,
-                         img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer.
-        }
-
-        if(!preload_full_kern){
-          int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack
-          _idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped
-          load_to_shared(d_kern, kern+_idx3,
-                         thread_id, nb_thread_id, kern_wid,1,
-                         kern_stride_col, kern_stride_row, true, c_contiguous);
-
-        }
-        __syncthreads();
-
-        //if needed as on Fermi as reading out of bound index from shared memory generate an error.
-        //Not needed on generation before as they worked anyway. Removing the if generate the good code
-        //as we store the result of only the good thread.
-        //This was with nvcc 3.0 on an GTX470 card.
-        if(out_row<out_len){
-          const float* idx_kern;
-          if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
-          else idx_kern=d_kern;
-          const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
-          float sum_ =0.0f;
-          convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
-          sum+=sum_;//We pass by an intermediate variable to have more precission.
-        }
-      }
-    }
-    __syncthreads();
-    if(out_row<out_len)
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          kern_id*out_wid*out_len+//the output image
-          out_row*out_wid+out_col] = sum;
-}
-
-/**
- * Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
- * kernel-image inner-product in parallel.
- * 
- * This implementation uses shared memory for the reduce, so it is limited by the product of stacklen x kern_len
- *
- * template stack_loop: if true, we accept that blockDim.x < nstack and we add a loop for this(use 3 more registers, so lower occupency when true, but accept nstack*kern_len>512)
- * TODO: explain parameters, preconditions
- */
-template<bool stack_loop>
-__global__ void
-conv_valid_row_reduce(int nB, int nK, int stacklen,
-        int img_len, int img_wid, 
-        int kern_len, int kern_wid,
-        int out_len, int out_wid, //physical
-        const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
-        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
-        int subsample_rows, int subsample_cols,
-        const int initial_reduce_boundary)
-{
-    const int outsize = nB * nK * out_len * out_wid;
-    extern __shared__ float reducebuf[];
-    for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
-    {
-        //figure out what output element we're in charge of computing
-        int ii = i;
-        int iB = ii % nB;      // output batch index
-        ii = ii / nB;
-        int iK = ii % nK;      // output kernel index
-        ii = ii / nK;
-        int iR_physical = ii % out_len; //output kernel row
-        int iC_physical = ii / out_len; // output kernel column
-        int iR_logical = iR_physical * subsample_rows;
-        int iC_logical = iC_physical * subsample_cols;
-
-        int ss = threadIdx.x;
-        int rr = threadIdx.y;
-        int img_rr = iR_logical + kern_len - 1 - rr;
-        int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y;
-        float sum = 0.0f;
-        if(stack_loop){
-          for (; ss < stacklen; ss+=blockDim.x){
-            const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
-            const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
-            for (int cc = 0; cc < kern_wid; ++cc)
-            {
-                sum +=  kk_0[0] * ii_0[0];
-                kk_0 += kern_str_C;
-                ii_0 -= img_str_C;
-            }
-          }
-        }else{
-          const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
-          const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
-          for (int cc = 0; cc < kern_wid; ++cc)
-          {
-            sum +=  kk_0[0] * ii_0[0];
-            kk_0 += kern_str_C;
-            ii_0 -= img_str_C;
-          }
-        }
-
-        if (blockDim.x * blockDim.y == 1)
-        {
-            out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
-        }
-        else
-        {
-            reducebuf[reduceIdx] = sum;
-            __syncthreads();
-            int reduce_boundary = initial_reduce_boundary;
-
-            // add in the terms above the reduce boundary
-            if (reduceIdx + reduce_boundary < (blockDim.x * blockDim.y))
-                reducebuf[reduceIdx] += reducebuf[reduce_boundary +reduceIdx];
-            reduce_boundary >>= 1;
-            // there are an equal number of terms above and below the reduce_boundary
-            while (reduce_boundary)
-            {
-                __syncthreads();
-                if (reduceIdx < reduce_boundary)
-                {
-                    reducebuf[reduceIdx] += reducebuf[reduce_boundary + reduceIdx];
-                }
-                reduce_boundary >>= 1;
-            }
-            if (reduceIdx == 0)
-            {
-                out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = reducebuf[0];
-            }
-        }
-    }
-}
-
-
-
-/**
- * Reference implementation of 'valid' mode convolution (with stack)
- * 
- * This implementation works for any size of image and kernel.  It does not use shared memory.
- *
- * TODO: explain parameters, preconditions
- */
-__global__ void
-conv_reference_valid(int nB, int nK, int stacklen,
-        int img_len, int img_wid, 
-        int kern_len, int kern_wid,
-        int out_len, int out_wid, //physical
-        const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
-        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
-        int subsample_rows, int subsample_cols)
-{
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    __shared__ int numThreads, outsize;
-    numThreads = blockDim.x * gridDim.x;
-    outsize = nB * nK * out_len * out_wid;
-
-    for (int i = idx; i < outsize; i += numThreads)  //physical
-    {
-        //figure out what output element we're in charge of computing
-        int ii = i;
-        int iB = ii % nB;      // output batch index
-        ii = ii / nB;
-        int iK = ii % nK;      // output kernel index
-        ii = ii / nK;
-        int iR_physical = ii % out_len; //output kernel row
-        int iC_physical = ii / out_len; // output kernel column
-        int iR_logical = iR_physical * subsample_rows;
-        int iC_logical = iC_physical * subsample_cols;
-
-        float sum = 0.0f;
-        for (int ss = 0; ss < stacklen; ++ss)
-        {
-            for (int rr = 0; rr < kern_len; ++rr)
-            {
-                int img_rr = iR_logical + kern_len - 1 - rr;
-                for (int cc = 0; cc < kern_wid; ++cc)
-                {
-                    int img_cc = iC_logical + kern_wid-1-cc;
-                    float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
-                    float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
-                    sum +=  k_0 * i_0;
-                }
-            }
-        }
-        //coords[i*5+0] = iB;
-        //coords[i*5+1] = iK;
-        //coords[i*5+2] = iR;
-        //coords[i*5+3] = iC;
-        //coords[i*5+4] = iB * out_str_B + iK * out_str_K + iR * out_str_R + iC * out_str_C;
-        out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
-    }
-}
-
-/**
- * Reference implementation of 'full' mode convolution (with stack)
- * 
- * This implementation works for any size of image and kernel.  It does not use shared memory.
- *
- * TODO: explain parameters, preconditions
- */
-__global__ void
-conv_reference_full(int nB, int nK, int stacklen,
-        int img_len, int img_wid, 
-        int kern_len, int kern_wid,
-        int out_len, int out_wid, //physical dimensions
-        const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
-        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
-        int subsample_rows, int subsample_cols)
-{
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    __shared__ int numThreads, physical_outsize;
-    numThreads = blockDim.x * gridDim.x;
-    physical_outsize = nB * nK * out_len * out_wid;
-
-    for (int i = idx; i < physical_outsize; i += numThreads) 
-    {
-        //figure out what output element we're in charge of computing
-        int ii = i;
-        int iB = ii % nB;      // output batch index
-        ii = ii / nB;
-        int iK = ii % nK;      // output kernel index
-        ii = ii / nK;
-        int iR_physical = ii % out_len; //output kernel row
-        int iC_physical = ii / out_len; // output kernel column
-        int iR_logical = iR_physical * subsample_rows;
-        int iC_logical = iC_physical * subsample_cols;
-
-        float sum = 0.0f;
-        for (int ss = 0; ss < stacklen; ++ss)
-        {
-            for (int rr = 0; rr < kern_len; ++rr)
-            {
-                int img_rr = iR_logical - rr;
-                if ((img_rr >= 0) && (img_rr < img_len))
-                {
-                    for (int cc = 0; cc < kern_wid; ++cc)
-                    {
-                        int img_cc = iC_logical - cc;
-                        if ((img_cc >= 0) && (img_cc < img_wid))
-                        {
-                            float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
-                            float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
-                            sum +=  k_0 * i_0;
-                        }
-                    }
-                }
-            }
-        }
-        out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
-    }
-}
-
-#endif // #ifndef CONV_KERNEL_CU
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
diff --git a/theano/sandbox/gpuarray/elemwise.py b/theano/sandbox/gpuarray/elemwise.py
deleted file mode 100644
index 2c086d32696..00000000000
--- a/theano/sandbox/gpuarray/elemwise.py
+++ /dev/null
@@ -1,2743 +0,0 @@
-import copy
-from itertools import izip
-from StringIO import StringIO
-
-import numpy
-
-import theano
-from theano import Apply, scalar, config
-from theano import scalar as scal
-from theano.scalar import Scalar
-from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)
-from theano.sandbox.gpuarray.comp import NVCC_compiler
-
-try:
-    import pygpu
-    from pygpu import gpuarray
-    from pygpu.tools import ScalarArg, ArrayArg
-    from pygpu.elemwise import ElemwiseKernel
-    from pygpu.reduction import ReductionKernel
-    from pygpu.gpuarray import dtype_to_typecode, dtype_to_ctype
-except ImportError:
-    pass
-
-from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC,
-                                               GpuKernelBase, Kernel)
-from theano.sandbox.gpuarray.type import GpuArrayType
-
-from theano.gof.utils import MethodNotDefined
-
-
-def _is_scalar(v):
-    False
-
-
-def make_argument(v, name):
-    if _is_scalar(v):
-        return ScalarArg(numpy.dtype(v.type.dtype), name)
-    else:
-        return ArrayArg(numpy.dtype(v.type.dtype), name)
-
-
-def ensure_allocated(storage, shape, dtype):
-    odat = storage[0]
-    if odat is not None:
-        if odat.shape != shape:
-            # It is unsafe to try to resize odat,
-            # we have to allocate output storage.
-            odat = None
-    if odat is None:
-        odat = pygpu.empty(shape, dtype=dtype)
-    storage[0] = odat
-    return odat
-
-
-def as_C_string_const(s):
-    return '\n'.join('"%s\\n"' % (l.replace('"', '\\"'))
-                     for l in s.split('\n'))
-
-
-class GpuElemwise(HideC, Elemwise):
-    nin = property(lambda self: self.scalar_op.nin)
-    nout = property(lambda self: self.scalar_op.nout)
-
-    def __str__(self):
-        if self.name is not None:
-            return self.name
-        items = str(sorted(self.inplace_pattern.items()))
-        return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)
-
-    def make_node(self, *inputs):
-        res = Elemwise.make_node(self, *inputs)
-        outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
-                                dtype=o.type.dtype)() for o in res.outputs]
-        inputs = [as_gpuarray_variable(i) for i in inputs]
-        node = Apply(self, inputs, outputs)
-
-        # Try to generate the kernel to catch SupportCodeErrors
-        try:
-            scal_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
-
-            scal_out = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
-
-            fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
-                              [o() for o in scal_out])
-            code = self.scalar_op.c_support_code_apply(fake_node, "test")
-            if code:
-                raise SupportCodeError(code)
-        except MethodNotDefined:
-            pass
-        try:
-            support_code = self.scalar_op.c_support_code()
-            if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and
-                support_code.strip() != ""):
-                # The macro is fine, the C++ struct is not.
-                raise SupportCodeError(support_code)
-        except MethodNotDefined:
-            pass
-
-        return node
-
-    def generate_kernel(self, node, nodename):
-        inps = [make_argument(i, 'i%d' % (n,)) for n, i in
-                enumerate(node.inputs)]
-        scal_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
-
-        outs = [make_argument(o, 'o%d' % (n,)) for n, o in
-                enumerate(node.outputs) if not n in self.inplace_pattern]
-        scal_out = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
-
-        fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
-                          [o() for o in scal_out])
-
-        scal_out = []
-        oi = 0
-        for n in range(len(node.outputs)):
-            if n in self.inplace_pattern:
-                scal_out.append(inps[self.inplace_pattern[n]].name+'[i]')
-            else:
-                scal_out.append(outs[oi].name+'[i]')
-                oi += 1
-
-        kop = self.scalar_op.c_code(fake_node, nodename+'_scalar',
-                                    [i.name+'[i]' for i in inps],
-                                    scal_out,
-                                    dict(fail='return;'))
-
-        # Translate types for scalar composite ops (except complex).
-        # NB: OpenCL implicitly has 'stdint' defs at the kernel compilation stage
-        support_code = "" if pygpu.get_default_context().kind == 'opencl' else """
-#ifdef _MSC_VER
-#define signed __int8 int8_t
-#define unsigned __int8 uint8_t
-#define signed __int16 int16_t
-#define unsigned __int16 uint16_t
-#define signed __int32 int32_t
-#define unsigned __int32 uint32_t
-#define signed __int64 int64_t
-#define unsigned __int64 uint64_t
-#else
-#include <stdint.h>
-#endif
-"""
-        # Translate ga_ pseudo-types into their specific realizations
-        support_code += """
-#define ga_bool uint8_t
-#define ga_byte int8_t
-#define ga_ubyte uint8_t
-#define ga_short int16_t
-#define ga_ushort uint16_t
-#define ga_int int32_t
-#define ga_uint uint32_t
-#define ga_long int64_t
-#define ga_ulong uint64_t
-#define ga_float float
-#define ga_double double
-#define ga_half uint16_t
-
-"""
-        try:
-            #We accept only some c_support_code().
-            #This filter is done in the make_node()
-            support_code += self.scalar_op.c_support_code()
-        except MethodNotDefined:
-            pass
-        for npy, ga in [("npy_uint8", "ga_ubyte"),
-                        ("npy_uint16", "ga_ushort"),
-                        ("npy_uin32", "ga_uint"),
-                        ("npy_uin64", "ga_ulong"),
-                        ("npy_int8", "ga_byte"),
-                        ("npy_int16", "ga_short"),
-                        ("npy_int32", "ga_int"),
-                        ("npy_int64", "ga_long"),
-                        ("npy_float32", "ga_float"),
-                        ("npy_float64", "ga_double"),
-            ]:
-            kop = kop.replace(npy, ga)
-        return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
-
-    def c_headers(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return NVCC_compiler
-
-    def c_support_code(self):
-        return self.scalar_op.c_support_code()
-
-    def c_support_code_apply(self, node, nodename):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        # This is useless by itself, but will serve an eventual c_code
-        # implementation
-        k = self.generate_kernel(node, nodename)
-        nd = node.inputs[0].type.ndim
-        CLUDA_PREAMBLE = """
-#define local_barrier() __syncthreads();
-
-#define WITHIN_KERNEL __device__
-#define KERNEL extern "C" __global__
-#define GLOBAL_MEM /* empty */
-#define LOCAL_MEM __shared__
-#define LOCAL_MEM_ARG /* empty */
-#define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y*Z, 1)
-
-#define LID_0 threadIdx.x
-#define LID_1 threadIdx.y
-#define LID_2 threadIdx.z
-
-#define GID_0 blockIdx.x
-#define GID_1 blockIdx.y
-#define GID_2 blockIdx.z
-
-#define LDIM_0 blockDim.x
-#define LDIM_1 blockDim.y
-#define LDIM_2 blockDim.z
-
-#define GDIM_0 gridDim.x
-#define GDIM_1 gridDim.y
-#define GDIM_2 gridDim.z
-"""
-        res = [CLUDA_PREAMBLE]
-        for i in range(0, nd + 1):
-            res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
-        res.append(k.contig_src + ';')
-
-        return '\n'.join(res)
-
-    def c_init_code(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['setup_ext_cuda();']
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        nd = node.outputs[0].ndim
-        fail = sub["fail"]
-        initial_dims = ','.join('1' for i in xrange(nd))
-        opname = str(self.scalar_op)
-
-        #check that all inputs have valid dimensions
-        emitted_inames = {}
-        code = """
-        int n_blocks = 0;
-        int threads_per_block = 0;
-        size_t numEls = 0;
-        """
-        if nd > 0:
-            code += """
-            size_t dims[%(nd)s] = {%(initial_dims)s};
-            """ % locals()
-        else:
-            code += """
-            size_t *dims = NULL;
-            """
-        for idx, iname in enumerate(inputs):
-            if iname in emitted_inames:
-                assert emitted_inames[iname] is node.inputs[idx]
-                continue
-
-            broadcasts = map(int, node.inputs[idx].broadcastable)
-            broadcasts = ', '.join(map(str, broadcasts))
-            nd = node.inputs[idx].ndim
-            if nd > 0:
-                code += """
-                int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
-                """ % locals()
-            else:
-                code += """
-                int *broadcasts_%(iname)s = NULL;
-                """ % locals()
-            emitted_inames[iname] = node.inputs[idx]
-
-        #check that all inputs have valid dimensions
-        emitted_inames = {}
-        for idx, iname in enumerate(inputs):
-            if iname in emitted_inames:
-                continue
-            code += """
-        //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
-        if (%(nd)s != PyGpuArray_NDIM(%(iname)s))
-        {
-            PyErr_Format(PyExc_TypeError,
-                         "need %(nd)s dims, not %%i",
-                         PyGpuArray_NDIM(%(iname)s));
-            %(fail)s;
-        }
-        for (int i = 0; i< %(nd)s; ++i)
-        {
-            dims[i] = (dims[i] == 1) ? PyGpuArray_DIMS(%(iname)s)[i] : dims[i];
-            if ((!(broadcasts_%(iname)s[i] &&
-                 PyGpuArray_DIMS(%(iname)s)[i] == 1)) &&
-                (dims[i] != PyGpuArray_DIMS(%(iname)s)[i]))
-            {
-                //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
-                PyErr_Format(PyExc_ValueError,
-                             "GpuElemwise. Input dimension mis-match. Input"
-                             " %(idx)d (indices start at 0) has shape[%%i] == %%i"
-                             ", but the output's size on that axis is %%i.",
-                             i,
-                             PyGpuArray_DIMS(%(iname)s)[i],
-                             dims[i]
-                            );
-                %(fail)s;
-            }
-        }
-            """ % locals()
-            emitted_inames[iname] = True
-        #check that all outputs have valid dimensions
-        for idx, oname in enumerate(outputs):
-            typecode = dtype_to_typecode(node.outputs[idx].dtype)
-            if idx not in self.inplace_pattern.keys():
-                code += """
-        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
-            if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
-            {
-                Py_DECREF(%(oname)s);
-                %(oname)s = NULL;
-            }
-        }
-        if (%(oname)s && !GpuArray_CHKFLAGS(&(%(oname)s->ga), GA_C_CONTIGUOUS))
-        {
-            Py_XDECREF(%(oname)s);
-            %(oname)s = NULL;
-        }
-        if (NULL == %(oname)s)
-        {
-            %(oname)s = pygpu_empty(%(nd)d, dims,
-                            %(typecode)s, GA_C_ORDER,
-                            pygpu_default_context(), Py_None);
-            if (!%(oname)s) {
-                        //TODO, this check don't seam good.
-                        //TODO, set exception?
-                            %(fail)s
-            }
-        }
-        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
-        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
-        """ % locals()
-            else:
-                input_idx = self.inplace_pattern[idx]
-                iname = inputs[input_idx]
-                code += """
-        Py_XDECREF(%(oname)s);
-        %(oname)s = %(iname)s;
-        Py_INCREF(%(oname)s);
-        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
-            if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
-            {
-                PyErr_Format(PyExc_ValueError,
-                             "GpuElemwise. Output dimension mis-match. Output"
-                             " %(idx)d (indices start at 0), working inplace"
-                             " on input %(input_idx)s, has shape[%%i] == %%i"
-                             ", but the output's size on that axis is %%i.",
-                             i,
-                             PyGpuArray_DIMS(%(oname)s)[i],
-                             dims[i]
-                            );
-                Py_DECREF(%(oname)s);
-                %(oname)s = NULL;
-                %(fail)s;
-            }
-        }
-        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
-        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
-        """ % locals()
-        z = outputs[0]
-        code += """numEls = PyGpuArray_SIZE(%(z)s);
-
-        //first use at least a full warp
-        threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE
-
-        //next start adding multiprocessors
-        // UP TO NUMBER OF MULTIPROCESSORS, use 30 for now.
-        n_blocks = std::min(numEls/threads_per_block +
-                               (numEls %% threads_per_block?1:0),
-                           (size_t)30);
-
-        // next start adding more warps per multiprocessor
-        if (threads_per_block * n_blocks < numEls)
-            threads_per_block = std::min(numEls/n_blocks, (size_t) 256);
-
-                //std::cerr << "calling callkernel returned\\n";
-        """ % locals()
-
-        code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals()
-        param = []
-        for i in range(nd):
-            param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
-                                                              i=i))
-        for n, (name, var) in enumerate(zip(inputs + outputs,
-                                       node.inputs + node.outputs)):
-            if (n - len(inputs)) in self.inplace_pattern:
-                continue
-            dtype = dtype_to_ctype(var.dtype)
-            param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
-            param.append("%(name)s->ga.offset" % locals())
-            for i in range(nd):
-                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
-        code += ',\n'.join(param) + ");\n"
-        if config.gpuarray.sync:
-            code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z)
-        return str(code)
-
-    def perform(self, node, inputs, output_storage):
-        # Try to reuse the kernel from a previous call to hopefully
-        # avoid recompiling
-        if not hasattr(node, '_cache_elemwise_k'):
-            node._cache_elemwise_k = self.generate_kernel(node, "kcode")
-
-        out_shape = []
-        for values in izip(*[input.shape for input in inputs]):
-            if any(v == 0 for v in values):
-                # All non-broadcasted dimensions should be zero
-                assert max(values) <= 1
-                out_shape.append(0)
-            else:
-                out_shape.append(max(values))
-        out_shape = tuple(out_shape)
-
-        args = copy.copy(inputs)
-        for n, (stor, out) in enumerate(izip(output_storage, node.outputs)):
-            if n in self.inplace_pattern:
-                stor[0] = inputs[self.inplace_pattern[n]]
-            else:
-                args.append(ensure_allocated(stor, out_shape, out.type.dtype))
-
-        node._cache_elemwise_k(*args, broadcast=True)
-        if config.gpuarray.sync:
-            output_storage[0][0].sync()
-
-    def c_code_cache_version(self):
-        ver = self.scalar_op.c_code_cache_version()
-        if ver:
-            return (2, ver)
-        else:
-            return ver
-
-
-class SupportCodeError(Exception):
-    """
-    We do not support certain things (such as the C++ complex struct)
-    """
-
-
-class GpuDimShuffle(HideC, DimShuffle):
-    def make_node(self, input):
-        res = DimShuffle.make_node(self, input)
-        otype = GpuArrayType(dtype=res.outputs[0].type.dtype,
-                             broadcastable=res.outputs[0].type.broadcastable)
-        input = as_gpuarray_variable(input)
-        return Apply(self, [input], [otype()])
-
-    def __str__(self):
-        if self.inplace:
-            s = "InplaceGpuDimShuffle{%s}"
-        else:
-            s = "GpuDimShuffle{%s}"
-        return s % (','.join(str(x) for x in self.new_order))
-
-    def perform(self, node, inp, out):
-        input, = inp
-        storage, = out
-
-        res = input
-
-        res = res.transpose(self.shuffle+self.drop)
-
-        shape = list(res.shape[:len(self.shuffle)])
-        for augm in self.augment:
-            shape.insert(augm, 1)
-        res = res.reshape(shape)
-
-        if not self.inplace:
-            res = res.copy()
-
-        storage[0] = res
-
-    def c_support_code_apply(self, node, name):
-        def copy_shape(nd_out):
-            stmts = []
-            e = 0
-            for d in range(nd_out):
-                if d in self.augment:
-                    stmts.append("sh[%s] = 1;" % (d,))
-                else:
-                    stmts.append("sh[%s] = tmp->ga.dimensions[%s];" % (d, e))
-                    e += 1
-            return '\n            '.join(stmts)
-
-        return """
-        static const unsigned int %(name)s_ax[] = {%(shuffle)s};
-
-        static PyGpuArrayObject *%(name)s_f(PyGpuArrayObject *a) {
-            PyGpuArrayObject *res, *tmp;
-            size_t sh[%(nd_out)s];
-
-            tmp = pygpu_transpose(a, %(name)s_ax);
-            if (!tmp) return NULL;
-            %(copy_shape)s
-            res = pygpu_reshape(tmp, %(nd_out)s, sh, GA_ANY_ORDER, 1, -1);
-            Py_DECREF(tmp);
-            return res;
-        }
-        """ % dict(shuffle=', '.join(str(a) for a in (self.shuffle+self.drop)),
-                   name=name, nd_out=len(self.new_order),
-                   copy_shape=copy_shape(len(self.new_order)))
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        d = dict(name=name, fail=sub['fail'], inp=inputs[0], out=outputs[0],
-                 nd=len(self.input_broadcastable))
-        process = """
-        PyGpuArrayObject *tmp = NULL;
-        if (%(inp)s->ga.nd != %(nd)s) {
-            PyErr_SetString(PyExc_TypeError, "input nd");
-            %(fail)s
-        }
-
-        Py_XDECREF(%(out)s);
-        %(out)s = %(name)s_f(%(inp)s);
-        if (%(out)s == NULL) {%(fail)s}
-        """ % d
-
-        if not self.inplace:
-            process += """
-            tmp = pygpu_copy(%(out)s, GA_ANY_ORDER);
-            Py_DECREF(%(out)s);
-            if (!tmp) {
-                %(out)s = NULL;
-                %(fail)s
-            }
-            %(out)s = tmp;
-            """ % d
-        return process
-
-    def c_code_cache_version(self):
-        return (4,)
-
-
-class GpuCAReduceCuda(HideC, CAReduceDtype):
-    """GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
-
-    The dimensions along which to reduce is specified by the
-    `reduce_mask` that you pass to the constructor.  The `reduce_mask`
-    is a tuple of booleans (actually integers 0 or 1) that specify for
-    each input dimension, whether to reduce it (1) or not (0).
-
-    For example, when scalar_op is a theano.scalar.basic.Add instance:
-
-      - reduce_mask == (1,) sums a vector to a scalar
-
-      - reduce_mask == (1,0) computes the sum of each column in a matrix
-
-      - reduce_mask == (0,1) computes the sum of each row in a matrix
-
-      - reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor.
-
-    :note: any reduce_mask of all zeros is a sort of 'copy', and may
-           be removed during graph optimization
-
-    This Op is a work in progress.
-
-    This op was recently upgraded from just GpuSum a general CAReduce. Not
-    many code cases are supported for scalar_op being anything other than
-    scal.Add instances yet.
-
-    Important note: if you implement new cases for this op, be sure to
-    benchmark them and make sure that they actually result in a speedup.
-    GPUs are not especially well-suited to reduction operations so it is
-    quite possible that the GPU might be slower for some cases.
-
-    pre_scalar_op: if present, must be a scalar op with only 1
-    input. We will execute it on the input value before reduction.
-
-    """
-
-    def __init__(self, scalar_op, axis=None,
-                 reduce_mask=None, dtype=None, acc_dtype=None,
-                 pre_scalar_op=None):
-        if reduce_mask is not None:
-            reduce_mask = tuple(reduce_mask)
-        self.reduce_mask = reduce_mask
-
-        # used to make sure that calls to scalar op
-        # have unique name arguments
-        self._n_scalar_op_calls = 0
-        CAReduceDtype.__init__(self, scalar_op, axis=axis,
-                               dtype=dtype, acc_dtype=acc_dtype)
-        self.pre_scalar_op = pre_scalar_op
-        if pre_scalar_op:
-            assert pre_scalar_op.nin == 1
-
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.axis == other.axis and
-                self.reduce_mask == other.reduce_mask and
-                self.dtype == other.dtype and
-                self.acc_dtype == other.acc_dtype and
-                self.scalar_op == other.scalar_op and
-                self.pre_scalar_op == other.pre_scalar_op)
-
-    def __hash__(self):
-        return (hash(type(self)) ^
-                hash(self.axis) ^
-                hash(self.reduce_mask) ^
-                hash(self.dtype) ^
-                hash(self.acc_dtype) ^
-                hash(type(self.scalar_op)) ^
-                hash(type(self.pre_scalar_op)))
-
-    def __str__(self):
-        pre = ""
-        if self.pre_scalar_op:
-            pre = "pre=%s,red=" % str(self.pre_scalar_op)
-        ax = ''
-        if self.axis is not None:
-            ax = '{%s}' % (', '.join(str(x) for x in self.axis),)
-        return "GpuCAReduceCuda{%s%s}%s" % (pre,str(self.scalar_op), ax)
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        # For unpickling of old ops.
-        if not hasattr(self, "pre_scalar_op"):
-            self.pre_scalar_op = None
-
-    def make_node(self, x):
-        x = as_gpuarray_variable(x)
-        ret = super(GpuCAReduceCuda, self).make_node(x)
-        self = copy.copy(self)
-        self.axis = ret.op.axis
-        if self.pre_scalar_op:
-            # Currently we only tested pre_scalar_op that don't cause
-            # upcast.
-            d1 = self.__class__(scalar_op=self.scalar_op)(Elemwise(self.pre_scalar_op)(x))
-            assert d1.dtype == ret.outputs[0].dtype
-            assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype
-        if self.reduce_mask is None:
-            if self.axis is None:
-                reduce_mask = [1] * x.type.ndim
-            else:
-                reduce_mask = [0] * x.type.ndim
-                for a in self.axis:
-                    assert reduce_mask[a] == 0
-                    reduce_mask[a] = 1
-            self.reduce_mask = tuple(reduce_mask)
-
-        if (x.type.ndim != len(self.reduce_mask)):
-            raise TypeError("x must have rank %i" % len(self.reduce_mask))
-        if ("complex" in x.dtype or
-            "complex" in ret.outputs[0].dtype or
-            "complex" in self._acc_dtype(x.dtype)):
-            raise NotImplementedError("We don't support complex in gpu reduction")
-        return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype,
-                                              ret.outputs[0].type.broadcastable)()])
-
-    """
-    This method must be commented, because there's no way
-    to communicate that it's OK to call for + but not for
-    max
-    def perform(self, node, inp, out):
-        x, = inp
-        z, = out
-        # reduce_max is declared but does nothing but
-        # raise NotImplementedError.
-        # We can't call it here anyway because it hasn't
-        # been added to the python bindings yet
-        z[0] = x.reduce_sum(self.reduce_mask)
-    """
-    def perform(self, node, inp, out):
-        raise MethodNotDefined("")
-
-    def supports_c_code(self, inputs):
-        """ Returns True if the current op and reduce pattern
-            has functioning C code """
-
-        # If we don't even have the right method, we certainly
-        # don't support the C code
-        # (This is the test that used to be implemented by
-        # local_gpu_sum)
-        pattern = (''.join(str(i) for i in self.reduce_mask))
-        if not hasattr(self, 'c_code_reduce_%s' % pattern):
-            return False
-
-        # Now that this is a general reduction op, we might
-        # have a method for a pattern, but that pattern
-        # might not be implemented for the current scalar op.
-        # To detect this more complicated situation, we
-        # make fake arguments to c_code, try to run them,
-        # and see if NotImplementedError gets raised.
-
-        node = self.make_node(*inputs)
-
-        name = 'fake_name'
-
-        inp = ['fake_input_name_%d' % i for i in xrange(len(inputs))]
-        out = ['fake_output_name_%d' % i for i in xrange(len(node.outputs))]
-
-        sub = {'fail': 'fake failure code'}
-
-        try:
-            self.c_code(node, name, inp, out, sub)
-            self.c_support_code_apply(node, name)
-        except NotImplementedError:
-            return False
-        return True
-
-    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
-
-    def c_init_code(self):
-        return ['setup_ext_cuda();']
-
-    def c_code(self, node, name, inp, out, sub):
-        x, = inp
-        z, = out
-
-        nd_in = node.inputs[0].type.ndim
-        nd_out = node.outputs[0].type.ndim
-        # For complex, we need to use theano_complex* in the c code to
-        # have it run. But libgpuarray don't understand it.
-        in_dtype = node.inputs[0].type.dtype_specs()[1]
-        out_dtype = node.outputs[0].type.dtype_specs()[1]
-        gin_dtype = "npy_" + node.inputs[0].dtype
-        gout_dtype = "npy_" + node.outputs[0].dtype
-        assert nd_in - nd_out == sum(self.reduce_mask)
-
-        sio = StringIO()
-        fail = sub['fail']
-
-        #check input
-        print >> sio, """
-        if (PyGpuArray_NDIM(%(x)s) != %(nd_in)s)
-        {
-            PyErr_Format(PyExc_TypeError,
-                         "required nd=%(nd_in)s, got nd=%%i", PyGpuArray_NDIM(%(x)s));
-            %(fail)s;
-        }
-        """ % locals()
-
-        # It might be nice to use a property of the op class to do this,
-        # but tensor.elemwise.CAReduce has this exact same check so I guess
-        # this is OK to do
-        if self.scalar_op in [scal.minimum, scal.maximum]:
-            conds = ["(PyGpuArray_DIMS(%s)[%d] == 0)" % (x, i)
-                     for i in xrange(nd_in)
-                     if self.reduce_mask[i]]
-            assert len(conds) > 0
-            cond = "(" + " || ".join(conds) + ")"
-            print >> sio, """
-            if %(cond)s
-            {
-                PyErr_Format(PyExc_ValueError," tried to reduce a 0-length axis.");
-                %(fail)s;
-            }
-            """ %locals()
-
-        #
-        # alloc an output if we need one
-        #
-
-        # check the basics of out output
-        print >> sio, """
-        if (  !%(z)s
-           || (PyGpuArray_NDIM(%(z)s) != %(nd_out)s)
-        """ % locals()
-
-        #ensure that the output has the right non-reduced dimensions
-        j = 0
-        for i in xrange(nd_in):
-            if not self.reduce_mask[i]:
-                print >> sio, " || (PyGpuArray_DIMS(%(z)s)[%(j)s] != PyGpuArray_DIMS(%(x)s)[%(i)d]) " % locals()
-                j += 1
-
-        print >> sio, """
-           )
-        {
-            """ % locals()
-        if nd_out > 0:
-            print >> sio, "size_t new_dims[%(nd_out)s]; " % locals()
-        else:
-            print >> sio, "size_t *new_dims=NULL; "
-
-        j = 0
-        for i in xrange(nd_in):
-            if not self.reduce_mask[i]:
-                print >> sio, 'new_dims[%(j)s] = PyGpuArray_DIMS(%(x)s)[%(i)s];' % locals()
-                j += 1
-        out_typecode = dtype_to_typecode(gout_dtype[4:])
-        print >> sio, """
-            Py_XDECREF(%(z)s);
-            %(z)s = pygpu_empty(%(nd_out)s, new_dims,
-                                %(out_typecode)s, GA_C_ORDER,
-                                pygpu_default_context(),
-                                Py_None);
-            if (NULL == %(z)s)
-            {
-                PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
-                %(fail)s;
-            }
-        }
-        """ % locals()
-
-        # \begin bracket the reduction in a check that there is
-        # actually work to do
-        if getattr(self.scalar_op, 'identity', None) == 0:
-            zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals()
-        #TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
-        else:
-            scalar_op = self.scalar_op
-            zero_shp = """
-            PyErr_Format(PyExc_NotImplementedError,
-                         "GpuCAReduceCuda not implemented when input shape is 0"
-                         " for this scalar_op: %(scalar_op)s");
-            %(fail)s;
-            """ % locals()
-        print >> sio, """
-        if (PyGpuArray_SIZE(%(z)s) && ! PyGpuArray_SIZE(%(x)s)){
-            %(zero_shp)s;
-        }
-        else if (PyGpuArray_SIZE(%(z)s))
-        {
-        """ % locals()
-
-        #
-        # Now perform the reduction
-        #
-
-        if all(i == 1 for i in self.reduce_mask):
-            #check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
-            #TODO: check if we are ccontiguous when we un-dimshuffle
-            #TODO: if only some dims are ccontiguous, call version with less dims.
-            print >> sio, 'if(%(x)s->ga.flags & GA_C_CONTIGUOUS){'%locals()
-            self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
-            print >> sio, "}else{"
-            getattr(self, 'c_code_reduce_%s'%(''.join(
-                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
-            print >> sio, "}"
-        else:
-            getattr(self, 'c_code_reduce_%s'%(''.join(
-                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
-
-        # \end bracket the reduction ...
-        print >> sio, """
-        }
-        """ % locals()
-
-        return sio.getvalue()
-
-    def _makecall(self, node, name, x, z, fail, pattern=None):
-        """Return a string for making a kernel call.
-
-            The return value looks something like:
-
-            .. code-block:: c
-
-                if (verbose)
-                    printf("running kernel_reduce_10_%(name)s\\n");
-                int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z;
-                kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
-                                                n_shared>>>(
-                        PyGpuArray_DIMS(%(x)s)[0],
-                        PyGpuArray_DIMS(%(x)s)[1],
-                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
-                        );
-                [
-        if config.gpuarray.sync:
-            code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z)
-                ]
-                if (cudaSuccess != cudaGetLastError())
-                {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: ... );
-                    %(fail)s;
-                }
-        """
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
-        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
-        sio = StringIO()
-        if pattern is None:
-            pattern = ''.join(str(c) for c in self.reduce_mask)
-        ndim = len(self.reduce_mask)
-        nd_out = ndim - sum(self.reduce_mask)
-        shapes_format = "shape=(%s)" % ",".join(["%llu"] * node.inputs[0].ndim)
-        shapes_data = ",".join(["(unsigned long long) PyGpuArray_DIMS(%s)[%d]" % (x, i)
-                                for i in range(node.inputs[0].ndim)])
-
-        print >> sio, """
-            if (verbose)
-                printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
-            int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z;
-            if (verbose>1)
-                printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
-                       " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
-                       " nb_block=%%d, n_shared=%%d, %(shapes_format)s\\n",
-                                  n_threads.x,n_threads.y,n_threads.z,
-                                  n_threads.x*n_threads.y*n_threads.z,
-                                  n_blocks.x,n_blocks.y,
-                                  n_blocks.x*n_blocks.y, n_shared, %(shapes_data)s);
-            kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-            """ % locals()
-        for i in xrange(ndim):
-            print >> sio, """
-                    PyGpuArray_DIMS(%(x)s)[%(i)s],
-            """ % locals()
-        print >> sio, """
-                    (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
-            """ % locals()
-        for i in xrange(ndim):
-            print >> sio, """
-                    ,PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s)
-            """ % locals()
-        print >> sio, """
-                    ,(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
-            """ % locals()
-        for i in xrange(nd_out):
-            print >> sio, """
-                    ,PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s)
-            """ % locals()
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
-        print >> sio, """
-                    );
-            %(sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                    "Cuda error: %%s: %%s."
-                    " (grid: %%i x %%i; block: %%i x %%i x %%i)"
-                    " %(shapes_format)s \\n",
-                    "kernel_reduce_%(pattern)s_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z,
-                    %(shapes_data)s);
-                %(fail)s;
-            }
-        """ % locals()
-        return sio.getvalue()
-
-    def _k_decl(self, node, nodename, pattern=None,
-                ndim=None, reduce_mask=None):
-        """Return a string to declare a kernel function
-
-        The result will look something like this:
-
-        .. code-block:: c
-
-            static __global__ void kernel_reduce_110_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(in_dtype)s *A,
-                    const int sA0,
-                    const int sA1,
-                    const int sA2,
-                    %(out_dtype)s * Z,
-                    const int sZ0)
-
-            Since the nodename is unique, we don't need to put the name
-            of the scalar_op in here.
-
-        """
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
-        if reduce_mask is None:
-            reduce_mask = self.reduce_mask
-        if ndim is None:
-            ndim = len(reduce_mask)
-        if pattern is None:
-            pattern = ''.join(str(i) for i in reduce_mask)
-        sio = StringIO()
-
-        print >> sio, """
-            static __global__ void kernel_reduce_%(pattern)s_%(nodename)s(
-        """ % locals()
-        for i in xrange(ndim):
-            print >> sio, """
-                    const int d%(i)s,
-        """ % locals()
-        print >> sio, """
-                    const %(in_dtype)s *A,
-        """ % locals()
-        for i in xrange(ndim):
-            print >> sio, """
-                    const int sA%(i)s,
-        """ % locals()
-        print >> sio, """
-                    %(out_dtype)s * Z
-        """ % locals()
-        for i in xrange(ndim - sum(reduce_mask)):
-            print >> sio, """
-                    , const int sZ%(i)s
-        """ % locals()
-        print >> sio, ")"
-        return sio.getvalue()
-
-    def _k_init(self, node, nodename):
-        acc_dtype = self._acc_dtype(node.inputs[0].dtype)
-        # We need to use theano_complex* and not npy_complex*
-        acc_dtype = theano.scalar.basic.Scalar(acc_dtype).dtype_specs()[1]
-
-        return """
-                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
-                const int threadNum = threadIdx.z * blockDim.x * blockDim.y
-                + threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = 0;
-
-                //This is caught in cuda/init.py when we init the gpu. I keep
-                //it here to ease finding code that rely on this.
-                if (warpSize != 32)
-                {
-                    Z[0] = -666;
-                    return;
-                }
-
-        """ % locals()
-
-    def _assign_init(self, first_item):
-        """
-        This return the initial value for myresult.
-        If the scalar op have an identity value, return it.
-
-        Otherwise, check that the scalar op is maximum or minimum
-        and return first_item. It should be the first element of the reduction.
-        As the maximum and minimum of the same value don't change, this work.
-        """
-        if hasattr(self.scalar_op, 'identity'):
-            return str(self.scalar_op.identity)
-        else:
-            assert isinstance(self.scalar_op, (scal.Maximum,
-                                               scal.Minimum))
-            if self.pre_scalar_op: # TODO, multi_dtype!
-                #dtype = node.inputs[0].dtype
-                dtype = 'float32'
-
-                dummy_var = scal.Scalar(dtype=dtype)()
-
-                dummy_node = self.pre_scalar_op.make_node(dummy_var)
-
-                dummy_name = 'assign_init_pre_scalar_op' + str(self._n_scalar_op_calls)
-                self._n_scalar_op_calls += 1
-                t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
-                                              (first_item,), ("",), {})
-                assert t.startswith(' = ')
-                first_item = t[3:]
-                if first_item[-1] == ';':
-                    first_item = first_item[:-1]
-
-            return first_item
-
-    def _assign_reduce(self, node, name, left, right, sub, pre):
-        """
-            node: the node argument to this op's c_code
-            name: the name argument to this op's c_code
-            left: a C code string identifying an lvalue
-            right: a C code string identifying an expression
-            sub: the sub argument to this op's c_code
-            pre: If True, we will add the pre_scalar_op.c_code
-
-            returns C code to reduce left and right, assigning the
-            result to left."""
-
-        x, = node.inputs
-        in_dtype = x.dtype
-        out_dtype = node.outputs[0].dtype
-
-        dummy_left = Scalar(dtype=out_dtype)()
-        dummy_right = Scalar(dtype=in_dtype)()
-
-        dummy_node = self.scalar_op.make_node(dummy_left, dummy_right)
-
-        dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
-        self._n_scalar_op_calls += 1
-
-        if pre and self.pre_scalar_op:
-            assert left == "myresult"
-            dummy_node = self.pre_scalar_op.make_node(dummy_left)
-            dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
-            self._n_scalar_op_calls += 1
-            t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
-                                          (right,), ("",), sub)
-            assert t.startswith(' = ')
-            right = t[3:]
-            if right[-1] == ';':
-                right = right[:-1]
-
-        return self.scalar_op.c_code(dummy_node, dummy_name, (left, right),
-                                     (left,), sub)
-
-    def _k_reduce_buf(self, z_pos, node, name, sub):
-        """
-        WRITEME
-
-        node, name, sub: these should be passed through from the original
-        call to c_code
-        """
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
-        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
-
-        # This code (the code in new_version) is currently ignored.
-        # Code produced later in this function is returned instead.
-        # The code here works with all nvidia driver
-        # But only for powers or multiples of 2!
-        new_version = """
-        __syncthreads(); // some kernel do multiple reduction.
-        buf[threadNum] = myresult;
-        __syncthreads();
-
-
-        if (threadNum >= ((threadCount >> 1) * 2))
-        {
-            int idx = threadNum - (threadCount >> 1) * 2;"""
-
-        new_version += self._assign_reduce(node, name, 'buf[idx]',
-                                           'buf[threadNum]', sub, False)
-
-        new_version += """
-        }
-        __syncthreads();
-
-        // Works for power of 2 only.
-        int nTotalThreads = threadCount; // Total number of active threads
-        while(nTotalThreads > 1)
-        {
-            int halfPoint = (nTotalThreads >> 1);        // divide by two
-            // only the first half of the threads will be active.
-
-            if (threadNum < halfPoint)
-            {
-              // Get the shared value stored by another thread
-              %(acc_dtype)s temp = buf[threadNum + halfPoint];
-              """
-
-        new_version += self._assign_reduce(node, name,
-                                           'buf[threadNum]', 'temp', sub, False)
-
-        new_version += """
-            }
-            __syncthreads();
-
-            nTotalThreads = (nTotalThreads >> 1);        // divide by two.
-        }
-            __syncthreads();
-
-        if (threadNum == 0)
-        {
-            %(z_pos)s = buf[0];
-        }
-            __syncthreads();"""
-
-        new_version = new_version % locals()
-
-        current_version = """
-        __syncthreads(); // some kernel do multiple reduction.
-        buf[threadNum] = myresult;
-        __syncthreads();
-
-        // rest of function is handled by one warp
-        if (threadNum < warpSize)
-        {
-            //round up all the partial sums into the first `warpSize` elements
-            for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
-            {
-                """
-        current_version += self._assign_reduce(node, name,
-                                               'myresult', 'buf[i]',
-                                               sub, False) + """
-            }
-            buf[threadNum] = myresult;
-        /*Comment this optimization as it don't work on Fermi GPU.
-        TODO: find why it don't work or put the GPU compute capability into the version
-            // no sync because only one warp is running
-            if(threadCount >32)
-            {"""
-        for num in [16, 8, 4, 2, 1]:
-            current_version += self._assign_reduce(node, name,
-                                                   'buf[threadNum]',
-                                                   'buf[threadNum+%d]' % num,
-                                                   sub, False)
-            current_version += """
-            """
-        current_version += """
-                if (threadNum == 0)
-                {
-                    %(z_pos)s = buf[0];
-                }
-
-            }
-            else */
-            if (threadNum < 16)
-            {
-                //reduce so that threadNum 0 has the reduction of everything
-                """
-        for num in [16, 8, 4, 2, 1]:
-            this_if = "if (threadNum + %d < threadCount) " % num + \
-                self._assign_reduce(node, name,
-                                    'buf[threadNum]','buf[threadNum+%d]' % num,
-                                    sub, False)
-            current_version += this_if
-            current_version += """
-            """
-        current_version += """
-                if (threadNum == 0)
-                {
-                    %(z_pos)s = buf[0];
-                }
-            }
-        }
-        """
-
-        current_version = current_version % locals()
-
-        return current_version
-
-    #Threads must be organized as: threadNum%nb_reduce correspond to the same sum
-    #nb_reduce<=warpSize
-    def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce):
-        reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {}, False)
-        return """
-        __syncthreads(); // some kernel do multiple reduction.
-        buf[threadNum] = myresult;
-        __syncthreads();
-
-        // rest of function is handled by one warp
-        if (threadNum < %(nb_reduce)s)
-        {
-            //round up all the partial sums into the first `nb_reduce` elements
-            for (int i = threadNum + %(nb_reduce)s; i < threadCount; i += %(nb_reduce)s)
-            {
-                %(reduce_fct)s;
-            }
-            %(z_pos)s = myresult;
-        }
-        """ % locals()
-
-    def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
-        """
-        WRITEME
-        IG: I believe, based on how this is called in c_code, that it
-        is for the case where we are reducing on all axes and x is
-        C contiguous.
-        """
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
-        if getattr(self.scalar_op, 'identity', None) == 0:
-            zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals()
-        #TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
-        else:
-            zero_shp = """
-            PyErr_Format(PyExc_NotImplementedError,
-                         "GpuCAReduceCuda not implemented when input shape is 0 for this scalar_op");
-            %(fail)s;
-            """ % locals()
-
-        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
-        print >> sio, """
-        {
-          if(PyGpuArray_SIZE(%(x)s)==0){
-            %(zero_shp)s;
-          }else{
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_SIZE(%(x)s),
-                            (size_t) 256));
-            dim3 n_blocks(1);
-            if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
-                                " n_threads.x=%%d, size=%%d, ndim=%%d\\n",
-                                n_threads.x,PyGpuArray_SIZE(%(x)s),
-                                PyGpuArray_NDIM(%(x)s));
-            int n_shared = sizeof(%(acc_dtype)s) * n_threads.x;
-            kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                    PyGpuArray_SIZE(%(x)s),
-                    (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                    (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
-            %(sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %%s: %%s."
-                             " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_ccontig_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
-         }
-        }
-        """ % locals()
-
-    def c_code_reduce_1(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print >> sio, """
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                            (size_t) 256));
-            dim3 n_blocks(1);
-            %(makecall)s
-        }
-        """ % locals()
-
-    def c_code_reduce_11(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print >> sio, """
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[1],
-                            (size_t) 256));
-            while (n_threads.y * n_threads.x <= 256) ++n_threads.y;
-            n_threads.y -= 1;
-            if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0])
-                n_threads.y = PyGpuArray_DIMS(%(x)s)[0];
-
-            dim3 n_blocks(1);
-            %(makecall)s
-        }
-        """ % locals()
-
-    def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
-        """
-        :param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
-                  Work for N=1,2,3
-        """
-
-        assert N in [1, 2, 3]
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
-        makecall = self._makecall(node, name, x, z, fail)
-        N_pattern = ''.join(['1'] * N)
-        param_dim = ",".join(["PyGpuArray_DIMS(%s)[%d]" % (x, i)
-                              for i in xrange(N + 1)])
-        strides_dim = ",".join(["PyGpuArray_STRIDES(%s)[%d]/sizeof(%s)"
-                                % (x, i, in_dtype) for i in xrange(N + 1)])
-
-        threads_y = """
-            //get as many y threads as we can fit
-            while (n_threads.x * (n_threads.y+1) <= 256)
-            {
-                if (n_threads.y < PyGpuArray_DIMS(%(x)s)[%(N)s-1])
-                    n_threads.y += 1;
-                else
-                    break;
-            }""" % locals()
-
-        threads_z = """
-            //get as many z threads as we can fit
-            while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256)
-            {
-                if (n_threads.z < PyGpuArray_DIMS(%(x)s)[%(N)s-2])
-                    n_threads.z += 1;
-                else
-                    break;
-            }
-            //Maximum for Fermi GPU on that dimensions.
-            n_threads.z = std::min(n_threads.z, (unsigned)64);
-        """ % locals()
-
-        if len(self.reduce_mask) == 2:
-            threads_y = ''
-            threads_z = ''
-
-        if len(self.reduce_mask) == 3:
-            threads_z = ''
-
-        print >> sio, """
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[%(N)s],
-                            (size_t) 256));
-            %(threads_y)s
-            %(threads_z)s
-            dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0],
-                                   (size_t) 4096));
-            %(makecall)s
-        }
-        """ % locals()
-
-    def c_code_reduce_01(self, sio, node, name, x, z, fail):
-        self.c_code_reduce_01X(sio, node, name, x, z, fail, 1)
-
-    def c_code_reduce_011(self, sio, node, name, x, z, fail):
-        self.c_code_reduce_01X(sio, node, name, x, z, fail, 2)
-
-    def c_code_reduce_0111(self, sio, node, name, x, z, fail):
-        self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
-
-    def c_code_reduce_10(self, sio, node, name, x, z, fail):
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
-        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
-        print >> sio, """
-    {
-        int verbose = 0;
-        if(PyGpuArray_STRIDES(%(x)s)[0]>
-           PyGpuArray_STRIDES(%(x)s)[1]){
-                // If there are a lot of summations to do, then we can use simple parallelization -
-                // use each thread to do one sum.
-
-                // we might as well launch blocks of 32 threads because that's the warp size.
-                // we could schedule more threads if we were maxing out the gridsize below, but
-                // the gridsize is way more than the physical hardware and I think 32 threads
-                // on a huge grid is enough to fully use the hardware.
-                dim3 n_threads(32,1,1);
-
-                // We kindof reshape the input implicitly to something 4D:
-                //  the shape A,B,C    ->   A, B, D, E
-                //  where C <= D*E < C+32
-                //  where E==32
-
-                int A = 1;
-                int B = PyGpuArray_DIMS(%(x)s)[0];
-                int C = PyGpuArray_DIMS(%(x)s)[1];
-                int D = C/32;
-                if (32*D < C) D+= 1;
-                assert ((C <= 32*D) && (32*D < C+32));
-
-                // The gridsize would ideally be (A, D).  But we do the following logic to make
-                // sure we don't ask for a grid that is too big.
-                dim3 n_blocks(A,D);
-                if (n_blocks.x > 4096) n_blocks.x = 4096;
-                if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
-                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads>>>(
-                A,B,C,D,
-                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        1,
-                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        1,
-                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
-                        );
-
-            %(sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                    "Cuda error: %%s: %%s."
-                    " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_10_AD%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
-        }else{
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                            (size_t) 256));
-            dim3 n_blocks(1,
-                std::min(PyGpuArray_DIMS(%(x)s)[1],
-                    (size_t) 4096));
-            if (verbose) {
-              fprintf(stderr,
-                "running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n",
-                n_blocks.x,
-                n_blocks.y);
-            }
-            assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
-            int n_shared = sizeof(%(acc_dtype)s) * n_threads.x;
-            kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                    1,
-                    PyGpuArray_DIMS(%(x)s)[0],
-                    PyGpuArray_DIMS(%(x)s)[1],
-                    (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                    1,
-                    PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                    PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                    (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                    1,
-                    PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
-                    );
-            %(sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                    "Cuda error: %%s: %%s."
-                    " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_010_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
-        }
-    }
-        """ % locals()
-
-    def c_code_reduce_010(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        makecall_inner = self._makecall(node, name, x, z, fail,
-                                        pattern="010_inner")
-        pattern = ''.join(str(i) for i in self.reduce_mask)
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
-        print >> sio, """
-        {
-            //int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2];
-
-            //if ((n_summations >= 15 * 32) && (PyGpuArray_DIMS(%(x)s)[2]>=16))
-            if (1) // if the alternative is less buggy, consider not using this branch
-            {
-                // If there are a lot of summations to do, then we can use simple parallelization -
-                // use each thread to do one sum.
-
-                // we might as well launch blocks of 32 threads because that's the warp size.
-                // we could schedule more threads if we were maxing out the gridsize below, but
-                // the gridsize is way more than the physical hardware and I think 32 threads
-                // on a huge grid is enough to fully use the hardware.
-                dim3 n_threads(32,1,1);
-
-                // We kindof reshape the input implicitly to something 4D:
-                //  the shape A,B,C    ->   A, B, D, E
-                //  where C <= D*E < C+32
-                //  where E==32
-
-                int A = PyGpuArray_DIMS(%(x)s)[0];
-                int B = PyGpuArray_DIMS(%(x)s)[1];
-                int C = PyGpuArray_DIMS(%(x)s)[2];
-                int D = C/32;
-                if (32*D < C) D+= 1;
-                assert ((C <= 32*D) && (32*D < C+32));
-
-                // The gridsize would ideally be (A, D).  But we do the following logic to make
-                // sure we don't ask for a grid that is too big.
-                dim3 n_blocks(A,D);
-                if (n_blocks.x > 4096) n_blocks.x = 4096;
-                if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
-                int n_shared = 0;
-                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                A,B,C,D,
-                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s),
-                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s),
-                        PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s)
-                        );
-                %(sync)s
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                        "Cuda error: %%s: %%s."
-                        " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                        "kernel_reduce_010_%(name)s",
-                        cudaGetErrorString(sts),
-                        n_blocks.x,
-                        n_blocks.y,
-                        n_threads.x,
-                        n_threads.y,
-                        n_threads.z);
-                    %(fail)s;
-                }
-            }
-            else
-            {
-                int verbose = 2;
-
-                  dim3 n_threads(std::min((size_t) 32,
-                                          PyGpuArray_DIMS(%(x)s)[2]));
-                  while(    (n_threads.x*(n_threads.y+1)<=256)
-                         && (n_threads.y<PyGpuArray_DIMS(%(x)s)[1])){
-                      n_threads.y++;
-                  }
-
-                  dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0],
-                                (size_t)4096));
-                  n_blocks.y = std::min(
-                      ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
-                                  (size_t)n_threads.x),
-                      (size_t)(4096 / n_blocks.x)
-                      );
-                if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                                     PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s)),
-                            PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s))
-                   ==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s)
-                  && n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
-                                             (size_t)n_threads.x)){
-                  if(verbose>1)
-                    printf("n_block.x.1=%%d, n_block.x.2=%%d, n_block.y.1=%%d, n_block.y.2=%%d,\\n",
-                           PyGpuArray_DIMS(%(x)s)[0],4096,
-                           ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads.x),
-                                       (size_t)(4096 / n_blocks.x));
-                  assert(n_threads.x<=32);
-                  %(makecall_inner)s
-                }else{
-                  n_threads.x = std::min(PyGpuArray_DIMS(%(x)s)[1],
-                                         (size_t) 256);
-                  n_blocks.x = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096);
-                  n_blocks.y = std::min(
-                      PyGpuArray_DIMS(%(x)s)[2],
-                      (size_t)(4096 / n_blocks.x)
-                      );
-                  %(makecall)s
-                }
-                %(sync)s
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                        "kernel_reduce_%(pattern)s_%(name)s",
-                        cudaGetErrorString(sts),
-                        n_blocks.x,
-                        n_blocks.y,
-                        n_threads.x,
-                        n_threads.y,
-                        n_threads.z);
-                    %(fail)s;
-                }
-            }
-        }
-        """ % locals()
-
-    def c_code_reduce_0101(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print >> sio, """
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[3],
-                             (size_t) 256));
-            while (n_threads.x * n_threads.y <= 256)
-            {
-                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1]) break;
-                n_threads.y += 1;
-            }
-            n_threads.y -= 1;
-            dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2]);
-            %(makecall)s
-        }
-        """ % locals()
-
-    def c_code_reduce_100(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        # use threadIdx.x for i0
-        # use blockIdx.x for i1
-        # use blockIdx.y for i2
-        print >> sio, """
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                             (size_t) 256));
-            dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096));
-            while (n_blocks.x * (n_blocks.y+1) <= 4096 && n_blocks.y <= PyGpuArray_DIMS(%(x)s)[2])
-            {
-                n_blocks.y += 1;
-            }
-            %(makecall)s
-        }
-        """ % locals()
-
-    def c_code_reduce_110(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print >> sio, """
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[1],
-                             (size_t) 256));
-            while (n_threads.x*n_threads.y <= 256)
-            {
-                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0])
-                    break;
-                n_threads.y += 1;
-            }
-            n_threads.y -= 1;
-
-            dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[2]);
-            %(makecall)s
-        }
-        """ % locals()
-
-    def c_code_reduce_001(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print >> sio, """
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[2],
-                             (size_t) 256));
-            dim3 n_blocks(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                             (size_t) 4096));
-            while (n_blocks.x * n_blocks.y <= 4096)
-            {
-                if (n_blocks.y > PyGpuArray_DIMS(%(x)s)[1])
-                    break;
-                n_blocks.y += 1;
-            }
-            n_blocks.y -= 1;
-            %(makecall)s
-        }
-        """ % locals()
-
-    def c_code_reduce_111(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print >> sio, """
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[2],
-                             (size_t) 256));
-
-            //get as many y threads as we can fit
-            while (n_threads.x * n_threads.y <= 256)
-            {
-                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1])
-                    break;
-                n_threads.y += 1;
-            }
-            n_threads.y -= 1;
-
-            //get as many z threads as we can fit
-            while (n_threads.x * n_threads.y * n_threads.z <= 256)
-            {
-                if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
-                    break;
-                n_threads.z += 1;
-            }
-            n_threads.z -= 1;
-            //Maximum for Fermi GPU on that dimensions.
-            n_threads.z = std::min(n_threads.z, (unsigned)64);
-
-            dim3 n_blocks(1,1,1);
-            %(makecall)s
-        }
-        """ % locals()
-
-    def c_code_reduce_0011(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
-        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
-        print >> sio, """
-        {
-            int verbose = 0;
-
-            dim3 n_blocks(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                             (size_t) 4096));
-
-            while (n_blocks.x * n_blocks.y <= 4096 &&
-                   n_blocks.y < PyGpuArray_DIMS(%(x)s)[1])
-            {
-                n_blocks.y += 1;
-            }
-
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[3],
-                             (size_t) 256));
-            while (n_threads.x * n_threads.y <= 256
-                   && n_threads.y < PyGpuArray_DIMS(%(x)s)[2]
-                   && n_threads.x * n_threads.y * sizeof(%(acc_dtype)s) <=(15*1024-200))
-            {
-                n_threads.y += 1;
-            }
-
-            %(makecall)s
-        }
-        """ % locals()
-
-    def c_code_reduce_1111(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print >> sio, """
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[2],
-                             (size_t) 256));
-
-            //get as many y threads as we can fit
-            while (n_threads.x * n_threads.y <= 256)
-            {
-                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1])
-                    break;
-                n_threads.y += 1;
-            }
-            n_threads.y -= 1;
-
-            //get as many z threads as we can fit
-            while (n_threads.x * n_threads.y * n_threads.z <= 256)
-            {
-                if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
-                    break;
-                n_threads.z += 1;
-            }
-            n_threads.z -= 1;
-
-            //Maximum for Fermi GPU on that dimensions.
-            n_threads.z = std::min(n_threads.z, (unsigned)64);
-
-            dim3 n_blocks(1,1,1);
-            %(makecall)s
-        }
-        """ % locals()
-
-    def c_code_reduce_1011(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print >> sio, """
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[3],
-                             (size_t) 256));
-
-            while (n_threads.x * (n_threads.y+1) <= 256) ++n_threads.y;
-            if (n_threads.y > PyGpuArray_DIMS(%(x)s)[2])
-                n_threads.y = PyGpuArray_DIMS(%(x)s)[2];
-
-            while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256) ++n_threads.z;
-            if (n_threads.z > 64)
-                n_threads.z = 64;
-            if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
-                n_threads.z = PyGpuArray_DIMS(%(x)s)[0];
-
-            dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[1]);
-            %(makecall)s
-        }
-        """ % locals()
-
-    def c_code_cache_version_apply(self, node):
-        version = [12]  # the version corresponding to the c code in this Op
-
-        # now we insert versions for the ops on which we depend...
-        scalar_node = Apply(self.scalar_op,
-                [Scalar(dtype=input.type.dtype)() for input in node.inputs],
-                [Scalar(dtype=output.type.dtype)() for output in node.outputs])
-        version.extend(self.scalar_op.c_code_cache_version())
-        for i in node.inputs + node.outputs:
-            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
-        if all(version):
-            return tuple(version)
-        else:
-            return ()
-
-    def c_support_code_apply(self, node, nodename):
-        sio = StringIO()
-        nd_in = len(self.reduce_mask)
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
-        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
-
-        if all(i == 1 for i in self.reduce_mask):
-            #this kernel is ok for up to a few thousand elements, but
-            # it only runs on ONE multiprocessor
-            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[0]")
-            print >> sio, """
-            static __global__ void kernel_reduce_ccontig_%(nodename)s(
-                    const unsigned int d0,
-                    const %(in_dtype)s *A,
-                    %(out_dtype)s * Z)
-            {
-                const int threadCount = blockDim.x;
-                const int threadNum = threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
-                {
-                    %(reduce_fct)s
-                }
-                %(reducebuf)s
-            }
-            """ % locals()
-        if self.reduce_mask == (1,):
-            #this kernel is ok for up to a few thousand elements, but
-            # it only runs on ONE multiprocessor
-            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[0]")
-            print >> sio, """
-            static __global__ void kernel_reduce_1_%(nodename)s(
-                    const unsigned int d0,
-                    const %(in_dtype)s *A, const int sA0,
-                    %(out_dtype)s * Z)
-            {
-                const int threadCount = blockDim.x;
-                const int threadNum = threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
-                {
-                    %(reduce_fct)s
-                }
-                %(reducebuf)s
-            }
-            """ % locals()
-        if self.reduce_mask == (1, 1):
-            #this kernel is ok for up to a few thousand elements, but
-            # it only runs on ONE multiprocessor
-            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[0]")
-
-            print >> sio, """
-            static __global__ void kernel_reduce_11_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const %(in_dtype)s *A, const int sA0, const int sA1,
-                    %(out_dtype)s * Z)
-            {
-                const int threadCount = blockDim.x * blockDim.y;
-                const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
-                {
-                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
-                    {
-                        %(reduce_fct)s;
-                    }
-                }
-                %(reducebuf)s
-            }
-            """ % locals()
-        #01, 011, 0111
-        if (0 == self.reduce_mask[0] and
-            all(self.reduce_mask[1:]) and
-            nd_in in[2, 3, 4]):
-            # this kernel uses one block for each row.
-            # threads per block for each element per row.
-
-            N_pattern = ''.join(['1'] * (nd_in - 1))
-            # TODO: is it faster to hardcode sA3, etc. in the later code, rather
-            # than have the for_* variables declare them and the later code use
-            # their names?
-            if nd_in == 2:
-                for_i1 = "for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)"
-                first_i1 = 'threadIdx.x'
-                sA1 = 'sA1'
-                for_i2 = "int i2=0, sA2=0;"
-                sA2 = '0'
-                first_i2 = '0'
-                for_i3 = "int i3=0, sA3=0;"
-                sA3 = '0'
-                first_i3 = '0'
-            if nd_in == 3:
-                for_i1 = "for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)"
-                first_i1 = 'threadIdx.y'
-                sA1 = 'sA1'
-                for_i2 = "for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)"
-                first_i2 = 'threadIdx.x'
-                sA2 = 'sA2'
-                for_i3 = "int i3=0, sA3=0;"
-                first_i3 = 0
-                sA3 = '0'
-            if nd_in == 4:
-                for_i1 = "for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)"
-                first_i1 = 'threadIdx.z'
-                sA1 = 'sA1'
-                for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
-                first_i2 = 'threadIdx.y'
-                sA2 = 'sA2'
-                for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
-                first_i3 = 'threadIdx.x'
-                sA3 = 'sA3'
-
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node,
-                                           nodename, sub={})
-            param_dim = ",".join(["const int d%d" % i
-                                  for i in xrange(nd_in)])
-            param_strides = ",".join(["const int sA%d" % i
-                                      for i in xrange(nd_in)])
-            decl = self._k_decl(node, nodename)
-            init = self._k_init(node, nodename)
-            reduce_init = self._assign_init("A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0]" % locals())
-            reduce_fct = self._assign_reduce(
-                node, nodename, "myresult",
-                "A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
-                {}, True)
-            print >> sio, """
-                %(decl)s{
-                    %(init)s
-                    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
-                      myresult = %(reduce_init)s;
-                      %(for_i1)s{
-                        %(for_i2)s{
-                          %(for_i3)s{
-                            %(reduce_fct)s;
-                          }
-                        }
-                      }
-                      %(reducebuf)s
-                    }
-                }
-                """ % locals()
-        if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
-            # this kernel uses one block for each column,
-            # threads per block for each element per column.
-
-            #TODO: This kernel is pretty inefficient in terms of reading, because if A is
-            #      c_contiguous (typical case) then each warp is accessing non-contigous
-            #      memory (a segment of a column).
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]',
-                                           node, nodename, sub={})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2]")
-            print >> sio, """
-            static __global__ void kernel_reduce_010_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(in_dtype)s *A, const int sA0,
-                    const int sA1, const int sA2,
-                    %(out_dtype)s * Z, const int sZ0, const int sZ1)
-            {
-                const int threadCount = blockDim.x;
-                const int threadNum = threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-
-                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
-                {
-                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
-                    {
-                        %(acc_dtype)s myresult = %(reduce_init)s;
-                        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
-                        {
-                            %(reduce_fct)s;
-                        }
-                        %(reducebuf)s
-                    }
-                }
-
-            }
-            """ % locals()
-        if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "X[a * sX0 + b * sX1 + c * sX2]",
-                                             {}, True)
-            reduce_init = self._assign_init("X[a * sX0 + 0 * sX1 + c * sX2]")
-            print >> sio, """
-            static __global__ void kernel_reduce_010_AD_%(nodename)s(
-                    const int A,
-                    const int B,
-                    const int C,
-                    const int D,
-                    //const int E, // THIS is 32
-                    const %(in_dtype)s *X, const int sX0,
-                    const int sX1, const int sX2,
-                    %(out_dtype)s * Z, const int sZ0, const int sZ1)
-            {
-                const int threadCount = blockDim.x;
-                const int threadNum = threadIdx.x;
-                %(acc_dtype)s myresult = 0;
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-                for (int a = blockIdx.x; a < A; a += gridDim.x)
-                {
-                    for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
-                    {
-                        int c = i2_D * 32 + threadIdx.x;
-                        if (c < C)
-                        {
-                            myresult = %(reduce_init)s;
-                            for (int b = 0; b < B; ++b)
-                            {
-                                %(reduce_fct)s;
-                            }
-                            Z[a * sZ0 + c * sZ1] = myresult;
-                        }
-                    }
-                }
-
-            }
-            """ % locals()
-        if self.reduce_mask == (0, 1, 0):
-            #
-            # This kernel is optimized when the inner most dimensions
-            # have the smallest stride.
-
-            # this kernel uses one block for multiple column(up to 32TODO),
-            # threads per block for each element per column.
-
-#thread.x = dim 2 contiguous
-#thread.y = dim 1
-#block.x = dim 0
-#block.y = dim 1 rest
-            init = self._k_init(node, nodename)
-            decl = self._k_decl(node, nodename, pattern="010_inner")
-            reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
-                                                    node, nodename,
-                                                    'blockDim.x')
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[i0 * sA0 + 0 * sA1 + i2 * sA2]")
-            print >> sio, """
-            %(decl)s
-            {
-             if(warpSize<blockDim.x){
-               //TODO: set error code
-               Z[0] = -666;
-               return;
-              }
-
-              %(init)s
-              for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
-              {
-                for (int i2 = blockIdx.y*blockDim.x+threadIdx.x; i2 < d2; i2 += gridDim.y*blockDim.x)
-                 {
-                  myresult = %(reduce_init)s;
-                  for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
-                  {
-                      %(reduce_fct)s;
-                  }
-                  %(reducebuf)s
-                 }
-              }
-            }
-            """ % locals()
-        if self.reduce_mask == (1, 1, 0):
-            # this kernel uses one block for each column,
-            # threads per block for each element per column.
-
-            #TODO: This kernel is pretty inefficient in terms of reading, because if A is
-            #      c_contiguous (typical case) then each warp is accessing non-contigous
-            #      memory (a segment of a column).
-            reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[blockIdx.x * sA2]")
-            print >> sio, """
-            static __global__ void kernel_reduce_110_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(in_dtype)s *A, const int sA0,
-                    const int sA1, const int sA2,
-                    %(out_dtype)s * Z, const int sZ0)
-            {
-                const int threadCount = blockDim.x * blockDim.y;
-                const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
-
-                if (warpSize != 32)
-                {
-                    //TODO: set error code
-                    Z[blockIdx.x * sZ0] = -666;
-                    return;
-                }
-
-                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
-                {
-                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
-                    {
-                        %(reduce_fct)s;
-                    }
-                }
-
-                %(reducebuf)s
-            }
-            """ % locals()
-        if self.reduce_mask == (1, 0, 0):
-            reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]',
-                                           node, nodename, sub={})
-            decl = self._k_decl(node, nodename)
-            init = self._k_init(node, nodename)
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[i1 * sA1 + i2 * sA2]")
-            print >> sio, """
-            %(decl)s
-            {
-                %(init)s
-                for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
-                {
-                    for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x)
-                    {
-                        myresult = %(reduce_init)s;
-                        for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
-                        {
-                            %(reduce_fct)s
-                        }
-                        %(reducebuf)s
-                    }
-                }
-            }
-            """ % locals()
-        if self.reduce_mask == (1, 1, 1):
-            reducebuf = self._k_reduce_buf('Z[0]', node,
-                                           nodename, sub={})
-            decl = self._k_decl(node, nodename)
-            init = self._k_init(node, nodename)
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[0]")
-            print >> sio, """
-            %(decl)s
-            {
-                %(init)s
-                myresult = %(reduce_init)s;
-                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
-                {
-                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
-                    {
-                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
-                        {
-                            %(reduce_fct)s;
-                        }
-                    }
-                }
-                %(reducebuf)s
-            }
-            """ % locals()
-        if self.reduce_mask == (0, 0, 1):
-            # this kernel uses one block for each row,
-            # threads per block for each element per row.
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
-                                           node, nodename, sub={})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
-            print >> sio, """
-            static __global__ void kernel_reduce_001_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(in_dtype)s *A, const int sA0,
-                    const int sA1, const int sA2,
-                    %(out_dtype)s * Z, const int sZ0, const int sZ1)
-            {
-                const int threadCount = blockDim.x;
-                const int threadNum = threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
-                {
-                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
-                    {
-                        %(acc_dtype)s myresult = %(reduce_init)s;
-                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
-                        {
-                            %(reduce_fct)s;
-                        }
-                        %(reducebuf)s
-                    }
-                }
-            }
-            """ % locals()
-        if self.reduce_mask == (0, 0, 1, 1):
-             # this kernel uses one block for each row,
-            # threads per block for each element per row.
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
-                                           node, nodename, sub={})
-            decl = self._k_decl(node, nodename)
-            init = self._k_init(node, nodename)
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
-            print >> sio, """
-            %(decl)s
-            {
-                %(init)s
-
-                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
-                {
-                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
-                    {
-                        %(acc_dtype)s myresult = %(reduce_init)s;
-                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
-                    {
-                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
-                        {
-                            %(reduce_fct)s;
-                        }
-                    }
-                        %(reducebuf)s
-                    }
-                }
-            }
-            """ % locals()
-        if self.reduce_mask == (0, 1, 0, 1):
-            # this kernel uses one block for each row,
-            # threads per block for each element per row.
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]',
-                                           node, nodename, sub={})
-            decl = self._k_decl(node, nodename)
-            init = self._k_init(node, nodename)
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[i0 * sA0 + i2 * sA2]")
-            print >> sio, """
-            %(decl)s
-            {
-                %(init)s
-
-                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
-                {
-                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
-                    {
-                        %(acc_dtype)s myresult = %(reduce_init)s;
-                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
-                    {
-                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
-                        {
-                            %(reduce_fct)s;
-                        }
-                    }
-                        %(reducebuf)s
-                    }
-                }
-            }
-            """ % locals()
-        if self.reduce_mask == (1, 1, 1, 1):
-            reducebuf = self._k_reduce_buf('Z[0]', node, nodename,
-                                           sub={})
-            decl = self._k_decl(node, nodename)
-            init = self._k_init(node, nodename)
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[0]")
-            print >> sio, """
-            %(decl)s
-            {
-                %(init)s
-                myresult = %(reduce_init)s;
-              for (int i0 = 0; i0 < d0; i0++)
-                for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)
-                {
-                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
-                    {
-                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
-                        {
-                            %(reduce_fct)s;
-                        }
-                    }
-                }
-                %(reducebuf)s
-            }
-            """ % locals()
-        if self.reduce_mask == (1, 0, 1, 1):
-            reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
-                                           node, nodename, sub={})
-            reduce_fct = self._assign_reduce(node, nodename, "myresult",
-                                             "A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {}, True)
-            reduce_init = self._assign_init("A[blockIdx.x * sA1]")
-            print >> sio, """
-            static __global__ void kernel_reduce_1011_%(nodename)s(
-                    const unsigned int d0,
-                    const unsigned int d1,
-                    const unsigned int d2,
-                    const unsigned int d3,
-                    const %(in_dtype)s *A, const int sA0, const int sA1,
-                    const int sA2, const int sA3,
-                    %(out_dtype)s * Z, const int sZ0)
-            {
-                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
-                const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
-                {
-                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
-                    {
-                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
-                        {
-                            %(reduce_fct)s;
-                        }
-                    }
-                }
-                %(reducebuf)s
-            }
-            """ % locals()
-        print >> sio, """
-        template <typename T>
-        static T ceil_intdiv(T a, T b)
-        {
-            return (a/b) + ((a % b) ? 1: 0);
-        }
-        """
-        return sio.getvalue()
-
-
-class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
-    """CAReduce that reuse the python code from gpuarray.
-
-    Too slow for now as it only have a python interface.
-
-    """
-    def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
-        if not hasattr(scalar_op, 'identity'):
-            raise ValueError("No identity on scalar op")
-        CAReduceDtype.__init__(self, scalar_op, axis=axis, dtype=dtype,
-                               acc_dtype=acc_dtype)
-
-    def __str__(self):
-        ax = ''
-        if self.axis is not None:
-            ax = '{%s}' % (', '.join(str(x) for x in self.axis),)
-        return "GpuReduce{%s}%s" % (self.scalar_op, ax)
-
-    def make_node(self, input):
-        res = CAReduceDtype.make_node(self, input)
-        input = as_gpuarray_variable(input)
-        otype = GpuArrayType(dtype=res.outputs[0].dtype,
-                             broadcastable=res.outputs[0].broadcastable)
-
-        if res.op.axis is not None:
-            redux = []
-            for i in range(len(input.type.broadcastable)):
-                redux.append(i in res.op.axis)
-                # since redux is just another way to describe what is in axis
-                # it doesn't need to be compared in __eq__ or __hash__
-            res.op.redux = redux
-
-        return Apply(res.op, [input], [otype()])
-
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
-        # cache the kernel object
-        self.get_kernel_cache(node)
-        return super(GpuCAReduceCPY, self).make_thunk(node, storage_map,
-                                                   compute_map, no_recycling)
-
-    def get_kernel_cache(self, node):
-        attr = '@cache_reduction_k'
-        if self.axis is None:
-            redux = [True] * node.inputs[0].ndim
-        else:
-            redux = self.redux
-        if not hasattr(node, attr):
-            acc_dtype = getattr(self, 'acc_dtype', None)
-            if acc_dtype is None:
-                acc_dtype = node.outputs[0].type.dtype
-            if any(redux):
-                setattr(node, attr, self.generate_kernel(node, acc_dtype,
-                                                         redux))
-
-        if any(redux):
-            return getattr(node, attr)
-
-    def gpu_kernels(self, node, name):
-        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
-            # Some OpenCL compilers do not accept no-arguments kernels
-            src = "KERNEL void reduk(GLOBAL_MEM float *a) {}"
-            params = ['float32']
-        else:
-            k = self.get_kernel_cache(node)
-            _, src, _, _ = k._get_basic_kernel(k.init_local_size,
-                                               node.inputs[0].ndim)
-            nd = node.inputs[0].ndim
-            params = ['uint32', gpuarray.GpuArray]
-            params.extend('uint32' for _ in range(nd))
-            params.append(gpuarray.GpuArray)
-            params.append('uint32')
-            params.extend('int32' for _ in range(nd))
-        acc_dtype = getattr(self, 'acc_dtype', None)
-        if acc_dtype is None:
-            acc_dtype = node.outputs[0].type.dtype
-        return [Kernel(code=src, name="reduk", params=params,
-                       flags=Kernel.get_flags(node.inputs[0].type.dtype,
-                                              acc_dtype,
-                                              node.outputs[0].type.dtype),
-                       objvar='k_reduk_'+name)]
-
-    def c_code(self, node, name, inp, out, sub):
-        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
-            # We special case the no-reduction case since the gpu
-            # kernel has trouble handling it.
-            return """
-        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
-        if (!%(out)s) {
-            %(fail)s
-        }
-
-        if (%(sync)d)
-            GpuArray_sync(&%(out)s->ga);
-""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
-           sync=bool(config.gpuarray.sync))
-        k = self.get_kernel_cache(node)
-        _, src, _, ls = k._get_basic_kernel(k.init_local_size,
-                                            node.inputs[0].ndim)
-        if self.axis is None:
-            redux = [True] * node.inputs[0].ndim
-        else:
-            redux = self.redux
-        acc_dtype = getattr(self, 'acc_dtype', None)
-        if acc_dtype is None:
-            acc_dtype = node.outputs[0].type.dtype
-        input = inp[0]
-        output = out[0]
-        nd_out = node.outputs[0].ndim
-        code = """
-        size_t gs = 1;
-        unsigned int n = 1;
-        unsigned int proxy_dim[%(nd_in)s];
-        unsigned int proxy_off;
-        int proxy_str[%(nd_in)s];
-        void *args[%(n_args)s];
-        PyGpuArrayObject *tmp;
-        int err;
-""" % dict(n_args=4 + (node.inputs[0].ndim * 2), nd_in=node.inputs[0].ndim)
-
-        if nd_out != 0:
-            code += """
-        size_t out_dims[%(nd_out)s];
-        int need_out = %(output)s == NULL || %(output)s->ga.nd != %(nd_out)s;
-""" % dict(nd_out=nd_out, output=output)
-            j = 0
-            for i in range(node.inputs[0].ndim):
-                if not self.redux[i]:
-                    code += """
-         out_dims[%(j)s] = %(input)s->ga.dimensions[%(i)s];
-         if (!need_out)
-             need_out |= %(output)s->ga.dimensions[%(j)s] != out_dims[%(j)s];
-""" % dict(j=j, i=i, input=input, output=output)
-                    j += 1
-            code += """
-         if (need_out) {
-             %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None);
-             if (!%(output)s) {
-                 %(fail)s
-             }
-         }
-""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
-           out_type=dtype_to_typecode(node.outputs[0].type.dtype))
-        else:
-            code += """
-        if (%(output)s == NULL || %(output)s->ga.nd != 0) {
-            Py_XDECREF(%(output)s);
-            %(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER,
-                                     pygpu_default_context(), Py_None);
-            if (!%(output)s) {
-                %(fail)s
-            }
-        }
-""" % dict(output=output, fail=sub['fail'],
-           out_type=dtype_to_typecode(node.outputs[0].type.dtype))
-
-        if acc_dtype != node.outputs[0].type.dtype:
-            code += """
-        tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions,
-                          %(acc_type)s, GA_C_ORDER, pygpu_default_context(),
-                          Py_None);
-        if (!tmp) %(fail)s
-""" % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype))
-        else:
-            code += """
-        tmp = %(output)s;
-        Py_INCREF(tmp);
-""" % dict(output=output)
-
-        # We need the proxies since we are passing a pointer to the
-        # data into the call and therefore we need a real copy of the
-        # data in the proper type.
-        code += """
-        args[0] = &n;
-        args[1] = &tmp->ga;
-""" % dict(output=output)
-
-        p = 2
-        for i in range(node.inputs[0].ndim):
-            code += """
-        proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
-        args[%(p)s] = &proxy_dim[%(i)s];
-        n *= %(input)s->ga.dimensions[%(i)s];
-""" % dict(i=i, p=p, input=input)
-            p += 1
-            if not redux[i]:
-                code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)
-
-        code += """
-        args[%(p)s] = &%(input)s->ga;
-        proxy_off = %(input)s->ga.offset;
-        args[%(p)s+1] = &proxy_off;
-""" % dict(p=p, input=input)
-        p += 2
-
-        for i in range(node.inputs[0].ndim):
-            code += """
-        proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
-        args[%(p)s] = &proxy_str[%(i)s];
-""" % dict(p=p, i=i, input=input)
-            p += 1
-
-        code += """
-        if (gs == 0) gs = 1;
-        n /= gs;
-        err = GpuKernel_call(&%(k_var)s, 0, %(ls)s, gs, args);
-        if (err != GA_NO_ERROR) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "gpuarray error: GpuCAReduceCPY: %%s.",
-                         GpuKernel_error(&%(k_var)s, err));
-            %(fail)s
-        }
-
-        if (%(cast_out)d) {
-            err = GpuArray_move(&%(output)s->ga, &tmp->ga);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "gpuarray error: GpuCAReduceCPY [cast]: %%s.",
-                             GpuArray_error(&tmp->ga, err));
-                %(fail)s
-            }
-        } else {
-            Py_XDECREF(%(output)s);
-            %(output)s = tmp;
-        }
-
-        if (%(sync)d)
-            GpuArray_sync(&%(output)s->ga);
-""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
-           ls=ls, fail=sub['fail'], output=output, input=input,
-           cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
-
-        return code
-
-    def c_code_cache_version(self):
-        return (0, self.GpuKernelBase_version)
-
-    def generate_kernel(self, node, odtype, redux):
-        if isinstance(self.scalar_op, scalar.basic.Add):
-            reduce_expr = "a + b"
-        elif isinstance(self.scalar_op, scalar.basic.Mul):
-            reduce_expr = "a * b"
-        else:
-            raise NotImplementedError()
-        return ReductionKernel(pygpu.get_default_context(), odtype,
-                               self.scalar_op.identity, reduce_expr, redux,
-                               arguments=[make_argument(node.inputs[0], 'a')],
-                               init_nd=node.inputs[0].ndim)
-
-    def perform(self, node, inp, out):
-        input, = inp
-        output, = out
-
-        if self.axis is None:
-            redux = [True] * input.ndim
-        else:
-            redux = self.redux
-
-        if any(redux):
-            output[0] = self.get_kernel_cache(node)(input).astype(copy=False,
-                                             dtype=node.outputs[0].type.dtype)
-        else:
-            output[0] = pygpu.gpuarray.array(input, copy=True,
-                                             dtype=node.outputs[0].type.dtype)
-# To allow reloading old pickled files
-GpuCAReduce = GpuCAReduceCPY
diff --git a/theano/sandbox/gpuarray/kernel_codegen.py b/theano/sandbox/gpuarray/kernel_codegen.py
deleted file mode 100644
index 8af5fcb6870..00000000000
--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ /dev/null
@@ -1,320 +0,0 @@
-""" Helper routines for generating gpu kernels for nvcc.
-"""
-
-
-def nvcc_kernel(name, params, body):
-    """Return the c code of a kernel function.
-
-    :param params: the parameters to the function as one or more strings
-
-    :param body: the [nested] list of statements for the body of the
-         function.  These will be separated by ';' characters.
-
-    """
-    paramstr = ', '.join(params)
-
-    def flatbody():
-        for b in body:
-            if isinstance(b, (list, tuple)):
-                for bb in b:
-                    yield bb
-            else:
-                yield b
-    bodystr = ';\n'.join(flatbody())
-    return """__global__ void %(name)s (%(paramstr)s)
-    {
-        %(bodystr)s;
-    }
-    """ % locals()
-
-
-def code_version(version):
-    """decorator to support version-based cache mechanism"""
-    if not isinstance(version, tuple):
-        raise TypeError('version must be tuple', version)
-
-    def deco(f):
-        f.code_version = version
-        return f
-    return deco
-
-UNVERSIONED = ()
-
-
-@code_version((1,))
-def inline_reduce(N, buf, pos, count, manner_fn):
-    """Return C++ code for a function that reduces a contiguous buffer.
-
-    :param N: length of the buffer
-    :param buf: buffer pointer
-    :param pos: index of executing thread
-    :param count: number of executing threads
-
-    :param manner_fn: a function that accepts strings of arguments a
-        and b, and returns c code for their reduction. (Example:
-        return "%(a)s + %(b)s" for a sum reduction).
-
-    :postcondition:
-    This function leaves the answer in position 0 of the buffer.  The
-    rest of the buffer is trashed by this function.
-
-    :note: buf should be in gpu shared memory, we access it many times.
-
-    """
-    loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
-    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
-    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
-    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
-    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
-    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
-
-    return """
-    {
-        // This function trashes buf[1..warpSize],
-        // leaving the reduction result in buf[0].
-
-        if (%(pos)s < warpSize)
-        {
-            for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
-            {
-                %(buf)s[%(pos)s] = %(loop_line)s;
-            }
-            if (%(pos)s < 16)
-            {
-                //reduce so that %(pos)s 0 has the sum of everything
-                if(%(pos)s + 16 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_16)s;
-                if(%(pos)s + 8 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_8)s;
-                if(%(pos)s + 4 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_4)s;
-                if(%(pos)s + 2 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_2)s;
-                if(%(pos)s + 1 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_1)s;
-            }
-        }
-    }
-    """ % locals()
-
-
-@code_version(inline_reduce.code_version)
-def inline_reduce_max(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count,
-                         lambda a, b: "max(%s, %s)" % (a, b))
-
-
-@code_version(inline_reduce.code_version)
-def inline_reduce_sum(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count,
-                         lambda a, b: "%s + %s" % (a, b))
-
-
-@code_version(inline_reduce.code_version)
-def inline_reduce_min(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count,
-                         lambda a, b: "min(%s, %s)" % (a, b))
-
-
-@code_version(inline_reduce.code_version)
-def inline_reduce_prod(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count,
-                         lambda a, b: "%s * %s" % (a, b))
-
-
-@code_version((2,) + inline_reduce_max.code_version +
-              inline_reduce_sum.code_version)
-def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
-    """
-
-    :param N: length of the buffer
-    :param threadPos: index of executing thread
-    :param threadCount: number of executing threads
-    :param dtype: dtype of the softmax's output
-
-    :Precondition: buf and buf2 contain two identical copies of the input
-        to softmax
-    :Postcondition: buf contains the softmax, buf2 contains un-normalized
-        softmax
-
-    :note: buf and buf2 should be in gpu shared memory, we access it many times
-
-    :note2: We use __i as an int variable in a loop
-    """
-    return [
-            #get max of buf (trashing all but buf[0])
-            inline_reduce_max(N, buf, threadPos, threadCount),
-            '__syncthreads()',
-            ('npy_%s row_max = ' + buf + '[0]') % dtype,
-            '__syncthreads()',
-            'for(int __i=' + threadPos + '; __i<' + N +
-                  '; __i+=' + threadCount + '){',
-                buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
-                buf2 + '[__i] = ' + buf + '[__i]',
-            '}',
-            '__syncthreads()',
-            inline_reduce_sum(N, buf, threadPos, threadCount),
-            '__syncthreads()',
-            ('npy_%s row_sum = ' + buf + '[0]') % dtype,
-            '__syncthreads()',
-            # divide each exp() result by the sum to complete the job.
-            'for(int __i=' + threadPos + '; __i<' + N +
-                  '; __i+=' + threadCount + '){',
-                buf + '[__i] = ' + buf2 + '[__i] / row_sum',
-            '}',
-            '__syncthreads()',
-            ]
-
-
-@code_version((1,))
-def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
-                               manner_fn, manner_init,
-                               b='', stride_b='', dtype='float32'):
-    """Return C++ code for a function that reduces a contiguous buffer.
-
-    :param N: length of the buffer
-    :param buf: buffer pointer of size warpSize * sizeof(dtype)
-    :param pos: index of executing thread
-    :param count: number of executing threads
-    :param b: Optional, pointer to the bias
-    :param stride_b: Optional, the stride of b if b is provided
-    :param dtype: Optional, the dtype of the output
-
-    :param manner_fn: a function that accepts strings of arguments a
-        and b, and returns c code for their reduction. (Example:
-        return "%(a)s + %(b)s" for a sum reduction).
-    :param manner_init: a function that accepts strings of arguments a
-        and return c code for its initialization
-
-    :postcondition:
-    This function leaves the answer in position 0 of the buffer.  The
-    rest of the buffer is trashed by this function.
-
-    :note: buf should be in gpu shared memory, we access it many times.
-
-    """
-    if b:
-        init = manner_init("%(x)s[%(pos)s * %(stride_x)s] +"
-                           " %(b)s[%(pos)s * %(stride_b)s]" % locals())
-        loop_line = manner_fn("red",
-                              manner_init("%(x)s[i * %(stride_x)s] + "
-                                          "%(b)s[i * %(stride_b)s]" %
-                                          locals()))
-    else:
-        init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
-        loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
-                                                 locals()))
-    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
-                          "%s[i]" % buf)
-    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
-    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
-    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
-    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
-    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
-
-    return """
-    {
-        // This function trashes buf[1..n_threads],
-        // leaving the reduction result in buf[0].
-        npy_%(dtype)s red = %(init)s;
-        #pragma unroll 16
-        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
-          red = %(loop_line)s;
-        }
-        buf[%(pos)s] = red;
-        __syncthreads();
-        if (%(pos)s < warpSize)
-        {
-            for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
-            {
-                %(buf)s[%(pos)s] = %(loop_line2)s;
-            }
-            if (%(pos)s < 16)
-            {
-                //reduce so that %(pos)s 0 has the reduction of everything
-                if(%(pos)s + 16 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_16)s;
-                if(%(pos)s + 8 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_8)s;
-                if(%(pos)s + 4 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_4)s;
-                if(%(pos)s + 2 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_2)s;
-                if(%(pos)s + 1 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_1)s;
-            }
-        }
-    }
-    """ % locals()
-
-
-@code_version(inline_reduce_fixed_shared.code_version)
-def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
-                                   b='', stride_b='', dtype='float32'):
-    return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
-                                      lambda a, b: "max(%s, %s)" % (a, b),
-                                      lambda a: a,
-                                      b, stride_b, dtype)
-
-
-@code_version((1,) + inline_reduce_max.code_version +
-              inline_reduce_sum.code_version)
-def inline_softmax_fixed_shared(N, buf, x, stride_x,
-                                sm, sm_stride,
-                                threadPos, threadCount,
-                                b='', stride_b='', dtype="float32"):
-    """
-
-    :param N: length of the buffer, atleast waprSize(32).
-    :param buf: a shared memory buffer of size warpSize * sizeof(dtype)
-    :param x: a ptr to the gpu memory where the row is stored
-    :param stride_x: the stride between each element in x
-    :param sm: a ptr to the gpu memory to store the result
-    :param sm_stride: the stride between eash sm element
-    :param threadPos: index of executing thread
-    :param threadCount: number of executing threads
-    :param b: Optional, pointer to the bias
-    :param stride_b: Optional, the stride of b if b is provided
-    :param dtype: Optional, the dtype of the softmax's output if not float32
-
-    :Precondition: buf is empty
-    :Postcondition: buf[0] contains the softmax,
-        buf2 contains un-normalized softmax
-
-    :note: buf should be in gpu shared memory, we access it many times.
-
-    :note2: We use tx as an int variable in a loop
-    """
-    ret = [
-        #get max of buf (trashing all but buf[0])
-        inline_reduce_fixed_shared_max(N, buf, x, stride_x,
-                                       threadPos, threadCount, b, stride_b,
-                                       dtype),
-        '__syncthreads()',
-        ('npy_%s row_max = ' + buf + '[0]') % dtype,
-        '__syncthreads()',
-        inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
-                                   lambda a, b: "%s + %s" % (a, b),
-                                   lambda a: "exp(%s - row_max)" % a,
-                                   b, stride_b, dtype),
-        '__syncthreads()',
-        ('npy_%s row_sum = ' + buf + '[0]') % dtype,
-        '__syncthreads()',
-        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-        ]
-    # This set all value correctly
-    if b:
-        ret += [
-            "%(sm)s[tx * %(sm_stride)s] = "
-            "  exp(%(x)s[tx * %(stride_x)s] +"
-            "            %(b)s[tx * %(stride_b)s] - row_max)"
-            " / row_sum" % locals()]
-    else:
-        ret += [
-            "%(sm)s[tx * %(sm_stride)s] = "
-            "exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals()]
-    ret += [
-        "}",
-        '__syncthreads()',
-    ]
-    return ret
diff --git a/theano/sandbox/gpuarray/neighbours.py b/theano/sandbox/gpuarray/neighbours.py
deleted file mode 100644
index 1f0c7529213..00000000000
--- a/theano/sandbox/gpuarray/neighbours.py
+++ /dev/null
@@ -1,448 +0,0 @@
-import numpy
-
-from theano import Op, Apply, config
-from theano.gof import local_optimizer
-from theano.tensor.nnet.neighbours import Images2Neibs
-import theano.tensor as T
-
-try:
-    import pygpu
-    from pygpu import gpuarray, elemwise
-except ImportError:
-    pass
-
-from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable,
-                                               host_from_gpu, gpu_from_host)
-from theano.sandbox.gpuarray.opt import register_opt as register_gpu_opt
-from theano.sandbox.gpuarray.opt import op_lifter as op_lifter
-from theano.sandbox.gpuarray.type import GpuArrayType
-from theano.sandbox.gpuarray.comp import NVCC_compiler
-
-
-class GpuImages2Neibs(Images2Neibs, Op):
-    def __init__(self, mode='valid'):
-        if mode not in ['valid', 'ignore_borders', 'wrap_centered']:
-            raise NotImplementedError("Only the mode valid, ignore_borders"
-                                      " and wrap_centered"
-                                      " have been implemented for the op"
-                                      " GpuImages2Neibs")
-        self.mode = mode
-
-    def make_node(self, ten4, neib_shape, neib_step):
-        ten4 = as_gpuarray_variable(ten4)
-        neib_shape = T.as_tensor_variable(neib_shape)
-        neib_step = T.as_tensor_variable(neib_step)
-
-        assert ten4.ndim == 4
-        assert neib_shape.ndim == 1
-        assert neib_step.ndim == 1
-        assert "int" in neib_shape.dtype
-        assert "int" in neib_step.dtype
-
-        return Apply(self, [ten4, neib_shape, neib_step],
-                     [GpuArrayType(broadcastable=(False, False),
-                                   dtype=ten4.type.dtype)()])
-
-    def c_code_cache_version(self):
-        return (9,1)
-
-    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
-
-    def c_init_code(self):
-        return ['setup_ext_cuda();']
-
-    def c_support_code_apply(self, node, nodename):
-        dtype_ten4 = node.inputs[0].dtype
-        dtype_z = node.outputs[0].dtype
-        mode = self.mode
-        return """
-//a version that use less register but don't work in all case.
-        static __global__ void k_multi_warp_less_%(nodename)s(
-            const int nb_batch,
-            const int nb_stack,
-            const int height,
-            const int width,
-            const int c,
-            const int d,
-            const int step_x,
-            const int step_y,
-            const int grid_c,
-            const int grid_d,
-            const int stride0, const int stride1,
-            const int stride2, const int stride3,
-            npy_%(dtype_ten4)s * global_ten4,
-            const int out_s0, const int out_s1,
-            npy_%(dtype_z)s * global_out
-        )
-        {
-            const int wrap_centered_idx_shift_x = c/2;
-            const int wrap_centered_idx_shift_y = d/2;
-
-            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
-                tblock<nb_batch*nb_stack*grid_c*grid_d;
-                tblock+=gridDim.x*blockDim.z){
-                const int b = tblock%%grid_d;
-                int left = tblock/grid_d;
-                const int a = left%%grid_c;
-                left = left/grid_c;
-                const int s = left%%nb_stack;
-                left = left/nb_stack;
-                const int n = left;
-
-                if(n>nb_batch)continue;
-                if(s>nb_stack)continue;
-                if(a>grid_c)continue;
-                if(b>grid_d)continue;
-                            int z_row = b + grid_d*(a + grid_c*
-                                                    (s + nb_stack*n));
-                            int i = threadIdx.y;     // loop over c
-                            {
-                                int ten4_2 = i + a * step_x;
-                                if("%(mode)s"=="wrap_centered"){
-                                    ten4_2 -= wrap_centered_idx_shift_x;
-                                    if ( ten4_2 < 0 )
-                                        ten4_2 += height;
-                                    else if (ten4_2 >= height)
-                                        ten4_2 -= height;
-                                }
-                                int j = threadIdx.x;  // loop over d
-                                {
-                                    int ten4_3 = j + b * step_y;
-                                    if("%(mode)s"=="wrap_centered"){
-                                        ten4_3 -= wrap_centered_idx_shift_y;
-                                        if ( ten4_3 < 0 )
-                                            ten4_3 += width;
-                                        else if (ten4_3 >= width)
-                                            ten4_3 -= width;
-                                    }
-
-                                    int ten4_idx = stride3*ten4_3 +
-                                                   stride2*ten4_2 +
-                                                   stride1*s + stride0*n;
-
-                                    int z_col = j + d * i;
-                                    int z_idx = z_col * out_s1 +
-                                                z_row * out_s0;
-                                    global_out[z_idx] = global_ten4[ten4_idx];
-                                }
-                            }
-            }
-        }
-
-        static __global__ void k_multi_warp_%(nodename)s(
-            const int nb_batch,
-            const int nb_stack,
-            const int height,
-            const int width,
-            const int c,
-            const int d,
-            const int step_x,
-            const int step_y,
-            const int grid_c,
-            const int grid_d,
-            const int stride0, const int stride1,
-            const int stride2, const int stride3,
-            npy_%(dtype_ten4)s * global_ten4,
-            const int out_s0, const int out_s1,
-            npy_%(dtype_z)s * global_out
-        )
-        {
-            const int wrap_centered_idx_shift_x = c/2;
-            const int wrap_centered_idx_shift_y = d/2;
-
-            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
-                tblock<nb_batch*nb_stack*grid_c*grid_d;
-                tblock+=gridDim.x*blockDim.z){
-                const int b = tblock%%grid_d;
-                int left = tblock/grid_d;
-                const int a = left%%grid_c;
-                left = left/grid_c;
-                const int s = left%%nb_stack;
-                left = left/nb_stack;
-                const int n = left;
-
-                if(n>nb_batch)continue;
-                if(s>nb_stack)continue;
-                if(a>grid_c)continue;
-                if(b>grid_d)continue;
-                            int z_row = b + grid_d*(a + grid_c*
-                                                    (s + nb_stack*n));
-                            // loop over c
-                            for (int i = threadIdx.y; i < c; i+=blockDim.y)
-                            {
-                                int ten4_2 = i + a * step_x;
-                                if("%(mode)s"=="wrap_centered"){
-                                    ten4_2 -= wrap_centered_idx_shift_x;
-                                    if ( ten4_2 < 0 )
-                                        ten4_2 += height;
-                                    else if (ten4_2 >= height)
-                                        ten4_2 -= height;
-                                }
-                                // loop over d
-                                for (int j = threadIdx.x; j < d; j+=blockDim.x)
-                                {
-                                    int ten4_3 = j + b * step_y;
-                                    if("%(mode)s"=="wrap_centered"){
-                                        ten4_3 -= wrap_centered_idx_shift_y;
-                                        if ( ten4_3 < 0 )
-                                            ten4_3 += width;
-                                        else if (ten4_3 >= width)
-                                            ten4_3 -= width;
-                                    }
-
-                                    int ten4_idx = stride3*ten4_3 +
-                                                   stride2*ten4_2 +
-                                                   stride1*s + stride0*n;
-
-                                    int z_col = j + d * i;
-                                    int z_idx = z_col * out_s1 +
-                                                z_row * out_s0;
-                                    global_out[z_idx] = global_ten4[ten4_idx];
-                                }
-                            }
-            }
-        }
-        """ % locals()
-
-    def c_code(self, node, name, inp, out, sub):
-        dtype_ten4 = node.inputs[0].dtype
-        dtype_neib_shape = node.inputs[1].dtype
-        dtype_neib_step = node.inputs[2].dtype
-        dtype_z = node.outputs[0].dtype
-        itemsize_ten4 = numpy.dtype(dtype_ten4).itemsize
-        itemsize_z = numpy.dtype(dtype_z).itemsize
-        typecode_z = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        ten4, neib_shape, neib_step = inp
-        z, = out
-        fail = sub['fail']
-        mode = self.mode
-        if config.gpuarray.sync:
-            cnda_thread_sync = "GpuArray_sync(&%(z)s->ga);" % dict(z=z)
-        else:
-            cnda_thread_sync = ""
-        return """
-#ifndef CEIL_INTDIV
-#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
-#endif
-
-        int grid_c = -1;
-        int grid_d = -1;
-
-        {
-            if (PyGpuArray_NDIM(%(ten4)s) != 4)
-            {
-                PyErr_Format(PyExc_TypeError,
-                             "GpuImages2Neibs: pvals wrong rank");
-                %(fail)s;
-            }
-            if (PyArray_NDIM(%(neib_shape)s) != 1)
-            {
-                PyErr_Format(PyExc_TypeError,
-                             "GpuImages2Neibs: unis wrong rank");
-                %(fail)s;
-            }
-
-            if (PyArray_DIMS(%(neib_shape)s)[0] != 2)
-            {
-                PyErr_Format(PyExc_ValueError,
-                             "GpuImages2Neibs: neib_shape has to contain two"
-                             " elements");
-                %(fail)s;
-            }
-
-            const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
-                                                     %(neib_shape)s, 0);
-            const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
-                                                     %(neib_shape)s, 1);
-            const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*)
-                                         PyArray_GETPTR1(%(neib_step)s, 0);
-            const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
-                                         PyArray_GETPTR1(%(neib_step)s, 1);
-
-            if ( "%(mode)s" == "wrap_centered") {
-                if (c%%2!=1 || d%%2!=1){
-                    PyErr_Format(PyExc_TypeError,
-        "GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
-                    %(fail)s;
-                }
-                if ( PyGpuArray_DIMS(%(ten4)s)[2] < c ||
-                     PyGpuArray_DIMS(%(ten4)s)[3] < d)
-                {
-                    PyErr_Format(PyExc_TypeError,
-                                 "GpuImages2Neibs: in wrap_centered mode,"
-                                 " don't support image shapes smaller then"
-                                 " the patch shapes: neib_shape=(%%d,%%d),"
-                                 " ten4[2:]=[%%d,%%d]",
-                                 c, d, PyGpuArray_DIMS(%(ten4)s)[2],
-                                 PyGpuArray_DIMS(%(ten4)s)[3]);
-                    %(fail)s;
-                }
-                grid_c = CEIL_INTDIV(((PyGpuArray_DIMS(%(ten4)s))[2]),
-                                     step_x);
-                grid_d = CEIL_INTDIV(((PyGpuArray_DIMS(%(ten4)s))[3]),
-                                     step_y);
-
-
-            }else if ( "%(mode)s" == "valid") {
-                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
-                     ((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
-                {
-                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
-                                 " neib_shape[0]=%%d, neib_step[0]=%%d and"
-                                 " ten4.shape[2]=%%d not consistent",
-                                 c, step_x,
-                                 PyGpuArray_DIMS(%(ten4)s)[2]);
-                    %(fail)s;
-                }
-                if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
-                     ((((PyGpuArray_DIMS(%(ten4)s))[3]-d) %% step_y)!=0))
-                {
-                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
-                                 " neib_shape[1]=%%d, neib_step[1]=%%d and"
-                                 " ten4.shape[3]=%%d not consistent",
-                                 d, step_y,
-                                 PyGpuArray_DIMS(%(ten4)s)[3]);
-                    %(fail)s;
-                }
-                //number of patch in height
-                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
-                //number of patch in width
-                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
-            }else if ( "%(mode)s" == "ignore_borders") {
-                //number of patch in height
-                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
-                //number of patch in width
-                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
-            }else{
-                PyErr_Format(PyExc_TypeError,
-                             "GpuImages2Neibs:: unknown mode '%(mode)s'");
-                 %(fail)s;
-            }
-
-            // new dimensions for z
-            const int z_dim1 = c * d;
-            const int z_dim0 =  grid_c
-                                * grid_d
-                                * PyGpuArray_DIMS(%(ten4)s)[1]
-                                * PyGpuArray_DIMS(%(ten4)s)[0];
-
-            if ((NULL == %(z)s)
-                || (PyGpuArray_DIMS(%(z)s)[0] != z_dim0)
-                || (PyGpuArray_DIMS(%(z)s)[1] != z_dim1))
-            {
-                Py_XDECREF(%(z)s);
-                size_t dims[2];
-                dims[0] = z_dim0;
-                dims[1] = z_dim1;
-                %(z)s = pygpu_empty(2, dims, %(typecode_z)s,
-                                    GA_C_ORDER, pygpu_default_context(),
-                                    Py_None);
-                if (!%(z)s)
-                {
-                    PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
-                                    " failed to alloc z output");
-                    %(fail)s;
-                }
-            }
-
-        }
-
-        { // NESTED SCOPE
-
-            const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0];
-            const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1];
-            const int height = PyGpuArray_DIMS(%(ten4)s)[2];
-            const int width = PyGpuArray_DIMS(%(ten4)s)[3];
-
-            const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
-                                                     %(neib_shape)s, 0);
-            const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
-                                                     %(neib_shape)s, 1);
-            const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*)
-                                         PyArray_GETPTR1(%(neib_step)s, 0);
-            const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
-                                         PyArray_GETPTR1(%(neib_step)s, 1);
-
-            dim3 n_threads(d,c,1);
-            //Their is a max of 512 threads per blocks
-            while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
-            while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
-
-            //Make bigger block to have better memory access pattern and
-            //a higher core utilisation. for smaller patch size
-
-            while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 &&
-                  n_threads.z<PyGpuArray_DIMS(%(z)s)[0]){
-                n_threads.z++;
-            }
-            int nb_block;
-            if (PyGpuArray_DIMS(%(z)s)[0] %% n_threads.z == 0)
-                nb_block = PyGpuArray_DIMS(%(z)s)[0] / n_threads.z;
-            else
-                nb_block = (PyGpuArray_DIMS(%(z)s)[0] / n_threads.z) + 1;
-            dim3 n_blocks(std::min(32*1024,nb_block));
-            int n_shared = 0;
-
-            void (*f)(int, int, int ,int,
-                      int, int, int ,int,
-                      int, int,
-                      int, int, int, int,
-                      npy_%(dtype_ten4)s*,
-                      int, int,
-                      npy_%(dtype_z)s*);
-            if(n_threads.x==d && n_threads.y==c){
-                f = k_multi_warp_less_%(name)s;
-            }else{
-                f = k_multi_warp_%(name)s;
-            }
-
-            f<<<n_blocks, n_threads, n_shared>>>(
-                nb_batch,
-                nb_stack,
-                height, width,
-                c, d, step_x, step_y,
-                grid_c, grid_d,
-                PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s,
-                PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s,
-                PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s,
-                PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s,
-                (npy_%(dtype_ten4)s*)(
-                                ((char *)cuda_get_ptr(%(ten4)s->ga.data)) +
-                                %(ten4)s->ga.offset),
-                PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s,
-                (npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                   %(z)s->ga.offset)
-            );
-            %(cnda_thread_sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError, "GpuImages2Neibs:"
-                             " Cuda error: %%s: %%s. (grid: %%i x %%i;"
-                             " block: %%i x %%i x %%i; shared: %%i)\\n",
-                    "k_multi_warp_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z,
-                    n_shared);
-                %(fail)s;
-            }
-
-        } // END NESTED SCOPE
-        """ % locals()
-
-@op_lifter([Images2Neibs])
-def use_gpu_images2neibs(node):
-    if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
-        return GpuImages2Neibs(node.op.mode)
-
-register_gpu_opt()(use_gpu_images2neibs)
diff --git a/theano/sandbox/gpuarray/nnet.py b/theano/sandbox/gpuarray/nnet.py
deleted file mode 100644
index 19bfd413a12..00000000000
--- a/theano/sandbox/gpuarray/nnet.py
+++ /dev/null
@@ -1,855 +0,0 @@
-import numpy
-
-from theano import Op, Apply, config
-from theano.compat.six import StringIO
-from theano.sandbox.gpuarray.comp import NVCC_compiler
-
-
-try:
-    import pygpu
-    from pygpu import gpuarray, elemwise
-except ImportError:
-    pass
-
-from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable
-from theano.sandbox.gpuarray.type import GpuArrayType
-from theano.sandbox.gpuarray.kernel_codegen import (nvcc_kernel,
-                                                   inline_softmax,
-                                                   inline_softmax_fixed_shared)
-
-
-
-class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
-    """
-    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
-    """
-    nin = 3
-    nout = 3
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def make_node(self, x, b, y_idx):
-        #N.B. won't work when we don't cast y_idx to float anymore
-        x = as_gpuarray_variable(x)
-        b = as_gpuarray_variable(b)
-        y_idx = as_gpuarray_variable(y_idx)
-        nll = GpuArrayType(x.type.dtype,
-                           y_idx.type.broadcastable)()
-        sm = x.type()
-        am = y_idx.type()
-        return Apply(self, [x, b, y_idx], [nll, sm, am])
-
-    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>']
-
-    def c_support_code_apply(self, node, nodename):
-        dtype_x = node.inputs[0].dtype
-        dtype_b = node.inputs[1].dtype
-        dtype_y_idx = node.inputs[2].dtype
-        return """
-        __global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N,
-            const npy_%(dtype_x)s* x_data, int xs0, int xs1,
-            const npy_%(dtype_b)s* b, int bs0,
-            const npy_%(dtype_y_idx)s* y_idx_data, int y_idxs0,
-            npy_%(dtype_x)s* nll_data, int nlls0,
-            npy_%(dtype_x)s* sm_data, int sms0, int sms1,
-            npy_%(dtype_y_idx)s* am_data, int ams0)
-        {
-          for (int row = blockIdx.x; row < M; row += gridDim.x){
-
-            const npy_%(dtype_x)s* x = x_data + xs0 * row;
-            const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0];
-            npy_%(dtype_x)s* sm = sm_data + sms0 * row;
-
-            npy_%(dtype_x)s sum = 0.0;
-            int row_max_j = 0;
-            npy_%(dtype_x)s row_max = x[0] + b[0];
-            for (int j = 1; j < N; ++j)
-            {
-                npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
-                //todo: store to shared memory
-                row_max_j = (row_ij > row_max) ? j : row_max_j;
-                row_max   = (row_ij > row_max) ? row_ij : row_max;
-            }
-            //compute the exp
-            for (int j = 0; j < N; ++j)
-            {
-                npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
-                npy_%(dtype_x)s sm_ij = exp(row_ij - row_max);
-                sum += sm_ij;
-                sm[j * sms1] = sm_ij;
-            }
-            npy_%(dtype_x)s sum_inv = 1.0 / sum;
-            for (int j = 0; j < N; ++j)
-            {
-                sm[j * sms1] *= sum_inv;
-            }
-            if ((y_idx >= N) || (y_idx < 0))
-            {
-                //TODO: set raise an error bit in a global var?
-                nll_data[row*nlls0] = 0.0; // raise some suspicion at least...
-            }
-            else
-            {
-                nll_data[row*nlls0] = - x[y_idx*xs1]
-                           - b[y_idx*bs0]
-                           + row_max
-                           + log(sum);
-            }
-            am_data[row*ams0] = row_max_j;
-          }
-        }
-
-        CUdeviceptr (*cuda_get_ptr)(gpudata *g);
-        """ % locals()
-
-    def c_init_code(self):
-        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
-
-    def c_code(self, node, nodename, inp, out, sub):
-        typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
-        typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
-        typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
-        itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
-        itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
-        itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
-        itemsize_nll = numpy.dtype(node.outputs[0].dtype).itemsize
-        itemsize_sm = numpy.dtype(node.outputs[1].dtype).itemsize
-        itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize
-        x, b, y_idx = inp
-        nll, sm, am = out
-        dtype_x = node.inputs[0].dtype
-        dtype_b = node.inputs[1].dtype
-        dtype_y_idx = node.inputs[2].dtype
-        dtype_nll = node.outputs[0].dtype
-        dtype_sm = node.outputs[1].dtype
-        dtype_am = node.outputs[2].dtype
-        classname = self.__class__.__name__
-        fail = sub['fail']
-        sio = StringIO()
-        print >> sio, """
-        if (PyGpuArray_NDIM(%(y_idx)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
-            %(fail)s;
-        }
-        if (PyGpuArray_NDIM(%(x)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
-            %(fail)s;
-        }
-        if (PyGpuArray_NDIM(%(b)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
-            %(fail)s;
-        }
-        if (PyGpuArray_DIMS(%(x)s)[0] !=
-            PyGpuArray_DIMS(%(y_idx)s)[0])
-        {
-            PyErr_SetString(PyExc_ValueError,
-                            "dimension mismatch in x,y_idx arguments");
-            %(fail)s;
-        }
-        if (PyGpuArray_DIMS(%(x)s)[1] != PyGpuArray_DIMS(%(b)s)[0])
-        {
-            PyErr_SetString(PyExc_ValueError,
-                            "dimension mismatch in x,b arguments");
-            %(fail)s;
-        }
-        if ((NULL == %(nll)s) //initial condition
-            || (PyGpuArray_DIMS(%(nll)s)[0] !=
-                PyGpuArray_DIMS(%(y_idx)s)[0]))
-        {
-            Py_XDECREF(%(nll)s);
-            %(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
-                                %(typecode_x)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
-            if (!%(nll)s) {
-                %(fail)s
-            }
-        }
-        if ((NULL == %(sm)s)
-            || (PyGpuArray_DIMS(%(sm)s)[0] !=
-                PyGpuArray_DIMS(%(x)s)[0])
-            || (PyGpuArray_DIMS(%(sm)s)[1] !=
-                PyGpuArray_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(sm)s);
-            %(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode_b)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
-            if(!%(sm)s)
-            {
-                PyErr_SetString(PyExc_MemoryError,
-                                "failed to alloc sm output");
-                // no need to decref cnda_nll, the cleanup code should do it up
-                %(fail)s;
-            }
-        }
-        if ((NULL == %(am)s)
-            || (PyGpuArray_DIMS(%(am)s)[0] !=
-                PyGpuArray_DIMS(%(y_idx)s)[0]))
-        {
-            Py_XDECREF(%(am)s);
-            %(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
-                                %(typecode_y_idx)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
-            if(!%(am)s)
-            {
-                PyErr_SetString(PyExc_MemoryError,
-                                "failed to alloc am output");
-                // no need to decref nll and sm,
-                // the cleanup code should do it up
-                %(fail)s;
-            }
-        }
-        {
-            int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256;
-     //TODO: launch more threads per row and do parallel sum and max reductions
-            int n_threads = 1;
-            int n_shared_bytes = 0; //n_threads * sizeof(dtype);
-
-
-            k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>(
-                PyGpuArray_DIMS(%(x)s)[0],
-                PyGpuArray_DIMS(%(x)s)[1],
-                (npy_%(dtype_x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                   %(x)s->ga.offset),
-                PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-                (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
-                                   %(b)s->ga.offset),
-                PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
-                (npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
-                                   %(y_idx)s->ga.offset),
-                PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
-                (npy_%(dtype_nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) +
-                                   %(nll)s->ga.offset),
-                PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s,
-                (npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
-                                   %(sm)s->ga.offset),
-                PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
-                PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
-                (npy_%(dtype_am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) +
-                                   %(am)s->ga.offset),
-                PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s);
-            cudaError_t err = cudaGetLastError();
-            if (cudaSuccess != err)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
-                             "The kernel was launched with %%d threads,"
-                             " %%d blocks and %%d shared memory\\n",
-                             cudaGetErrorString(err),
-                             n_threads, n_blocks, n_shared_bytes);
-                // no need to decref output vars the cleanup code will do it
-                %(fail)s;
-            }
-        }
-        """ % locals()
-        return sio.getvalue()
-
-    def c_code_cache_version(self):
-        #return ()
-        return (5,)
-
-    def c_compiler(self):
-        return NVCC_compiler
-
-
-gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
-
-
-class GpuCrossentropySoftmax1HotWithBiasDx(Op):
-    """
-    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
-    """
-    nin = 3
-    nout = 1
-    """Gradient wrt x of the CrossentropySoftmax1Hot Op"""
-    def __init__(self, **kwargs):
-        Op.__init__(self, **kwargs)
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def make_node(self, dnll, sm, y_idx):
-        dnll = as_gpuarray_variable(dnll)
-        sm = as_gpuarray_variable(sm)
-        y_idx = as_gpuarray_variable(y_idx)
-        return Apply(self, [dnll, sm, y_idx], [sm.type()])
-
-    def c_code_cache_version(self):
-        #return ()
-        return (6,)
-
-    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
-
-    def c_code(self, node, nodename, inp, out, sub):
-        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
-        itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
-        itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
-        itemsize_dx = numpy.dtype(node.outputs[0].dtype).itemsize
-        dtype_dnll = node.inputs[0].dtype
-        dtype_sm = node.inputs[1].dtype
-        dtype_y_idx = node.inputs[2].dtype
-        dtype_dx = node.outputs[0].dtype
-        dnll, sm, y_idx = inp
-        dx, = out
-        fail = sub['fail']
-        return """
-        if ((PyGpuArray_NDIM(%(dnll)s) != 1)
-            || (PyGpuArray_NDIM(%(sm)s) != 2)
-            || (PyGpuArray_NDIM(%(y_idx)s) != 1))
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error");
-            %(fail)s;
-        }
-        if (PyGpuArray_DIMS(%(dnll)s)[0] !=
-            PyGpuArray_DIMS(%(sm)s)[0])
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "dnll.shape[0] == %%i, but sm.shape[0] == %%i",
-                         PyGpuArray_DIMS(%(dnll)s)[0],
-                         PyGpuArray_DIMS(%(sm)s)[0]);
-            %(fail)s;
-        }
-        if (PyGpuArray_DIMS(%(dnll)s)[0] !=
-            PyGpuArray_DIMS(%(y_idx)s)[0])
-        {
-            PyErr_SetString(PyExc_ValueError,
-                            "dnll.shape[0] != y_idx.shape[0]");
-            %(fail)s;
-        }
-        if ((NULL == %(dx)s)
-            || (PyGpuArray_DIMS(%(dx)s)[0] !=
-                PyGpuArray_DIMS(%(sm)s)[0])
-            || (PyGpuArray_DIMS(%(dx)s)[1] !=
-                PyGpuArray_DIMS(%(sm)s)[1]))
-        {
-            Py_XDECREF(%(dx)s);
-            %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
-                                 %(typecode_dx)s,
-                                 GA_C_ORDER,
-                                 pygpu_default_context(), Py_None);
-            if (!%(dx)s) {
-                %(fail)s
-            }
-        }
-        {
-            int n_blocks = PyGpuArray_DIMS(%(dx)s)[0] < 256 ? PyGpuArray_DIMS(%(dx)s)[0] : 256;
-            int n_threads = PyGpuArray_DIMS(%(dx)s)[1] < 256 ? PyGpuArray_DIMS(%(dx)s)[1] : 256;
-
-            kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
-                <<<n_blocks, n_threads>>>(
-                        PyGpuArray_DIMS(%(dx)s)[0],
-                        PyGpuArray_DIMS(%(dx)s)[1],
-
-                        (npy_%(dtype_dnll)s*)(((char *)cuda_get_ptr(%(dnll)s->ga.data)) +
-                                           %(dnll)s->ga.offset),
-                        PyGpuArray_STRIDES(%(dnll)s)[0] / %(itemsize_dnll)s,
-
-                        (npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
-                                           %(sm)s->ga.offset),
-                        PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
-                        PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
-
-                        (npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
-                                           %(y_idx)s->ga.offset),
-                        PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
-
-                        (npy_%(dtype_dx)s*)(((char *)cuda_get_ptr(%(dx)s->ga.data)) +
-                                           %(dx)s->ga.offset),
-                        PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s,
-                        PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s
-                );
-            cudaError_t err = cudaGetLastError();
-            if( cudaSuccess != err)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %%s: %%s.\\n"
-                             "The kernel was launched with %%d threads and"
-                             " %%d blocks\\n",
-                             "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
-                             cudaGetErrorString(err), n_threads, n_blocks);
-                %(fail)s;
-            }
-        }
-        assert(%(dx)s);
-        """ % locals()
-
-    def c_support_code_apply(self, node, nodename):
-        dtype_dnll = node.inputs[0].dtype
-        dtype_sm = node.inputs[1].dtype
-        dtype_y_idx = node.inputs[2].dtype
-        dtype_dx = node.outputs[0].dtype
-        return """
-        __global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
-           int N, int K,
-           const npy_%(dtype_dnll)s* dnll, const int dnll_s0,
-           const npy_%(dtype_sm)s* sm, const int sm_s0, const int sm_s1,
-           const npy_%(dtype_y_idx)s* y_idx, const int y_idx_s0,
-           npy_%(dtype_dx)s* dx, const int dx_s0, const int dx_s1)
-        {
-            for (int i = blockIdx.x; i < N; i += gridDim.x)
-            {
-                npy_%(dtype_dnll)s dnll_i = dnll[i * dnll_s0];
-                npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0];
-
-                for (int j = threadIdx.x; j < K; j += blockDim.x)
-                {
-                    if (y_i == j)
-                    {
-                        dx[i * dx_s0 + j * dx_s1] =
-                            dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
-                    }
-                    else
-                    {
-                        dx[i * dx_s0 + j * dx_s1] =
-                            dnll_i * sm[i * sm_s0 + j * sm_s1];
-                    }
-                    //dx[i * dx_s0 + j * dx_s1] =
-                    //    dnll_i * sm[i * sm_s0 + j * sm_s1];
-                    //dx[i*dx_s0+j*dx_s1] = 0;
-                }
-            }
-        }
-
-        CUdeviceptr (*cuda_get_ptr)(gpudata *g);
-        """ % locals()
-
-    def c_init_code(self):
-        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
-
-gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
-
-
-class GpuSoftmax (Op):
-    """
-    Implement Softmax on the gpu.
-    """
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def make_node(self, x):
-        x = as_gpuarray_variable(x)
-        return Apply(self, [x], [x.type()])
-
-    def infer_shape(self, node, shape):
-        return shape
-
-    def c_code_cache_version(self):
-        return (12,) + inline_softmax.code_version
-        
-    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
-        
-    def c_init_code(self):
-        return ['setup_ext_cuda();']
-
-    def c_code(self, node, nodename, inp, out, sub):
-        dtype_x = node.inputs[0].dtype
-        dtype_z = node.outputs[0].dtype
-        itemsize_x = numpy.dtype(dtype_x).itemsize
-        itemsize_z = numpy.dtype(dtype_z).itemsize
-        typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        x, = inp
-        z, = out
-        fail = sub['fail']
-        if config.gpuarray.sync:
-            cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
-        else:
-            cnda_thread_sync = ""  
-        return """
-        if (PyGpuArray_NDIM(%(x)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error");
-            %(fail)s;
-        }
-        if ((NULL == %(z)s) ||
-            (PyGpuArray_DIMS(%(z)s)[0] !=
-             PyGpuArray_DIMS(%(x)s)[0]) ||
-            (PyGpuArray_DIMS(%(z)s)[1] !=
-             PyGpuArray_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(z)s);
-            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
-            if (!%(z)s) {
-                %(fail)s
-            } 
-        }
-        {
-            int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0],
-                                    (size_t)(32 * 1024));
-//TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512);
-            int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
-                                     2 * sizeof(npy_%(dtype_x)s);
-
-            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
-            {
-              //Those numbers are based on not too recent GPU
-              //to make them compatible with more GPU.
-              //TODO: read the information from the card.
-              if(n_shared_bytes < (32 * 1024 - 500)){
-                kSoftmax_%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_shared_bytes
-                    >>>(
-                            PyGpuArray_DIMS(%(x)s)[0],
-                            PyGpuArray_DIMS(%(x)s)[1],
-
-                            (npy_%(dtype_x)s*)(
-                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                    %(x)s->ga.offset),
-                            PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                            PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-
-                            (npy_%(dtype_z)s*)(
-                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                    %(z)s->ga.offset),
-                            PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                            PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
-                    );
-              }else{
-                kSoftmax_fixed_shared%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_threads * sizeof(npy_%(dtype_x)s)
-                    >>>(
-                            PyGpuArray_DIMS(%(x)s)[0],
-                            PyGpuArray_DIMS(%(x)s)[1],
-
-                            (npy_%(dtype_x)s*)(
-                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                    %(x)s->ga.offset),
-                            PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                            PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-
-                            (npy_%(dtype_z)s*)(
-                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                    %(z)s->ga.offset),
-                            PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                            PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
-                    );
-              }
-              %(cnda_thread_sync)s
-              cudaError_t err = cudaGetLastError();
-              if( cudaSuccess != err)
-              {
-                  PyErr_Format(PyExc_RuntimeError,
-                               "Cuda error: %%s: %%s.\\n Used %%d blocks,"
-                               " %%d threads %%d bytes of shared memory",
-                               "kSoftmax[_fixed_shared]%(nodename)s",
-                               cudaGetErrorString(err),
-                               n_blocks, n_threads, n_shared_bytes);
-                  %(fail)s;
-              }
-            }
-        }
-        assert(%(z)s);
-        """ % locals()
-
-    def c_support_code_apply(self, node, nodename):
-        dtype_x = node.inputs[0].dtype
-        dtype_sm = node.outputs[0].dtype
-        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
-                params=['int M', 'int N',
-                    'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
-                    'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
-                body=[
-                    "extern __shared__ npy_%(dtype_sm)s buf[]",
-                    "npy_%(dtype_sm)s * buf2 = buf + N",
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                    "     blockIDX += gridDim.x){",
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                        "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
-                        "buf2[tx] = buf[tx]",
-                      "}",
-                      "__syncthreads()",
-                      inline_softmax('N', 'buf', 'buf2',
-                                     'threadIdx.x', 'blockDim.x', dtype_sm),
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                        # This set all value correctly
-                        "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
-                      "}",
-                      "__syncthreads()",
-                    "}",
-                ])
-        ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
-                params=['int M', 'int N',
-                    'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
-                    'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
-                body=[
-                    "extern __shared__ npy_%(dtype_sm)s buf[]",
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                    "     blockIDX += gridDim.x){",
-                      "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
-                      "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
-                      inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
-                                                  'sm_ptr', 'sm_s1',
-                                                  'threadIdx.x', 'blockDim.x',
-                                                  dtype=dtype_sm),
-                      "__syncthreads()",
-                    "}",
-                    ])
-        return (ret1 + "\n" + ret2) % locals()
-
-gpu_softmax = GpuSoftmax()
-
-
-class GpuSoftmaxWithBias (Op):
-    """
-    Implement SoftmaxWithBias on the gpu.
-    """
-    nin = 2
-    nout = 1
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def make_node(self, x, b):
-        x = as_gpuarray_variable(x)
-        b = as_gpuarray_variable(b)
-        return Apply(self, [x, b], [x.type()])
-
-    def infer_shape(self, node, shape):
-        return  [shape[0]]
-        
-    def c_code_cache_version(self):
-        return (11,) + inline_softmax.code_version
-        
-    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
-        
-    def c_init_code(self):
-        return ['setup_ext_cuda();']
-        
-    def c_code(self, node, nodename, inp, out, sub):
-        dtype_x = node.inputs[0].dtype
-        dtype_b = node.inputs[1].dtype
-        dtype_z = node.outputs[0].dtype
-        itemsize_x = numpy.dtype(dtype_x).itemsize
-        itemsize_b = numpy.dtype(dtype_b).itemsize
-        itemsize_z = numpy.dtype(dtype_z).itemsize
-        typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        x, b = inp
-        z, = out
-        fail = sub['fail']
-        if config.gpuarray.sync:
-            cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
-        else:
-            cnda_thread_sync = "" 
-        return """
-        if (PyGpuArray_NDIM(%(x)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error input");
-            %(fail)s;
-        }
-        if (PyGpuArray_NDIM(%(b)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error for the bias");
-            %(fail)s;
-        }
-        if ((PyGpuArray_DIMS(%(x)s)[1] !=
-            PyGpuArray_DIMS(%(b)s)[0]))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "number of columns in x (%%ld)"
-                         " does not match length of b (%%ld)",
-                         (long int)PyGpuArray_DIMS(%(x)s)[1],
-                         (long int)PyGpuArray_DIMS(%(b)s)[0]);
-            %(fail)s;
-        }
-        if ((NULL == %(z)s)
-            || (PyGpuArray_DIMS(%(z)s)[0] !=
-                PyGpuArray_DIMS(%(x)s)[0])
-            || (PyGpuArray_DIMS(%(z)s)[1] !=
-                PyGpuArray_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(z)s);
-            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
-            if (!%(z)s) {
-                %(fail)s
-            } 
-        }
-        {
-            int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024));
-//TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512);
-            int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
-                                     2 * sizeof(npy_%(dtype_x)s);
-            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
-            {
-              if(n_shared_bytes < (32 * 1024 - 500)){
-                kSoftmaxWithBias_%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_shared_bytes
-                    >>>(
-                        PyGpuArray_DIMS(%(x)s)[0],
-                        PyGpuArray_DIMS(%(x)s)[1],
-
-                        (npy_%(dtype_x)s*)(
-                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                    %(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                        PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-
-                        (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
-                                           %(b)s->ga.offset),
-                        PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
-
-                        (npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                           %(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                        PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
-                    );
-              }else{
-                kSoftmaxWithBias_fixed_shared%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_threads * sizeof(npy_%(dtype_x)s)
-                    >>>(
-                        PyGpuArray_DIMS(%(x)s)[0],
-                        PyGpuArray_DIMS(%(x)s)[1],
-
-                        (npy_%(dtype_x)s*)(
-                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                    %(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                        PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-
-                        (npy_%(dtype_b)s*)(
-                                    ((char *)cuda_get_ptr(%(b)s->ga.data)) +
-                                    %(b)s->ga.offset),
-                        PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
-
-                        (npy_%(dtype_z)s*)(
-                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                    %(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                        PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
-                    );
-              }
-                %(cnda_thread_sync)s
-                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "Cuda error: %%s: %%s.\\n",
-                                 "kSoftmaxWithBias_%(nodename)s",
-                                 cudaGetErrorString(err));
-                    %(fail)s;
-                }
-            }
-        }
-        assert(%(z)s);
-        """ % locals()
-
-    def c_support_code_apply(self, node, nodename):
-        dtype_x = node.inputs[0].dtype
-        dtype_b = node.inputs[1].dtype
-        dtype_sm = node.outputs[0].dtype
-        ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
-                params=['int M', 'int N',
-                        'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
-                        'const npy_%(dtype_b)s * b', 'const int sb0',
-                        'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
-                body=[
-                    "extern __shared__ npy_%(dtype_sm)s buf[]",
-                    "npy_%(dtype_sm)s * buf2 = buf + N",
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                    "     blockIDX += gridDim.x){",
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
-                         "buf[tx] += b[tx * sb0]",
-                         "buf2[tx] = buf[tx]",
-                      "}",
-                       "__syncthreads()",
-                       inline_softmax('N', 'buf', 'buf2',
-                                      'threadIdx.x', 'blockDim.x', dtype_sm),
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
-                      "}",
-                      "__syncthreads()",
-                    "}",
-                    ])
-        ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
-                           params=['int M', 'int N',
-                                   'const npy_%(dtype_x)s * x',
-                                   'const int sx0', 'const int sx1',
-                                   'const npy_%(dtype_b)s * b', 'const int sb0',
-                                   'npy_%(dtype_sm)s * sm',
-                                   'const int sm_s0', 'const int sm_s1'],
-                           body=[
-                               "extern __shared__ npy_%(dtype_sm)s buf[]",
-                               "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                               "     blockIDX += gridDim.x){",
-                               "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
-                               "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
-                               inline_softmax_fixed_shared('N', 'buf',
-                                                           'x_ptr', 'sx1',
-                                                           'sm_ptr', 'sm_s1',
-                                                           'threadIdx.x',
-                                                           'blockDim.x',
-                                                           'b', 'sb0',
-                                                           dtype_sm),
-                               "__syncthreads()",
-                               "}",
-                           ])
-        return (ret1 + "\n" + ret2) % locals()
-
-gpu_softmax_with_bias = GpuSoftmaxWithBias()
diff --git a/theano/sandbox/gpuarray/opt.py b/theano/sandbox/gpuarray/opt.py
deleted file mode 100644
index f1491defbde..00000000000
--- a/theano/sandbox/gpuarray/opt.py
+++ /dev/null
@@ -1,714 +0,0 @@
-import copy
-import theano
-import numpy
-
-try:
-    import pygpu
-except ImportError:
-    pass
-
-from theano import tensor, scalar, gof
-from theano.compile import optdb
-from theano.gof import (local_optimizer, EquilibriumDB,
-                        SequenceDB, ProxyDB,
-                        Optimizer, toolbox,
-                        InconsistencyError, EquilibriumOptimizer)
-
-from theano.scan_module import scan_utils, scan_op, scan_opt
-
-from theano.gof.python25 import all, any
-from theano.tensor.nnet.conv import ConvOp
-from theano.sandbox.gpuarray.type import GpuArrayType
-from theano.sandbox.gpuarray.basic_ops import (
-    host_from_gpu, gpu_from_host, HostFromGpu, GpuSplit,
-    gpu_alloc, GpuAlloc, GpuReshape, GpuEye, gpu_join, GpuJoin,
-)
-from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
-from theano.sandbox.gpuarray.conv import GpuConv
-from theano.sandbox.gpuarray.nnet import (
-    GpuCrossentropySoftmaxArgmax1HotWithBias,
-    GpuCrossentropySoftmax1HotWithBiasDx,
-    GpuSoftmaxWithBias, GpuSoftmax
-)
-from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
-                                              GpuDimShuffle, GpuCAReduceCuda,
-                                              GpuCAReduceCPY)
-from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
-                                               GpuAdvancedIncSubtensor1,
-                                               GpuAdvancedIncSubtensor1_dev20)
-from theano.sandbox.gpuarray.type import GpuArrayConstant
-
-gpu_optimizer = EquilibriumDB()
-gpu_cut_copies = EquilibriumDB()
-
-gpu_seqopt = SequenceDB()
-
-gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
-                    'fast_compile', 'fast_run', 'inplace', 'gpuarray')
-gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
-                    'fast_compile', 'fast_run', 'gpuarray')
-
-# do not add 'fast_run' to these two as this would always enable gpuarray mode
-optdb.register('gpuarray_opt', gpu_seqopt,
-               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
-               'gpuarray')
-
-
-def register_opt(*tags, **kwargs):
-    def f(local_opt):
-        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
-        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
-        return local_opt
-    return f
-
-register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
-
-
-def safe_to_gpu(x):
-    if isinstance(x.type, tensor.TensorType):
-        return gpu_from_host(x)
-    else:
-        return x
-
-
-def safe_to_cpu(x):
-    if isinstance(x.type, GpuArrayType):
-        return host_from_gpu(x)
-    else:
-        return x
-
-
-def op_lifter(OP, cuda_only=False):
-    """
-    OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
-    gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
-    """
-    def f(maker):
-        def local_opt(node):
-            dev = theano.sandbox.gpuarray.init_dev.device
-            if cuda_only and not dev.startswith('cuda'):
-                return
-
-            if type(node.op) in OP:
-
-                # Either one of our inputs is on the gpu or
-                # all of our client are on the gpu
-                if (any([i.owner and i.owner.op == host_from_gpu
-                         for i in node.inputs]) or
-                    all([c != 'output' and c.op == gpu_from_host
-                         for c, idx in node.outputs[0].clients])):
-                    new_op = maker(node)
-                    # This is needed as sometimes new_op inherit from OP.
-                    if new_op and new_op != node.op:
-                        if isinstance(new_op, theano.Op):
-                            return [safe_to_cpu(o) for o in
-                                    new_op(*node.inputs, return_list=True)]
-                        elif isinstance(new_op, (tuple, list)):
-                            return [safe_to_cpu(o) for o in new_op]
-                        else:  # suppose it is a variable on the GPU
-                            return [host_from_gpu(new_op)]
-            return False
-        local_opt.__name__ = maker.__name__
-        return local_optimizer(OP)(local_opt)
-    return f
-
-
-class InputToGpuOptimizer(Optimizer):
-    "Transfer the input to the gpu to start the rolling wave."
-
-    def add_requirements(self, fgraph):
-        fgraph.attach_feature(toolbox.ReplaceValidate())
-
-    def apply(self, fgraph):
-        for input in fgraph.inputs:
-            if isinstance(input.type, GpuArrayType):
-                continue
-
-            if (len(input.clients) == 1 and
-                (input.clients[0][0] == 'output' or
-                 input.clients[0][0].op == gpu_from_host)):
-                continue
-
-            try:
-                new_input = host_from_gpu(gpu_from_host(input))
-                fgraph.replace_validate(input, new_input,
-                                        "InputToGpuOptimizer")
-            except TypeError, e:
-                # This could fail if the inputs are not TensorTypes
-                pass
-
-gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
-                    0, 'fast_run', 'fast_compile', 'merge')
-
-
-@local_optimizer([gpu_from_host, host_from_gpu])
-def local_cut_gpu_host_gpu(node):
-    if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
-        return [node.inputs[0].owner.inputs[0]]
-    if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host):
-        return [node.inputs[0].owner.inputs[0]]
-    return False
-gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu,
-                        'fast_compile', 'fast_run', 'inplace', 'gpuarray')
-gpu_cut_copies.register('cut_gpua_constant_transfers',
-                        tensor.opt.constant_folding,
-                        'fast_compile', 'fast_run', 'gpuarray')
-optdb['canonicalize'].register('local_cut_gpua_host_gpua',
-                               local_cut_gpu_host_gpu,
-                               'fast_compile', 'fast_run', 'gpuarray')
-
-
-@register_opt('fast_compile')
-@local_optimizer([tensor.Alloc])
-def local_gpuaalloc2(node):
-    """
-    Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
-
-    Moves an alloc that is an input to join to the gpu.
-    """
-    if (isinstance(node.op, tensor.Alloc) and
-        all(c != 'output' and
-            c.op == tensor.join and
-            all(i.owner and
-                i.owner.op in [host_from_gpu, tensor.alloc]
-                for i in c.inputs[1:])
-            for c, idx in node.outputs[0].clients)):
-        return [host_from_gpu(gpu_alloc(*node.inputs))]
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.Alloc])
-def local_gpuaalloc(node):
-    new_out = gpu_alloc(*node.inputs)
-    # We need to hide new broadcastable dimensions because
-    # ReplaceValidate doesn't like when they change.
-    if new_out.broadcastable != node.outputs[0].broadcastable:
-        # but if a dim is suddenly not broadcastable anymore then that's a bug
-        for b_old, b_new in zip(node.outputs[0].broadcastable,
-                                new_out.broadcastable):
-            assert b_new or (not b_old)
-        new_out = tensor.patternbroadcast(new_out,
-                                          node.outputs[0].broadcastable)
-    return (new_out,)
-
-
-@register_opt()
-@local_optimizer([GpuAlloc])
-def local_gpualloc_memset_0(node):
-    if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
-        inp = node.inputs[0]
-        if (isinstance(inp, GpuArrayConstant) and
-            inp.data.size == 1 and
-            (numpy.asarray(inp.data) == 0).all()):
-            new_out = GpuAlloc(memset_0=True)(*node.inputs)
-            return [new_out]
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.Reshape])
-def local_gpureshape(node):
-    op = node.op
-    name = op.name
-    if name:
-        name = 'Gpu' + name
-    res = GpuReshape(op.ndim, op.name)
-    return res
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.Rebroadcast])
-def local_gpu_rebroadcast(node):
-    if isinstance(node.inputs[0].owner.op, HostFromGpu):
-        return node.op(node.inputs[0].owner.inputs[0])
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.Flatten])
-def local_gpuflatten(node):
-    op = node.op
-    shp = []
-    if op.outdim != 1:
-        shp = [node.inputs[0].shape[i] for i in range(op.outdim - 1)]
-    shp += [-1]
-    res = GpuReshape(op.outdim, None)
-    o = res(node.inputs[0], theano.tensor.as_tensor_variable(shp))
-    return o
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.Elemwise])
-def local_gpu_elemwise(node):
-    op = node.op
-    name = op.name
-    if name:
-        name = 'Gpu'+name
-    res = GpuElemwise(op.scalar_op, name=name,
-                      inplace_pattern=copy.copy(op.inplace_pattern),
-                      nfunc_spec=op.nfunc_spec)
-    return res
-
-
-def max_inputs_to_GpuElemwise(node):
-    ptr_size = 8
-    int_size = 4
-
-    # we take the limit from CUDA for now
-    argument_limit = 232
-    ndim = node.inputs[0].type.ndim
-    # number of elements and shape
-    size_param_mandatory = (int_size * (ndim + 1)) + \
-        (ptr_size + int_size * ndim) * len(node.outputs)
-
-    nb_bytes_avail = argument_limit - size_param_mandatory
-    nb_bytes_per_input = ptr_size + ndim * int_size
-    max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
-
-    return max_nb_inputs
-
-gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
-    GpuElemwise,
-    max_inputs_to_GpuElemwise)
-optdb.register('gpua_elemwise_fusion',
-               tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00,
-               'fast_run', 'fusion', 'local_elemwise_fusion', 'gpuarray')
-
-inplace_gpu_elemwise_opt = tensor.opt.inplace_elemwise_optimizer_op(
-    GpuElemwise)
-optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
-               'inplace_elemwise_optimizer', 'fast_run', 'inplace', 'gpuarray')
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.DimShuffle])
-def local_gpua_dimshuffle(node):
-    return GpuDimShuffle(node.op.input_broadcastable,
-                         node.op.new_order)
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.SpecifyShape])
-def local_gpua_specifyShape(node):
-    if isinstance(node.inputs[0].type, GpuArrayType):
-        return
-    inp = [gpu_from_host(node.inputs[0])] + node.inputs[1:]
-    return tensor.specify_shape(*inp)
-
-
-@register_opt('fast_compile')
-@op_lifter([theano.compile.ops.Shape])
-def local_gpua_shape(node):
-    # op_lifter will call this opt too frequently as the output is
-    # always on the CPU.
-    if isinstance(node.inputs[0].type, GpuArrayType):
-        return
-    return [gpu_from_host(node.inputs[0]).shape]
-
-
-def gpu_print_wrapper(op, cnda):
-    op.old_op.global_fn(op.old_op, numpy.asarray(cnda))
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.printing.Print])
-def local_gpu_print_op(node):
-    x, = node.inputs
-    gpu_x, = x.owner.inputs
-    new_op = node.op.__class__(global_fn=gpu_print_wrapper)
-    new_op.old_op = node.op
-    return new_op(gpu_x)
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.Join])
-def local_gpua_join(node):
-    return gpu_join
-
-
-@register_opt('fast_compile')
-@local_optimizer([GpuJoin])
-def local_gpuajoin_1(node):
-    # join of a single element
-    if (isinstance(node.op, GpuJoin) and
-        len(node.inputs) == 2):
-        return [node.inputs[1]]
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.Split])
-def local_gpua_split(node):
-    return GpuSplit(node.op.len_splits)
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.Subtensor])
-def local_gpua_subtensor(node):
-    return GpuSubtensor(node.op.idx_list)
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.IncSubtensor])
-def local_gpua_incsubtensor(node):
-    return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
-                           node.op.set_instead_of_inc,
-                           node.op.destroyhandler_tolerate_aliased)
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.AdvancedIncSubtensor1])
-def local_gpua_advanced_incsubtensor(node):
-
-    # This optimization is disabled if cuda is not active
-    if pygpu.get_default_context().kind != "cuda":
-        return None
-
-    x, y = node.inputs[0:2]
-    coords = node.inputs[2:]
-    set_instead_of_inc = node.op.set_instead_of_inc
-    active_device_no = theano.sandbox.cuda.active_device_number()
-    device_properties = theano.sandbox.cuda.device_properties
-
-    compute_capability = device_properties(active_device_no)['major']
-
-    if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
-        return GpuAdvancedIncSubtensor1(
-            set_instead_of_inc=set_instead_of_inc)
-    else:
-        return GpuAdvancedIncSubtensor1_dev20(
-            set_instead_of_inc=set_instead_of_inc)
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
-def local_gpua_careduce(node):
-    if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
-                                      scalar.Maximum, scalar.Minimum)):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if dev.startswith('opencl'):
-            op = GpuCAReduceCPY
-            if node.op.scalar_op not in [scalar.add, scalar.mul]:
-                # We don't support yet all reduction with cpy code.
-                return
-        else:
-            op = GpuCAReduceCuda
-        x, = node.inputs
-
-        greduce = op(
-            node.op.scalar_op, axis=node.op.axis,
-            dtype=getattr(node.op, 'dtype', None),
-            acc_dtype=getattr(node.op, 'acc_dtype', None))
-        gvar = greduce(x)
-        # We need to have the make node called, otherwise the mask can
-        # be None
-        if (op is GpuCAReduceCPY or
-            gvar.owner.op.supports_c_code([gpu_from_host(x)])):
-            return greduce
-        else:
-            # Try to make a simpler pattern based on reshaping
-            # The principle is that if two adjacent dimensions have
-            # the same value in the reduce_mask, then we can reshape
-            # to make them a single dimension, do the reduction, and
-            # then reshape to get them back.
-
-            if node.op.axis is None:
-                reduce_mask = [1] * x.type.ndim
-            else:
-                reduce_mask = [0] * x.type.ndim
-                for a in node.op.axis:
-                    assert reduce_mask[a] == 0
-                    reduce_mask[a] = 1
-
-            shape_of = node.fgraph.shape_feature.shape_of
-
-            x_shape = shape_of[x]
-
-            new_in_shp = [x_shape[0]]
-            new_mask = [reduce_mask[0]]
-            for i in xrange(1, x.type.ndim):
-                if reduce_mask[i] == reduce_mask[i - 1]:
-                    new_in_shp[-1] *= x_shape[i]
-                else:
-                    new_mask.append(reduce_mask[i])
-                    new_in_shp.append(x_shape[i])
-            new_axis = []
-            for idx, m in enumerate(new_mask):
-                if m == 1:
-                    new_axis.append(idx)
-            greduce = op(
-                node.op.scalar_op,
-                axis=new_axis, reduce_mask=new_mask,
-                dtype=getattr(node.op, 'dtype', None),
-                acc_dtype=getattr(node.op, 'acc_dtype', None))
-
-            reshaped_x = x.reshape(tensor.stack(*new_in_shp))
-            gpu_reshaped_x = gpu_from_host(reshaped_x)
-            gvar = greduce(gpu_reshaped_x)
-            # We need to have the make node called, otherwise the mask can
-            # be None
-            reshaped_gpu_inputs = [gpu_reshaped_x]
-            if greduce.supports_c_code(reshaped_gpu_inputs):
-                reduce_reshaped_x = host_from_gpu(
-                    greduce(gpu_reshaped_x))
-
-                if reduce_reshaped_x.ndim != node.outputs[0].ndim:
-                    unreshaped_reduce = reduce_reshaped_x.reshape(
-                        tensor.stack(*shape_of[node.outputs[0]]))
-                else:
-                    unreshaped_reduce = reduce_reshaped_x
-                return [unreshaped_reduce]
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
-def local_gpua_gemv(node):
-    return GpuGemv(inplace=node.op.inplace)
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.blas.Gemm])
-def local_gpua_gemm(node):
-    return GpuGemm(inplace=node.op.inplace)
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
-def local_gpua_ger(node):
-    return GpuGer(destructive=node.op.destructive)
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.blas.Dot22])
-def local_gpua_dot22(node):
-    return gpu_dot22
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.basic.Eye])
-def local_gpua_eye(node):
-    return GpuEye(dtype=node.op.dtype)
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
-def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
-    return GpuCrossentropySoftmaxArgmax1HotWithBias()
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
-def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
-    return GpuCrossentropySoftmax1HotWithBiasDx()
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.nnet.Softmax], cuda_only=True)
-def local_gpua_softmax(node):
-    return GpuSoftmax()
-
-
-@register_opt('fast_compile')
-@op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
-def local_gpua_softmaxwithbias(node):
-    return GpuSoftmaxWithBias()
-
-
-@register_opt('fast_compile')
-@op_lifter([theano.tensor.opt.Assert])
-def local_assert(node):
-    return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0],
-                                  *node.inputs[1:]))]
-
-
-@register_opt('fast_compile')
-@op_lifter([gpu_from_host, ConvOp])
-def local_gpu_conv(node):
-    """
-    gpu_from_host(conv) -> gpu_conv(gpu_from_host)
-
-    conv(host_from_gpu) -> host_from_gpu(gpu_conv)
-    """
-    def GpuConvOp_from_ConvOp(op):
-        logical_img_hw = None
-
-        if op.kshp_logical is not None and op.kshp_logical != op.kshp:
-            return None
-        # print op.kshp, op.imshp[1:3]
-        # print op.kshp_logical, logical_img_hw
-        ret = GpuConv(border_mode=op.out_mode,
-                      subsample=(op.dx, op.dy),
-                      logical_img_hw=logical_img_hw,
-                      logical_kern_hw=op.kshp_logical,
-                      logical_kern_align_top=op.kshp_logical_top_aligned,
-                      kshp=op.kshp,
-                      version=op.version,
-                      verbose=op.verbose,
-                      imshp=op.imshp,
-        )
-        if op.imshp_logical is not None:
-            logical_img_hw = op.imshp_logical[1:3]
-            if logical_img_hw != op.imshp[1:3]:
-                # this case is not implemented
-                # return None
-                rstride = int(numpy.ceil(op.imshp_logical[1] /
-                                         float(op.imshp[1])))
-                cstride = int(numpy.ceil(op.imshp_logical[2] /
-                                         float(op.imshp[2])))
-
-                def make_graph(img, kern):
-                    buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype),
-                                       img.shape[0], *op.imshp_logical)
-                    img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
-                                               img)
-                    img = gpu_from_host(img)
-                    return ret(img, kern)
-
-                return make_graph
-        return ret
-
-    def values_eq_approx(a, b):
-        """This fct is needed to don't have DebugMode raise useless
-        error due to ronding error.
-
-        This happen as We reduce on the two last dimensions, so this
-        can raise the absolute error if the number of element we
-        reduce on is significant.
-
-        """
-        assert a.ndim == 4
-        atol = None
-        if a.shape[-1] * a.shape[-2] > 100:
-            # For float32 the default atol is 1e-5
-            atol = 3e-5
-        return GpuArrayType.values_eq_approx(a, b, atol=atol)
-
-    img, kern = node.inputs
-    gpu_conv = GpuConvOp_from_ConvOp(node.op)
-    if gpu_conv is None:
-        return
-    out = gpu_conv(gpu_from_host(img),
-                   gpu_from_host(kern))
-    # in some case the ConvOp broadcast the last 2 dimensions
-    # differently then the gpu ConvOp
-    out = tensor.patternbroadcast(
-        host_from_gpu(out),
-        node.outputs[0].broadcastable)
-    # op_lifter want the output on the GPU.
-    out = gpu_from_host(out)
-    out.values_eq_approx = values_eq_approx
-    return [out]
-
-
-@register_opt("low_memory")
-@local_optimizer([GpuCAReduceCuda])
-def local_gpu_elemwise_careduce(node):
-    """ Merge some GpuCAReduceCuda and GPUElemwise"""
-    if (isinstance(node.op, GpuCAReduceCuda) and
-        node.op.pre_scalar_op is None and
-        node.inputs[0].owner and
-        isinstance(node.inputs[0].owner.op, GpuElemwise) and
-        # The Op support all scalar with 1 inputs.  We don't
-        # automatically add more case, as some like trigonometic
-        # operation with some reduction pattern will probably result
-        # to slow down.
-        isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)
-        ):
-        op = node.op
-        inp = node.inputs[0].owner.inputs[0]
-        return [GpuCAReduceCuda(scalar_op=op.scalar_op,
-                                reduce_mask=op.reduce_mask,
-                                pre_scalar_op=scalar.basic.sqr)(inp)]
-
-
-def tensor_to_gpu(x):
-    if isinstance(x.type, tensor.TensorType):
-        y = GpuArrayType(broadcastable=x.type.broadcastable,
-                         dtype=x.type.dtype)()
-        if x.name:
-            y.name = x.name + '[Gpua]'
-        return y
-    else:
-        return x
-
-
-def gpu_safe_new(x, tag=''):
-    """
-    Internal function that constructs a new variable from x with the same
-    type, but with a different name ( old name + tag). This function is used
-    by gradient, or the R-op to construct new variables for the inputs of
-    the inner graph such that there is no interference between the original
-    graph and the newly constructed graph.
-    """
-    if hasattr(x, 'name') and x.name is not None:
-        nw_name = x.name + tag
-    else:
-        nw_name = None
-    if isinstance(x, theano.Constant):
-        return x.clone()
-
-    nw_x = x.type()
-    nw_x.name = nw_name
-    return nw_x
-
-
-def gpu_reconstruct_graph(inputs, outputs, tag=None):
-    """
-    Different interface to clone, that allows you to pass inputs.
-    Compared to clone, this method always replaces the inputs with
-    new variables of the same type, and returns those ( in the same
-    order as the original inputs).
-    """
-    if tag is None:
-        tag = ''
-    nw_inputs = [gpu_safe_new(x, tag) for x in inputs]
-    givens = {}
-    for nw_x, x in zip(nw_inputs, inputs):
-        givens[x] = nw_x
-    nw_outputs = scan_utils.clone(outputs, replace=givens)
-    return (nw_inputs, nw_outputs)
-
-
-@register_opt('scan', 'fast_compile')
-@op_lifter([scan_op.Scan])
-def local_scan_to_gpua(node):
-    info = copy.deepcopy(node.op.info)
-    if info.get('gpua', False):
-        return
-    info['gpua'] = True
-    nw_ins = [node.inputs[0]]
-    e = (1 +
-         node.op.n_seqs +
-         node.op.n_mit_mot +
-         node.op.n_mit_sot +
-         node.op.n_sit_sot +
-         node.op.n_shared_outs)
-    nw_ins += [safe_to_gpu(x) for x in node.inputs[1:e]]
-    b = e
-    e = e + node.op.n_nit_sot
-    nw_ins += node.inputs[b:e]
-    nw_ins += [safe_to_gpu(x) for x in node.inputs[e:]]
-    scan_ins = [tensor_to_gpu(x) for x in node.op.inputs]
-    scan_outs = [safe_to_gpu(x) for x in node.op.outputs]
-    scan_outs = scan_utils.clone(
-        scan_outs,
-        replace=zip(node.op.inputs,
-                    [safe_to_cpu(x) for x in scan_ins]))
-
-    # We need to construct the hash here, because scan
-    # __init__ does not know about the gpu and can not
-    # handle graphs with inputs being on the gpu
-    tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins, scan_outs)
-    local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False)
-    _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
-    info['gpu_hash'] = hash(_cmodule_key)
-
-    nw_op = scan_op.Scan(scan_ins, scan_outs, info,
-                         typeConstructor=GpuArrayType).make_node(*nw_ins)
-    return nw_op.outputs
-
-optdb.register('gpua_scanOp_make_inplace',
-               scan_opt.ScanInplaceOptimizer(typeConstructor=GpuArrayType,
-                                             gpua_flag=True),
-               75,
-               'gpua',
-               'fast_run',
-               'inplace',
-               'scan')
diff --git a/theano/sandbox/gpuarray/subtensor.py b/theano/sandbox/gpuarray/subtensor.py
deleted file mode 100644
index bb47a9e52ba..00000000000
--- a/theano/sandbox/gpuarray/subtensor.py
+++ /dev/null
@@ -1,590 +0,0 @@
-import copy
-import StringIO
-import numpy
-
-import theano
-from theano import tensor, gof, Op
-from theano.gof.python25 import all, any
-from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
-import theano.tensor.inplace
-
-try:
-    import pygpu
-    from pygpu import gpuarray
-except ImportError:
-    pass
-
-from theano.sandbox.gpuarray.type import GpuArrayType
-from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
-from theano.sandbox.gpuarray.elemwise import GpuElemwise
-from theano.sandbox.gpuarray.comp import NVCC_compiler
-
-
-class GpuSubtensor(HideC, Subtensor):
-    def make_node(self, x, *inputs):
-        rval = tensor.Subtensor.make_node(self, x, *inputs)
-        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
-                             broadcastable=rval.outputs[0].type.broadcastable)
-        x = as_gpuarray_variable(x)
-        return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
-
-    def perform(self, node, inputs, out_):
-        out, = out_
-        x = inputs[0]
-
-        cdata = get_idx_list(inputs, self.idx_list)
-        if len(cdata) == 1:
-            cdata = cdata[0]
-
-        out[0] = x.__getitem__(cdata)
-
-    def c_support_code(self):
-        return """
-        static int fix_indices(ssize_t *start, ssize_t *stop, ssize_t *step,
-                               int start_n, int stop_n, int step_n,
-                               size_t len) {
-            if (step_n) *step = 1;
-            if (*step == 0) {
-                PyErr_SetString(PyExc_ValueError, "slice step cannot be zero");
-                return -1;
-            }
-            if (start_n) *start = (*step < 0) ? len-1 : 0;
-            else {
-                if (*start < 0) *start += len;
-                if (*start < 0) *start = (*step < 0) ? -1 : 0;
-                if (*start >= len) *start = (*step < 0) ? len-1 : len;
-            }
-
-            if (stop_n) *stop = (*step < 0) ? -1 : len;
-            else {
-                if (*stop < 0) *stop += len;
-                if (*stop < 0) *stop = (*step < 0) ? -1 : 0;
-                if (*stop >= len) *stop = (*step < 0) ? len-1 : len;
-            }
-            if (*stop < *start && *step > 0)
-                *stop = *start;
-            return 0;
-        }
-        """
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        inp_ndim = node.inputs[0].ndim
-        inp = inputs[0]
-        indices = inputs[1:]
-
-        # pad out the index list to the same dimension as the input
-        idx_list = self.idx_list + \
-            ((slice(None),) * (inp_ndim - len(self.idx_list)))
-
-        # This case fails when we use pygpu_index(), so here is some
-        # special code
-        if len(idx_list) == 0:
-            return """
-        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
-        if (!%(out)s) { %(fail)s }
-""" % dict(out=outputs[0], inp=inp, fail=sub['fail'])
-
-        sio = StringIO.StringIO()
-        print >> sio, """
-        ssize_t starts[%(sz)s];
-        ssize_t stops[%(sz)s];
-        ssize_t steps[%(sz)s];
-        ssize_t cur;
-        int err;
-
-        if (%(inp)s->ga.nd != %(sz)s) {
-            PyErr_SetString(PyExc_IndexError, "invalid index");
-            %(fail)s
-        }
-        """ % dict(sz=len(idx_list), inp=inp, fail=sub['fail'])
-
-        def fix_idx(idx):
-            if idx is None:
-                return "0", 1
-            elif isinstance(idx, (numpy.integer, int)):
-                return str(idx), 0
-            elif isinstance(idx, gof.Type):
-                return indices.pop(0), 0
-            else:
-                assert 0, idx
-
-        for i, idx in enumerate(idx_list):
-            if isinstance(idx, slice):
-                start, start_n = fix_idx(idx.start)
-                stop, stop_n = fix_idx(idx.stop)
-                step, step_n = fix_idx(idx.step)
-                print >>sio, """
-                starts[%(i)s] = %(start)s;
-                stops[%(i)s] = %(stop)s;
-                steps[%(i)s] = %(step)s;
-                if (fix_indices(&starts[%(i)s], &stops[%(i)s], &steps[%(i)s],
-                                %(start_n)s, %(stop_n)s, %(step_n)s,
-                                %(inp)s->ga.dimensions[%(i)s]) == -1) {
-                    %(fail)s
-                }
-                """ % dict(i=i, start=start, stop=stop, step=step,
-                           start_n=start_n, stop_n=stop_n, step_n=step_n,
-                           fail=sub['fail'], inp=inp)
-            else:
-                if isinstance(idx, gof.Type):
-                    start = indices.pop(0)
-                elif isinstance(idx, (numpy.integer, int)):
-                    start = idx
-                else:
-                    assert 0, idx
-                print >>sio, """
-                cur = %(start)s;
-                if (cur < 0)
-                    cur += %(inp)s->ga.dimensions[%(i)s];
-                starts[%(i)s] = cur;
-                steps[%(i)s] = 0;
-                """ % dict(i=i, start=start, fail=sub['fail'], inp=inp)
-
-        print >>sio, """
-        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_index(%(inp)s, starts, stops, steps);
-        if (!%(out)s) { %(fail)s }
-""" % dict(name=name, fail=sub['fail'], inp=inp, out=outputs[0])
-
-        return sio.getvalue()
-
-    def c_code_cache_version(self):
-        return (5,)
-
-
-class GpuIncSubtensor(IncSubtensor):
-    """
-    Implement IncSubtensor on the gpu.
-
-    Note: The optimization to make this inplace is in tensor/opt.
-          The same optimization handles IncSubtensor and GpuIncSubtensor.
-          This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
-          The helper methods like do_type_checking, copy_of_x, etc. specialize
-          the c_code for this Op.
-    """
-    def c_headers(self):
-        return self.iadd_node.op.c_headers()
-
-    def c_compiler(self):
-        return self.iadd_node.op.c_compiler()
-
-    def c_init_code(self):
-        return self.iadd_node.op.c_init_code()
-
-    def make_node(self, x, y, *inputs):
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
-        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
-        op = copy.copy(self)
-        ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
-        op.create_iadd_node(ret)
-        return ret
-
-    def create_iadd_node(self, node):
-        # We store a iadd_node in the op that contain the info needed
-        # for the inplace add.
-        cop = theano.tensor.inplace.add_inplace
-        gop = GpuElemwise(cop.scalar_op, copy.copy(cop.inplace_pattern),
-                          "Gpu" + cop.name, cop.nfunc_spec)
-        y = node.inputs[1]
-        xview = y.type()
-        iadd_node = gop(xview, y).owner
-        self.iadd_node = iadd_node
-
-    def perform(self, node, inputs, out_):
-        out, = out_
-        x, y = inputs[:2]
-        indices = list(reversed(inputs[2:]))
-
-        def convert(entry):
-            if isinstance(entry, gof.Type):
-                rval = indices.pop()
-                return rval
-            elif isinstance(entry, slice):
-                return slice(convert(entry.start),
-                             convert(entry.stop),
-                             convert(entry.step))
-            else:
-                return entry
-
-        cdata = tuple(map(convert, self.idx_list))
-        if len(cdata) == 1:
-            cdata = cdata[0]
-        if not self.inplace:
-            x = x.copy()
-        sub_x = x.__getitem__(cdata)
-        if sub_x.shape:
-            # we've sliced out an N-D tensor with N > 0
-            if not self.set_instead_of_inc:
-                #sub_x += y
-                pygpu.elemwise.ielemwise2(sub_x, '+', y,  broadcast=False)
-            else:
-                #sub_x += -sub_x + y
-                x.__setitem__(cdata, y)
-        else:
-            # scalar case
-            if not self.set_instead_of_inc:
-                #x.__setitem__(cdata, sub_x + y)
-                tmp = pygpu.elemwise.elemwise2(sub_x, '+', y,  sub_x,
-                                               broadcast=False)
-                x.__setitem__(cdata, tmp)
-            else:
-                x.__setitem__(cdata, y)
-        out[0] = x
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        owner = getattr(self.__dict__, "owner", None)
-        if owner:
-            op.create_iadd_node(owner)
-
-    def __getstate__(self):
-        d = copy.copy(self.__dict__)
-        if "iadd_node" in d:
-            d.pop('iadd_node')
-        return d
-
-    def do_type_checking(self, node):
-        """ Should raise NotImplementedError if c_code does not support
-        the types involved in this node.
-        """
-
-        if not isinstance(node.inputs[0].type, GpuArrayType):
-            raise NotImplementedError()
-
-    def copy_of_x(self, x):
-        """
-            :param x: a string giving the name of a C variable
-                pointing to an array
-
-            :return: C code expression to make a copy of x
-
-            Base class uses `PyArrayObject *`, subclasses may override for
-            different types of arrays.
-        """
-        return """pygpu_copy(%(x)s, GA_ANY_ORDER)""" % locals()
-
-    def decl_view(self):
-        return "PyGpuArrayObject* zview = NULL;"
-
-    def make_view_array(self, x, view_ndim):
-        """//TODO
-            :param x: a string identifying an array to be viewed
-            :param view_ndim: a string specifying the number of dimensions
-                to have in the view
-
-            This doesn't need to actually set up the view with the
-            right indexing; we'll do that manually later.
-        """
-        ret = """
-        size_t dims[%(view_ndim)s];
-        for(int i=0; i<%(view_ndim)s; i++)
-            dims[i] = xview_dims[i];
-        zview = pygpu_fromgpudata(%(x)s->ga.data,
-                                  xview_offset,
-                                  %(x)s->ga.typecode,
-                                  %(view_ndim)s,
-                                  dims,
-                                  xview_strides,
-                                  pygpu_default_context(),
-                                  1,
-                                  (PyObject *)%(x)s,
-                                  (PyObject *)&PyGpuArrayType);
-        """ % locals()
-        return ret
-
-    def get_helper_c_code_args(self):
-        """ Return a dictionary of arguments to use with helper_c_code"""
-        return {'c_prefix': 'PyGpuArray',
-                'strides_mul': 1
-                }
-
-    def copy_into(self, view, source):
-        """
-            view: string, C code expression for an array
-            source: string, C code expression for an array
-
-            returns a C code expression to copy source into view, and
-            return 0 on success
-        """
-        return """GpuArray_move(&%(view)s->ga, &%(source)s->ga)""" % locals()
-
-    def c_support_code_apply(self, node, nodename):
-        gop = self.iadd_node.op
-        sub_name = nodename + "_add_to_zview"
-        ret = gop.c_support_code_apply(self.iadd_node, sub_name)
-        ret += """
-        PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
-                                                    PyGpuArrayObject* src){
-           PyGpuArrayObject* ret = NULL;
-        """ % locals()
-        #def c_code(self, node, name, inputs, outputs, sub):
-        inputs = ["dst", "src"]
-        outputs = ["ret"]
-        sub = {"fail": "return NULL;"}
-        ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
-        ret += """
-            return dst;
-        }
-        """
-        return ret
-
-    def add_to_zview(self, nodename, x, fail):
-        #TODO
-        return """
-        PyGpuArrayObject * add_result = inc_sub_iadd_%(nodename)s(zview, %(x)s);
-
-        if (! add_result )
-        {
-            Py_DECREF(zview);
-            %(fail)s;
-        }
-        else
-        {
-            Py_DECREF(add_result);
-        }
-        """ % locals()
-
-    def c_code_cache_version(self):
-        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
-        elemwise_version = self.iadd_node.c_code_cache_version()
-        if not parent_version or not elemwise_version:
-            return
-        return parent_version + elemwise_version + (0,)
-
-
-class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
-    """
-    Implement AdvancedIncSubtensor1 on the gpu.
-    """
-    def make_node(self, x, y, ilist):
-        x_ = as_gpuarray_variable(x)
-        y_ = as_gpuarray_variable(y)
-        ilist_ = tensor.as_tensor_variable(ilist)
-
-        assert x_.type.dtype == y_.type.dtype
-        assert x_.type.ndim >= y_.type.ndim
-
-        if ilist_.type.dtype[:3] not in ('int', 'uin'):
-            raise TypeError('index must be integers')
-        if ilist_.type.broadcastable != (False,):
-            raise TypeError('index must be vector')
-        if x_.type.ndim == 0:
-            raise TypeError('cannot index into a scalar')
-        if x_.type.broadcastable[0]:
-            # the caller should have made a copy of x len(ilist) times
-            raise TypeError('cannot index into a broadcastable dimension')
-
-        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
-
-    def getInplElemwiseAdditionKernel(self, a, b):
-        a_arg = pygpu.tools.as_argument(a, 'a')
-        b_arg = pygpu.tools.as_argument(b, 'b')
-        args = [a_arg, b_arg]
-        oper = "a[i] = a[i] + %(b)s" % {'b': b_arg.expr()}
-        k = pygpu.elemwise.ElemwiseKernel(a.context, args, oper)
-        return k
-
-    # We can't use the parent version that loops on each index
-    # as we also need to loop when set_instead_of_inc is True and the
-    # parent doesn't loop in that case.
-    def perform(self, node, inp, out_):
-        # TODO opt to make this inplace
-        x, y, idx = inp
-        out, = out_
-
-        # Make sure idx is not a GpuArray otherwise we cannot use its content
-        # to index x and y
-        if isinstance(idx, gpuarray.GpuArray):
-            idx = numpy.asarray(idx)
-
-        if not self.inplace:
-            x = x.copy()
-        if self.set_instead_of_inc:
-            assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
-            if y.ndim == x.ndim:
-                assert len(y) == len(idx)
-                for (j, i) in enumerate(idx):
-                    x[i] = y[j]
-            else:
-                for i in idx:
-                    x[i] = y
-        else:
-            # If `y` has as many dimensions as `x`, then we want to iterate
-            # jointly on `x` and `y`. Otherwise, it means `y` should be
-            # broadcasted to fill all relevant rows of `x`.
-            assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
-
-            if len(idx) == 0:
-                pass
-            elif y.ndim == x.ndim:
-                assert len(y) == len(idx)
-
-                k = self.getInplElemwiseAdditionKernel(x[0], y[0])
-
-                for (j, i) in enumerate(idx):
-                    k(x[i], y[j], broadcast=False)
-            else:
-                nb_dims_to_add = (x.ndim - 1) - y.ndim
-                reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape)
-                k = self.getInplElemwiseAdditionKernel(x[0],
-                                                       reshaped_y)
-
-                for i in idx:
-                    k(x[i], reshaped_y, broadcast=True)
-
-        out[0] = x
-
-
-class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
-    """Implement AdvancedIncSubtensor1 on the gpu, but use function
-    only avail on compute capability 2.0 and more recent.
-    """
-
-    def __init__(self, inplace=False, set_instead_of_inc=False):
-        # The python implementation in the parent class is not applicable here
-        GpuAdvancedIncSubtensor1.__init__(self, inplace, set_instead_of_inc)
-
-    def make_node(self, x, y, ilist):
-        """It defer from GpuAdvancedIncSubtensor1 in that it make sure
-        the index are of type long.
-        """
-        x_ = as_gpuarray_variable(x)
-        y_ = as_gpuarray_variable(y)
-        ilist_ = as_gpuarray_variable(ilist)
-
-        assert x_.type.dtype == y_.type.dtype
-        assert x_.type.ndim >= y_.type.ndim
-
-        if ilist_.type.dtype[:3] not in ('int', 'uin'):
-            raise TypeError('index must be integers')
-        if ilist_.type.broadcastable != (False,):
-            raise TypeError('index must be vector')
-        if x_.type.ndim == 0:
-            raise TypeError('cannot index into a scalar')
-        if x_.type.broadcastable[0]:
-            # the caller should have made a copy of x len(ilist) times
-            raise TypeError('cannot index into a broadcastable dimension')
-
-        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
-
-    def c_code_cache_version(self):
-        return (2,)
-
-    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
-
-    def c_init_code(self):
-        return ['setup_ext_cuda();']
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        active_device_no = theano.sandbox.cuda.active_device_number()
-        device_properties = theano.sandbox.cuda.device_properties
-        compute_capability = device_properties(active_device_no)['major']
-        if ((self.set_instead_of_inc) or
-            (node.inputs[0].ndim != node.inputs[1].ndim) or
-            (node.inputs[0].ndim != 2) or
-            (compute_capability < 2)):
-            raise NotImplementedError("This case does not have C code yet.")
-
-        x = inputs[0]
-        y = inputs[1]
-        ind = inputs[2]
-        out = outputs[0]
-        fail = sub['fail']
-        inplace = int(self.inplace)
-        return """
-        Py_XDECREF(%(out)s);
-        if (!%(inplace)s) {
-            %(out)s = (PyGpuArrayObject*)pygpu_copy(%(x)s, GA_C_ORDER);
-        } else {
-            %(out)s = %(x)s;
-            Py_XINCREF(%(out)s);
-        }
-
-        GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s);
-
-        if (!%(out)s) {
-            %(fail)s
-        }
-        """ % locals()
-
-    def c_support_code_apply(self, node, nodename):
-        dtype_x = node.inputs[0].dtype
-        dtype_y = node.inputs[1].dtype
-        dtype_ind = node.inputs[2].dtype
-        dtype_out = node.outputs[0].dtype
-        itemsize_x = numpy.dtype(dtype_x).itemsize
-        itemsize_y = numpy.dtype(dtype_y).itemsize
-        itemsize_ind = numpy.dtype(dtype_ind).itemsize
-        itemsize_out = numpy.dtype(dtype_out).itemsize
-        return """
-
-        __global__ void k_vector_add_fast(int numRowsX,
-                                          int numColsX,
-                                          int stridesX0,
-                                          int stridesX1,
-                                          npy_%(dtype_x)s *X,
-                                          int numRowsY,
-                                          int numColsY,
-                                          int stridesY0,
-                                          int stridesY1,
-                                          npy_%(dtype_y)s *Y,
-                                          int numIndices,
-                                          int stridesIndices,
-                                          npy_%(dtype_ind)s *indices_arr)
-        {
-             for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
-             {
-                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
-                  {
-                      int x_row = indices_arr[i * stridesIndices];
-                      int y_row = i;
-                      atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
-                  }
-             }
-             return;
-        }
-
-        void GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
-                                      PyGpuArrayObject* py_other,
-                                      PyGpuArrayObject *indices_arr)
-        {
-                int num_threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1],
-                                                     (size_t)256);
-                int num_blocks = std::min(PyGpuArray_SIZE(indices_arr),
-                                          (size_t)4096);
-
-                dim3 n_blocks(num_blocks);
-                dim3 n_threads(num_threads_per_block);
-
-                k_vector_add_fast<<<n_blocks, n_threads>>>(
-                        PyGpuArray_DIMS(py_self)[0],
-                        PyGpuArray_DIMS(py_self)[1],
-                        PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s,
-                        PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s,
-                        (npy_%(dtype_x)s*)(
-                            ((char *)cuda_get_ptr(py_self->ga.data)) +
-                            py_self->ga.offset),
-                        PyGpuArray_DIMS(py_other)[0],
-                        PyGpuArray_DIMS(py_other)[1],
-                        PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s,
-                        PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s,
-                        (npy_%(dtype_x)s*)(
-                            ((char *)cuda_get_ptr(py_other->ga.data)) +
-                            py_other->ga.offset),
-                        PyGpuArray_DIMS(indices_arr)[0],
-                        PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s,
-                        (npy_%(dtype_ind)s*)(
-                            ((char *)cuda_get_ptr(indices_arr->ga.data)) +
-                            indices_arr->ga.offset)
-                );
-                return;
-        }
-
-        """ % locals()
diff --git a/theano/sandbox/gpuarray/tests/__init__.py b/theano/sandbox/gpuarray/tests/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/theano/sandbox/gpuarray/tests/test_basic_ops.py b/theano/sandbox/gpuarray/tests/test_basic_ops.py
deleted file mode 100644
index 2feae15715e..00000000000
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ /dev/null
@@ -1,476 +0,0 @@
-import unittest
-from itertools import izip
-from copy import copy, deepcopy
-
-import numpy
-import theano
-import theano.tensor as T
-from theano.tensor import TensorType
-from theano.tensor.basic import alloc
-from theano.tensor.tests.test_basic import (
-    rand, safe_make_node, T_reshape, T_Join_and_Split
-    )
-from theano.tests.unittest_tools import SkipTest
-from numpy.testing.noseclasses import KnownFailureTest
-
-import theano.sandbox.gpuarray
-
-if theano.sandbox.gpuarray.pygpu is None:
-    raise SkipTest("pygpu not installed")
-
-# If you are writing a new test file, don't copy this code, but rather
-# import stuff from this file (like mode_with_gpu) to reuse it.
-import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
-    if not cuda_ndarray.use.device_number:
-        #We should not enable all the use like the flag device=gpu,
-        #as many tests don't work in that setup.
-        cuda_ndarray.use('gpu',
-                         default_to_move_computation_to_gpu=False,
-                         move_shared_float32_to_gpu=False,
-                         enable_cuda=False)
-    theano.sandbox.gpuarray.init_dev('cuda')
-
-if not theano.sandbox.gpuarray.pygpu_activated:
-    raise SkipTest("pygpu disabled")
-
-from theano.sandbox.gpuarray.type import (GpuArrayType,
-                                          gpuarray_shared_constructor)
-from theano.sandbox.gpuarray.basic_ops import (
-    host_from_gpu, gpu_from_host,
-    gpu_alloc, GpuAlloc,
-    gpu_from_cuda,
-    cuda_from_gpu, HostFromGpu,
-    GpuFromHost, GpuReshape,
-    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
-from theano.sandbox.gpuarray.subtensor import GpuSubtensor
-
-from theano.tests import unittest_tools as utt
-utt.seed_rng()
-rng = numpy.random.RandomState(seed=utt.fetch_seed())
-
-from pygpu import gpuarray
-
-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
-
-
-def may_fail(msg, EClass):
-    """Mark a test that requires very specific conditions to work to
-       mask a specific exception class."""
-    def test_decorator(f):
-        def wrapper():
-            try:
-                f()
-            except Exception, e:
-                if isinstance(e, EClass):
-                    raise KnownFailureTest(msg, e)
-                raise
-        wrapper.__name__ = f.__name__
-        return wrapper
-    return test_decorator
-
-
-def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
-                 on_unused_input='raise', name=None):
-    if mode is None:
-        mode = mode_with_gpu
-    return theano.function(inputs, outputs, mode=mode,
-                           allow_input_downcast=allow_input_downcast,
-                           accept_inplace=True,
-                           on_unused_input=on_unused_input, name=name)
-
-
-def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
-    from theano.tensor.sharedvar import tensor_constructor, scalar_constructor
-    for c in (gpuarray_shared_constructor, tensor_constructor,
-              scalar_constructor):
-        try:
-            return c(value, name=name, strict=strict,
-                     allow_downcast=allow_downcast, **kwargs)
-        except TypeError:
-            continue
-
-
-def rand_gpuarray(*shape, **kwargs):
-    r = rng.rand(*shape) * 2 - 1
-    dtype = kwargs.pop('dtype', theano.config.floatX)
-    cls = kwargs.pop('cls', None)
-    if len(kwargs) != 0:
-        raise TypeError('Unexpected argument %s', kwargs.keys()[0])
-    return gpuarray.array(r, dtype=dtype, cls=cls)
-
-
-def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
-               mode_nogpu=mode_without_gpu, skip=False, eps=1e-10):
-    if checks is None:
-        checks = {}
-
-    _op = op
-    _gpu_op = gpu_op
-    _cases = cases
-    _skip = skip
-    _checks = checks
-
-    class Checker(unittest.TestCase, utt.TestOptimizationMixin):
-        op = staticmethod(_op)
-        gpu_op = staticmethod(_gpu_op)
-        cases = _cases
-        skip = _skip
-        checks = _checks
-
-        def setUp(self):
-            eval(self.__class__.__module__ + '.' + self.__class__.__name__)
-
-        def test_all(self):
-            if skip:
-                raise SkipTest(skip)
-
-            for testname, inputs in cases.items():
-                self.run_case(testname, inputs)
-
-        def run_case(self, testname, inputs):
-            inputs_ref = [theano.shared(inp) for inp in inputs]
-            inputs_tst = [theano.shared(inp) for inp in inputs]
-
-            try:
-                node_ref = safe_make_node(self.op, *inputs_ref)
-                node_tst = safe_make_node(self.op, *inputs_tst)
-            except Exception, exc:
-                err_msg = ("Test %s::%s: Error occured while making "
-                           "a node with inputs %s") % (self.gpu_op, testname,
-                                                       inputs)
-                exc.args += (err_msg,)
-                raise
-
-            try:
-                f_ref = inplace_func([], node_ref.outputs, mode=mode_nogpu)
-                f_tst = inplace_func([], node_tst.outputs, mode=mode_gpu)
-            except Exception, exc:
-                err_msg = ("Test %s::%s: Error occured while trying to "
-                           "make a Function") % (self.gpu_op, testname)
-                exc.args += (err_msg,)
-                raise
-
-            self.assertFunctionContains1(f_tst, self.gpu_op)
-
-            ref_e = None
-            try:
-                expecteds = f_ref()
-            except Exception, exc:
-                ref_e = exc
-
-            try:
-                variables = f_tst()
-            except Exception, exc:
-                if ref_e is None:
-                    err_msg = ("Test %s::%s: exception when calling the "
-                               "Function") % (self.gpu_op, testname)
-                    exc.args += (err_msg,)
-                    raise
-                else:
-                    # if we raised an exception of the same type we're good.
-                    if isinstance(exc, type(ref_e)):
-                        return
-                    else:
-                        err_msg = ("Test %s::%s: exception raised during test "
-                                   "call was not the same as the reference "
-                                   "call (got: %s, expected %s)") % \
-                                   (self.gpu_op, testname, type(exc),
-                                    type(ref_e))
-                        exc.args += (err_msg,)
-                        raise
-
-            for i, (variable, expected) in \
-                    enumerate(izip(variables, expecteds)):
-                if variable.dtype != expected.dtype or \
-                        variable.shape != expected.shape or \
-                        not TensorType.values_eq_approx(variable,
-                                                        expected):
-                    self.fail(("Test %s::%s: Output %s gave the wrong "
-                               "value. With inputs %s, expected %s "
-                               "(dtype %s), got %s (dtype %s).") % (
-                            self.op, testname, i, inputs, expected,
-                            expected.dtype, variable, variable.dtype))
-
-            for description, check in self.checks.items():
-                if not check(inputs, variables):
-                    self.fail(("Test %s::%s: Failed check: %s "
-                               "(inputs were %s, ouputs were %s)") %
-                              (self.op, testname, description,
-                               inputs, variables))
-
-    Checker.__name__ = name
-    return Checker
-
-
-def test_transfer_cpu_gpu():
-    a = T.fmatrix('a')
-    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
-
-    av = numpy.asarray(rng.rand(5, 4), dtype='float32')
-    gv = gpuarray.array(av)
-
-    f = theano.function([a], gpu_from_host(a))
-    fv = f(av)
-    assert GpuArrayType.values_eq(fv, gv)
-
-    f = theano.function([g], host_from_gpu(g))
-    fv = f(gv)
-    assert numpy.all(fv == av)
-
-
-def test_transfer_strided():
-    # This is just to ensure that it works in theano
-    # libgpuarray has a much more comprehensive suit of tests to
-    # ensure correctness
-    a = T.fmatrix('a')
-    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
-
-    av = numpy.asarray(rng.rand(5, 8), dtype='float32')
-    gv = gpuarray.array(av)
-
-    av = av[:, ::2]
-    gv = gv[:, ::2]
-
-    f = theano.function([a], gpu_from_host(a))
-    fv = f(av)
-    assert GpuArrayType.values_eq(fv, gv)
-
-    f = theano.function([g], host_from_gpu(g))
-    fv = f(gv)
-    assert numpy.all(fv == av)
-
-
-@may_fail("Op fails if both contexts are not the same and it's rare "
-          "that the tests will be run this way", ValueError)
-def test_transfer_cuda_gpu():
-    import theano.sandbox.cuda as cuda_ndarray
-    if cuda_ndarray.cuda_available is False:
-        raise SkipTest("Can't test interaction with cuda if cuda not present")
-    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
-    c = cuda_ndarray.CudaNdarrayType((False, False))('c')
-
-    av = theano._asarray(rng.rand(5, 4), dtype='float32')
-    gv = gpuarray.array(av)
-    cv = cuda_ndarray.CudaNdarray(av)
-    gvs = gv[:, ::-2]
-    cvs = cv[:, ::-2]
-
-    f = theano.function([c], gpu_from_cuda(c))
-    fv = f(cv)
-    assert GpuArrayType.values_eq_approx(fv, gv)
-
-    fvs = f(cvs)
-    assert GpuArrayType.values_eq_approx(fvs, gvs)
-
-    f = theano.function([g], cuda_from_gpu(g))
-    fv = f(gv)
-    assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fv, cv)
-
-    fvs = f(gvs)
-    assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fvs, cvs)
-
-
-def gpu_alloc_expected(x, *shp):
-    g = gpuarray.empty(shp, dtype=x.dtype)
-    g[:] = x
-    return g
-
-GpuAllocTester = makeTester(
-    name="GpuAllocTester",
-    op=alloc,
-    gpu_op=gpu_alloc,
-    cases=dict(
-        correct01=(rand(), numpy.int32(7)),
-# just gives a DeepCopyOp with possibly wrong results on the CPU
-#        correct01_bcast=(rand(1), numpy.int32(7)),
-        correct02=(rand(), numpy.int32(4), numpy.int32(7)),
-        correct12=(rand(7), numpy.int32(4), numpy.int32(7)),
-        correct13=(rand(7), numpy.int32(2), numpy.int32(4),
-                   numpy.int32(7)),
-        correct23=(rand(4, 7), numpy.int32(2), numpy.int32(4),
-                   numpy.int32(7)),
-        bad_shape12=(rand(7), numpy.int32(7), numpy.int32(5)),
-        )
-)
-
-
-class TestAlloc(theano.tensor.tests.test_basic.TestAlloc):
-    dtype = "float32"
-    mode = mode_with_gpu
-    shared = staticmethod(gpuarray_shared_constructor)
-    allocs = [GpuAlloc, GpuAlloc, T.Alloc]
-
-
-def test_shape():
-    x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])()
-    v = gpuarray.zeros((3, 4, 5), dtype='float32')
-    f = theano.function([x], x.shape)
-    topo = f.maker.fgraph.toposort()
-    assert numpy.all(f(v) == (3, 4, 5))
-    if theano.config.mode != 'FAST_COMPILE':
-        assert len(topo) == 4
-        assert isinstance(topo[0].op, T.opt.Shape_i)
-        assert isinstance(topo[1].op, T.opt.Shape_i)
-        assert isinstance(topo[2].op, T.opt.Shape_i)
-        assert isinstance(topo[3].op, T.opt.MakeVector)
-    mode = mode_with_gpu.excluding("local_shape_to_shape_i")
-    f = theano.function([x], x.shape, mode=mode)
-    topo = f.maker.fgraph.toposort()
-    assert numpy.all(f(v) == (3, 4, 5))
-    assert len(topo) == 1
-    assert isinstance(topo[0].op, T.Shape)
-
-
-def test_gpu_contiguous():
-    a = T.fmatrix('a')
-    i = T.iscalar('i')
-    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
-    f = theano.function([a, i], gpu_contiguous(a[::i]),
-                        mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
-    assert f(a_val, 1).flags.c_contiguous
-    assert f(a_val, 2).flags.c_contiguous
-    assert f(a_val, 2).flags.c_contiguous
-
-
-class G_reshape(T_reshape):
-    def shortDescription(self):
-        return None
-
-    def __init__(self, name):
-        T_reshape.__init__(self, name,
-                           shared=gpuarray_shared_constructor,
-                           op=GpuReshape,
-                           mode=mode_with_gpu,
-                           # avoid errors with limited devices
-#                             dtype='float32',
-                           ignore_topo=(HostFromGpu, GpuFromHost,
-                                        theano.compile.DeepCopyOp,
-                                        theano.sandbox.gpuarray.elemwise.GpuElemwise,
-                                        theano.tensor.opt.Shape_i,
-                                        theano.tensor.opt.MakeVector))
-        assert self.op == GpuReshape
-
-
-class G_Join_and_Split(T_Join_and_Split):
-    def setUp(self):
-        super(G_Join_and_Split, self).setUp()
-        self.mode = mode_with_gpu.excluding('constant_folding')
-        self.join_op = GpuJoin
-        self.split_op = GpuSplit
-        # Use join instead of MakeVector since there is no MakeVector on GPU
-        self.make_vector_op = GpuJoin
-        # this is to avoid errors with limited devices
-        self.floatX = 'float32'
-        self.hide_error = theano.config.mode not in ['DebugMode', 'DEBUG_MODE']
-        self.shared = gpuarray_shared_constructor
-
-    def test_gpusplit_opt(self):
-        rng = numpy.random.RandomState(seed=utt.fetch_seed())
-        m = self.shared(rng.rand(4, 6).astype(self.floatX))
-        o = T.Split(2)(m, 0, [2, 2])
-        f = theano.function([], o, mode=self.mode)
-        assert any([isinstance(node.op, self.split_op)
-                    for node in f.maker.fgraph.toposort()])
-        o1, o2 = f()
-        assert numpy.allclose(o1, m.get_value(borrow=True)[:2])
-        assert numpy.allclose(o2, m.get_value(borrow=True)[2:])
-
-
-def test_gpujoin_gpualloc():
-    a = T.fmatrix('a')
-    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
-    b = T.fmatrix('b')
-    b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32')
-
-    f = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)) + 4,
-                        mode=mode_without_gpu)
-    f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)),
-                            mode=mode_with_gpu)
-    f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a),
-                                            T.ones_like(b)) + 4,
-                             mode=mode_with_gpu)
-    assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2
-    assert sum([node.op == T.join for node in f.maker.fgraph.toposort()]) == 1
-    assert sum([isinstance(node.op, GpuAlloc)
-                for node in f_gpu.maker.fgraph.toposort()]) == 2
-    assert sum([node.op == gpu_join
-                for node in f_gpu.maker.fgraph.toposort()]) == 1
-    assert sum([isinstance(node.op, GpuAlloc)
-                for node in f_gpu2.maker.fgraph.toposort()]) == 2
-    assert sum([node.op == gpu_join
-                for node in f_gpu2.maker.fgraph.toposort()]) == 1
-    assert numpy.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))
-
-
-def test_gpueye():
-    def check(dtype, N, M_=None):
-        # Theano does not accept None as a tensor.
-        # So we must use a real value.
-        M = M_
-        # Currently DebugMode does not support None as inputs even if this is
-        # allowed.
-        if M is None:
-            M = N
-        N_symb = T.iscalar()
-        M_symb = T.iscalar()
-        k_symb = numpy.asarray(0)
-        out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
-        f = theano.function([N_symb, M_symb],
-                            out,
-                            mode=mode_with_gpu)
-        result = numpy.asarray(f(N, M))
-        assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))
-        assert result.dtype == numpy.dtype(dtype)
-        assert any([isinstance(node.op, GpuEye)
-                    for node in f.maker.fgraph.toposort()])
-
-    for dtype in ['float32', 'int32']:
-        yield check, dtype, 3
-        # M != N, k = 0
-        yield check, dtype, 3, 5
-        yield check, dtype, 5, 3
-
-
-def test_hostfromgpu_shape_i():
-    """
-    Test that the shape is lifted over hostfromgpu
-    """
-
-    m = mode_with_gpu.including('local_dot_to_dot22',
-                                'local_dot22_to_dot22scalar',
-                                'specialize')
-    a = T.fmatrix('a')
-    ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
-    av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
-    cv = gpuarray.asarray(numpy.random.rand(5, 4),
-                          dtype='float32')
-
-    gpu_from_host = theano.sandbox.gpuarray.basic_ops.gpu_from_host
-    host_from_gpu = theano.sandbox.gpuarray.basic_ops.host_from_gpu
-    f = theano.function([a], gpu_from_host(a), mode=m)
-    assert gpu_from_host in [x.op
-                             for x in f.maker.fgraph.toposort()]
-    f = theano.function([a], gpu_from_host(a).shape, mode=m)
-    topo = f.maker.fgraph.toposort()
-    assert isinstance(topo[0].op, T.opt.Shape_i)
-    assert isinstance(topo[1].op, T.opt.Shape_i)
-    assert isinstance(topo[2].op, T.opt.MakeVector)
-    assert tuple(f(av)) == (5, 4)
-
-    f = theano.function([ca], host_from_gpu(ca), mode=m)
-    assert host_from_gpu in [x.op
-                             for x in f.maker.fgraph.toposort()]
-    f = theano.function([ca], host_from_gpu(ca).shape, mode=m)
-    topo = f.maker.fgraph.toposort()
-    assert isinstance(topo[0].op, theano.compile.Shape_i)
-    assert isinstance(topo[1].op, theano.compile.Shape_i)
-    assert isinstance(topo[2].op, theano.tensor.opt.MakeVector)
-    assert tuple(f(cv)) == (5, 4)
diff --git a/theano/sandbox/gpuarray/tests/test_blas.py b/theano/sandbox/gpuarray/tests/test_blas.py
deleted file mode 100644
index bc5ae0f0055..00000000000
--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ /dev/null
@@ -1,113 +0,0 @@
-from unittest import TestCase
-from nose.plugins.skip import SkipTest
-
-import theano
-from theano import tensor
-from theano.tests import unittest_tools
-from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
-                                _dot22)
-from theano.tensor.tests.test_blas import TestGer, BaseGemv
-
-from theano.sandbox.gpuarray import gpuarray_shared_constructor
-from theano.sandbox.gpuarray.tests.test_basic_ops import (makeTester, rand,
-                                                          mode_with_gpu)
-
-from theano.sandbox.gpuarray.blas import (gpugemv_inplace, gpugemv_no_inplace,
-                                          gpugemm_inplace, gpugemm_no_inplace,
-                                          gpuger_inplace, gpuger_no_inplace,
-                                          GpuGer, gpu_dot22)
-
-
-GpuGemvTester = makeTester('GpuGemvTester',
-                           op=gemv_inplace, gpu_op=gpugemv_inplace,
-                           cases=dict(
-        dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
-        dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
-#        test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
-#        test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
-#        test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
-        test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
-        )
-)
-
-class TestGpuSgemv(TestCase, BaseGemv, unittest_tools.TestOptimizationMixin):
-    mode = mode_with_gpu
-    dtype = 'float32'
-
-    gemv = gpugemv_no_inplace
-    gemv_inplace = gpugemv_inplace
-
-    @staticmethod
-    def shared(val):
-        try:
-            return gpuarray_shared_constructor(val)
-        except TypeError:
-            return theano.shared(val)
-
-
-GpuGemmTester = makeTester('GpuGemmTester',
-                           op=gemm_inplace, gpu_op=gpugemm_inplace,
-                           cases=dict(
-        test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
-        test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
-        test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
-        test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
-        test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
-        test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
-        test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
-        test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
-        test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
- #       test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
- #       test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
- #       test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
- #       test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
-        )
-)
-
-class TestGpuSger(TestGer):
-    def setUp(self):
-        self.mode = mode_with_gpu
-        dtype = self.dtype = 'float32'  # optimization isn't dtype-dependent
-        self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
-        self.a = tensor.tensor(dtype=dtype, broadcastable=())
-        self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
-        self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
-        self.ger_destructive = gpuger_inplace
-
-        # data on the gpu make the op always inplace
-        self.ger = gpuger_inplace
-        self.gemm = gpugemm_inplace
-
-    def test_f32_0_0(self):
-        raise SkipTest('0-sized objects not supported')
-    def test_f32_1_0(self):
-        raise SkipTest('0-sized objects not supported')
-    def test_f32_0_1(self):
-        raise SkipTest('0-sized objects not supported')
-
-class TestGpuSgerNoTransfer(TestGpuSger):
-    shared = staticmethod(gpuarray_shared_constructor)
-
-class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
-    def setUp(self):
-        self.ops = [gpuger_no_inplace, gpuger_inplace]
-
-    def clone(self, op):
-        return GpuGer(destructive=op.destructive)
-
-
-GpuDot22Tester = makeTester(
-    'GpuGemmTester',
-    op=_dot22, gpu_op=gpu_dot22,
-    cases=dict(
-        test1=[rand(3, 4), rand(4, 5)],
-        test2=[rand(1, 4), rand(4, 5)],
-        test3=[rand(3, 1), rand(1, 5)],
-        test4=[rand(3, 4), rand(4, 1)],
-#        test5=[rand(0, 4), rand(4, 5)],
-#        test6=[rand(3, 0), rand(0, 5)],
-#        test7=[rand(3, 4), rand(4, 0)],
-#        test8=[rand(0, 4), rand(4, 0)],
-#        test9=[rand(0, 0), rand(0, 0)],
-    )
-)
diff --git a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
deleted file mode 100644
index da4b2760a74..00000000000
--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ /dev/null
@@ -1,845 +0,0 @@
-"""
-Tests for GPU convolution
-"""
-import sys
-import time
-import unittest
-
-
-import numpy
-
-from nose.plugins.skip import SkipTest
-imported_scipy_convolve2d = False
-try:
-    from scipy.signal import convolve2d
-    imported_scipy_convolve2d = True
-except ImportError:
-    pass
-
-import theano
-from theano import tensor
-from theano.gof.python25 import any
-from theano.tests.unittest_tools import seed_rng
-
-# We let that import do the init of the back-end if needed.
-from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
-                                                          mode_without_gpu)
-from theano.sandbox.gpuarray.type import GpuArrayType
-from theano.sandbox.gpuarray.conv import GpuConv
-import pygpu
-gftensor4 = GpuArrayType('float32', [False] * 4)
-
-device_id = theano.sandbox.cuda.use.device_number
-# TODO do with with the new back-end.
-from theano.sandbox.cuda import cuda_ndarray
-cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
-device_prop = cuda_ndarray.device_properties(device_id)
-
-
-def py_conv_valid_numpy(img, kern):
-    assert img.shape[1] == kern.shape[1]
-    outshp = (img.shape[0], kern.shape[0],
-            img.shape[2] - kern.shape[2] + 1,
-            img.shape[3] - kern.shape[3] + 1)
-    out = numpy.zeros(outshp, dtype='float32')
-    for b in xrange(out.shape[0]):
-        for k in xrange(out.shape[1]):
-            for rr in xrange(out.shape[2]):
-                for cc in xrange(out.shape[3]):
-                    #rr, cc is the upper-left corner of img patches
-                    imgpatch = img[b, :, rr:rr + kern.shape[2],
-                                   cc:cc + kern.shape[3]]
-
-                    innerprod = (imgpatch[:, ::-1, ::-1] *
-                                 kern[k, :, :, :]).sum()
-                    out[b, k, rr, cc] = innerprod
-    return out
-
-
-def py_conv_full_numpy(img, kern):
-    # manually pad the img with zeros all around, and then run it
-    # through py_conv_valid
-    pad_rows = 2 * (kern.shape[2] - 1) + img.shape[2]
-    pad_cols = 2 * (kern.shape[3] - 1) + img.shape[3]
-    padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols),
-                             dtype=img.dtype)
-    padded_img[:, :, kern.shape[2] - 1: kern.shape[2] - 1 + img.shape[2],
-                     kern.shape[3] - 1: kern.shape[3] - 1 + img.shape[3]] = img
-    return py_conv_valid_numpy(padded_img, kern)
-
-
-def py_conv(img, kern, mode, subsample):
-    """
-    use a scipy or numpy implementation depending is scipy is available.
-    The scipy version is faster.
-    """
-    if imported_scipy_convolve2d:
-        return py_conv_scipy(img, kern, mode, subsample)
-    elif mode == 'valid':
-        return py_conv_valid_numpy(img, kern)[:, :, ::subsample[0],
-                                                      ::subsample[1]]
-    elif mode == 'full':
-        return py_conv_full_numpy(img, kern)[:, :, ::subsample[0],
-                                                     ::subsample[1]]
-    else:
-        raise Exception("Can't execute this kernel.")
-
-
-def py_conv_scipy(img, kern, mode, subsample):
-    assert img.shape[1] == kern.shape[1]
-    if mode == 'valid':
-        outshp = (img.shape[0], kern.shape[0],
-                img.shape[2] - kern.shape[2] + 1,
-                img.shape[3] - kern.shape[3] + 1)
-    else:
-        outshp = (img.shape[0], kern.shape[0],
-                img.shape[2] + kern.shape[2] - 1,
-                img.shape[3] + kern.shape[3] - 1)
-    out = numpy.zeros(outshp, dtype='float32')
-    for b in xrange(out.shape[0]):
-        for k in xrange(out.shape[1]):
-            for s in xrange(img.shape[1]):
-                out[b, k, :, :] += convolve2d(img[b, s, :, :],
-                                              kern[k, s, :, :],
-                                              mode)
-    return out[:, :, ::subsample[0], ::subsample[1]]
-
-
-def _params_allgood_header():
-    print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup"
-
-
-def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
-                    kern_stride=(1, 1), version=-1, verbose=0, random=True,
-                    print_=None, id=None, rtol=1e-5, atol=1e-8,
-                    nb_iter=0, ones=False, compile_kshp=None):
-    #
-    # This function is the core of several of the big unit-test drivers,
-    # but it can also be used very directly on its own to test a specific
-    # kind of convolution.
-    #
-    # See `test_example` (above) for an example of how to use this directly.
-    #
-    # :param kshape: (4d)The shape of the kernel at run time.
-    # :param compile_kshp: (2d) hardcode the shape of the kernel in
-    #                      the generated code This is supposed to be
-    #                      faster, but we need to check That we raise
-    #                      an error if the input have the wrong shape.
-    #
-    if ones:
-        assert not random
-        npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
-        npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
-    elif random:
-        npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
-                                  dtype='float32')
-        npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
-                                   dtype='float32')
-    else:
-        npy_img = theano._asarray(numpy.arange(
-                numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
-        npy_kern = -(theano._asarray(numpy.arange(
-                    numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
-    img = pygpu.array(npy_img)
-    kern = pygpu.array(npy_kern)
-
-    #we take the stride after the transfert as we make c_contiguous
-    #data on the GPU.
-    if img_stride != (1, 1):
-        img = img[:, :, ::img_stride[0], ::img_stride[1]]
-        npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
-    if kern_stride != (1, 1):
-        kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
-        npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]
-
-    t2 = None
-    rval = True
-    try:
-        t0 = time.time()
-        cpuval = py_conv(npy_img, npy_kern, mode, subsample)
-        t1 = time.time()
-        i = gftensor4()
-        k = gftensor4()
-        op = GpuConv(border_mode=mode,
-                     subsample=subsample,
-                     version=version,
-                     verbose=verbose,
-                     kshp=compile_kshp)(i, k)
-        f = theano.function([i, k], op, mode=mode_with_gpu)
-        gpuval = f(img, kern)
-        t2 = time.time()
-        for i in range(nb_iter):
-            gpuval2 = f(img, kern)
-            assert numpy.allclose(numpy.asarray(gpuval),
-                                  numpy.asarray(gpuval2))
-            assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
-        gpuval = numpy.asarray(gpuval)
-        if gpuval.shape != cpuval.shape:
-            print >> sys.stdout, "ERROR: shape mismatch",
-            print >> sys.stdout, gpuval.shape, cpuval.shape
-            rval = False
-        if rval:
-            rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
-            assert numpy.all(numpy.isfinite(gpuval))
-    except NotImplementedError, e:
-        print >> sys.stdout, '_params_allgood Failed allclose', e
-        rval = False
-
-    if (t2 is not None):
-        if mode == 'valid':
-            approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2
-        else:
-            approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] *
-                         kshape[3] * ishape[2] * ishape[3] * 2)
-        approx_fp /= 1e6
-        cpu_mflops = approx_fp / (t1 - t0)
-        gpu_mflops = approx_fp / (t2 - t1)
-        if verbose > 0:
-            print >> sys.stdout, '%15s' % str(ishape), '%15s' % str(kshape),
-            print >> sys.stdout, '%12.5f  %7.2f %7.2f %7.1f' % (approx_fp,
-                    cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1))
-    if not rval:
-        print >> sys.stdout, ('test_' + mode + ' id=' + str(id) +
-                              ' FAILED for ishape, kshape, mode, subsample,' +
-                              ' img_stride, kern_stride, version', ishape,
-                              kshape, mode, subsample, img_stride, kern_stride,
-                              version)
-        diff = cpuval - gpuval
-        diffabs = numpy.absolute(diff)
-        pr_diff = diffabs / numpy.absolute(cpuval)
-        nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
-        print "max absolute diff:", (diffabs.max(), "avg abs diff:",
-                                     numpy.average(diffabs))
-        print "median abs diff:", (numpy.median(diffabs), "nb close:",
-                                   nb_close, "/", diff.size)
-        print "max relatif diff:", (pr_diff.max(), "avg rel diff:",
-                                    numpy.average(pr_diff))
-    if not rval and print_ != False:
-        if npy_img.shape[0] > 5:
-            print "img", npy_img[0]
-            print "kern", npy_kern[0]
-            print "gpu", gpuval[0][0]
-            print "cpu", cpuval[0][0]
-            print "diff", diff[0][0]
-        else:
-            print "img", npy_img
-            print "kern", npy_kern
-            print "gpu", gpuval
-            print "cpu", cpuval
-            print "diff", diff
-
-    return rval
-
-
-def exec_conv(version, shapes, verbose, random, mode,
-              print_=None, rtol=1e-5, ones=False):
-    if verbose > 0:
-        _params_allgood_header()
-    nb_failed = 0
-    nb_tests = 0
-
-    failed_version = set()
-    failed_id = []
-    # I put -1 in case we forget to add version in the test to.
-    for ver in version:
-        for id, (ishape, kshape, subshape,
-                 istride, kstride) in enumerate(shapes):
-            ret = False
-            try:
-                ret = _params_allgood(ishape,
-                        kshape,
-                        mode,
-                        subsample=subshape,
-                        img_stride=istride,
-                        kern_stride=kstride,
-                        version=ver,
-                        verbose=verbose,
-                        random=random,
-                        id=id,
-                        print_=print_,
-                        rtol=rtol,
-                        ones=ones)
-            except Exception, e:
-                print ver, id, (ishape, kshape, subshape, istride, kstride)
-                print e
-                pass
-            if not ret:
-                failed_version.add(ver)
-                failed_id.append(id)
-                nb_failed += 1
-            nb_tests += 1
-    if nb_failed > 0:
-        print "nb_failed", nb_failed, "on", nb_tests,
-        print "failed_version", failed_version, "failed_id", failed_id
-        assert nb_failed == 0, nb_failed
-    else:
-        print 'Executed', nb_tests, 'different shapes'
-
-
-def get_basic_shapes():
-        #basic test of image and kernel shape
-    return [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
-            ((1, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            ((1, 1, 3, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-        #basic test for unsquare kernel and image
-            ((1, 1, 2, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            ((1, 1, 3, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            ((1, 1, 4, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            ((1, 1, 4, 4), (1, 1, 3, 2), (1, 1), (1, 1), (1, 1)),
-            ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))]
-
-
-def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
-               img_stride=(1, 1), kern_stride=(1, 1)):
-    """ all possible case if we one or more of stack size, batch size,
-    nkern. We use the gived image shape, kernel shape and subsmaple
-    shape."""
-    return [
-        #stack only
-        ((1, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
-        #batch only
-        ((3, 1) + imshp, (1, 1) + kshp, subsample, img_stride, kern_stride),
-        #nkern only
-        ((1, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
-        #batch and nkern
-        ((3, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
-        #batch and stack
-        ((3, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
-        #stack and nkern
-        ((1, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
-        #batch, nkern and stack
-        ((2, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
-        #batch, nkern and stack
-        ((3, 2) + imshp, (4, 2) + kshp, subsample, img_stride, kern_stride)
-    ]
-
-
-def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
-                img_stride=(1, 1), kern_stride=(1, 1)):
-    #basic test of stack, batch and nkern paramter
-    shapes = get_shapes((1 * scales_img[0], 1 * scales_img[1]),
-                        (1 * scales_kern[0], 1 * scales_kern[1]),
-                        subsample, img_stride, kern_stride)
-    #basic test of stack, batch and nkern paramter with image and kernel shape
-    shapes += get_shapes((2 * scales_img[0], 2 * scales_img[1]),
-                         (2 * scales_kern[0], 2 * scales_kern[1]),
-                         subsample, img_stride, kern_stride)
-    #basic test of stack, batch and nkern paramter with image and kernel shape
-    shapes += get_shapes((3 * scales_img[0], 3 * scales_img[1]),
-                         (2 * scales_kern[0], 2 * scales_kern[1]),
-                         subsample, img_stride, kern_stride)
-    #basic test of stack, batch and nkern paramter with not square image.
-    shapes += get_shapes((4 * scales_img[0], 3 * scales_img[1]),
-                         (2 * scales_kern[0], 2 * scales_kern[1]),
-                         subsample, img_stride, kern_stride)
-    #basic test of stack, batch and nkern paramter with not square image.
-    shapes += get_shapes((3 * scales_img[0], 4 * scales_img[1]),
-                         (2 * scales_kern[0], 2 * scales_kern[1]),
-                         subsample, img_stride, kern_stride)
-    #basic test of stack, batch and nkern paramter with not square kernel.
-    shapes += get_shapes((4 * scales_img[0], 4 * scales_img[1]),
-                         (3 * scales_kern[0], 2 * scales_kern[1]),
-                         subsample, img_stride, kern_stride)
-    #basic test of stack, batch and nkern paramter with not square kernel.
-    shapes += get_shapes((4 * scales_img[0], 4 * scales_img[1]),
-                         (2 * scales_kern[0], 3 * scales_kern[1]),
-                         subsample, img_stride, kern_stride)
-    return shapes
-
-
-def get_valid_shapes():
-
-    #          img shape,     kern shape, subsample shape
-
-    shapes = get_basic_shapes()
-    shapes += get_shapes2()
-
-    #test image stride
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
-    shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
-
-    #test subsample done in a separate fct
-
-    shapes += [
-         #other test
-              ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
-            , ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize
-            , ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize, non-square image
-            , ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize, non-square image, non-square kern
-            , ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
-            , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)) # a big one
-            , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # MNIST LeNET layer 1
-            , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights
-            , ((60,20,28,28), (10,20,5,5), (1, 1), (2,2), (1, 1))#added a test case that fail from test_nnet.py.test_conv_nnet2
-            , ((10,5,28,28), (10,5,5,5), (1, 1), (2,2), (1, 1))#test precedent but reduced that triger the error
-            #Test more than maxThreadsDim0
-            , ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
-            , ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
-            ]
-
-    shapes += [ ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
-            , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
-            , ((60,30,8,8),(20,30,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 1 full
-            , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
-#            , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
-            , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers
-            , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers
-            , ((10,30,23,23),(20,30,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 full
-#            , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
-#            , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
-            ]
-    return shapes
-
-
-def test_valid_0_2():
-    seed_rng()
-    shapes = get_valid_shapes()
-    version = [0, 2]
-    verbose = 0
-
-    random = True
-    print_ = False
-    ones = False
-    if ones:
-        random = False
-    shapes2 = []
-
-    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
-        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
-                                                  numpy.asarray(kshape[2:]) +
-                                                  numpy.asarray([1, 1]))
-        if oshape[3] > device_prop['maxThreadsDim0']:
-            continue
-        if ishape[1] > 1:
-            continue
-        if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
-            (16 * 1024 - 150)):
-            continue
-        if subshape == (1, 1):
-            shapes2.append((ishape, kshape, subshape, istride, kstride))
-    shapes = shapes2
-
-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
-
-
-def test_valid_1_3_11_12():
-    seed_rng()
-    shapes = get_valid_shapes()
-    version = [1, 3, 11, 12]
-    verbose = 0
-
-    random = True
-    print_ = False
-    ones = False
-    if ones:
-        random = False
-    shapes2 = []
-
-    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
-        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
-                                                  numpy.asarray(kshape[2:]) +
-                                                  numpy.asarray([1, 1]))
-        if oshape[3] > device_prop['maxThreadsDim0']:
-            continue
-        if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
-            (16 * 1024 - 150)):
-            continue
-        if subshape == (1, 1):
-            shapes2.append((ishape, kshape, subshape, istride, kstride))
-    shapes = shapes2
-
-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
-
-
-def test_valid_4():
-    seed_rng()
-    shapes = get_valid_shapes()
-    version = [4]
-    verbose = 0
-
-    random = True
-    print_ = False
-    ones = False
-    if ones:
-        random = False
-    shapes2 = []
-
-    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
-        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
-                                                  numpy.asarray(kshape[2:]) +
-                                                  numpy.asarray([1, 1]))
-        if oshape[3] > device_prop['maxThreadsDim0']:
-            continue
-        if ishape[1] > 1:
-            continue
-        if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
-            (16 * 1024 - 150)):
-            continue
-        if subshape == (1, 1):
-            shapes2.append((ishape, kshape, subshape, istride, kstride))
-    shapes = shapes2
-
-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
-
-
-def test_valid_5():
-    seed_rng()
-    shapes = get_valid_shapes()
-    version = [5]
-    verbose = 0
-
-    random = True
-    print_ = False
-    ones = False
-    if ones:
-        random = False
-    shapes2 = []
-
-#    print len(shapes)
-    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
-        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
-                                                  numpy.asarray(kshape[2:]) +
-                                                  numpy.asarray([1, 1]))
-        if oshape[3] > device_prop['maxThreadsDim0']:
-            continue
-        if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
-            (16 * 1024 - 150)):
-            continue
-        if subshape == (1, 1):
-            shapes2.append((ishape, kshape, subshape, istride, kstride))
-    shapes = shapes2
-#    print len(shapes2)
-
-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
-
-
-def test_valid_7_8_13():
-    seed_rng()
-    shapes = get_valid_shapes()
-    # This is to test the "new" lower shared memory usage.
-    shapes.append(((10, 30, 60, 60), (20, 30, 40, 40),
-                   (1, 1), (1, 1), (1, 1)))
-    version = [7, 8, 13]
-    verbose = 0
-
-    random = True
-    print_ = False
-    ones = False
-    if ones:
-        random = False
-    shapes2 = []
-
-#    print len(shapes)
-    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
-        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
-                                                  numpy.asarray(kshape[2:]) +
-                                                  numpy.asarray([1, 1]))
-        if oshape[2] * oshape[3] > device_prop['maxThreadsDim0']:
-            continue
-        if max(numpy.prod(ishape[2:]) * 4 + 2 * kshape[3] * 4,
-               oshape[2] * oshape[3] * 4 * 2) > (16 * 1024 - 150):
-            continue
-        if subshape == (1, 1):
-            shapes2.append((ishape, kshape, subshape, istride, kstride))
-    shapes = shapes2
-#    print len(shapes2)
-
-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
-
-
-def test_valid_9_10():
-    seed_rng()
-    shapes = get_valid_shapes()
-    version = [9, 10]
-    verbose = 0
-
-    random = True
-    print_ = False
-    ones = False
-    if ones:
-        random = False
-    shapes2 = []
-
-#    print len(shapes)
-    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
-        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
-                                                  numpy.asarray(kshape[2:]) +
-                                                  numpy.asarray([1, 1]))
-        if oshape[3] > device_prop['maxThreadsDim0']:
-            continue
-        if (kshape[3] * 4 + ishape[3]) > (16 * 1024 - 150):
-            continue
-        if subshape == (1, 1):
-            shapes2.append((ishape, kshape, subshape, istride, kstride))
-    shapes = shapes2
-#    print len(shapes2)
-
-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
-
-
-def test_valid():
-    seed_rng()
-    shapes = get_valid_shapes()
-
-    #shapes=shapes[400:426]
-    # I put -1 in case we forget to add version in the test to.
-    # I put -2 to test the reference version.
-    version = [-2, -1, 6]
-    verbose = 0
-#    version=[1]
-
-    random = True
-    print_ = False
-    ones = False
-    if ones:
-        random = False
-
-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
-
-
-def test_full():
-    seed_rng()
-    shapes = get_basic_shapes()
-    shapes += get_shapes2()
-    #test image stride
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
-    shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
-
-    #test subsample done in a separate fct
-
-    shapes += [
-        #other test
-              ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
-            , ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize
-            , ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize, non-square image
-            , ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize, non-square image, non-square kern
-            , ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
-            , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)) # a big one
-            , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # MNIST LeNET layer 1
-            , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights
-
-        #other test
-            , ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1))#kernel bigger then image
-            , ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1))#kernel bigger then image
-            , ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1))
-            ]
-    shapes += [
-#        ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
-#            , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
-             ((60,30,8,8),(20,30,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 1 full
-#            , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
-#            , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
-#            , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers
-#            , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers
-            , ((10,30,23,23),(20,30,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 full
-#            , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
-#            , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
-            #Test more than maxThreadsDim0
-            , ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
-            , ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
-            ]
-
-#    shapes=shapes[:277]
-    version = [-2, -1, 0, 1, 2, 3, 4, 5]
-    verbose = 0
-#    version=[4]
-    random = True
-
-    exec_conv(version, shapes, verbose, random, 'full')
-
-
-def test_subsample():
-    seed_rng()
-    # implement when
-    shapes = [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
-              ((1, 1, 1, 1), (1, 1, 1, 1), (2, 2), (1, 1), (1, 1)),
-              ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1, 1), (1, 1)),
-              ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1, 1), (1, 1)),
-              ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1, 1), (1, 1))
-          ]
-    shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 1))
-    shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 2))
-    shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1))
-    shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 2))
-
-#We put only the version that implement the subsample to make the test faster.
-    version_valid = [-2, -1, 1, 3, 11, 12]
-    version_full = [-2, -1]
-    verbose = 0
-    random = True
-    print_ = False
-    ones = False
-    if ones:
-        random = False
-
-    exec_conv(version_valid, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones)
-    exec_conv(version_full, shapes, verbose, random, 'full',
-              print_=print_, ones=ones)
-
-
-class TestConv2DGPU(unittest.TestCase):
-    def test_logical_shapes(self):
-        seed_rng()
-        for stride in range(1, 4):
-            kshp = (10, 2, 10, 10)
-            featshp = (3, 10, 11, 11)
-
-            a = tensor.ftensor4()
-            A = tensor.ftensor4()
-
-            # Need to transpose first two dimensions of kernel, and reverse
-            # index kernel image dims (for correlation)
-            kernel_rotated = tensor.transpose(A, axes=[1, 0, 2, 3])
-
-            featshp_logical = (featshp[0], featshp[1], featshp[2] * stride,
-                               featshp[3] * stride)
-            kshp_rotated = (kshp[1], kshp[0], kshp[2], kshp[3])
-            #print featshp, kshp_rotated, featshp_logical[1:], kshp[2:]
-            image_estimate = tensor.nnet.conv2d(a, kernel_rotated,
-                                                border_mode='full',
-                                                image_shape=featshp,
-                                                filter_shape=kshp_rotated,
-                                                imshp_logical=featshp_logical[1:],
-                                                kshp_logical=kshp[2:])
-
-            func = theano.function([a, A], image_estimate, mode=mode_with_gpu)
-            #theano.printing.debugprint(func,)
-            assert any([isinstance(node.op, GpuConv)
-                        for node in func.maker.fgraph.toposort()])
-
-            a_in = numpy.random.randn(*featshp).astype("float32")
-            A_in = numpy.random.randn(*kshp).astype("float32")
-
-            func(a_in, A_in)
-
-    def test_invalid_input_shape(self):
-        """
-        Tests that when the shape gived at build time is not the same as
-        run time we raise an error
-        """
-        seed_rng()
-        verbose = 0
-        random = True
-        print_ = False
-        ones = False
-        if ones:
-            random = False
-
-        global mode_with_gpu
-        mode_with_gpu_orig = mode_with_gpu
-        try:
-            if theano.config.mode in ['DebugMode', 'DEBUG_MODE']:
-                mode_with_gpu = theano.compile.mode.get_mode(
-                    'FAST_RUN').including('gpu')
-                for mode in ['valid', 'full']:
-                    for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
-                                   ((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
-                                   #((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
-                                   # We use only the number of columns.
-                                   ]:
-
-                        self.assertRaises(ValueError, _params_allgood,
-                                          shapes[0], shapes[1],
-                                          verbose=verbose, random=random,
-                                          mode=mode,
-                                          print_=print_, ones=ones,
-                                          compile_kshp=shapes[2])
-        finally:
-            mode_with_gpu = mode_with_gpu_orig
-
-
-def benchmark():
-
-    shapes_valid = [
-        #test_lenet_28 shape
-        ((20, 60,12,12), (30,60,8,8), (1, 1), (1, 1), (1, 1))#valid
-        ,((60, 20,12,12), (30,20,5,5), (1, 1), (1, 1), (1, 1))#valid
-        ,((60, 1,28,28), (20,1,5,5), (1, 1), (1, 1), (1, 1))#valid
-        ,((1, 60,28,28), (20,60,24,24), (1, 1), (1, 1), (1, 1))#valid
-        #test_lenet_32 shape
-        ,((20, 60,14,14), (30,60,10,10), (1, 1), (1, 1), (1, 1))#valid
-        ,((60, 20,14,14), (30,20,5,5), (1, 1), (1, 1), (1, 1))#valid
-        ,((60, 1,32,32), (20,1,5,5), (1, 1), (1, 1), (1, 1))#valid
-        ,((1, 60,32,32), (20,60,28,28), (1, 1), (1, 1), (1, 1))#valid
-        #test_lenet_64 shape
-        ,((10, 20,29,29), (30,20,7,7), (1, 1), (1, 1), (1, 1))#valid
-        ,((20, 10,29,29), (30,10,23,23), (1, 1), (1, 1), (1, 1))#valid
-        ,((10, 1,64,64), (20,1,7,7), (1, 1), (1, 1), (1, 1))#valid
-        ,((1, 10,64,64), (20,10,58,58), (1, 1), (1, 1), (1, 1))#valid
-        #test_lenet_108 shape
-        ,((10, 20,51,51), (30,20,7,7), (1, 1), (1, 1), (1, 1))#valid
-        ,((20, 10,51,51), (30,10,45,45), (1, 1), (1, 1), (1, 1))#valid
-        ,((10, 1,108,108), (20,1,7,7), (1, 1), (1, 1), (1, 1))#valid
-        ,((1, 10,108,108), (20,10,102,102), (1, 1), (1, 1), (1, 1))#valid
-        #test_lenet_256 shape
-        ,((2, 20,124,124), (30,20,9,9), (1, 1), (1, 1), (1, 1))#valid
-        ,((20, 2,124,124), (30,2,116,116), (1, 1), (1, 1), (1, 1))#valid
-        ,((2, 1,256,256), (20,1,9,9), (1, 1), (1, 1), (1, 1))#valid
-        ,((1, 2,256,256), (20,2,248,248), (1, 1), (1, 1), (1, 1))#valid
-            ]
-
-    shapes_full = [
-        #test_lenet_28 shape
-         ((60, 30,8,8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))#full
-        #test_lenet_32 shape
-         ,((60, 30,10,10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))#full conv_full_patch_stack_padded' N=1
-        #test_lenet_64 shape
-         ,((10, 30,23,23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))#full conv_full_patch_stack_padded' N=3
-        #test_lenet_108 shape
-         ,((10, 30,45,45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))#full 'conv_full_patch_stack_padded' N=9
-        #test_lenet_256 shape
-         ,((2, 30,116,116), (20, 30, 9,9), (1, 1), (1, 1), (1, 1))#full conv_reference_full
-            ]
-
-#    shapes_valid=shapes_valid[-1:]
-#    shapes_full=shapes_full[-1:]
-    version = [-1]
-    verbose = 1
-    random = True
-
-    exec_conv(version, shapes_valid, verbose, random, 'valid',
-              print_=None, rtol=1e-3)
-    exec_conv(version, shapes_full, verbose, random, 'full')
-
-
-def test_stack_rows_segfault_070312():
-    seed_rng()
-    # 07/03/2012
-    # Running this unittest with cuda-memcheck exposes an illegal read.
-    # THEANO_FLAGS=device=gpu cuda-memcheck nosetests \
-    # test_conv_cuda_ndarray.py:test_stack_rows_segfault_070312
-    img = theano.shared(numpy.random.rand(1, 80, 96, 96).astype('float32'))
-    kern = theano.shared(numpy.random.rand(1, 80, 9, 9).astype('float32'))
-    out = theano.shared(numpy.random.rand(1, 2, 2, 3).astype('float32'))
-    op = theano.tensor.nnet.conv.ConvOp(imshp=(80, 96, 96), kshp=(9, 9),
-            nkern=1, bsize=1)
-    f = theano.function([], [], updates=[(out, op(img, kern))], mode=mode_with_gpu)
-    f()
diff --git a/theano/sandbox/gpuarray/tests/test_elemwise.py b/theano/sandbox/gpuarray/tests/test_elemwise.py
deleted file mode 100644
index 165baf9f593..00000000000
--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ /dev/null
@@ -1,211 +0,0 @@
-import theano
-from theano import scalar, gof
-from theano.gof.python25 import all, any
-from theano.tests.unittest_tools import SkipTest
-
-from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
-                                               test_CAReduce, T_reduce_dtype)
-
-from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
-                                                          rand_gpuarray)
-from theano.sandbox.gpuarray.elemwise import (GpuElemwise, GpuDimShuffle,
-                                              GpuCAReduceCuda, GpuCAReduceCPY)
-from theano.sandbox.gpuarray.type import GpuArrayType
-
-from pygpu.array import gpuarray
-
-
-# This is acutally a test for GpuElemwise
-class test_gpu_Broadcast(test_Broadcast):
-    op = GpuElemwise
-    type = GpuArrayType
-    cop = GpuElemwise
-    ctype = GpuArrayType
-    # The order is important
-    linkers = [gof.PerformLinker, gof.CLinker]
-
-    def setUp(self):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
-            self.linkers = [gof.PerformLinker]
-
-    def rand_val(self, shp):
-        return rand_gpuarray(*shp, **dict(cls=gpuarray))
-
-    def rand_cval(self, shp):
-        return rand_gpuarray(*shp, **dict(cls=gpuarray))
-
-    def test_c(self):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
-            raise SkipTest("Cuda specific tests")
-        super(test_gpu_Broadcast, self).test_c()
-
-    def test_c_inplace(self):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
-            raise SkipTest("Cuda specific tests")
-        super(test_gpu_Broadcast, self).test_c_inplace()
-
-
-class test_GpuDimShuffle(test_DimShuffle):
-    op = GpuDimShuffle
-
-
-class test_GpuCAReduceCPY(test_CAReduce):
-    dtypes = ["float32"]
-    bin_dtypes = ["uint8", "int8"]
-    op = GpuCAReduceCPY
-    reds = [scalar.add, scalar.mul]
-    pre_scalar_op = None
-
-    def test_perform(self):
-        for dtype in self.dtypes + self.bin_dtypes:
-            for op in self.reds:
-                self.with_linker(gof.PerformLinker(), op, dtype=dtype,
-                                 pre_scalar_op=self.pre_scalar_op)
-
-    def test_perform_nan(self):
-        for dtype in self.dtypes:
-            if not dtype.startswith('float'):
-                continue
-            for op in self.reds:
-                self.with_linker(gof.PerformLinker(), op, dtype=dtype,
-                                 test_nan=True,
-                                 pre_scalar_op=self.pre_scalar_op)
-
-    def test_c(self):
-        for dtype in self.dtypes + self.bin_dtypes:
-            for op in self.reds:
-                self.with_linker(gof.CLinker(), op, dtype=dtype,
-                                 pre_scalar_op=self.pre_scalar_op)
-
-    def test_c_nan(self):
-        for dtype in self.dtypes:
-            if not dtype.startswith('float'):
-                continue
-            for op in self.reds:
-                self.with_linker(gof.CLinker(), op, dtype=dtype,
-                                 test_nan=True,
-                                 pre_scalar_op=self.pre_scalar_op)
-
-    def test_infer_shape(self):
-        for dtype in self.dtypes:
-            test_CAReduce.test_infer_shape(self, dtype)
-
-
-class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
-    dtypes = ["float32", "int64"]
-    bin_dtypes = ["uint8", "int8"]
-
-    cases = [((5, 6), None),
-             ((5, 6), (0, 1)),
-             ((5, 6), (0, )),
-             ((5, 6), (1, )),
-             ((5, 6), (-1, )),
-             ((5, 6), (-2, )),
-             #((5, 6), ()),  #reduce on no axis(copy) isn't implemented
-             #((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
-             #((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
-             ((5, 0), None),
-             ((5, 0), (0, )),
-             ((5, 0), (1, )),
-             #((5, 0), ()), reduce on no axis isn't implemented
-             #((), None), reduce on no axis isn't implemented
-             #((), ()) reduce on no axis isn't implemented
-
-             #Test all GPU cases implemented
-             ((1,0),(1,)),
-             ((0,1),(1,)),
-             ((0,0),(1,)),
-             ((0,0,0),(1,2)),
-             ((0,0,0,0),(1,2,3)),
-             ((2,1),(1,)),
-             ((1,2),(1,)),
-             ((100,3,1300),[1]),
-             ((0,),[0]),((5,),[0]),
-             ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
-             ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
-             ((0,0,0,0),[0,1,2,3]),
-             ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
-
-                               #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
-             ((4100,3),[0]),((3,4101),[0]),#10
-             ((1024,33),[0]),((33,1024),[0]),#10
-             ((1025,33),[0]),((33,1025),[0]),#10
-
-             ((4100,3),[1]),((3,4101),[1]),#01
-             ((1024,33),[1]),((33,1024),[1]),#01
-             ((1025,33),[1]),((33,1025),[1]),#01
-
-             ((4100,3),[0,1]),((3,4101),[0,1]),#11
-             ((1024,33),[0,1]),((33,1024),[0,1]),#01
-             ((1025,33),[0,1]),((33,1025),[0,1]),#01
-
-             ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]), ((3,65536,1), [0]),#100
-             ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
-             ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
-             ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
-             ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
-             #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
-             ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
-             ((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111
-
-             ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
-             ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
-             ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
-             ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
-             ((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
-             ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,2,3), [0,1,2,3]),#1111
-
-             #test pattern implemented by reshape
-             #Skip them as this test the op directly, not the optimization with reshape
-#             ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
-#             ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
-#             ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
-#             ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
-#             ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
-#             ((5,4,3,10,11),[1,2]),
-    ]
-    op = GpuCAReduceCuda
-    reds = [scalar.add, scalar.mul,
-            scalar.maximum, scalar.minimum]
-    pre_scalar_op = scalar.sqr
-
-    def test_perform(self):
-        return
-
-    def test_perform_nan(self):
-        return
-
-    def setUp(self):
-        super(test_GpuCAReduceCuda, self).setUp()
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
-            raise SkipTest("Cuda specific tests")
-
-
-class T_gpureduce_dtype(T_reduce_dtype):
-    mode = mode_with_gpu.excluding('local_cut_useless_reduce')
-    op = GpuCAReduceCuda
-    #Currently we don't support reduction on 0 axis
-    axes = [None, 0, 1, 1, [0], [1], [0, 1]]
-    #We don't support complex dtype
-    dtypes = ['int8', 'int16', 'int32', 'int64',
-              'uint8', 'uint16', 'uint32', 'uint64',
-              'float32', 'float64']
-
-    def setUp(self):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
-            raise SkipTest("Cuda specific tests")
-
-
-def speed_reduce10():
-    import numpy
-    import theano
-    data = numpy.random.rand(1000, 1000).astype("float32")
-    m = theano.tensor.fmatrix()
-    f = theano.function([m], [m.sum(axis=0), m.T.sum(axis=0)],
-                        mode=mode_with_gpu)
-    f(data)
diff --git a/theano/sandbox/gpuarray/tests/test_neighbours.py b/theano/sandbox/gpuarray/tests/test_neighbours.py
deleted file mode 100644
index b0dff5047fb..00000000000
--- a/theano/sandbox/gpuarray/tests/test_neighbours.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import unittest
-
-# We let that import do the init of the back-end if needed.
-from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
-                                                          mode_without_gpu)
-
-import theano.tensor.nnet.tests.test_neighbours
-from theano.sandbox.gpuarray.neighbours import GpuImages2Neibs
-
-
-class T_GpuImages2Neibs(theano.tensor.nnet.tests.test_neighbours.T_Images2Neibs):
-    mode = mode_with_gpu
-    op = GpuImages2Neibs
-    dtypes = ['int64', 'float32', 'float64']
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/theano/sandbox/gpuarray/tests/test_nnet.py b/theano/sandbox/gpuarray/tests/test_nnet.py
deleted file mode 100644
index f8659a4eb05..00000000000
--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ /dev/null
@@ -1,288 +0,0 @@
-from nose.plugins.skip import SkipTest
-import numpy
-
-import theano
-from theano.gof.python25 import any
-import theano.tensor as T
-import theano.tests.unittest_tools as utt
-
-from theano.sandbox import gpuarray
-
-# We let that import do the init of the back-end if needed.
-from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
-                                                          mode_without_gpu)
-
-from theano.sandbox.gpuarray.nnet import (
-    GpuCrossentropySoftmaxArgmax1HotWithBias,
-    GpuCrossentropySoftmax1HotWithBiasDx)
-
-
-def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
-    """
-    This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
-
-    We check that we loop when their is too much threads
-
-    """
-
-    n_in = 1000
-    batch_size = 4097
-    n_out = 1250
-
-    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
-        n_in = 4098
-        n_out = 4099
-
-    x = T.fmatrix('x')
-    y = T.lvector('y')
-
-    b = T.fvector('b')
-    #W = T.fmatrix('W')
-
-    #we precompute the dot with big shape before to allow the test of
-    #GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
-    #(the launch timed out and was terminated) on GPU card not
-    #powerful enough. We need the big shape to check for corner
-    #case.
-    dot_result = T.fmatrix('dot_result')
-
-    # Seed numpy.random with config.unittests.rseed
-    utt.seed_rng()
-
-    xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
-                       dtype=numpy.float32)
-    #?????yy = numpy.ones((batch_size,),dtype='float32')
-    yy = numpy.ones((batch_size,), dtype='int32')
-    b_values = numpy.zeros((n_out,), dtype='float32')
-    W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
-
-    dot_value = numpy.asarray(numpy.dot(xx, W_values), dtype='float32')
-    del W_values
-    p_y_given_x = T.nnet.softmax(dot_result + b)
-    y_pred = T.argmax(p_y_given_x, axis=-1)
-    loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
-    dW = T.grad(loss, dot_result)
-    classify = theano.function(inputs=[y, b, dot_result],
-                               outputs=[loss, y_pred, dW],
-                               mode=mode_without_gpu)
-    classify_gpu = theano.function(inputs=[y, b, dot_result],
-                                   outputs=[loss, y_pred, dW],
-                                   mode=mode_with_gpu)
-    #theano.printing.debugprint(classify)
-    #theano.printing.debugprint(classify_gpu)
-
-    assert any([isinstance(node.op,
-                           T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
-                for node in classify.maker.fgraph.toposort()])
-    assert any([isinstance(node.op,
-                           GpuCrossentropySoftmaxArgmax1HotWithBias)
-                for node in classify_gpu.maker.fgraph.toposort()])
-
-    out = classify(yy, b_values, dot_value)
-    gout = classify_gpu(yy, b_values, dot_value)
-
-    assert len(out) == len(gout) == 3
-    assert numpy.allclose(out[0], gout[0])
-    assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute(
-        gout[2] - out[2]).max()
-    assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val)
-                                             for id, val in enumerate(out[1] -
-                                                                      gout[1])
-                                             if val != 0]
-
-
-def test_GpuCrossentropySoftmax1HotWithBiasDx():
-    """
-    This is basic test for GpuCrossentropySoftmax1HotWithBiasDx
-
-    We check that we loop when their is too much threads
-
-    """
-    n_in = 1000
-    batch_size = 4097
-    n_out = 1250
-
-    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
-        n_in = 4098
-        n_out = 4099
-
-    # Seed numpy.random with config.unittests.rseed
-    utt.seed_rng()
-
-    softmax_output_value = numpy.random.rand(batch_size,
-                                             n_out).astype('float32')
-    dnll_value = numpy.asarray(numpy.random.rand(batch_size), dtype='float32')
-    y_idx_value = numpy.random.randint(low=0, high=5, size=batch_size)
-
-    softmax_output = T.fmatrix()
-    softmax_output /= softmax_output.sum(axis=1).reshape(
-        softmax_output.shape[1], 1)
-    op = theano.tensor.nnet.crossentropy_softmax_1hot_with_bias_dx(
-        dnll_value,
-        softmax_output,
-        y_idx_value)
-
-    cpu_f = theano.function([softmax_output], op, mode=mode_without_gpu)
-    gpu_f = theano.function([softmax_output], op, mode=mode_with_gpu)
-    #theano.printing.debugprint(cpu_f)
-    #theano.printing.debugprint(gpu_f)
-
-    assert any([isinstance(node.op, T.nnet.CrossentropySoftmax1HotWithBiasDx)
-                for node in cpu_f.maker.fgraph.toposort()])
-    assert any([isinstance(node.op,
-                           GpuCrossentropySoftmax1HotWithBiasDx)
-                for node in gpu_f.maker.fgraph.toposort()])
-
-    cpu_out = cpu_f(softmax_output_value)
-    gpu_out = gpu_f(softmax_output_value)
-
-    rtol = 1e-5
-    atol = 1e-6
-    if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol):
-        abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
-        scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
-        max_i = scaled_err.argmax()
-
-        print 'max err index:', max_i, max_i / batch_size,
-        print max_i % batch_size, max_i / n_out, max_i & n_out
-        print 'At that index:'
-        print 'err:', scaled_err.flatten()[max_i]
-        print 'absolute error:', abs_err.flatten()[max_i]
-        print 'relative error:', rel_err.flatten()[max_i]
-        print 'cpu_out:', cpu_out.flatten()[max_i]
-        print 'gpu_out:', gpu_out.flatten()[max_i]
-        print 'softmax_output_value:', softmax_output_value.flatten()[max_i]
-        print 'dnll_value:', dnll_value[max_i / n_out]
-        print 'y_idx_value:', y_idx_value[max_i / n_out]
-
-        assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
-            rtol, atol)
-
-
-def test_softmax_with_bias_float32():
-    softmax_with_bias_unittest_template(dtypeInput='float32',
-                                        dtypeBias='float32')
-
-def test_softmax_with_bias_float64():
-    softmax_with_bias_unittest_template(dtypeInput='float32',
-                                        dtypeBias='float64')
-    softmax_with_bias_unittest_template(dtypeInput='float64',
-                                        dtypeBias='float32')
-    softmax_with_bias_unittest_template(dtypeInput='float64',
-                                        dtypeBias='float64')
-
-def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
-    """
-    This is basic test for GpuSoftmaxWithBias with float64 variables
-
-    We check that we loop when their is too much block
-
-    TODO: check that we loop when their is too much thread.(THIS IS
-    NOT IMPLEMENTED)
-    """
-    assert dtypeInput in ['float32', 'float64']
-    assert dtypeBias in ['float32', 'float64']
-
-    if dtypeInput == 'float32':
-        x = T.fmatrix('x')
-    elif dtypeInput == 'float64':
-        x = T.dmatrix('x')
-
-    # We can't use zeros_like(x[0,::]) as this don't allow to test with
-    # 0 shape
-    if dtypeBias == 'float32':
-        z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
-                                                 dtype='float32')[::2])
-    elif dtypeBias == 'float64':
-        z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
-                                                 dtype='float64')[::2])
-
-
-    f = theano.function([x], z, mode=mode_without_gpu)
-    f_gpu = theano.function([x], z, mode=mode_with_gpu)
-    assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax_with_bias
-    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
-                      theano.sandbox.gpuarray.nnet.GpuSoftmaxWithBias)
-
-    def cmp(n, m):
-        #print "test_softmax",n,m
-        if dtypeInput == 'float32':
-            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-        elif dtypeInput == 'float64':
-            data = numpy.arange(n * m, dtype='float64').reshape(n, m)
-
-        out = f(data)
-        gout = f_gpu(data)
-        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-
-    cmp(2, 5)
-    #we need to test n>32*1024 to check that we make the block loop.
-    cmp(2 << 15, 5)
-    cmp(4074, 400)
-    cmp(0, 10)
-    cmp(784, 784)
-    cmp(4, 1000)
-    cmp(4, 1024)
-    cmp(4, 2000)
-    cmp(4, 2024)
-    #GTX285 don't have enough shared mem for this case.
-    cmp(4, 4074)
-    # The GTX580, 680 and kepler don't have enough shared memory.
-    cmp(2, 10000)
-    cmp(128, 16 * 1024)
-    cmp(128, 64 * 1024)
-
-
-def test_softmax_float32():
-    softmax_unittest_template('float32')
-
-def test_softmax_float64():
-    softmax_unittest_template('float64')
-
-def softmax_unittest_template(dtypeInput):
-    """
-    This is basic test for GpuSoftmax with float64 variables
-
-    We check that we loop when their is too much block
-    We use slower code when there isn't enough shared memory
-    """
-    assert dtypeInput in ['float32', 'float64']
-
-    if dtypeInput == 'float32':
-        x = T.fmatrix('x')
-    elif dtypeInput == 'float64':
-        x = T.dmatrix('x')
-
-    z = T.nnet.softmax(x)
-    f = theano.function([x], z, mode=mode_without_gpu)
-    f_gpu = theano.function([x], z, mode=mode_with_gpu)
-    assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax
-    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
-                      theano.sandbox.gpuarray.nnet.GpuSoftmax)
-
-    def cmp(n, m):
-        if dtypeInput == 'float32':
-            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-        elif dtypeInput == 'float64':
-            data = numpy.arange(n * m, dtype='float64').reshape(n, m)
-
-        out = f(data)
-        gout = f_gpu(data)
-        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-
-    #we need to test n>32*1024 to check that we make the block loop.
-    cmp(2, 5)
-    cmp(2 << 15, 5)
-    cmp(4074, 400)
-    cmp(0, 10)
-    cmp(784, 784)
-    cmp(4, 1000)
-    cmp(4, 1024)
-    cmp(4, 2000)
-    cmp(4, 2024)
-    # The GTX285 don't have enough shared memory.
-    cmp(4, 4074)
-    # The GTX580, 680 and kepler don't have enough shared memory.
-    cmp(2, 10000)
-    cmp(128, 16 * 1024)
-    cmp(128, 64 * 1024)
\ No newline at end of file
diff --git a/theano/sandbox/gpuarray/tests/test_opt.py b/theano/sandbox/gpuarray/tests/test_opt.py
deleted file mode 100644
index c3e1d621074..00000000000
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import numpy
-
-import theano
-from theano import tensor
-from theano.tests import unittest_tools as utt
-import theano.sandbox.gpuarray
-from theano.sandbox.gpuarray.type import (
-    GpuArrayType, gpuarray_shared_constructor)
-from theano.sandbox.gpuarray.basic_ops import (
-    GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu)
-from theano.sandbox.gpuarray.elemwise import (
-    GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise)
-from theano.sandbox.gpuarray.tests.test_basic_ops import (
-    rand_gpuarray, mode_with_gpu, mode_without_gpu
-    )
-from theano.tests.unittest_tools import SkipTest
-from theano.tensor.tests.test_basic import TestSpecifyShape
-
-
-def test_local_assert():
-    x = theano.tensor.fmatrix()
-    a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())
-    f = theano.function([x], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    a_op = [n for n in topo if isinstance(n.op, theano.tensor.opt.Assert)]
-    assert len(a_op) == 1
-    assert isinstance(a_op[0].inputs[0].type, GpuArrayType)
-
-
-def test_flatten():
-    m = theano.tensor.fmatrix()
-    f = theano.function([m], m.flatten(), mode=mode_with_gpu)
-    val = numpy.random.rand(10, 11).astype("float32")
-    res = f(val)
-    utt.assert_allclose(res, val.flatten())
-    assert res.shape == val.flatten().shape
-    assert GpuReshape in [type(node.op)
-                          for node in f.maker.fgraph.toposort()]
-    val = numpy.random.rand(10, 11).astype("float32")
-    res = f(val)
-    utt.assert_allclose(res, val.flatten())
-    assert res.shape == val.flatten().shape
-    assert GpuReshape in [type(node.op)
-                          for node in f.maker.fgraph.toposort()]
-
-    f = theano.function([m], m.flatten(ndim=2), mode=mode_with_gpu)
-    val = numpy.random.rand(10, 11).astype("float32")
-    res = f(val)
-    utt.assert_allclose(res, val)
-    assert res.shape == val.shape
-    assert GpuReshape in [type(node.op)
-                          for node in f.maker.fgraph.toposort()]
-
-    m = theano.tensor.tensor3()
-    f = theano.function([m], m.flatten(ndim=2), mode=mode_with_gpu)
-    val = numpy.random.rand(10, 11, 12).astype("float32")
-    res = f(val)
-    utt.assert_allclose(res, val.reshape(10, -1))
-    assert res.shape == val.reshape(10, -1).shape
-    assert GpuReshape in [type(node.op)
-                          for node in f.maker.fgraph.toposort()]
-
-
-def test_reduce():
-    dev = theano.sandbox.gpuarray.init_dev.device
-
-    for method, param in [('sum', dict(acc_dtype='float32')),
-                          ('prod', dict(acc_dtype='float32')),
-                          ('max', {}), ('min', {})]:
-        m = theano.tensor.fmatrix()
-        f = theano.function([m], getattr(m, method)(axis=0,
-                                                    **param),
-                            mode=mode_with_gpu)
-        val = numpy.random.rand(10, 11).astype("float32")
-        res = f(val)
-        utt.assert_allclose(res, getattr(val, method)(axis=0))
-        assert res.shape == (11,)
-        topo = f.maker.fgraph.toposort()
-        ops = [type(node.op) for node in topo]
-
-        if dev.startswith('opencl') and method in ["max", "min"]:
-            assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
-        else:
-            assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops
-
-
-def test_local_gpualloc_memset_0():
-    i = theano.tensor.iscalar()
-    z = numpy.zeros((1,), dtype='float32')
-    o = numpy.ones((1,), dtype='float32')
-    ones = numpy.ones((2,), dtype='float32')
-
-    # Test with 0
-    a = gpu_alloc(z, i)
-    f = theano.function([i], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
-    assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
-    assert (numpy.asarray(f(6)) == 0).all()
-
-    # Test with 1
-    a = gpu_alloc(o, i)
-    f = theano.function([i], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
-    assert isinstance(topo[0].op, GpuAlloc)
-    assert not topo[0].op.memset_0
-    assert (numpy.asarray(f(6)) == 1).all()
-
-    # Test with 1, 1
-    a = gpu_alloc(ones, i)
-    f = theano.function([i], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
-    assert isinstance(topo[0].op, GpuAlloc)
-    assert not topo[0].op.memset_0
-    assert (numpy.asarray(f(2)) == 1).all()
-
-
-def test_rebroadcast():
-    d = numpy.random.rand(10, 10).astype('float32')
-    v = theano.tensor.fmatrix()
-    up = tensor.unbroadcast(v.sum().dimshuffle('x', 'x'), 0, 1)
-    f = theano.function([v], [up], mode=mode_with_gpu)
-
-    f(d)
-
-    topo = f.maker.fgraph.toposort()
-    rebrs = [node for node in topo if isinstance(node.op, tensor.Rebroadcast)]
-    assert len(rebrs) == 1
-    rebr = rebrs[0]
-
-    assert isinstance(rebr.inputs[0].type, GpuArrayType)
-    assert isinstance(rebr.outputs[0].type, GpuArrayType)
-
-
-class TestSpecifyShape(TestSpecifyShape):
-    mode = mode_with_gpu
-    input_type = GpuArrayType
-    pass
-
-
-def test_print_op():
-    """ Test that print ops don't block gpu optimization"""
-    b = tensor.fmatrix()
-    f = theano.function([b], theano.printing.Print()(b) * 2,
-                        mode=mode_with_gpu)
-    theano.printing.debugprint(f)
-    #print f.maker.fgraph.toposort()
-#[GpuFromHost(<TensorType(float32, matrix)>), <theano.printing.Print object at 0x3581210>(GpuFromHost.0), GpuElemwise{mul}(CudaNdarray{[[ 2.]]}, <theano.printing.Print object at 0x3581210>.0), HostFromGpu(GpuElemwise{mul}.0)]
-    topo = f.maker.fgraph.toposort()
-    assert topo[0].op == gpu_from_host
-    assert isinstance(topo[1].op, theano.printing.Print)
-    assert isinstance(topo[2].op, GpuElemwise)
-    assert topo[3].op == host_from_gpu
-    f(numpy.random.random((5, 5)).astype('float32'))
-
-
-def test_local_gpu_elemwise_careduce():
-    x = theano.tensor.matrix()
-    o = (x*x).sum()
-    f = theano.function([x], o, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 3
-    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
-    f(numpy.random.rand(3, 4).astype(theano.config.floatX))
diff --git a/theano/sandbox/gpuarray/tests/test_scan.py b/theano/sandbox/gpuarray/tests/test_scan.py
deleted file mode 100644
index f2a5610630a..00000000000
--- a/theano/sandbox/gpuarray/tests/test_scan.py
+++ /dev/null
@@ -1,243 +0,0 @@
-from unittest import TestCase
-
-import numpy
-import theano
-
-from theano.tests import unittest_tools as utt
-import theano.sandbox.rng_mrg
-from theano.sandbox.gpuarray.basic_ops import (
-    gpu_from_host, GpuFromHost, HostFromGpu
-)
-from theano.sandbox.gpuarray.elemwise import GpuElemwise
-
-from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
-
-class T_Scan(TestCase):
-    def setUp(self):
-        utt.seed_rng()
-
-    def test_one_sequence_one_output_weights_gpu1(self):
-        def f_rnn(u_t, x_tm1, W_in, W):
-            return u_t * W_in + x_tm1 * W
-
-        u = theano.tensor.fvector('u')
-        x0 = theano.tensor.fscalar('x0')
-        W_in = theano.tensor.fscalar('win')
-        W = theano.tensor.fscalar('w')
-
-        mode = mode_with_gpu.excluding('InputToGpuOptimizer')
-        output, updates = theano.scan(f_rnn,
-                                      u,
-                                      x0,
-                                      [W_in, W],
-                                      n_steps=None,
-                                      truncate_gradient=-1,
-                                      go_backwards=False,
-                                      mode=mode)
-
-        output = gpu_from_host(output)
-        f2 = theano.function([u, x0, W_in, W],
-                             output,
-                             updates=updates,
-                             allow_input_downcast=True,
-                             mode=mode)
-
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        v_u = rng.uniform(size=(4,), low=-5., high=5.)
-        v_x0 = rng.uniform()
-        W = rng.uniform()
-        W_in = rng.uniform()
-
-        v_u = numpy.asarray(v_u, dtype='float32')
-        v_x0 = numpy.asarray(v_x0, dtype='float32')
-        W = numpy.asarray(W, dtype='float32')
-        W_in = numpy.asarray(W_in, dtype='float32')
-
-        # compute the output in numpy
-        v_out = numpy.zeros((4,))
-        v_out[0] = v_u[0] * W_in + v_x0 * W
-        for step in xrange(1, 4):
-            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
-
-        theano_values = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(theano_values, v_out)
-
-        # TO DEL
-        topo = f2.maker.fgraph.toposort()
-        scan_node = [node for node in topo
-                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-
-        topo = f2.maker.fgraph.toposort()
-        assert sum([isinstance(node.op, HostFromGpu)
-                    for node in topo]) == 0
-        assert sum([isinstance(node.op, GpuFromHost)
-                    for node in topo]) == 4
-
-        scan_node = [node for node in topo
-                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
-
-        # check that there is no gpu transfer in the inner loop.
-        assert any([isinstance(node.op, GpuElemwise)
-                    for node in scan_node_topo])
-        assert not any([isinstance(node.op, HostFromGpu)
-                        for node in scan_node_topo])
-        assert not any([isinstance(node.op, GpuFromHost)
-                        for node in scan_node_topo])
-
-
-    # This second version test the second case in the optimizer to the gpu.
-    def test_one_sequence_one_output_weights_gpu2(self):
-        def f_rnn(u_t, x_tm1, W_in, W):
-            return u_t * W_in + x_tm1 * W
-
-        u = theano.tensor.fvector('u')
-        x0 = theano.tensor.fscalar('x0')
-        W_in = theano.tensor.fscalar('win')
-        W = theano.tensor.fscalar('w')
-        output, updates = theano.scan(f_rnn,
-                                      u,
-                                      x0,
-                                      [W_in, W],
-                                      n_steps=None,
-                                      truncate_gradient=-1,
-                                      go_backwards=False,
-                                      mode=mode_with_gpu)
-
-        f2 = theano.function([u, x0, W_in, W],
-                             output,
-                             updates=updates,
-                             allow_input_downcast=True,
-                             mode=mode_with_gpu)
-
-        # get random initial values
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        v_u = rng.uniform(size=(4,), low=-5., high=5.)
-        v_x0 = rng.uniform()
-        W = rng.uniform()
-        W_in = rng.uniform()
-
-        # compute the output in numpy
-        v_out = numpy.zeros((4,))
-        v_out[0] = v_u[0] * W_in + v_x0 * W
-        for step in xrange(1, 4):
-            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
-
-        theano_values = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(theano_values, v_out)
-
-        topo = f2.maker.fgraph.toposort()
-        assert sum([isinstance(node.op, HostFromGpu)
-                    for node in topo]) == 1
-        assert sum([isinstance(node.op, GpuFromHost)
-                    for node in topo]) == 4
-
-        scan_node = [node for node in topo
-                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
-
-        # check that there is no gpu transfer in the inner loop.
-        assert any([isinstance(node.op, GpuElemwise)
-                    for node in scan_node_topo])
-        assert not any([isinstance(node.op, HostFromGpu)
-                        for node in scan_node_topo])
-        assert not any([isinstance(node.op, GpuFromHost)
-                        for node in scan_node_topo])
-
-    # This third test checks that scan can deal with a mixture of dtypes as
-    # outputs when is running on GPU
-    def test_gpu3_mixture_dtype_outputs(self):
-        def f_rnn(u_t, x_tm1, W_in, W):
-            return (u_t * W_in + x_tm1 * W,
-                    theano.tensor.cast(u_t + x_tm1, 'int64'))
-
-        u = theano.tensor.fvector('u')
-        x0 = theano.tensor.fscalar('x0')
-        W_in = theano.tensor.fscalar('win')
-        W = theano.tensor.fscalar('w')
-        output, updates = theano.scan(f_rnn,
-                                      u,
-                                      [x0, None],
-                                      [W_in, W],
-                                      n_steps=None,
-                                      truncate_gradient=-1,
-                                      go_backwards=False,
-                                      mode=mode_with_gpu)
-
-        f2 = theano.function([u, x0, W_in, W],
-                             output,
-                             updates=updates,
-                             allow_input_downcast=True,
-                             mode=mode_with_gpu)
-
-        # get random initial values
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        v_u = rng.uniform(size=(4,), low=-5., high=5.)
-        v_x0 = rng.uniform()
-        W = rng.uniform()
-        W_in = rng.uniform()
-
-        # compute the output in numpy
-        v_out1 = numpy.zeros((4,))
-        v_out2 = numpy.zeros((4,), dtype='int64')
-        v_out1[0] = v_u[0] * W_in + v_x0 * W
-        v_out2[0] = v_u[0] + v_x0
-        for step in xrange(1, 4):
-            v_out1[step] = v_u[step] * W_in + v_out1[step - 1] * W
-            v_out2[step] = numpy.int64(v_u[step] + v_out1[step - 1])
-
-        theano_out1, theano_out2 = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(theano_out1, v_out1)
-        utt.assert_allclose(theano_out2, v_out2)
-
-        topo = f2.maker.fgraph.toposort()
-        scan_node = [node for node in topo
-                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-        assert scan_node.op.gpua
-
-        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
-
-        # check that there is no gpu transfer in the inner loop.
-        assert not any([isinstance(node.op, HostFromGpu)
-                        for node in scan_node_topo])
-        assert not any([isinstance(node.op, GpuFromHost)
-                        for node in scan_node_topo])
-
-
-    def test_gpu4_gibbs_chain(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        v_vsample = numpy.array(rng.binomial(1, .5, size=(3, 20),),
-                                dtype='float32')
-        vsample = theano.shared(v_vsample)
-        trng = theano.sandbox.rng_mrg.MRG_RandomStreams(
-                                utt.fetch_seed())
-
-        def f(vsample_tm1):
-            return trng.binomial(vsample_tm1.shape, n=1, p=0.3,
-                                 dtype='float32') * vsample_tm1
-
-        theano_vsamples, updates = theano.scan(f,
-                                               [],
-                                               vsample,
-                                               [],
-                                               n_steps=10,
-                                               truncate_gradient=-1,
-                                               go_backwards=False,
-                                               mode=mode_with_gpu)
-        my_f = theano.function([],
-                               theano_vsamples[-1],
-                               updates=updates,
-                               allow_input_downcast=True,
-                               mode=mode_with_gpu)
-
-        # I leave this to tested by debugmode, this test was anyway
-        # more of does the graph compile kind of test
-        t_result = my_f()
diff --git a/theano/sandbox/gpuarray/tests/test_subtensor.py b/theano/sandbox/gpuarray/tests/test_subtensor.py
deleted file mode 100644
index ea332e6deb7..00000000000
--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import numpy
-
-import theano
-from theano.tensor.tests.test_subtensor import T_subtensor
-
-from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
-from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
-                                               GpuAdvancedIncSubtensor1)
-
-from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
-
-from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
-
-from theano.compile import DeepCopyOp
-
-from theano import tensor
-
-
-class G_subtensor(T_subtensor):
-    def shortDescription(self):
-        return None
-
-    def __init__(self, name):
-        T_subtensor.__init__(self, name,
-                             shared=gpuarray_shared_constructor,
-                             sub=GpuSubtensor,
-                             inc_sub=GpuIncSubtensor,
-                             adv_incsub1=GpuAdvancedIncSubtensor1,
-                             mode=mode_with_gpu,
-                             # avoid errors with limited devices
-                             dtype='float32',
-                             ignore_topo=(HostFromGpu, GpuFromHost,
-                                          DeepCopyOp))
-        # GPU opt can't run in fast_compile only.
-        self.fast_compile = False
-        assert self.sub == GpuSubtensor
-
-
-def test_advinc_subtensor1():
-    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
-    for shp in [(3, 3), (3, 3, 3)]:
-        shared = gpuarray_shared_constructor
-        xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
-        yval = numpy.empty((2,) + shp[1:], dtype='float32')
-        yval[:] = 10
-        x = shared(xval, name='x')
-        y = tensor.tensor(dtype='float32',
-                     broadcastable=(False,) * len(shp),
-                     name='y')
-        expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
-        f = theano.function([y], expr, mode=mode_with_gpu)
-        assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1)
-                    for node in f.maker.fgraph.toposort()]) == 1
-        rval = f(yval)
-        rep = xval.copy()
-        rep[[0, 2]] += yval
-        assert numpy.allclose(rval, rep)
diff --git a/theano/sandbox/gpuarray/tests/test_type.py b/theano/sandbox/gpuarray/tests/test_type.py
deleted file mode 100644
index 7a1f3ed6419..00000000000
--- a/theano/sandbox/gpuarray/tests/test_type.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import operator
-
-import numpy
-
-import theano
-from theano.compile import DeepCopyOp
-
-from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
-
-from theano.sandbox.gpuarray.type import GpuArrayType
-
-
-def test_deep_copy():
-    a = rand_gpuarray(20, dtype='float32')
-    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
-
-    f = theano.function([g], g)
-
-    assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
-
-    res = f(a)
-
-    assert GpuArrayType.values_eq(res, a)
-
-
-def test_values_eq_approx():
-    a = rand_gpuarray(20, dtype='float32')
-    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
-    assert GpuArrayType.values_eq_approx(a, a)
-    b = a.copy()
-    b[0] = numpy.asarray(b[0]) + 1.
-    assert not GpuArrayType.values_eq_approx(a, b)
-    b = a.copy()
-    b[0] = -numpy.asarray(b[0])
-    assert not GpuArrayType.values_eq_approx(a, b)
-
-
-def test_specify_shape():
-    a = rand_gpuarray(20, dtype='float32')
-    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
-    f = theano.function([g], theano.tensor.specify_shape(g, [20]))
-    f(a)
diff --git a/theano/sandbox/gpuarray/type.py b/theano/sandbox/gpuarray/type.py
deleted file mode 100644
index 718ec494cc7..00000000000
--- a/theano/sandbox/gpuarray/type.py
+++ /dev/null
@@ -1,409 +0,0 @@
-import numpy
-
-import theano
-from theano.tensor.var import _tensor_py_operators
-from theano import Type, Variable, Constant, tensor, config, scalar
-from theano.compile import SharedVariable
-
-# Make sure this is importable even if pygpu is absent
-# (it will not work though)
-try:
-    import pygpu
-    from pygpu import gpuarray
-    from pygpu.elemwise import compare, elemwise2
-except ImportError:
-    pass
-
-
-class GpuArrayType(Type):
-    def __init__(self, dtype, broadcastable, name=None):
-        # In case this was not provided and no global value is available
-        self.dtype = str(dtype)
-        self.broadcastable = tuple(bool(b) for b in broadcastable)
-        self.ndim = len(self.broadcastable)
-        self.name = name
-        try:
-            self.typecode = gpuarray.dtype_to_typecode(self.dtype)
-        except gpuarray.GpuArrayException:
-            raise TypeError("Unsupported dtype for %s: %s" %
-                            (self.__class__.__name__, self.dtype))
-
-    def __str__(self):
-        return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable)
-
-    def filter(self, data, strict=False, allow_downcast=None):
-        if strict:
-            if not isinstance(data, gpuarray.GpuArray):
-                raise TypeError("%s expected a GpuArray object." % self,
-                                data, type(data))
-            if self.typecode != data.typecode:
-                raise TypeError("%s expected typecode %d (dtype %s), "
-                                "got %d (dtype %s)." %
-                                (self, self.typecode, self.dtype,
-                                 data.typecode, str(data.dtype)))
-            # fallthrough to ndim check
-        elif allow_downcast:
-            data = gpuarray.array(data, dtype=self.typecode, copy=False,
-                                  ndmin=len(self.broadcastable))
-        else:
-            up_dtype = scalar.upcast(self.dtype, data.dtype)
-            if up_dtype == self.dtype:
-                data = gpuarray.array(data, dtype=self.dtype, copy=False)
-            else:
-                raise TypeError("%s cannot store a value of dtype %s "
-                                "without risking loss of precision." %
-                                (self, data.dtype))
-
-        if self.ndim != data.ndim:
-            raise TypeError("Wrong number of dimensions: expected %s, "
-                            "got %s with shape %s." % (self.ndim, data.ndim,
-                                                       data.shape), data)
-        shp = data.shape
-        for i, b in enumerate(self.broadcastable):
-            if b and shp[i] != 1:
-                raise TypeError("Non-unit value on shape on a broadcastable"
-                                " dimension.", shp, self.broadcastable)
-        return data
-
-    def filter_variable(self, other):
-        if hasattr(other, '_as_GpuArrayVariable'):
-            other = other._as_GpuArrayVariable()
-
-        if not isinstance(other, Variable):
-            other = self.Constant(type=self, data=other)
-
-        if other.type == self:
-            return other
-
-        if not isinstance(other.type, tensor.TensorType):
-            raise TypeError('Incompatible type', (self, other.type))
-        if (other.type.dtype != self.dtype):
-            raise TypeError('Incompatible dtype', (self.dtype,
-                                                   other.type.dtype))
-        if other.type.ndim != self.ndim:
-            raise TypeError('Incompatible number of dimensions.'
-                            ' Expected %d, got %d.' % (self.ndim, other.ndim))
-        if other.type.broadcastable != self.broadcastable:
-            raise TypeError('Incompatible broadcastable dimensions.'
-                            ' Expected %s, got %s.' %
-                            (str(other.type.broadcastable),
-                             str(self.broadcastable)))
-
-        return theano.sandbox.gpuarray.basic_ops.gpu_from_host(other)
-
-    @staticmethod
-    def values_eq(a, b):
-        if a.shape != b.shape:
-            return False
-        if a.typecode != b.typecode:
-            return False
-        return numpy.asarray(compare(a, '==', b)).all()
-
-    @staticmethod
-    def values_eq_approx(a, b,
-                         allow_remove_inf=False, allow_remove_nan=False,
-                         rtol=None, atol=None):
-        if a.shape != b.shape or a.dtype != b.dtype:
-            return False
-        if 'int' in str(a.dtype):
-            return GpuArrayType.values_eq(a, b)
-        else:
-            if allow_remove_inf or allow_remove_nan:
-                raise NotImplementedError(
-                    "GpuArrayType.values_eq_approx() don't implemented the"
-                    " allow_remove_inf and allow_remove_nan parameter")
-            narrow = 'float32', 'complex64'
-            if (str(a.dtype) in narrow) or (str(b.dtype) in narrow):
-                atol_ = theano.tensor.basic.float32_atol
-                rtol_ = theano.tensor.basic.float32_rtol
-            else:
-                atol_ = theano.tensor.basic.float64_atol
-                rtol_ = theano.tensor.basic.float64_rtol
-            if rtol is not None:
-                rtol_ = rtol
-            if atol is not None:
-                atol_ = atol
-            res = elemwise2(a, '', b, a, odtype=numpy.dtype('bool'),
-                            op_tmpl="res[i] = (fabs(%%(a)s - %%(b)s) <"
-                            "(%(atol_)s + %(rtol_)s * fabs(%%(b)s)))" %
-                            locals())
-            return numpy.asarray(res).all()
-
-    def value_zeros(self, shape):
-        return pygpu.gpuarray.zeros(shape, dtype=self.typecode)
-
-    def make_variable(self, name=None):
-        return self.Variable(self, name=name)
-
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.typecode == other.typecode and
-                self.broadcastable == other.broadcastable)
-
-    def __hash__(self):
-        return (hash(self.typecode) ^ hash(self.broadcastable))
-
-    def __str__(self):
-        return "GpuArray<%s>" % (self.dtype,)
-
-    def dtype_specs(self):
-        """Return a tuple (python type, c type, numpy typenum) that corresponds
-        to self.dtype.
-
-        This function is used internally as part of C code generation.
-        """
-        # TODO: add more type correspondances for e.g. int32, int64, float32,
-        # complex64, etc.
-        try:
-            return {
-                'float32': (float, 'npy_float32', 'NPY_FLOAT32'),
-                'float64': (float, 'npy_float64', 'NPY_FLOAT64'),
-                'uint8': (int, 'npy_uint8', 'NPY_UINT8'),
-                'int8': (int, 'npy_int8', 'NPY_INT8'),
-                'uint16': (int, 'npy_uint16', 'NPY_UINT16'),
-                'int16': (int, 'npy_int16', 'NPY_INT16'),
-                'uint32': (int, 'npy_uint32', 'NPY_UINT32'),
-                'int32': (int, 'npy_int32', 'NPY_INT32'),
-                'uint64': (int, 'npy_uint64', 'NPY_UINT64'),
-                'int64': (int, 'npy_int64', 'NPY_INT64'),
-                'complex128': (complex, 'theano_complex128', 'NPY_COMPLEX128'),
-                'complex64': (complex, 'theano_complex64', 'NPY_COMPLEX64')
-                }[self.dtype]
-        except KeyError:
-            raise TypeError("Unsupported dtype for %s: %s" %
-                            (self.__class__.__name__, self.dtype))
-
-    def get_shape_info(self, obj):
-        return obj.shape
-
-    def get_size(self, shape_info):
-        if shape_info:
-            return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
-        else:
-            return numpy.dtype(self.dtype).itemsize
-
-    def c_declare(self, name, sub, check_input=True):
-        return """
-        PyGpuArrayObject *%(name)s;
-        """ % locals()
-
-    def c_init(self, name, sub):
-        return "%s = NULL;" % (name,)
-
-    def c_extract(self, name, sub, check_input=True):
-        # TODO I don't check broadcast stuff for now.
-        return """
-        %(name)s = NULL;
-        if (py_%(name)s == Py_None) {
-            PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not None");
-            %(fail)s
-        }
-        /* First check if we are the base type exactly (the most common case),
-           then do the full subclass check if needed. */
-        if (py_%(name)s->ob_type != &PyGpuArrayType &&
-            !PyObject_TypeCheck(py_%(name)s, &PyGpuArrayType)) {
-            PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
-            %(fail)s
-        }
-        %(name)s = (PyGpuArrayObject *)py_%(name)s;
-        Py_INCREF(%(name)s);
-        """ % {'name': name, 'fail': sub['fail']}
-
-    def c_cleanup(self, name, sub):
-        return "Py_XDECREF(%(name)s); %(name)s = NULL;" % {'name': name}
-
-    def c_sync(self, name, sub):
-        return """
-        if (!%(name)s) {
-            Py_XDECREF(py_%(name)s);
-            Py_INCREF(Py_None);
-            py_%(name)s = Py_None;
-        } else if ((void *)py_%(name)s != (void *)%(name)s) {
-            Py_XDECREF(py_%(name)s);
-            py_%(name)s = (PyObject *)%(name)s;
-            Py_INCREF(py_%(name)s);
-        }
-        """ % {'name': name}
-
-    def c_init_code(self):
-        # We don't actually need the numpy API except in
-        # HostFromGpu and GpuFromHost and those case will be covered
-        # by the TensorType parameter
-        return ['import_pygpu__gpuarray();']
-
-    def c_headers(self):
-        # We need arrayobject for the PyArrayDescr struct def
-        # (even if we just use a pointer to it in a function def)
-        return ['<gpuarray/array.h>', '<gpuarray/kernel.h>', '<gpuarray/error.h>',
-                '<gpuarray/buffer_blas.h>', '<numpy/arrayobject.h>',
-                '<gpuarray_api.h>']
-
-    def c_header_dirs(self):
-        return [pygpu.get_include(), numpy.get_include()]
-
-    def c_libraries(self):
-        return ['gpuarray']
-
-    def c_code_cache_version(self):
-        ver = pygpu.gpuarray.api_version()
-        # we only use the major version since the minor revision are
-        # API-compatible.
-        return (1, ver[0])
-
-
-class _operators(_tensor_py_operators):
-    def _as_TensorVariable(self):
-        from basic_ops import host_from_gpu
-        return host_from_gpu(self)
-
-    def _as_GpuArrayVariable(self):
-        return self
-
-
-class GpuArrayVariable(_operators, Variable):
-    pass
-
-
-GpuArrayType.Variable = GpuArrayVariable
-
-
-class GpuArraySignature(tensor.TensorConstantSignature):
-    pass  # might do something better if we can run the sum on the
-          # GPU, but for now this will suffice.
-
-
-class GpuArrayConstant(_operators, Constant):
-    def signature(self):
-        return GpuArraySignature((self.type, numpy.asarray(self.data)))
-
-    def __str__(self):
-        if self.name is not None:
-            return self.name
-        return "GpuArrayConstant{%s}" % numpy.asarray(self.data)
-
-
-GpuArrayType.Constant = GpuArrayConstant
-
-
-class GpuArraySharedVariable(_operators, SharedVariable):
-    def get_value(self, borrow=False, return_internal_type=False):
-        if return_internal_type:
-            if borrow:
-                return self.container.value
-            else:
-                return self.container.value.copy()
-        else:
-            return numpy.asarray(self.container.value)
-
-    def set_value(self, value, borrow=False):
-        self.container.value = pygpu.gpuarray.array(value, copy=(not borrow))
-
-    def __getitem__(self, *args):
-        return _operators.__getitem__(self, *args)
-
-
-GpuArrayType.SharedVariable = GpuArraySharedVariable
-
-
-def gpuarray_shared_constructor(value, name=None, strict=False,
-                                allow_downcast=None, borrow=False,
-                                broadcastable=None):
-    """SharedVariable constructor for GpuArrayType"""
-    if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)):
-        raise TypeError('ndarray or GpuArray required')
-
-    if broadcastable is None:
-        broadcastable = (False,) * value.ndim
-    type = GpuArrayType(value.dtype, broadcastable)
-    deviceval = pygpu.gpuarray.array(value, copy=(not borrow))
-    return GpuArraySharedVariable(type=type, value=deviceval, name=name,
-                                  strict=strict)
-
-theano.compile.register_view_op_c_code(GpuArrayType, """
-    Py_XDECREF(%(oname)s);
-    %(oname)s = %(iname)s;
-    Py_XINCREF(%(oname)s);
-""", version=(0,))
-
-# Register GpuArrayType C code for Shape Op.
-theano.compile.register_shape_c_code(
-    GpuArrayType,
-    """
-    npy_intp shape[] = {%(iname)s->ga.nd};
-    if(%(oname)s == NULL || (PyArray_DIMS(%(oname)s)[0] != shape[0]))
-    {
-        Py_XDECREF(%(oname)s);
-        %(oname)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
-    }
-    for(int i=0;i<shape[0];i++)
-    {
-        ((npy_int64*)PyArray_GETPTR1(%(oname)s, i))[0] = %(iname)s->ga.dimensions[i];
-    }
-    """,
-    version=1)
-
-theano.compile.register_shape_i_c_code(
-    GpuArrayType,
-    """
-    if(!%(oname)s)
-        %(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
-    ((npy_int64*)PyArray_DATA(%(oname)s))[0] =
-                              %(iname)s->ga.dimensions[%(i)s];
-    """,
-    """
-    if (%(i)s>=%(iname)s->ga.nd){
-        PyErr_SetString(PyExc_TypeError,
-            "Number of dimensions lower than expected");
-        %(fail)s
-    }
-    """,
-    version=(1,))
-
-theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
-    Py_XDECREF(%(oname)s);
-    %(oname)s = pygpu_copy(%(iname)s, GA_ANY_ORDER);
-    if (!%(oname)s) { %(fail)s }
-""", version=(5,))
-
-theano.compile.register_rebroadcast_c_code(
-    GpuArrayType,
-    """
-    if(%(iname)s->ga.dimensions[%(axis)s] != 1){
-        PyErr_Format(PyExc_ValueError,
-            "Dimension %(axis)s in Rebroadcast's input was"
-            " supposed to be 1 (got %%d instead)",
-            %(iname)s->ga.dimensions[%(axis)s]);
-        %(fail)s
-    }
-    """,
-    version=1)
-
-theano.compile.register_specify_shape_c_code(
-    GpuArrayType,
-    """
-        if (PyGpuArray_NDIM(%(iname)s) != PyArray_DIMS(%(shape)s)[0]) {
-            PyErr_Format(PyExc_AssertionError,
-                         "SpecifyShape: vector of shape has %%d elements,"
-                         " but the input has %%d dimensions.",
-                         PyGpuArray_NDIM(%(iname)s),
-                         PyArray_DIMS(%(shape)s)[0]);
-            %(fail)s;
-        }
-        for(int i = 0; i < PyGpuArray_NDIM(%(iname)s); i++){
-            dtype_%(shape)s shp = ((dtype_%(shape)s*)PyArray_GETPTR1(%(shape)s,
-                                                                     i))[0];
-            if (PyGpuArray_DIMS(%(iname)s)[i] != shp) {
-                PyErr_Format(PyExc_AssertionError,
-                             "SpecifyShape: dim %%d of input has shape %%d,"
-                             " expected %%d.",
-                             i, PyGpuArray_DIMS(%(iname)s)[i],
-                             shp);
-                %(fail)s;
-            }
-        }
-        Py_XDECREF(%(oname)s);
-        %(oname)s = %(iname)s;
-        Py_XINCREF(%(oname)s);
-    """,
-    version=1,
-    c_support_code_apply='#include <numpy_compat.h>')
diff --git a/theano/sandbox/linalg/__init__.py b/theano/sandbox/linalg/__init__.py
index cd9807a105b..e549a828439 100644
--- a/theano/sandbox/linalg/__init__.py
+++ b/theano/sandbox/linalg/__init__.py
@@ -2,5 +2,5 @@
 from kron import kron
 from ops import (cholesky, matrix_inverse, solve,
         diag, extract_diag, alloc_diag,
-        det, psd, eig, eigh, eigvalsh,
+        det, psd, eig, eigh,
         trace, spectral_radius_bound)
diff --git a/theano/sandbox/linalg/kron.py b/theano/sandbox/linalg/kron.py
index f2772ddedf5..28f158a7022 100644
--- a/theano/sandbox/linalg/kron.py
+++ b/theano/sandbox/linalg/kron.py
@@ -1,6 +1,34 @@
-from theano.tensor.slinalg import kron
-import warnings
+from theano import tensor
 
-warnings.warn(
-        "theano modules are deprecated and will be removed in release 0.7",
-        stacklevel=3)
\ No newline at end of file
+
+def kron(a, b):
+    """ Kronecker product
+
+    Same as scipy.linalg.kron(a, b).
+
+    :note: numpy.kron(a, b) != scipy.linalg.kron(a, b)!
+        They don't have the same shape and order when
+        a.ndim != b.ndim != 2.
+
+    :param a: array_like
+    :param b: array_like
+    :return: array_like with a.ndim + b.ndim - 2 dimensions.
+
+    """
+    a = tensor.as_tensor_variable(a)
+    b = tensor.as_tensor_variable(b)
+    if (a.ndim + b.ndim <= 2):
+        raise TypeError('kron: inputs dimensions must sum to 3 or more. '
+                        'You passed %d and %d.' % (a.ndim, b.ndim))
+    o = tensor.outer(a, b)
+    o = o.reshape(tensor.concatenate((a.shape, b.shape)),
+                  a.ndim + b.ndim)
+    shf = o.dimshuffle(0, 2, 1, * range(3, o.ndim))
+    if shf.ndim == 3:
+        shf = o.dimshuffle(1, 0, 2)
+        o = shf.flatten()
+    else:
+        o = shf.reshape((o.shape[0] * o.shape[2],
+                         o.shape[1] * o.shape[3]) +
+                        tuple([o.shape[i] for i in range(4, o.ndim)]))
+    return o
diff --git a/theano/sandbox/linalg/ops.py b/theano/sandbox/linalg/ops.py
index 631b9383d0e..47c55036481 100644
--- a/theano/sandbox/linalg/ops.py
+++ b/theano/sandbox/linalg/ops.py
@@ -15,42 +15,6 @@
 from theano.gof.opt import Optimizer
 from theano.gradient import DisconnectedType
 
-from theano.tensor.nlinalg import ( MatrixInverse,
-                                    matrix_inverse,
-                                    MatrixPinv,
-                                    pinv,
-                                    AllocDiag,
-                                    alloc_diag,
-                                    ExtractDiag,
-                                    extract_diag,
-                                    diag,
-                                    trace,
-                                    Det,
-                                    det,
-                                    Eig,
-                                    eig,
-                                    Eigh,
-                                    EighGrad,
-                                    eigh,
-                                    matrix_dot,
-                                    _zero_disconnected,
-                                    qr,
-                                    svd,
-                                    lstsq,
-                                    matrix_power,
-                                    norm
-                                    )
-
-from theano.tensor.slinalg import ( Cholesky,
-                                    cholesky,
-                                    CholeskyGrad,
-                                    Solve,
-                                    solve,
-                                    Eigvalsh,
-                                    EigvalshGrad,
-                                    eigvalsh
-                                    )
-
 try:
     import scipy.linalg
     imported_scipy = True
@@ -70,13 +34,16 @@ class Hint(Op):
     transfer that information out of the graph.
 
     """
-
-    __props__ = ('hints',)
-
     def __init__(self, **kwargs):
         self.hints = tuple(kwargs.items())
         self.view_map = {0: [0]}
 
+    def __eq__(self, other):
+        return type(self) == type(other) and self.hints == other.hints
+
+    def __hash__(self):
+        return hash((type(self), self.hints))
+
     def make_node(self, x):
         return Apply(self, [x], [x.type()])
 
@@ -105,7 +72,7 @@ def hints(variable):
 
 
 @register_canonicalize
-@local_optimizer([Hint])
+@local_optimizer([])
 def remove_hint_nodes(node):
     if is_hint_node(node):
         # transfer hints from graph to Feature
@@ -170,9 +137,9 @@ def on_attach(self, fgraph):
         # Variable -> tuple(scalars) or None  (All tensor vars map to tuple)
         self.hints = {}
         for node in fgraph.toposort():
-            self.on_import(fgraph, node, "on_attach")
+            self.on_import(fgraph, node)
 
-    def on_import(self, fgraph, node, reason):
+    def on_import(self, fgraph, node):
         if node.outputs[0] in self.hints:
             # this is a revert, not really an import
             for r in node.outputs + node.inputs:
@@ -192,7 +159,7 @@ def update_second_from_first(self, r0, r1):
             if k not in new_hints:
                 new_hints[k] = v
 
-    def on_change_input(self, fgraph, node, i, r, new_r, reason):
+    def on_change_input(self, fgraph, node, i, r, new_r):
         # TODO:
         # This tells us that r and new_r must have the same shape
         # if we didn't know that the shapes are related, now we do.
@@ -256,20 +223,8 @@ def is_positive(v):
     return False
 
 
-@register_canonicalize
-@local_optimizer([DimShuffle])
-def transinv_to_invtrans(node):
-    if isinstance(node.op, DimShuffle):
-        if node.op.new_order == (1, 0):
-            A, = node.inputs
-            if A.owner:
-                if isinstance(A.owner.op, MatrixInverse):
-                    X, = A.owner.inputs
-                    return [A.owner.op(node.op(X))]
-
-
 @register_stabilize
-@local_optimizer([Dot, Dot22])
+@local_optimizer([])
 def inv_as_solve(node):
     if not imported_scipy:
         return False
@@ -284,36 +239,10 @@ def inv_as_solve(node):
                 return [solve(r.owner.inputs[0].T, l.T).T]
 
 
-@register_stabilize
-@register_canonicalize
-@local_optimizer([Solve])
-def tag_solve_triangular(node):
-    """
-    If a general solve() is applied to the output of a cholesky op, then
-    replace it with a triangular solve.
-    """
-    if node.op == solve:
-        if node.op.A_structure == 'general':
-            A, b = node.inputs  # result is solution Ax=b
-            if A.owner and isinstance(A.owner.op, type(cholesky)):
-                if A.owner.op.lower:
-                    return [Solve('lower_triangular')(A, b)]
-                else:
-                    return [Solve('upper_triangular')(A, b)]
-            if (A.owner and isinstance(A.owner.op, DimShuffle)
-                and A.owner.op.new_order == (1, 0)):
-                A_T, = A.owner.inputs
-                if A_T.owner and isinstance(A_T.owner.op, type(cholesky)):
-                    if A_T.owner.op.lower:
-                        return [Solve('upper_triangular')(A, b)]
-                    else:
-                        return [Solve('lower_triangular')(A, b)]
-
-
 @register_canonicalize
 @register_stabilize
 @register_specialize
-@local_optimizer([DimShuffle])
+@local_optimizer([])
 def no_transpose_symmetric(node):
     if isinstance(node.op, DimShuffle):
         x = node.inputs[0]
@@ -324,7 +253,7 @@ def no_transpose_symmetric(node):
 
 
 @register_stabilize
-@local_optimizer(None) # XXX: solve is defined later and can't be used here
+@local_optimizer([])
 def psd_solve_with_chol(node):
     if node.op == solve:
         A, b = node.inputs  # result is solution Ax=b
@@ -340,7 +269,7 @@ def psd_solve_with_chol(node):
 
 @register_stabilize
 @register_specialize
-@local_optimizer(None) # XXX: det is defined later and can't be used here
+@local_optimizer([])
 def local_det_chol(node):
     """
     If we have det(X) and there is already an L=cholesky(X)
@@ -358,7 +287,7 @@ def local_det_chol(node):
 @register_canonicalize
 @register_stabilize
 @register_specialize
-@local_optimizer([tensor.log])
+@local_optimizer([])
 def local_log_prod_sqr(node):
     if node.op == tensor.log:
         x, = node.inputs
@@ -378,7 +307,7 @@ def local_log_prod_sqr(node):
 @register_canonicalize
 @register_stabilize
 @register_specialize
-@local_optimizer([tensor.log])
+@local_optimizer([])
 def local_log_pow(node):
     if node.op == tensor.log:
         x, = node.inputs
@@ -388,6 +317,517 @@ def local_log_pow(node):
             return [exponent * tensor.log(base)]
 
 
+def matrix_dot(*args):
+    """ Shorthand for product between several dots
+
+    Given :math:`N` matrices :math:`A_0, A_1, .., A_N`, ``matrix_dot`` will
+    generate the matrix product between all in the given order, namely
+    :math:`A_0 \cdot A_1 \cdot A_2 \cdot .. \cdot A_N`.
+    """
+    rval = args[0]
+    for a in args[1:]:
+        rval = theano.tensor.dot(rval, a)
+    return rval
+
+MATRIX_STRUCTURES = (
+        'general',
+        'symmetric',
+        'lower_triangular',
+        'upper_triangular',
+        'hermitian',
+        'banded',
+        'diagonal',
+        'toeplitz',
+        )
+
+
+class Cholesky(Op):
+    """
+    Return a triangular matrix square root of positive semi-definite `x`
+
+    L = cholesky(X, lower=True) implies dot(L, L.T) == X
+    """
+    #TODO: inplace
+    #TODO: for specific dtypes
+    #TODO: LAPACK wrapper with in-place behavior, for solve also
+    def __init__(self, lower=True):
+        self.lower = lower
+        self.destructive = False
+
+    def props(self):
+        return (self.lower,
+                self.destructive)
+
+    def __hash__(self):
+        return hash((type(self), self.props()))
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and self.props() == other.props())
+
+    def infer_shape(self, node, shapes):
+        return [shapes[0]]
+
+    def __str__(self):
+        if self.lower:
+            lu = 'lower'
+        else:
+            lu = 'upper'
+        if self.destructive:
+            destr = 'destructive'
+        else:
+            destr = 'non-destructive'
+        return 'Cholesky{%s,%s}' % (lu, destr)
+
+    def make_node(self, x):
+        assert imported_scipy, (
+            "Scipy not available. Scipy is needed for the Cholesky op")
+        x = as_tensor_variable(x)
+        assert x.ndim == 2
+        return Apply(self, [x], [x.type()])
+
+    def perform(self, node, inputs, outputs):
+        x = inputs[0]
+        z = outputs[0]
+        z[0] = scipy.linalg.cholesky(x, lower=self.lower).astype(x.dtype)
+
+    def grad(self, inputs, gradients):
+        return [CholeskyGrad(self.lower)(inputs[0], self(inputs[0]),
+                                         gradients[0])]
+
+cholesky = Cholesky()
+
+
+class CholeskyGrad(Op):
+    """
+    """
+    def __init__(self, lower=True):
+        self.lower = lower
+        self.destructive = False
+
+    def props(self):
+        return (self.lower,
+                self.destructive)
+
+    def __hash__(self):
+        return hash((type(self), self.props()))
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and self.props() == other.props())
+
+    def __str__(self):
+        if self.lower:
+            lu = 'lower'
+        else:
+            lu = 'upper'
+        if self.destructive:
+            destr = 'destructive'
+        else:
+            destr = 'non-destructive'
+        return 'CholeskyGrad{%s,%s}' % (lu, destr)
+
+    def make_node(self, x, l, dz):
+        x = as_tensor_variable(x)
+        l = as_tensor_variable(l)
+        dz = as_tensor_variable(dz)
+        assert x.ndim == 2
+        assert l.ndim == 2
+        assert dz.ndim == 2
+        assert l.owner.op.lower == self.lower, (
+            "lower/upper mismatch between Cholesky op and CholeskyGrad op"
+        )
+        return Apply(self, [x, l, dz], [x.type()])
+
+    def perform(self, node, inputs, outputs):
+        """Implements the "reverse-mode" gradient [1]_ for the
+        Cholesky factorization of a positive-definite matrix.
+
+        .. [1] S. P. Smith. "Differentiation of the Cholesky Algorithm".
+               Journal of Computational and Graphical Statistics,
+               Vol. 4, No. 2 (Jun.,1995), pp. 134-147
+               http://www.jstor.org/stable/1390762
+
+        """
+        x = inputs[0]
+        L = inputs[1]
+        dz = inputs[2]
+        dx = outputs[0]
+        N = x.shape[0]
+        if self.lower:
+            F = numpy.tril(dz)
+            for k in xrange(N - 1, -1, -1):
+                for j in xrange(k + 1, N):
+                    for i in xrange(j, N):
+                        F[i, k] -= F[i, j] * L[j, k]
+                        F[j, k] -= F[i, j] * L[i, k]
+                for j in xrange(k + 1, N):
+                    F[j, k] /= L[k, k]
+                    F[k, k] -= L[j, k] * F[j, k]
+                F[k, k] /= (2 * L[k, k])
+        else:
+            F = numpy.triu(dz)
+            M = N - 1
+            for k in xrange(N - 1, -1, -1):
+                for j in xrange(k + 1, N):
+                    for i in xrange(j, N):
+                        F[k, i] -= F[j, i] * L[k, j]
+                        F[k, j] -= F[j, i] * L[k, i]
+                for j in xrange(k + 1, N):
+                    F[k, j] /= L[k, k]
+                    F[k, k] -= L[k, j] * F[k, j]
+                F[k, k] /= (2 * L[k, k])
+        dx[0] = F
+
+    def infer_shape(self, node, shapes):
+        return [shapes[0]]
+
+
+class MatrixPinv(Op):
+    """Computes the pseudo-inverse of a matrix :math:`A`.
+
+    The pseudo-inverse of a matrix A, denoted :math:`A^+`, is
+    defined as: "the matrix that 'solves' [the least-squares problem]
+    :math:`Ax = b`," i.e., if :math:`\\bar{x}` is said solution, then
+    :math:`A^+` is that matrix such that :math:`\\bar{x} = A^+b`.
+
+    Note that :math:`Ax=AA^+b`, so :math:`AA^+` is close to the identity matrix.
+    This method is not faster then `matrix_inverse`. Its strength comes from
+    that it works for non-square matrices.
+    If you have a square matrix though, `matrix_inverse` can be both more
+    exact and faster to compute. Also this op does not get optimized into a
+    solve op.
+    """
+    def __init__(self):
+        pass
+
+    def props(self):
+        """Function exposing different properties of each instance of the
+        op.
+
+        For the ``MatrixPinv`` op, there are no properties to be exposed.
+        """
+        return ()
+
+    def __hash__(self):
+        return hash((type(self), self.props()))
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and self.props() == other.props())
+
+    def make_node(self, x):
+        x = as_tensor_variable(x)
+        assert x.ndim == 2
+        return Apply(self, [x], [x.type()])
+
+    def perform(self, node, (x,), (z, )):
+        try:
+            if imported_scipy:
+                z[0] = scipy.linalg.pinv(x).astype(x.dtype)
+            else:
+                z[0] = numpy.linalg.pinv(x).astype(x.dtype)
+        except numpy.linalg.LinAlgError:
+            logger.debug('Failed to invert %s' % str(node.inputs[0]))
+            raise
+
+    def __str__(self):
+        return "MatrixPseudoInverse"
+
+pinv = MatrixPinv()
+
+
+class MatrixInverse(Op):
+    """Computes the inverse of a matrix :math:`A`.
+
+    Given a square matrix :math:`A`, ``matrix_inverse`` returns a square
+    matrix :math:`A_{inv}` such that the dot product :math:`A \cdot A_{inv}`
+    and :math:`A_{inv} \cdot A` equals the identity matrix :math:`I`.
+
+    :note: When possible, the call to this op will be optimized to the call
+           of ``solve``.
+    """
+
+    def __init__(self):
+        pass
+
+    def props(self):
+        """Function exposing different properties of each instance of the
+        op.
+
+        For the ``MatrixInverse`` op, there are no properties to be exposed.
+        """
+        return ()
+
+    def __hash__(self):
+        return hash((type(self), self.props()))
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and self.props() == other.props())
+
+    def make_node(self, x):
+        x = as_tensor_variable(x)
+        assert x.ndim == 2
+        return Apply(self, [x], [x.type()])
+
+    def perform(self, node, (x,), (z, )):
+        try:
+            z[0] = numpy.linalg.inv(x).astype(x.dtype)
+        except numpy.linalg.LinAlgError:
+            logger.debug('Failed to invert %s' % str(node.inputs[0]))
+            raise
+
+    def grad(self, inputs, g_outputs):
+        r"""The gradient function should return
+
+            .. math:: V\frac{\partial X^{-1}}{\partial X},
+
+        where :math:`V` corresponds to ``g_outputs`` and :math:`X` to
+        ``inputs``. Using the `matrix cookbook
+        <http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=3274>`_,
+        once can deduce that the relation corresponds to
+
+            .. math:: (X^{-1} \cdot V^{T} \cdot X^{-1})^T.
+
+        """
+        x, = inputs
+        xi = self(x)
+        gz, = g_outputs
+        #TT.dot(gz.T,xi)
+        return [-matrix_dot(xi, gz.T, xi).T]
+
+    def R_op(self, inputs, eval_points):
+        r"""The gradient function should return
+
+            .. math:: \frac{\partial X^{-1}}{\partial X}V,
+
+        where :math:`V` corresponds to ``g_outputs`` and :math:`X` to
+        ``inputs``.  Using the `matrix cookbook
+        <http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=3274>`_,
+        once can deduce that the relation corresponds to
+
+            .. math:: X^{-1} \cdot V \cdot X^{-1}.
+
+        """
+        x, = inputs
+        xi = self(x)
+        ev, = eval_points
+        if ev is None:
+            return [None]
+        return [-matrix_dot(xi, ev, xi)]
+
+    def __str__(self):
+        return "MatrixInverse"
+
+matrix_inverse = MatrixInverse()
+
+
+class Solve(Op):
+    """Solve a system of linear equations"""
+    def __init__(self,
+                 A_structure='general',
+                 lower=False,
+                 overwrite_A=False,
+                 overwrite_b=False):
+        if A_structure not in MATRIX_STRUCTURES:
+            raise ValueError('Invalid matrix structure argument', A_structure)
+        self.A_structure = A_structure
+        self.lower = lower
+        self.overwrite_A = overwrite_A
+        self.overwrite_b = overwrite_b
+
+    def props(self):
+        return (self.A_structure,
+                self.lower,
+                self.overwrite_A,
+                self.overwrite_b)
+
+    def __hash__(self):
+        return hash((type(self), self.props()))
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.props() == other.props()
+
+    def __repr__(self):
+        return 'Solve{%s}' % str(self.props())
+
+    def make_node(self, A, b):
+        assert imported_scipy, (
+            "Scipy not available. Scipy is needed for the Solve op")
+        A = as_tensor_variable(A)
+        b = as_tensor_variable(b)
+        assert A.ndim == 2
+        assert b.ndim in [1, 2]
+        otype = tensor.tensor(
+                broadcastable=b.broadcastable,
+                dtype=(A * b).dtype)
+        return Apply(self, [A, b], [otype])
+
+    def perform(self, node, inputs, output_storage):
+        A, b = inputs
+        #TODO: use the A_structure to go faster
+        output_storage[0][0] = scipy.linalg.solve(A, b)
+
+    # computes shape of x where x = inv(A) * b
+    def infer_shape(self, node, shapes):
+        Ashape, Bshape = shapes
+        rows = Ashape[1]
+        if len(Bshape) == 1:  # b is a Vector
+            return [(rows,)]
+        else:
+            cols = Bshape[1]  # b is a Matrix
+            return [(rows, cols)]
+
+solve = Solve()  # general solve
+
+#TODO : SolveTriangular
+
+#TODO: Optimizations to replace multiplication by matrix inverse
+#      with solve() Op (still unwritten)
+
+
+class ExtractDiag(Op):
+    """ Return the diagonal of a matrix. """
+    def __init__(self, view=False):
+        self.view = view
+        if self.view:
+            self.view_map = {0: [0]}
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.view == other.view
+
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.view)
+
+    def make_node(self, _x):
+        x = as_tensor_variable(_x)
+        if x.type.ndim != 2:
+            raise TypeError('ExtractDiag only works on matrices', _x)
+        return Apply(self, [x], [tensor.vector(dtype=x.type.dtype)])
+
+    def perform(self, node, ins, outs):
+        """ For some reason numpy.diag(x) is really slow, so we
+        implemented our own. """
+        x, = ins
+        z, = outs
+
+        # zero-dimensional matrices ...
+        if x.shape[0] == 0 or x.shape[1] == 0:
+            z[0] = numpy.zeros(0, dtype=x.dtype)
+            return
+
+        if x.shape[0] < x.shape[1]:
+            rval = x[:, 0]
+        else:
+            rval = x[0]
+
+        rval.strides = (x.strides[0] + x.strides[1],)
+        if self.view:
+            z[0] = rval
+        else:
+            z[0] = rval.copy()
+
+    def __str__(self):
+        return 'ExtractDiag{view=%s}' % self.view
+
+    def grad(self, inputs, g_outputs):
+        x = tensor.zeros_like(inputs[0])
+        xdiag = alloc_diag(g_outputs[0])
+        return [tensor.set_subtensor(
+            x[:xdiag.shape[0], :xdiag.shape[1]],
+            xdiag)]
+
+    def infer_shape(self, node, shapes):
+        x_s, = shapes
+        shp = tensor.min(node.inputs[0].shape)
+        return [(shp,)]
+
+extract_diag = ExtractDiag()
+#TODO: optimization to insert ExtractDiag with view=True
+
+
+class AllocDiag(Op):
+    """
+    Allocates a square matrix with the given vector as its diagonal.
+    """
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def make_node(self, _x):
+        x = as_tensor_variable(_x)
+        if x.type.ndim != 1:
+            raise TypeError('AllocDiag only works on vectors', _x)
+        return Apply(self, [x], [tensor.matrix(dtype=x.type.dtype)])
+
+    def grad(self, inputs, g_outputs):
+        return [extract_diag(g_outputs[0])]
+
+    def perform(self, node, (x,), (z,)):
+        if x.ndim != 1:
+            raise TypeError(x)
+        z[0] = numpy.diag(x)
+
+    def infer_shape(self, node, shapes):
+        x_s, = shapes
+        return [(x_s[0], x_s[0])]
+
+alloc_diag = AllocDiag()
+
+
+def diag(x):
+    """
+    Numpy-compatibility method
+    If `x` is a matrix, return its diagonal.
+    If `x` is a vector return a matrix with it as its diagonal.
+
+    * This method does not support the `k` argument that numpy supports.
+    """
+    xx = as_tensor_variable(x)
+    if xx.type.ndim == 1:
+        return alloc_diag(xx)
+    elif xx.type.ndim == 2:
+        return extract_diag(xx)
+    else:
+        raise TypeError('diag requires vector or matrix argument', x)
+
+
+class Det(Op):
+    """Matrix determinant
+    Input should be a square matrix
+    """
+    def make_node(self, x):
+        x = as_tensor_variable(x)
+        assert x.ndim == 2
+        o = theano.tensor.scalar(dtype=x.dtype)
+        return Apply(self, [x], [o])
+
+    def perform(self, node, (x,), (z, )):
+        try:
+            z[0] = numpy.asarray(numpy.linalg.det(x), dtype=x.dtype)
+        except Exception:
+            print 'Failed to compute determinant', x
+            raise
+
+    def grad(self, inputs, g_outputs):
+        gz, = g_outputs
+        x, = inputs
+        return [gz * self(x) * matrix_inverse(x).T]
+
+    def infer_shape(self, node, shapes):
+        return [()]
+
+    def __str__(self):
+        return "Det"
+det = Det()
+
+
+def trace(X):
+    """
+    Returns the sum of diagonal elements of matrix X.
+    """
+    return extract_diag(X).sum()
+
+
 def spectral_radius_bound(X, log2_exponent):
     """
     Returns upper bound on the largest eigenvalue of square symmetrix matrix X.
@@ -408,10 +848,241 @@ def spectral_radius_bound(X, log2_exponent):
     if log2_exponent <= 0:
         raise ValueError('spectral_radius_bound requires a strictly positive '
                          'exponent', log2_exponent)
-
     XX = X
     for i in xrange(log2_exponent):
         XX = tensor.dot(XX, XX)
     return tensor.pow(
             trace(XX),
             2 ** (-log2_exponent))
+
+
+class A_Xinv_b(Op):
+    """Product of form a inv(X) b"""
+    def make_node(self, a, X, b):
+        assert imported_scipy, (
+            "Scipy not available. Scipy is needed for the A_Xinv_b op")
+        a = as_tensor_variable(a)
+        X = as_tensor_variable(X)
+        b = as_tensor_variable(b)
+        assert a.ndim == 2
+        assert X.ndim == 2
+        assert b.ndim == 2
+        o = theano.tensor.matrix(dtype=x.dtype)
+        return Apply(self, [a, X, b], [o])
+
+    def perform(self, ndoe, inputs, outstor):
+        a, X, b = inputs
+        if 1:
+            L_factor = scipy.linalg.cho_factor(X)
+            xb = scipy.linalg.cho_solve(L_factor, b)
+            xa = scipy.linalg.cho_solve(L_factor, a.T)
+            z = numpy.dot(xa.T, xb)
+        else:
+            raise NotImplementedError(self.X_structure)
+        outstor[0][0] = z
+
+    def grad(self, inputs, g_outputs):
+        gz, = g_outputs
+        a, X, b = inputs
+        iX = matrix_inverse(X)
+        ga = matrix_dot(gz, b.T, iX.T)
+        gX = -matrix_dot(iX.T, a, gz, b.T, iX.T)
+        gb = matrix_dot(ix.T, a.T, gz)
+        return [ga, gX, gb]
+
+
+class Eig(Op):
+    """Compute the eigenvalues and right eigenvectors of a square array.
+
+    """
+    _numop = staticmethod(numpy.linalg.eig)
+
+    def props(self):
+        """Function exposing different properties of each instance of the
+        op.
+
+        For the ``Eig`` op, there are no properties to be exposed.
+        """
+        return ()
+
+    def __hash__(self):
+        return hash((type(self), self.props()))
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and self.props() == other.props())
+
+    def make_node(self, x):
+        x = as_tensor_variable(x)
+        assert x.ndim == 2
+        w = theano.tensor.vector(dtype=x.dtype)
+        v = theano.tensor.matrix(dtype=x.dtype)
+        return Apply(self, [x], [w, v])
+
+    def perform(self, node, (x,), (w, v)):
+        try:
+            w[0], v[0] = [z.astype(x.dtype) for z in self._numop(x)]
+        except numpy.linalg.LinAlgError:
+            logger.debug('Failed to find %s of %s' % (self._numop.__name__,
+                                                      node.inputs[0]))
+            raise
+
+    def infer_shape(self, node, shapes):
+        n = shapes[0][0]
+        return [(n,), (n, n)]
+
+    def __str__(self):
+        return self._numop.__name__.capitalize()
+
+eig = Eig()
+
+
+def _zero_disconnected(outputs, grads):
+    l = []
+    for o, g in zip(outputs, grads):
+        if isinstance(g.type, DisconnectedType):
+            l.append(o.zeros_like())
+        else:
+            l.append(g)
+    return l
+
+
+class Eigh(Eig):
+    """
+    Return the eigenvalues and eigenvectors of a Hermitian or symmetric matrix.
+
+    """
+    _numop = staticmethod(numpy.linalg.eigh)
+
+    def __init__(self, UPLO='L'):
+        self.UPLO = UPLO
+
+    def __str__(self):
+        return 'Eigh{%s}' % self.UPLO
+
+    def props(self):
+        return self.UPLO,
+
+    def make_node(self, x):
+        x = as_tensor_variable(x)
+        assert x.ndim == 2
+        # Numpy's linalg.eigh may return either double or single
+        # presision eigenvalues depending on installed version of
+        # LAPACK.  Rather than trying to reproduce the (rather
+        # involved) logic, we just probe linalg.eigh with a trivial
+        # input.
+        w_dtype = self._numop([[numpy.dtype(x.dtype).type()]])[0].dtype.name
+        w = theano.tensor.vector(dtype=w_dtype)
+        v = theano.tensor.matrix(dtype=x.dtype)
+        return Apply(self, [x], [w, v])
+
+    def perform(self, node, (x,), (w, v)):
+        try:
+            w[0], v[0] = self._numop(x, self.UPLO)
+        except numpy.linalg.LinAlgError:
+            logger.debug('Failed to find %s of %s' % (self._numop.__name__,
+                                                      node.inputs[0]))
+            raise
+
+    def grad(self, inputs, g_outputs):
+        r"""The gradient function should return
+
+           .. math:: \sum_n\left(W_n\frac{\partial\,w_n}
+                           {\partial a_{ij}} +
+                     \sum_k V_{nk}\frac{\partial\,v_{nk}}
+                           {\partial a_{ij}}\right),
+
+        where [:math:`W`, :math:`V`] corresponds to ``g_outputs``,
+        :math:`a` to ``inputs``, and  :math:`(w, v)=\mbox{eig}(a)`.
+
+        Analytic formulae for eigensystem gradients are well-known in
+        perturbation theory:
+
+           .. math:: \frac{\partial\,w_n}
+                          {\partial a_{ij}} = v_{in}\,v_{jn}
+
+
+           .. math:: \frac{\partial\,v_{kn}}
+                          {\partial a_{ij}} =
+                \sum_{m\ne n}\frac{v_{km}v_{jn}}{w_n-w_m}
+        """
+        x, = inputs
+        w, v = self(x)
+        # Replace gradients wrt disconnected variables with
+        # zeros. This is a work-around for issue #1063.
+        gw, gv = _zero_disconnected([w, v], g_outputs)
+        return [EighGrad(self.UPLO)(x, w, v, gw, gv)]
+
+
+def eigh(a, UPLO='L'):
+    return Eigh(UPLO)(a)
+
+
+class EighGrad(Op):
+    """Gradient of an eigensystem of a Hermitian matrix.
+
+    """
+    def __init__(self, UPLO='L'):
+        self.UPLO = UPLO
+        if UPLO == 'L':
+            self.tri0 = numpy.tril
+            self.tri1 = lambda a: numpy.triu(a, 1)
+        else:
+            self.tri0 = numpy.triu
+            self.tri1 = lambda a: numpy.tril(a, -1)
+
+    def props(self):
+        return (self.UPLO,)
+
+    def __hash__(self):
+        return hash((type(self), self.props()))
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and self.props() == other.props())
+
+    def __str__(self):
+        return 'EighGrad{%s}' % self.UPLO
+
+    def make_node(self, x, w, v, gw, gv):
+        x, w, v, gw, gv = map(as_tensor_variable, (x, w, v, gw, gv))
+        assert x.ndim == 2
+        assert w.ndim == 1
+        assert v.ndim == 2
+        assert gw.ndim == 1
+        assert gv.ndim == 2
+        out_dtype = theano.scalar.upcast(x.dtype, w.dtype, v.dtype,
+                                         gw.dtype, gv.dtype)
+        out = theano.tensor.matrix(dtype=out_dtype)
+        return Apply(self, [x, w, v, gw, gv], [out])
+
+    def perform(self, node, inputs, outputs):
+        r"""
+        Implements the "reverse-mode" gradient for the eigensystem of
+        a square matrix.
+        """
+        x, w, v, W, V = inputs
+        N = x.shape[0]
+        outer = numpy.outer
+
+        G = lambda n: sum(v[:, m] * V.T[n].dot(v[:, m]) / (w[n] - w[m])
+                          for m in xrange(N) if m != n)
+        g = sum(outer(v[:, n], v[:, n] * W[n] + G(n))
+                for n in xrange(N))
+
+        # Numpy's eigh(a, 'L') (eigh(a, 'U')) is a function of tril(a)
+        # (triu(a)) only.  This means that partial derivative of
+        # eigh(a, 'L') (eigh(a, 'U')) with respect to a[i,j] is zero
+        # for i < j (i > j).  At the same time, non-zero components of
+        # the gradient must account for the fact that variation of the
+        # opposite triangle contributes to variation of two elements
+        # of Hermitian (symmetric) matrix. The following line
+        # implements the necessary logic.
+        out = self.tri0(g) + self.tri1(g).T
+
+        # The call to self.tri0 in perform upcast from float32 to
+        # float64 or from int* to int64 in numpy 1.6.1 but not in
+        # 1.6.2. We do not want version dependent dtype in Theano.
+        # We think it should be the same as the output.
+        outputs[0][0] = numpy.asarray(out, dtype=node.outputs[0].dtype)
+
+    def infer_shape(self, node, shapes):
+        return [shapes[0]]
diff --git a/theano/sandbox/linalg/tests/test_linalg.py b/theano/sandbox/linalg/tests/test_linalg.py
index a51d5b580eb..25e2588a0fd 100644
--- a/theano/sandbox/linalg/tests/test_linalg.py
+++ b/theano/sandbox/linalg/tests/test_linalg.py
@@ -1,10 +1,6 @@
-import unittest
-
 import numpy
 import numpy.linalg
 from numpy.testing import assert_array_almost_equal
-from numpy.testing import dec, assert_array_equal, assert_allclose
-from numpy import inf
 
 import theano
 from theano import tensor, function
@@ -12,8 +8,6 @@
 from theano.tests.test_rop import break_op
 from theano.tests import unittest_tools as utt
 from theano import config
-from theano.tensor.nlinalg import MatrixInverse
-from theano.tensor import DimShuffle
 
 # The one in comment are not tested...
 from theano.sandbox.linalg.ops import (cholesky,
@@ -22,15 +16,12 @@
                                        matrix_inverse,
                                        pinv,
                                        Solve,
-                                       solve,
                                        diag,
                                        ExtractDiag,
                                        extract_diag,
                                        AllocDiag,
                                        alloc_diag,
                                        det,
-                                       svd,
-                                       qr,
                                        #PSD_hint,
                                        trace,
                                        matrix_dot,
@@ -38,13 +29,167 @@
                                        imported_scipy,
                                        Eig,
                                        inv_as_solve,
-                                       norm
                                        )
-
-from theano.sandbox.linalg import eig, eigh, eigvalsh
+from theano.sandbox.linalg import eig, eigh
 from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
-from nose.tools import assert_raises
+
+
+def check_lower_triangular(pd, ch_f):
+    ch = ch_f(pd)
+    assert ch[0, pd.shape[1] - 1] == 0
+    assert ch[pd.shape[0] - 1, 0] != 0
+    assert numpy.allclose(numpy.dot(ch, ch.T), pd)
+    assert not numpy.allclose(numpy.dot(ch.T, ch), pd)
+
+
+def check_upper_triangular(pd, ch_f):
+    ch = ch_f(pd)
+    assert ch[4, 0] == 0
+    assert ch[0, 4] != 0
+    assert numpy.allclose(numpy.dot(ch.T, ch), pd)
+    assert not numpy.allclose(numpy.dot(ch, ch.T), pd)
+
+
+def test_cholesky():
+    if not imported_scipy:
+        raise SkipTest("Scipy needed for the Cholesky op.")
+
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    r = rng.randn(5, 5).astype(config.floatX)
+    pd = numpy.dot(r, r.T)
+    x = tensor.matrix()
+    chol = cholesky(x)
+    # Check the default.
+    ch_f = function([x], chol)
+    yield check_lower_triangular, pd, ch_f
+    # Explicit lower-triangular.
+    chol = Cholesky(lower=True)(x)
+    ch_f = function([x], chol)
+    yield check_lower_triangular, pd, ch_f
+    # Explicit upper-triangular.
+    chol = Cholesky(lower=False)(x)
+    ch_f = function([x], chol)
+    yield check_upper_triangular, pd, ch_f
+
+
+def test_cholesky_grad():
+    if not imported_scipy:
+        raise SkipTest("Scipy needed for the Cholesky op.")
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    r = rng.randn(5, 5).astype(config.floatX)
+    pd = numpy.dot(r, r.T)
+    eps = None
+    if config.floatX == "float64":
+        eps = 2e-8
+    # Check the default.
+    yield (lambda: utt.verify_grad(cholesky, [pd], 3, rng, eps=eps))
+    # Explicit lower-triangular.
+    yield (lambda: utt.verify_grad(Cholesky(lower=True), [pd], 3,
+                                   rng, eps=eps))
+    # Explicit upper-triangular.
+    yield (lambda: utt.verify_grad(Cholesky(lower=False), [pd], 3,
+                                   rng, eps=eps))
+
+
+def test_cholesky_and_cholesky_grad_shape():
+    if not imported_scipy:
+        raise SkipTest("Scipy needed for the Cholesky op.")
+
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    x = tensor.matrix()
+    for l in (cholesky(x), Cholesky(lower=True)(x), Cholesky(lower=False)(x)):
+        f_chol = theano.function([x], l.shape)
+        g = tensor.grad(l.sum(), x)
+        f_cholgrad = theano.function([x], g.shape)
+        topo_chol = f_chol.maker.fgraph.toposort()
+        topo_cholgrad = f_cholgrad.maker.fgraph.toposort()
+        if config.mode != 'FAST_COMPILE':
+            assert sum([node.op.__class__ == Cholesky
+                        for node in topo_chol]) == 0
+            assert sum([node.op.__class__ == CholeskyGrad
+                        for node in topo_cholgrad]) == 0
+        for shp in [2, 3, 5]:
+            m = numpy.cov(rng.randn(shp, shp + 10)).astype(config.floatX)
+            yield numpy.testing.assert_equal, f_chol(m), (shp, shp)
+            yield numpy.testing.assert_equal, f_cholgrad(m), (shp, shp)
+
+
+def test_inverse_correctness():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+
+    r = rng.randn(4, 4).astype(theano.config.floatX)
+
+    x = tensor.matrix()
+    xi = matrix_inverse(x)
+
+    ri = function([x], xi)(r)
+    assert ri.shape == r.shape
+    assert ri.dtype == r.dtype
+
+    rir = numpy.dot(ri, r)
+    rri = numpy.dot(r, ri)
+
+    assert _allclose(numpy.identity(4), rir), rir
+    assert _allclose(numpy.identity(4), rri), rri
+
+
+def test_pseudoinverse_correctness():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    d1 = rng.randint(4) + 2
+    d2 = rng.randint(4) + 2
+    r = rng.randn(d1, d2).astype(theano.config.floatX)
+
+    x = tensor.matrix()
+    xi = pinv(x)
+
+    ri = function([x], xi)(r)
+    assert ri.shape[0] == r.shape[1]
+    assert ri.shape[1] == r.shape[0]
+    assert ri.dtype == r.dtype
+    # Note that pseudoinverse can be quite unprecise so I prefer to compare
+    # the result with what numpy.linalg returns
+    assert _allclose(ri, numpy.linalg.pinv(r))
+
+
+def test_matrix_dot():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    n = rng.randint(4) + 2
+    rs = []
+    xs = []
+    for k in xrange(n):
+        rs += [rng.randn(4, 4).astype(theano.config.floatX)]
+        xs += [tensor.matrix()]
+    sol = matrix_dot(*xs)
+
+    theano_sol = function(xs, sol)(*rs)
+    numpy_sol = rs[0]
+    for r in rs[1:]:
+        numpy_sol = numpy.dot(numpy_sol, r)
+
+    assert _allclose(numpy_sol, theano_sol)
+
+
+def test_inverse_singular():
+    singular = numpy.array([[1, 0, 0]] + [[0, 1, 0]] * 2,
+                           dtype=theano.config.floatX)
+    a = tensor.matrix()
+    f = function([a], matrix_inverse(a))
+    try:
+        f(singular)
+    except numpy.linalg.LinAlgError:
+        return
+    assert False
+
+
+def test_inverse_grad():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    r = rng.randn(4, 4)
+    tensor.verify_grad(matrix_inverse, [r], rng=numpy.random)
+
+    rng = numpy.random.RandomState(utt.fetch_seed())
+
+    r = rng.randn(4, 4)
+    tensor.verify_grad(matrix_inverse, [r], rng=numpy.random)
 
 
 def test_rop_lop():
@@ -57,8 +202,8 @@ def test_rop_lop():
     rop_f = function([mx, mv], yv)
 
     sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(),
-                        sequences=tensor.arange(y.shape[0]),
-                        non_sequences=[y, mx, mv])
+                       sequences=tensor.arange(y.shape[0]),
+                       non_sequences=[y, mx, mv])
     scan_f = function([mx, mv], sy)
 
     rng = numpy.random.RandomState(utt.fetch_seed())
@@ -72,7 +217,7 @@ def test_rop_lop():
 
     raised = False
     try:
-        tensor.Rop(
+        tmp = tensor.Rop(
             theano.clone(y, replace={mx: break_op(mx)}),
             mx,
             mv)
@@ -95,6 +240,169 @@ def test_rop_lop():
     assert _allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
 
 
+def test_det():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+
+    r = rng.randn(5, 5).astype(config.floatX)
+    x = tensor.matrix()
+    f = theano.function([x], det(x))
+    assert numpy.allclose(numpy.linalg.det(r), f(r))
+
+
+def test_det_grad():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+
+    r = rng.randn(5, 5).astype(config.floatX)
+    tensor.verify_grad(det, [r], rng=numpy.random)
+
+
+def test_det_shape():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    r = rng.randn(5, 5).astype(config.floatX)
+
+    x = tensor.matrix()
+    f = theano.function([x], det(x))
+    f_shape = theano.function([x], det(x).shape)
+    assert numpy.all(f(r).shape == f_shape(r))
+
+
+def test_alloc_diag():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    x = theano.tensor.vector()
+    g = alloc_diag(x)
+    f = theano.function([x], g)
+
+    # test "normal" scenario (5x5 matrix) and special cases of 0x0 and 1x1
+    for shp in [5, 0, 1]:
+        m = rng.rand(shp).astype(config.floatX)
+        v = numpy.diag(m)
+        r = f(m)
+        # The right diagonal is extracted
+        assert (r == v).all()
+
+    # Test we accept only vectors
+    xx = theano.tensor.matrix()
+    ok = False
+    try:
+        alloc_diag(xx)
+    except TypeError:
+        ok = True
+    assert ok
+
+    # Test infer_shape
+    f = theano.function([x], g.shape)
+    topo = f.maker.fgraph.toposort()
+    if config.mode != 'FAST_COMPILE':
+        assert sum([node.op.__class__ == AllocDiag for node in topo]) == 0
+    for shp in [5, 0, 1]:
+        m = rng.rand(shp).astype(config.floatX)
+        assert (f(m) == m.shape).all()
+
+
+def test_alloc_diag_grad():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    x = rng.rand(5)
+    tensor.verify_grad(alloc_diag, [x], rng=rng)
+
+
+def test_diag():
+    """
+    Test that linalg.diag has the same behavior as numpy.diag.
+    numpy.diag has two behaviors:
+    (1) when given a vector, it returns a matrix with that vector as the
+    diagonal.
+    (2) when given a matrix, returns a vector which is the diagonal of the
+    matrix.
+
+    (1) and (2) are tested by test_alloc_diag and test_extract_diag
+    respectively. This test makes sure that linalg.diag instantiates
+    the right op based on the dimension of the input.
+    """
+
+    # test that it builds a matrix with given diagonal when using vector inputs
+    x = theano.tensor.vector()
+    y = diag(x)
+    assert y.owner.op.__class__ == AllocDiag
+
+    # test that it extracts the diagonal when using matrix input
+    x = theano.tensor.matrix()
+    y = extract_diag(x)
+    assert y.owner.op.__class__ == ExtractDiag
+
+    # other types should raise error
+    x = theano.tensor.tensor3()
+    ok = False
+    try:
+        y = extract_diag(x)
+    except TypeError:
+        ok = True
+    assert ok
+
+
+# not testing the view=True case since it is not used anywhere.
+def test_extract_diag():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    x = theano.tensor.matrix()
+    g = extract_diag(x)
+    f = theano.function([x], g)
+
+    for shp in [(2, 3), (3, 2), (3, 3), (1, 1), (0, 0)]:
+        m = rng.rand(*shp).astype(config.floatX)
+        v = numpy.diag(m)
+        r = f(m)
+        # The right diagonal is extracted
+        assert (r == v).all()
+
+    # Test we accept only matrix
+    xx = theano.tensor.vector()
+    ok = False
+    try:
+        extract_diag(xx)
+    except TypeError:
+        ok = True
+    assert ok
+
+    # Test infer_shape
+    f = theano.function([x], g.shape)
+    topo = f.maker.fgraph.toposort()
+    if config.mode != 'FAST_COMPILE':
+        assert sum([node.op.__class__ == ExtractDiag for node in topo]) == 0
+    for shp in [(2, 3), (3, 2), (3, 3)]:
+        m = rng.rand(*shp).astype(config.floatX)
+        assert f(m) == min(shp)
+
+
+def test_extract_diag_grad():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    x = rng.rand(5, 4)
+    tensor.verify_grad(extract_diag, [x], rng=rng)
+
+
+def test_extract_diag_empty():
+    c = theano.tensor.constant(numpy.array([[], []], 'int32'))
+    extract_diag(c).eval()
+
+
+def test_trace():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    x = theano.tensor.matrix()
+    g = trace(x)
+    f = theano.function([x], g)
+
+    for shp in [(2, 3), (3, 2), (3, 3)]:
+        m = rng.rand(*shp).astype(config.floatX)
+        v = numpy.trace(m)
+        assert v == f(m)
+
+    xx = theano.tensor.vector()
+    ok = False
+    try:
+        trace(xx)
+    except TypeError:
+        ok = True
+    assert ok
+
+
 def test_spectral_radius_bound():
     tol = 10 ** (-6)
     rng = numpy.random.RandomState(utt.fetch_seed())
@@ -142,35 +450,101 @@ def test_spectral_radius_bound():
     assert ok
 
 
-def test_transinv_to_invtrans():
-    X = tensor.matrix('X')
-    Y = tensor.nlinalg.matrix_inverse(X)
-    Z = Y.transpose()
-    f = theano.function([X], Z)
-    for node in f.maker.fgraph.toposort():
-        if isinstance(node.op, MatrixInverse):
-            assert isinstance(node.inputs[0].owner.op, DimShuffle)
-        if isinstance(node.op, DimShuffle):
-            assert node.inputs[0].name == 'X'
-
-
-def test_tag_solve_triangular():
-    cholesky_lower = Cholesky(lower=True)
-    cholesky_upper = Cholesky(lower=False)
-    A = tensor.matrix('A')
-    x = tensor.vector('x')
-    L = cholesky_lower(A)
-    U = cholesky_upper(A)
-    b1 = solve(L, x)
-    b2 = solve(U, x)
-    f = theano.function([A,x], b1)
-    for node in f.maker.fgraph.toposort():
-        if isinstance(node.op, Solve):
-            assert node.op.A_structure == 'lower_triangular'
-    f = theano.function([A,x], b2)
-    for node in f.maker.fgraph.toposort():
-        if isinstance(node.op, Solve):
-            assert node.op.A_structure == 'upper_triangular'
-
-
-        
+class test_Solve(utt.InferShapeTester):
+    def setUp(self):
+        super(test_Solve, self).setUp()
+        self.op_class = Solve
+        self.op = Solve()
+
+    def test_infer_shape(self):
+        if not imported_scipy:
+            raise SkipTest("Scipy needed for the Cholesky op.")
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        A = theano.tensor.matrix()
+        b = theano.tensor.matrix()
+        self._compile_and_check([A, b],  # theano.function inputs
+                                [self.op(A, b)],  # theano.function outputs
+                                # A must be square
+                                [numpy.asarray(rng.rand(5, 5),
+                                               dtype=config.floatX),
+                                 numpy.asarray(rng.rand(5, 1),
+                                               dtype=config.floatX)],
+                                self.op_class,
+                                warn=False)
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        A = theano.tensor.matrix()
+        b = theano.tensor.vector()
+        self._compile_and_check([A, b],  # theano.function inputs
+                                [self.op(A, b)],  # theano.function outputs
+                                # A must be square
+                                [numpy.asarray(rng.rand(5, 5),
+                                               dtype=config.floatX),
+                                 numpy.asarray(rng.rand(5),
+                                               dtype=config.floatX)],
+                                self.op_class,
+                                warn=False)
+
+
+class test_Eig(utt.InferShapeTester):
+    op_class = Eig
+    op = eig
+    dtype = 'float64'
+
+    def setUp(self):
+        super(test_Eig, self).setUp()
+        self.rng = numpy.random.RandomState(utt.fetch_seed())
+        self.A = theano.tensor.matrix(dtype=self.dtype)
+        X = numpy.asarray(self.rng.rand(5, 5),
+                          dtype=self.dtype)
+        self.S = X.dot(X.T)
+
+    def test_infer_shape(self):
+        A = self.A
+        S = self.S
+        self._compile_and_check([A],  # theano.function inputs
+                                self.op(A),  # theano.function outputs
+                                # S must be square
+                                [S],
+                                self.op_class,
+                                warn=False)
+
+    def test_eval(self):
+        A = theano.tensor.matrix(dtype=self.dtype)
+        self.assertEquals([e.eval({A: [[1]]}) for e in self.op(A)],
+                          [[1.0], [[1.0]]])
+        x = [[0, 1], [1, 0]]
+        w, v = [e.eval({A: x}) for e in self.op(A)]
+        assert_array_almost_equal(numpy.dot(x, v), w * v)
+
+
+class test_Eigh(test_Eig):
+    op = staticmethod(eigh)
+
+    def test_uplo(self):
+        S = self.S
+        a = theano.tensor.matrix(dtype=self.dtype)
+        wu, vu = [out.eval({a: S}) for out in self.op(a, 'U')]
+        wl, vl = [out.eval({a: S}) for out in self.op(a, 'L')]
+        assert_array_almost_equal(wu, wl)
+        assert_array_almost_equal(vu * numpy.sign(vu[0, :]),
+                                  vl * numpy.sign(vl[0, :]))
+
+    def test_grad(self):
+        S = self.S
+        utt.verify_grad(lambda x: self.op(x)[0], [S], rng=self.rng)
+        utt.verify_grad(lambda x: self.op(x)[1], [S], rng=self.rng)
+        utt.verify_grad(lambda x: self.op(x, 'U')[0], [S], rng=self.rng)
+        utt.verify_grad(lambda x: self.op(x, 'U')[1], [S], rng=self.rng)
+
+
+class test_Eigh_float32(test_Eigh):
+    dtype = 'float32'
+
+def test_matrix_inverse_solve():
+    if not imported_scipy:
+        raise SkipTest("Scipy needed for the Solve op.")
+    A = theano.tensor.dmatrix('A')
+    b = theano.tensor.dmatrix('b')
+    node = matrix_inverse(A).dot(b).owner
+    [out] = inv_as_solve.transform(node)
+    assert isinstance(out.owner.op, Solve)
diff --git a/theano/sandbox/multinomial.py b/theano/sandbox/multinomial.py
index 15024911d02..2364a893323 100644
--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -12,28 +12,22 @@
     from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
     from theano.sandbox.cuda.opt import register_opt
 
-
 class MultinomialFromUniform(Op):
     '''Converts samples from a uniform into sample from a multinomial.'''
     def __init__(self, odtype):
-        self.odtype = odtype
-
+        self.odtype=odtype
     def __eq__(self, other):
-        return type(self) == type(other) and self.odtype == other.odtype
-
+        return type(self) == type(other) and self.odtype==other.odtype
     def __hash__(self):
         return hash((type(self), self.odtype))
-
     def __str__(self):
-        return '%s{%s}' % (self.__class__.__name__, self.odtype)
-
+        return '%s{%s}'%(self.__class__.__name__, self.odtype)
     def __setstate__(self, dct):
         self.__dict__.update(dct)
         try:
             self.odtype
         except AttributeError:
-            self.odtype = 'auto'
-
+            self.odtype='auto'
     def make_node(self, pvals, unis):
         pvals = T.as_tensor_variable(pvals)
         unis = T.as_tensor_variable(unis)
@@ -41,12 +35,11 @@ def make_node(self, pvals, unis):
             raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
         if unis.ndim != 1:
             raise NotImplementedError('unis ndim should be 1', unis.ndim)
-        if self.odtype == 'auto':
+        if self.odtype=='auto':
             odtype = pvals.dtype
         else:
             odtype = self.odtype
-        out = T.tensor(dtype=odtype, broadcastable=pvals.type.broadcastable)
-        return Apply(self, [pvals, unis], [out])
+        return Apply(self, [pvals, unis], [T.matrix(dtype=odtype)])
 
     def grad(self, ins, outgrads):
         pvals, unis = ins
@@ -59,14 +52,7 @@ def c_code_cache_version(self):
     def c_code(self, node, name, ins, outs, sub):
         (pvals, unis) = ins
         (z,) = outs
-        if self.odtype == 'auto':
-            t = "PyArray_TYPE((PyArrayObject*) py_%(pvals)s)" % locals()
-        else:
-            t = theano.scalar.Scalar(self.odtype).dtype_specs()[1]
-            if t.startswith('theano_complex'):
-                t = t.replace('theano_complex', 'NPY_COMPLEX')
-            else:
-                t = t.upper()
+
         fail = sub['fail']
         return """
         if (PyArray_NDIM(%(pvals)s) != 2)
@@ -94,7 +80,7 @@ def c_code(self, node, name, ins, outs, sub):
             Py_XDECREF(%(z)s);
             %(z)s = (PyArrayObject*) PyArray_ZEROS(2,
                 PyArray_DIMS(%(pvals)s),
-                %(t)s,
+                type_num_%(z)s,
                 0);
             if (!%(z)s)
             {
@@ -135,7 +121,6 @@ def c_code(self, node, name, ins, outs, sub):
         }
         } // END NESTED SCOPE
         """ % locals()
-
     def perform(self, node, ins, outs):
         (pvals, unis) = ins
         (z,) = outs
@@ -180,17 +165,15 @@ def make_node(self, pvals, unis):
             raise TypeError('pvals must be cudandarray', pvals)
         if not isinstance(unis.type, CudaNdarrayType):
             raise TypeError('unis must be cudandarray', unis)
-        if self.odtype == 'auto':
+        if self.odtype=='auto':
             odtype = pvals.dtype
         else:
             odtype = self.odtype
         if odtype != pvals.dtype:
             raise NotImplementedError(
-                'GpuMultinomialFromUniform works only if '
-                'self.odtype == pvals.dtype', odtype, pvals.dtype)
-        br = (pvals.broadcastable[1], pvals.broadcastable[0])
-        out = CudaNdarrayType(broadcastable=br)()
-        return Apply(self, [pvals, unis], [out])
+                    'GpuMultinomialFromUniform works only if '
+                    'self.odtype == pvals.dtype', odtype, pvals.dtype)
+        return Apply(self, [pvals, unis], [pvals.type()])
 
     def perform(self, node, ins, outs):
         #The perform from parent don't work with CudaNdarray.  We
@@ -243,18 +226,19 @@ def c_support_code_apply(self, node, nodename):
 
         """ % locals()
 
+
     def c_code(self, node, name, ins, outs, sub):
         (pvals, unis) = ins
         (z,) = outs
 
         fail = sub['fail']
         return """
-        if (CudaNdarray_NDIM(%(pvals)s) != 2)
+        if (PyArray_NDIM(%(pvals)s) != 2)
         {
             PyErr_Format(PyExc_TypeError, "pvals wrong rank");
             %(fail)s;
         }
-        if (CudaNdarray_NDIM(%(unis)s) != 1)
+        if (PyArray_NDIM(%(unis)s) != 1)
         {
             PyErr_Format(PyExc_TypeError, "unis wrong rank");
             %(fail)s;
@@ -343,30 +327,25 @@ def c_code(self, node, name, ins, outs, sub):
         } // END NESTED SCOPE
         """ % locals()
 
-
-@local_optimizer([MultinomialFromUniform])
+@local_optimizer()
 def local_gpu_multinomial(node):
     if type(node.op) is MultinomialFromUniform:
         p, u = node.inputs
         m, = node.outputs
         if (p.dtype == u.dtype == m.dtype == 'float32' and
-            any([i.owner and isinstance(i.owner.op,
-                                        theano.sandbox.cuda.HostFromGpu)
+            any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu)
                  for i in node.inputs])):
             gpu_op = GpuMultinomialFromUniform(node.op.odtype)
-            return [host_from_gpu(gpu_op(*[gpu_from_host(i)
-                                           for i in node.inputs])).T]
+            return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs])).T]
     if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
-        node.inputs[0].owner and type(node.inputs[0].owner.op)
-        is MultinomialFromUniform):
+        node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform):
         multi = node.inputs[0].owner
         p, u = multi.inputs
         m, = multi.outputs
         if (p.dtype == u.dtype == m.dtype == 'float32'):
             gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
             ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T
-            # The dimshuffle is on the cpu, but will be moved to the
-            # gpu by an opt.
+            # The dimshuffle is on the cpu, but will be moved to the gpu by an opt.
             return [gpu_from_host(ret)]
 
 if cuda_available:
diff --git a/theano/sandbox/neighbourhoods.py b/theano/sandbox/neighbourhoods.py
index 10105e9a8e1..c1c6b5ed9ea 100644
--- a/theano/sandbox/neighbourhoods.py
+++ b/theano/sandbox/neighbourhoods.py
@@ -1,13 +1,5 @@
 #!/usr/bin/python
-"""WARNING: This code is not recommanded. It is not finished, it is
-slower then the version in sandbox/neighbours.py, and it do not work
-on the GPU.
 
-We only keep this version here as it is a little bit more generic, so
-it cover more cases. But thoses cases aren't needed frequently, so you
-probably don't want to use this version, go see neighbours.py!!!!!!!
-
-"""
 import theano
 from theano import gof, Op, tensor, Variable, Apply
 
@@ -158,7 +150,6 @@ def in_shape(self, output_shape):
         return out_dims, num_strides
 
     def make_node(self, x):
-        x = theano.tensor.as_tensor_variable(x)
         if self.inverse:
             # +1 in the inverse case
             if x.type.ndim != (self.n_dims_before + \
diff --git a/theano/sandbox/neighbours.py b/theano/sandbox/neighbours.py
index db8565bf642..a52cda792ee 100644
--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -1,6 +1,484 @@
 """
-Neighbours was moved into theano.tensor.nnet.neighbours.
-This file was created for compatibility.
+TODO: implement Images2Neibs.infer_shape() methods
+
 """
-from theano.tensor.nnet.neighbours import (images2neibs, neibs2images,
-    Images2Neibs)
\ No newline at end of file
+import theano
+from theano import Op, Apply
+import theano.tensor as T
+from theano.gradient import grad_not_implemented
+from theano.gradient import grad_undefined
+
+import numpy
+
+
+class Images2Neibs(Op):
+    def __init__(self, mode='valid'):
+        """
+        :type mode: str
+        :param mode: Possible values:
+            'valid': Requires an input that is a multiple of the
+                pooling factor (in each direction)
+            'ignore_borders': Same as valid, but will ignore the borders
+                if the shape(s) of the input
+                is not a multiple of the pooling factor(s)
+            'wrap_centered' : ?? TODO comment
+        :return:
+            Reshapes the input as a 2D tensor where each row is an
+            pooling example
+        """
+        if mode not in ['valid', 'wrap_centered', 'ignore_borders']:
+            raise NotImplementedError("Only the mode valid, ignore_borders"
+                                      " and wrap_centered have been"
+                                      " implemented for the op Images2Neibs")
+        self.mode = mode
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.mode == other.mode
+
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.mode)
+
+    def __str__(self):
+        return self.__class__.__name__ + "{%s}" % self.mode
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, "mode"):
+            self.mode = 'valid'
+
+    def make_node(self, ten4, neib_shape, neib_step=None):
+        """
+        :param ten4:     a list of lists of images
+                         ten4 is of shape (list 1 dim, list 2 dim,
+                                           row, col)
+        :param neib_shape: (r,c) where r is the height of the neighborhood
+                        in rows and c is the width of the neighborhood
+                        in columns
+        :param neib_step: (dr,dc) where dr is the number of rows to
+                          skip between patch and dc is the number of
+                          columns. When None, this is the same as
+                          neib_shape(patch are disjoint)
+
+        output:
+            a 2D matrix, written using the following pattern
+
+            idx = 0
+            for i in xrange(list 1 dim)
+                for j in xrange(list 2 dim)
+                    for k in <image column coordinates>
+                        for l in <image row coordinates>
+                            output[idx,:]
+                                 = flattened version of ten4[i,j,l:l+r,k:k+c]
+                            idx += 1
+            (note: the op isn't necessarily implemented internally with these
+            for loops, they're just the easiest way to describe the output
+            pattern)
+        """
+        ten4 = T.as_tensor_variable(ten4)
+        neib_shape = T.as_tensor_variable(neib_shape)
+        if neib_step is None:
+            neib_step = neib_shape
+        else:
+            neib_step = T.as_tensor_variable(neib_step)
+
+        assert ten4.ndim == 4
+        assert neib_shape.ndim == 1
+        assert neib_step.ndim == 1
+
+        return Apply(self, [ten4, neib_shape, neib_step],
+                     [T.matrix(dtype=ten4.type.dtype)])
+
+    def grad(self, inp, grads):
+        x, neib_shape, neib_step = inp
+        gz, = grads
+
+        if self.mode in ['valid', 'ignore_borders']:
+            if (neib_shape is neib_step or
+                neib_shape == neib_step or
+                # Theano Constant == do not compare the data
+                # the equals function do that.
+                (hasattr(neib_shape, "equals") and
+                 neib_shape.equals(neib_step))):
+                return [neibs2images(gz, neib_shape, x.shape, mode=self.mode),
+                        grad_undefined(self, 1, neib_shape),
+                        grad_undefined(self, 2, neib_step)]
+        return [grad_not_implemented(self, 0, x),
+                grad_undefined(self, 1, neib_shape),
+                grad_undefined(self, 2, neib_step)]
+
+    def c_code_cache_version(self):
+        return (5,)
+
+    def perform(self, node, inp, out_):
+        ten4, neib_shape, neib_step = inp
+        z, = out_
+        # GpuImages2Neibs should not run this perform in DebugMode
+        if type(self) != Images2Neibs:
+            raise theano.gof.utils.MethodNotDefined()
+
+        def CEIL_INTDIV(a, b):
+            if a % b:
+                return (a // b) + 1
+            else:
+                return a // b
+
+        grid_c = -1  # number of patch in height
+        grid_d = -1  # number of patch in width
+        assert ten4.ndim == 4
+        assert neib_shape.ndim == 1
+        assert neib_shape.shape[0] == 2
+        assert neib_step.ndim == 1
+        assert neib_step.shape[0] == 2
+        c, d = neib_shape
+        step_x, step_y = neib_step
+        mode = self.mode
+
+        if mode == "wrap_centered":
+            if (c % 2 != 1) or (d % 2 != 1):
+                raise TypeError(
+                    "Images2Neibs:"
+                    " in mode wrap_centered need patch with odd shapes")
+
+            if (ten4.shape[2] < c) or (ten4.shape[3] < d):
+                raise TypeError(
+                    "Images2Neibs: in wrap_centered mode, don't support"
+                    " image shapes smaller then the patch shapes:"
+                    " neib_shape=(%d,%d), ten4[2:]=[%d,%d]" %
+                    (c, d, ten4.shape[2], ten4.shape[3]))
+            grid_c = CEIL_INTDIV(ten4.shape[2], step_x)
+            grid_d = CEIL_INTDIV(ten4.shape[3], step_y)
+
+        elif mode == "valid":
+            if (ten4.shape[2] < c) or (((ten4.shape[2] - c) % step_x) != 0):
+                raise TypeError(
+                    "neib_shape[0]=%d, neib_step[0]=%d and"
+                    " ten4.shape[2]=%d not consistent" %
+                    (c, step_x, ten4.shape[2]))
+            if (ten4.shape[3] < d) or (((ten4.shape[3] - d) % step_y) != 0):
+                raise TypeError(
+                    "neib_shape[1]=%d, neib_step[1]=%d and"
+                    " ten4.shape[3]=%d not consistent" %
+                    (d, step_y, ten4.shape[3]))
+            # number of patch in height
+            grid_c = 1 + ((ten4.shape[2] - c) // step_x)
+            # number of patch in width
+            grid_d = 1 + ((ten4.shape[3] - d) // step_y)
+        elif mode == "ignore_borders":
+            # number of patch in height
+            grid_c = 1 + ((ten4.shape[2] - c) // step_x)
+            # number of patch in width
+            grid_d = 1 + ((ten4.shape[3] - d) // step_y)
+        else:
+            raise TypeError("Images2Neibs: unknow mode '%s'" % mode)
+
+        z_dim0 = grid_c * grid_d * ten4.shape[1] * ten4.shape[0]
+        z_dim1 = c * d
+        z[0] = numpy.empty((z_dim0, z_dim1), dtype=node.outputs[0].dtype)
+
+        nb_batch = ten4.shape[0]
+        nb_stack = ten4.shape[1]
+        height = ten4.shape[2]
+        width = ten4.shape[3]
+
+        wrap_centered_idx_shift_x = c // 2
+        wrap_centered_idx_shift_y = d // 2
+        for n in range(nb_batch):
+            for s in range(nb_stack):
+                # loop over the number of patch in height
+                for a in range(grid_c):
+                    # loop over the number of patch in width
+                    for b in range(grid_d):
+                        z_row = b + grid_d * (a + grid_c * (s + nb_stack * n))
+                        for i in range(c):
+                            ten4_2 = i + a * step_x
+                            if mode == "wrap_centered":
+                                ten4_2 -= wrap_centered_idx_shift_x
+                                if ten4_2 < 0:
+                                    ten4_2 += height
+                                elif ten4_2 >= height:
+                                    ten4_2 -= height
+                            for j in range(d):
+                                ten4_3 = j + b * step_y
+                                if mode == "wrap_centered":
+                                    ten4_3 -= wrap_centered_idx_shift_y
+                                    if ten4_3 < 0:
+                                        ten4_3 += width
+                                    elif ten4_3 >= width:
+                                        ten4_3 -= width
+                                z_col = j + d * i
+
+                                z[0][z_row, z_col] = ten4[n, s, ten4_2, ten4_3]
+
+    def c_code(self, node, name, inp, out, sub):
+        ten4, neib_shape, neib_step = inp
+        z, = out
+
+        fail = sub['fail']
+        mode = self.mode
+        return """
+#ifndef CEIL_INTDIV
+#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
+#endif
+
+        int grid_c = -1; //number of patch in height
+        int grid_d = -1; //number of patch in width
+        {
+        if (PyArray_NDIM(%(ten4)s) != 4)
+        {
+            PyErr_Format(PyExc_TypeError, "ten4 wrong rank");
+            %(fail)s;
+        }
+        if (PyArray_NDIM(%(neib_shape)s) != 1)
+        {
+            PyErr_Format(PyExc_TypeError, "neib_shape wrong rank");
+            %(fail)s;
+        }
+        if ( (PyArray_DIMS(%(neib_shape)s))[0] != 2)
+        {
+            PyErr_Format(PyExc_TypeError, "neib_shape wrong shape ; has to"
+                                          " contain 2 elements");
+            %(fail)s;
+        }
+        if (PyArray_NDIM(%(neib_step)s) != 1)
+        {
+            PyErr_Format(PyExc_TypeError, "neib_step wrong rank");
+            %(fail)s;
+        }
+        if ( (PyArray_DIMS(%(neib_step)s))[0] != 2)
+        {
+            PyErr_Format(PyExc_TypeError,
+                         "neib_step wrong step ; has to contain 2 elements");
+            %(fail)s;
+        }
+
+        // (c,d) = neib_shape
+        const npy_intp c = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
+        const npy_intp d = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
+        // (step_x,step_y) = neib_step
+        const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
+        const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
+
+        if ( "%(mode)s" == "wrap_centered") {
+            if (c%%2!=1 || d%%2!=1){
+                PyErr_Format(PyExc_TypeError,
+                             "Images2Neibs: in mode wrap_centered"
+                             " need patch with odd shapes");
+                %(fail)s;
+            }
+            if ( (PyArray_DIMS(%(ten4)s))[2] < c ||
+                 (PyArray_DIMS(%(ten4)s))[3] < d)
+            {
+                PyErr_Format(PyExc_TypeError,
+                    "Images2Neibs: in wrap_centered mode, don't support image"
+                    " shapes smaller then the patch shapes:"
+                    " neib_shape=(%%ld,%%ld), ten4[2:]=[%%ld,%%ld]",
+                    (long int)c, (long int)d,
+                    (long int)(PyArray_DIMS(%(ten4)s)[2]),
+                    (long int)(PyArray_DIMS(%(ten4)s)[3]));
+                %(fail)s;
+            }
+            grid_c = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[2]),step_x);
+            grid_d = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[3]),step_y);
+
+        }else if ( "%(mode)s" == "valid") {
+            if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
+                 ( (((PyArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
+            {
+                PyErr_Format(PyExc_TypeError,
+                             "neib_shape[0]=%%ld, neib_step[0]=%%ld and"
+                             " ten4.shape[2]=%%ld not consistent",
+                             (long int)c, (long int)step_x,
+                             (long int)(PyArray_DIMS(%(ten4)s)[2]));
+                %(fail)s;
+            }
+            if ( ((PyArray_DIMS(%(ten4)s))[3] < d) ||
+                 ( (((PyArray_DIMS(%(ten4)s))[3]-d) %% step_y)!=0))
+            {
+                PyErr_Format(PyExc_TypeError,
+                             "neib_shape[1]=%%ld, neib_step[1]=%%ld and"
+                             " ten4.shape[3]=%%ld not consistent",
+                             (long int)d, (long int)step_y,
+                             (long int)(PyArray_DIMS(%(ten4)s)[3]));
+                %(fail)s;
+            }
+            //number of patch in height
+            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
+            //number of patch in width
+            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
+        }else if ( "%(mode)s" == "ignore_borders") {
+            //number of patch in height
+            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
+            //number of patch in width
+            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
+        }else{
+            PyErr_Format(PyExc_TypeError,
+                         "Images2Neibs: unknow mode '%(mode)s'");
+            %(fail)s;
+        }
+
+        // new dimensions for z
+        const npy_intp z_dim1 = c * d;
+        const npy_intp z_dim0 =  grid_c
+                            * grid_d
+                            * (PyArray_DIMS(%(ten4)s))[1]
+                            * (PyArray_DIMS(%(ten4)s))[0];
+
+        if ((NULL == %(z)s)
+            || ((PyArray_DIMS(%(z)s))[0] != z_dim0 )
+            || ((PyArray_DIMS(%(z)s))[1] != z_dim1 )
+        )
+        {
+            Py_XDECREF(%(z)s);
+            npy_intp dims[2];
+            dims[0] = z_dim0;
+            dims[1] = z_dim1;
+
+            %(z)s = (PyArrayObject*) PyArray_EMPTY(2,
+                dims,
+                type_num_%(ten4)s,
+                0);
+
+            if (!%(z)s)
+            {
+                PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
+                %(fail)s;
+            }
+        }
+        }
+
+        { // NESTED SCOPE
+
+        const int nb_batch = (PyArray_DIMS(%(ten4)s))[0];
+        const int nb_stack = (PyArray_DIMS(%(ten4)s))[1];
+        const int height = (PyArray_DIMS(%(ten4)s))[2];
+        const int width = (PyArray_DIMS(%(ten4)s))[3];
+
+        // (c,d) = neib_shape
+        const npy_intp c = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
+        const npy_intp d = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
+        // (step_x,step_y) = neib_step
+        const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
+        const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
+
+        const int wrap_centered_idx_shift_x = c/2;
+        const int wrap_centered_idx_shift_y = d/2;
+        // Oh this is messed up...
+        for (int n = 0; n < nb_batch; n++)              // loop over batches
+            for (int s = 0; s < nb_stack; s++)          // loop over stacks
+                for (int a = 0; a < grid_c; a++)        // loop over the number of patch in height
+                    for (int b = 0; b < grid_d; b++)    // loop over the number of patch in width
+                    {
+                        int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
+                        for (int i = 0; i < c; i++)     // loop over c
+                        {
+                            int ten4_2 = i + a * step_x;
+                            if ( "%(mode)s" == "wrap_centered" ){
+                                ten4_2 -= wrap_centered_idx_shift_x;
+                                if ( ten4_2 < 0 ) ten4_2 += height;
+                                else if (ten4_2 >= height) ten4_2 -= height;
+                            }
+                            for (int j = 0; j < d; j++)  // loop over d
+                            {
+
+                                int ten4_3 = j + b * step_y;
+                                if ( "%(mode)s" == "wrap_centered" ){
+                                    ten4_3 -= wrap_centered_idx_shift_y;
+                                    if ( ten4_3 < 0 ) ten4_3 += width;
+                                    else if (ten4_3 >= width) ten4_3 -= width;
+                                }
+                                int z_col = j + d * i;
+
+                                dtype_%(z)s* curr_z = (dtype_%(z)s*) PyArray_GETPTR2(%(z)s, z_row, z_col);
+                                *curr_z = *( (dtype_%(ten4)s*) PyArray_GETPTR4(%(ten4)s, n, s, ten4_2, ten4_3));
+
+                                //printf("\\n(%%i,%%i,%%i,%%i) --> (%%i,%%i)",
+                                //       n, s, ten4_2, ten4_3, z_row, z_col);
+                                //printf("%%f ", *curr_z);
+                            }
+                        }
+                    }
+        } // END NESTED SCOPE
+        """ % locals()
+
+
+def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
+    """
+        :param ten4:     a list of lists of images
+                         ten4 is of shape (list 1 dim, list 2 dim,
+                                           row, col)
+        :type ten4:      A 4d tensor-like.
+        :param neib_shape: (r,c) where r is the height of the neighborhood
+                        in rows and c is the width of the neighborhood
+                        in columns
+        :type neib_shape: A 1d tensor-like of 2 values.
+        :param neib_step: (dr,dc) where dr is the number of rows to
+                          skip between patch and dc is the number of
+                          columns. When None, this is the same as
+                          neib_shape(patch are disjoint)
+        :type neib_step: A 1d tensor-like of 2 values.
+        :param mode: Possible values:
+            'valid': Requires an input that is a multiple of the
+                pooling factor (in each direction)
+            'ignore_borders': Same as valid, but will ignore the borders
+                if the shape(s) of the input
+                is not a multiple of the pooling factor(s)
+            'wrap_centered' : ?? TODO comment
+        :type mode: str
+        :return:
+            Reshapes the input as a 2D tensor where each row is an
+            pooling example. Pseudo-code of the output:
+
+            .. code-block:: python
+
+                idx = 0
+                for i in xrange(list 1 dim)
+                    for j in xrange(list 2 dim)
+                        for k in <image column coordinates>
+                            for l in <image row coordinates>
+                                output[idx,:]
+                                     = flattened version of ten4[i,j,l:l+r,k:k+c]
+                                idx += 1
+                (note: the op isn't necessarily implemented internally with these
+                for loops, they're just the easiest way to describe the output
+                pattern)
+    """
+    return Images2Neibs(mode)(ten4, neib_shape, neib_step)
+
+
+def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
+    """
+    Inverse of images2neib.
+
+    :param neibs: matrix like the one obtained by images2neib
+    :param neib_shape: neib_shape that was used in images2neib
+    :param original_shape: original shape of the 4d tensor given to images2neib
+
+    :return: Return a 4d tensor of shape `original_shape`.
+    """
+    neibs = T.as_tensor_variable(neibs)
+    neib_shape = T.as_tensor_variable(neib_shape)
+    original_shape = T.as_tensor_variable(original_shape)
+
+    new_neib_shape = T.stack(original_shape[-1] // neib_shape[1],
+                             neib_shape[1])
+    output_2d = images2neibs(neibs.dimshuffle('x', 'x', 0, 1),
+                             new_neib_shape, mode=mode)
+
+    if mode == 'ignore_borders':
+        valid_shape = list(original_shape)
+        valid_shape[2] = (valid_shape[2] // neib_shape[0]) * neib_shape[0]
+        valid_shape[3] = (valid_shape[3] // neib_shape[1]) * neib_shape[1]
+        output_4d = output_2d.reshape(valid_shape)
+        #padding the borders with zeros
+        for d in [2, 3]:
+            pad_shape = list(output_4d.shape)
+            pad_shape[d] = original_shape[d] - valid_shape[d]
+            output_4d = T.concatenate([output_4d, T.zeros(pad_shape)], axis=d)
+    elif mode == 'valid':
+        # TODO: we do not implement all mode with this code.
+        # Add a check for the good cases.
+        output_4d = output_2d.reshape(original_shape)
+    else:
+        raise NotImplementedError("neibs2images do not support mode=%s" % mode)
+
+    return output_4d
diff --git a/theano/sandbox/rng_mrg.py b/theano/sandbox/rng_mrg.py
index 54cf26a5990..ede4001db6f 100644
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -5,16 +5,13 @@
 http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
 
 """
-import warnings
-
+import sys, warnings
 import numpy
 
-from theano import Op, Apply, shared, config, Variable, Out
-from theano import gradient, function
-from theano import tensor
+from theano import Op, Apply, shared, config, Variable
 from theano.tensor import (raw_random, TensorType, as_tensor_variable,
-                           get_vector_length, cast, opt, scal)
-from theano.tensor import sqrt, log, sin, cos, join, prod
+        get_vector_length, cast, opt, scal)
+from theano.tensor import zeros_like, sqrt, log, sin, cos, join, prod
 from theano.compile import optdb
 from theano.gof import local_optimizer
 from theano.gof.python25 import all, any
@@ -26,206 +23,65 @@
     from theano.sandbox.cuda import (CudaNdarrayType,
                                      float32_shared_constructor)
 
-from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel
-from theano.sandbox.gpuarray.type import GpuArrayType
-
 
 def matVecModM(A, s, m):
-    assert A.dtype == 'int64'
-    return numpy.int32(numpy.sum((A*s) % m, 1) % m)
-
+    # return (A * s) % m
+    x = numpy.zeros_like(s)
+    for i in xrange(len(x)):
+        for j in xrange(len(s)):
+            r = numpy.int32((numpy.int64(A[i][j]) * s[j] + x[i]) % m)
+            if r >= 0:
+                x[i] = r
+            else:
+                x[i] = r + m
+    return x
 
 def multMatVect(v, A, m1, B, m2):
-    """
-    multiply the first half of v by A with a modulo of m1
-    and the second half by B with a modulo of m2
-
-    Note: The parameters of dot_modulo are passed implicitly because passing
-    them explicitly takes more time then running the function's C-code.
-    """
-    if multMatVect.dot_modulo is None:
-        A_sym = tensor.lmatrix('A')
-        s_sym = tensor.ivector('s')
-        m_sym = tensor.iscalar('m')
-        A2_sym = tensor.lmatrix('A2')
-        s2_sym = tensor.ivector('s2')
-        m2_sym = tensor.iscalar('m2')
-        o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym)
-        multMatVect.dot_modulo = function(
-            [A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o)
-
-    # This way of calling the Theano fct is done to bypass Theano overhead.
-    f = multMatVect.dot_modulo
-    f.input_storage[0].storage[0] = A
-    f.input_storage[1].storage[0] = v[:3]
-    f.input_storage[2].storage[0] = m1
-    f.input_storage[3].storage[0] = B
-    f.input_storage[4].storage[0] = v[3:]
-    f.input_storage[5].storage[0] = m2
-    f.fn()
-    r = f.output_storage[0].storage[0]
-
+    #multiply the first half of v by A with a modulo of m1
+    #and the second half by B with a modulo of m2
+    err_orig = numpy.seterr(over='ignore')
+    try:
+        r = numpy.zeros_like(v)
+        r[:3] = matVecModM(A, v[:3], m1)
+        r[3:] = matVecModM(B, v[3:], m2)
+    finally:
+        numpy.seterr(**err_orig)
     return r
-multMatVect.dot_modulo = None
-
-
-class DotModulo(Op):
-    """
-    Efficient and numerically stable implementation of a dot product followed
-    by a modulo operation. This performs the same function as matVecModM.
-
-    We do this 2 times on 2 triple inputs and concatenating the output
-    """
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def make_node(self, A, s, m, A2, s2, m2):
-        return Apply(self, [A, s, m, A2, s2, m2], [s.type()])
-
-    def perform(self, node, (A, s, m, A2, s2, m2), (out, )):
-        o1 = matVecModM(A, s, m)
-        o2 = matVecModM(A2, s2, m2)
-        out[0] = numpy.concatenate((o1, o2))
-
-    def c_code_cache_version(self):
-        return (6,)
-
-    def c_code(self, node, name, (_A, _s, _m, _A2, _s2, _m2), (_z, ), sub):
-        return """
-        int osize = -1;
-        if (PyArray_NDIM(%(_A)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(A) != 2"); %(fail)s;}
-        if (PyArray_NDIM(%(_s)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(v) != 1"); %(fail)s;}
-        if (PyArray_NDIM(%(_m)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(m) != 0"); %(fail)s;}
-        if (PyArray_NDIM(%(_A2)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(A2) != 2"); %(fail)s;}
-        if (PyArray_NDIM(%(_s2)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(v2) != 1"); %(fail)s;}
-        if (PyArray_NDIM(%(_m2)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(m2) != 0"); %(fail)s;}
-
-        if( PyArray_DIMS(%(_A)s)[1] != PyArray_DIMS(%(_s)s)[0])
-        {PyErr_SetString(PyExc_NotImplementedError, "A and s shapes don't agree."); %(fail)s;}
-        if( PyArray_DIMS(%(_A2)s)[1] != PyArray_DIMS(%(_s2)s)[0])
-        {PyErr_SetString(PyExc_NotImplementedError, "A2 and s2 shapes don't agree."); %(fail)s;}
-
-        osize = PyArray_DIMS(%(_A)s)[0] + PyArray_DIMS(%(_A2)s)[0];
-        if (!%(_z)s
-            || (PyArray_DIMS(%(_z)s)[0] != osize))
-        {
-            {Py_XDECREF(%(_z)s);}
-            npy_intp dims[] = {0,};
-            dims[0] = osize;
-            %(_z)s = (PyArrayObject*) PyArray_SimpleNew(1, dims, PyArray_TYPE(%(_s)s));
-        }
-
-        if(!%(_z)s){%(fail)s;}
-
-        {   //makes it compile even though labels jump over variable definitions.
-
-            // A has size MxN, s has N, output M
-            npy_intp M = PyArray_DIMS(%(_A)s)[0];
-            npy_intp N = PyArray_DIMS(%(_A)s)[1];
-
-            const dtype_%(_A)s* __restrict__ DA = (dtype_%(_A)s*)PyArray_DATA(%(_A)s);
-            dtype_%(_s)s* __restrict__ Ds = (dtype_%(_s)s*)PyArray_DATA(%(_s)s);
-            dtype_%(_z)s* __restrict__ Dz = (dtype_%(_z)s*)PyArray_DATA(%(_z)s);
-            const dtype_%(_m)s m = ((dtype_%(_m)s*)PyArray_DATA(%(_m)s))[0];
-
-            npy_intp SA = PyArray_STRIDES(%(_A)s)[1] / PyArray_DESCR(%(_A)s)->elsize;
-            npy_intp Ss = PyArray_STRIDES(%(_s)s)[0] / PyArray_DESCR(%(_s)s)->elsize;
-            npy_intp Sz = PyArray_STRIDES(%(_z)s)[0] / PyArray_DESCR(%(_z)s)->elsize;
-
-            for (npy_int32 i = 0; i < M; ++i)
-            {
-                const dtype_%(_A)s* __restrict__ Ak = (dtype_%(_A)s*)(PyArray_BYTES(%(_A)s) + PyArray_STRIDES(%(_A)s)[0] * i);
-
-                npy_int64 r = 0;
-
-                for (npy_int32 j = 0; j < N; ++j)
-                {
-                    r += (npy_int64)(Ds[j * Ss] * (npy_int64)(Ak[j * SA])) %% m;
-                }
-
-                Dz[i * Sz] = r %% m;
-            }
-        }
-
-        //redo it with the second triple of inputs
-        {
-            // A has size MxN, s has N, output M
-            npy_intp M = PyArray_DIMS(%(_A2)s)[0];
-            npy_intp N = PyArray_DIMS(%(_A2)s)[1];
-
-            const dtype_%(_A2)s* __restrict__ DA = (dtype_%(_A2)s*)PyArray_DATA(%(_A2)s);
-            dtype_%(_s2)s* __restrict__ Ds = (dtype_%(_s2)s*)PyArray_DATA(%(_s2)s);
-            const dtype_%(_m2)s m = ((dtype_%(_m2)s*)PyArray_DATA(%(_m2)s))[0];
-
-            npy_intp SA = PyArray_STRIDES(%(_A2)s)[1] / PyArray_DESCR(%(_A2)s)->elsize;
-            npy_intp Ss = PyArray_STRIDES(%(_s2)s)[0] / PyArray_DESCR(%(_s2)s)->elsize;
-            npy_intp Sz = PyArray_STRIDES(%(_z)s)[0] / PyArray_DESCR(%(_z)s)->elsize;
-
-            dtype_%(_z)s* __restrict__ Dz = (dtype_%(_z)s*)PyArray_DATA(%(_z)s) + PyArray_DIMS(%(_A)s)[0] * Sz;
-
-            for (npy_int32 i = 0; i < M; ++i)
-            {
-                const dtype_%(_A2)s* __restrict__ Ak = (dtype_%(_A2)s*)(PyArray_BYTES(%(_A2)s) + PyArray_STRIDES(%(_A2)s)[0] * i);
-
-                npy_int64 r = 0;
-
-                for (npy_int32 j = 0; j < N; ++j)
-                {
-                    r += (npy_int64)(Ds[j * Ss] * (npy_int64)(Ak[j * SA])) %% m;
-                }
-
-                Dz[i * Sz] = r %% m;
-            }
-
-        }
-
-        """ % dict(locals(), **sub)
 
 
 #MRG31k3p
 #generator constants :
-M1 = numpy.asarray(numpy.int32(2147483647))    #2^31 - 1
-M2 = numpy.asarray(numpy.int32(2147462579))    #2^31 - 21069
-MASK12 = numpy.int32(511)                      #2^9 - 1
-MASK13 = numpy.int32(16777215)                 #2^24 - 1
-MASK2 = numpy.int32(65535)                     #2^16 - 1
+M1 = numpy.int32(2147483647)    #2^31 - 1
+M2 = numpy.int32(2147462579)    #2^31 - 21069
+MASK12 = numpy.int32(511)       #2^9 - 1
+MASK13 = numpy.int32(16777215)  #2^24 - 1
+MASK2 = numpy.int32(65535)      #2^16 - 1
 MULT2 = numpy.int32(21069)
-NORM = 4.656612873077392578125e-10;            #1./2^31
+NORM = 4.656612873077392578125e-10; #1./2^31
 
-#A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]],
-#                      dtype='int64')
-#A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]],
-#                      dtype='int64')
+A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]])
+A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]])
 
 A1p72 = numpy.asarray([[1516919229, 758510237, 499121365],
-                       [1884998244, 1516919229, 335398200],
-                       [601897748, 1884998244, 358115744]],
-                      dtype='int64')
+       [1884998244, 1516919229, 335398200],
+       [601897748, 1884998244, 358115744]])
 A2p72 = numpy.asarray([[1228857673, 1496414766, 954677935],
-                       [1133297478, 1407477216, 1496414766],
-                       [2002613992, 1639496704, 1407477216]],
-                      dtype='int64')
+   [1133297478, 1407477216, 1496414766],
+   [2002613992, 1639496704, 1407477216]])
 
 A1p134 = numpy.asarray(
-    [[1702500920, 1849582496, 1656874625],
-     [828554832, 1702500920, 1512419905],
-     [1143731069, 828554832, 102237247]],
-    dtype='int64')
+  [[1702500920, 1849582496, 1656874625],
+   [828554832, 1702500920, 1512419905],
+   [1143731069, 828554832, 102237247]])
 A2p134 = numpy.asarray(
-    [[796789021, 1464208080, 607337906],
-     [1241679051, 1431130166, 1464208080],
-     [1401213391, 1178684362, 1431130166]],
-    dtype='int64')
+  [[796789021, 1464208080, 607337906],
+   [1241679051, 1431130166, 1464208080],
+   [1401213391, 1178684362, 1431130166]])
 np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
 
-
 def ff_2p134(rstate):
     return multMatVect(rstate, A1p134, M1, A2p134, M2)
 
-
 def ff_2p72(rstate):
     return multMatVect(rstate, A1p72, M1, A2p72, M2)
 
@@ -234,41 +90,42 @@ def mrg_next_value(rstate, new_rstate):
     x11, x12, x13, x21, x22, x23 = rstate
     assert type(x11) == numpy.int32
 
+    #i0, i7, i9, i15, i16, i22, i24 = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
     i0, i7, i9, i15, i16, i22, i24 = np_int32_vals
     #first component
-    y1 = (((x12 & MASK12) << i22) + (x12 >> i9) +
-          ((x13 & MASK13) << i7) + (x13 >> i24))
+    y1 = (((x12 & MASK12) << i22) + (x12 >> i9)
+        + ((x13 & MASK13) << i7) + (x13 >> i24))
 
     assert type(y1) == numpy.int32
-    if (y1 < 0 or y1 >= M1):  # must also check overflow
-        y1 -= M1
-    y1 += x13
+    if (y1 < 0 or y1 >= M1):     #must also check overflow
+        y1 -= M1;
+    y1 += x13;
     if (y1 < 0 or y1 >= M1):
-        y1 -= M1
+        y1 -= M1;
 
-    x13 = x12
-    x12 = x11
-    x11 = y1
+    x13 = x12;
+    x12 = x11;
+    x11 = y1;
 
     #second component
-    y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16))
+    y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
     assert type(y1) == numpy.int32
     if (y1 < 0 or y1 >= M2):
-        y1 -= M2
-    y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16))
+        y1 -= M2;
+    y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
     assert type(y2) == numpy.int32
     if (y2 < 0 or y2 >= M2):
-        y2 -= M2
-    y2 += x23
+        y2 -= M2;
+    y2 += x23;
     if (y2 < 0 or y2 >= M2):
-        y2 -= M2
-    y2 += y1
+        y2 -= M2;
+    y2 += y1;
     if (y2 < 0 or y2 >= M2):
-        y2 -= M2
+        y2 -= M2;
 
-    x23 = x22
-    x22 = x21
-    x21 = y2
+    x23 = x22;
+    x22 = x21;
+    x21 = y2;
 
     # Must never return either 0 or M1+1
     new_rstate[...] = [x11, x12, x13, x21, x22, x23]
@@ -278,30 +135,27 @@ def mrg_next_value(rstate, new_rstate):
     else:
         return (x11 - x21) * NORM
 
-
 class mrg_uniform_base(Op):
     def __init__(self, output_type, inplace=False):
         Op.__init__(self)
         self.output_type = output_type
-        self.inplace = inplace
+        self.inplace=inplace
         if inplace:
-            self.destroy_map = {0: [0]}
+            self.destroy_map = {0:[0]}
         self.warned_numpy_version = False
 
     def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.output_type == other.output_type and
-                self.inplace == other.inplace)
+        return type(self) == type(other) \
+                and self.output_type == other.output_type \
+                and self.inplace == other.inplace
 
     def __hash__(self):
         return hash(type(self)) ^ hash(self.output_type) ^ hash(self.inplace)
-
     def __str__(self):
         if self.inplace:
             s = "inplace"
-        else:
-            s = "no_inplace"
-        return self.__class__.__name__ + "{%s,%s}" % (self.output_type, s)
+        else: s = "no_inplace"
+        return self.__class__.__name__+"{%s,%s}"%(self.output_type,s)
 
     def make_node(self, rstate, size):
         # error checking slightly redundant here, since
@@ -309,14 +163,11 @@ def make_node(self, rstate, size):
         #
         # call through MRG_RandomStreams instead.
         return Apply(self,
-                     [rstate, size],
-                     [rstate.type(), self.output_type()])
+                [rstate, size],
+                [rstate.type(), self.output_type()])
 
-    def grad(self, inputs, ograd):
-        return [gradient.grad_undefined(
-                    self, k, inp,
-                    'No gradient defined through random sampling op')
-                for k, inp in enumerate(inputs)]
+    def grad(self,inputs,ograd):
+        return [None for i in inputs]
 
     def R_op(self, inputs, eval_points):
         return [None for i in eval_points]
@@ -336,40 +187,34 @@ def new(cls, rstate, ndim, dtype, size):
     def perform(self, node, inp, out):
         rstate, size = inp
         o_rstate, o_sample = out
-        numpy_version = numpy.__version__.split('.')
-
-        if (not self.warned_numpy_version and
-            int(numpy_version[0]) <= 1 and
-            int(numpy_version[1]) < 3):
-
+        numpy_version=numpy.__version__.split('.')
+        if not self.warned_numpy_version and int(numpy_version[0])<=1 and int(numpy_version[1])<3:
             print "Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy"
             self.warned_numpy_version = True
 
         n_elements = 1
 
-        rstate = numpy.asarray(rstate)  # bring state from GPU if necessary
+        rstate = numpy.asarray(rstate) # bring state from GPU if necessary
         if not self.inplace:
             rstate = rstate.copy()
 
         for s in size:
             n_elements *= s
 
-        n_streams, _ = rstate.shape
+        n_streams,_ = rstate.shape
 
         rval = numpy.zeros(n_elements, dtype=self.output_type.dtype)
 
         err_orig = numpy.seterr(over='ignore')
         try:
             for i in xrange(n_elements):
-                sample = mrg_next_value(rstate[i % n_streams],
-                                        rstate[i % n_streams])
+                sample = mrg_next_value(rstate[i%n_streams], rstate[i%n_streams])
                 rval[i] = sample
         finally:
             numpy.seterr(**err_orig)
 
-        # send to GPU if necessary
-        o_rstate[0] = node.outputs[0].type.filter(rstate)
-        o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))
+        o_rstate[0] = node.outputs[0].type.filter(rstate) # send to GPU if necessary
+        o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))# send to GPU if necessary
 
     def c_code(self, node, name, inp, out, sub):
         rstate, size = inp
@@ -383,7 +228,7 @@ def c_code(self, node, name, inp, out, sub):
         fail = sub['fail']
         if self.output_type.dtype == 'float32':
             otype = 'float'
-            NORM = '4.6566126e-10f'  # numpy.float32(1.0/(2**31+65))
+            NORM = '4.6566126e-10f' #numpy.float32(1.0/(2**31+65))
             # this was determined by finding the biggest number such that
             # numpy.float32(number * M1) < 1.0
         else:
@@ -434,7 +279,7 @@ def c_code(self, node, name, inp, out, sub):
         }
         for (int i = 0; i < %(ndim)s; ++i)
         {
-            odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
+            odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0];
             n_elements *= odims[i];
             must_alloc_sample = must_alloc_sample || (PyArray_DIMS(%(o_sample)s)[i] != odims[i]);
             //fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]);
@@ -468,8 +313,8 @@ def c_code(self, node, name, inp, out, sub):
         }
         n_streams = PyArray_DIMS(%(o_rstate)s)[0];
 
-        sample_data = (%(otype)s *) PyArray_DATA(%(o_sample)s);
-        state_data = (npy_int32 *) PyArray_DATA(%(o_rstate)s);
+        sample_data = (%(otype)s *) %(o_sample)s->data;
+        state_data = (npy_int32 *) %(o_rstate)s->data;
         for (int i = 0; i < n_elements; ++i)
         {
             npy_int32 * state_data_i = state_data + (i%%n_streams)*6;
@@ -547,19 +392,14 @@ def new(cls, rstate, ndim, dtype, size):
     def c_support_code_apply(self, node, nodename):
         if self.output_type.dtype == 'float32':
             otype = 'float'
-            NORM = '4.6566126e-10f'  # numpy.float32(1.0/(2**31+65))
+            NORM = '4.6566126e-10f' #numpy.float32(1.0/(2**31+65))
             # this was determined by finding the biggest number such that
             # numpy.float32(number * M1) < 1.0
         else:
             otype = 'double'
             NORM = '4.656612873077392578125e-10'
         return """
-        // FB: I disable the printing of the warning, as we
-        //receive too much email about this and this don't help
-        //people. I'm not even sure if the "fix" to give the info about
-        //the shape statically give a speed up. So I consider this
-        //warning as useless until proved it can speed the user code.
-        static int %(nodename)s_printed_warning = 1;
+        static int %(nodename)s_printed_warning = 0;
 
         static __global__ void %(nodename)s_mrg_uniform(
                 %(otype)s*sample_data,
@@ -636,7 +476,7 @@ def c_support_code_apply(self, node, nodename):
             }
         }
 
-        """ % locals()
+        """ %locals()
 
     def c_code(self, node, nodename, inp, out, sub):
         rstate, size = inp
@@ -651,7 +491,7 @@ def c_code(self, node, nodename, inp, out, sub):
         else:
             otype = 'double'
 
-        SYNC = "CNDA_THREAD_SYNC"
+        SYNC="CNDA_THREAD_SYNC";
         return """
         //////// <code generated by mrg_uniform>
 
@@ -661,7 +501,7 @@ def c_code(self, node, nodename, inp, out, sub):
         int must_alloc_sample = ((NULL == %(o_sample)s)
                 || !CudaNdarray_Check(py_%(o_sample)s)
                 || !CudaNdarray_is_c_contiguous(%(o_sample)s)
-                || (CudaNdarray_NDIM(%(o_sample)s) != %(ndim)s));
+                || (PyArray_NDIM(%(o_sample)s) != %(ndim)s));
 
         if (PyArray_NDIM(%(size)s) != 1)
         {
@@ -681,7 +521,7 @@ def c_code(self, node, nodename, inp, out, sub):
         }
         for (int i = 0; i < %(ndim)s; ++i)
         {
-            odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
+            odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0];
             n_elements *= odims[i];
             must_alloc_sample = (must_alloc_sample
                     || CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
@@ -710,14 +550,9 @@ def c_code(self, node, nodename, inp, out, sub):
         else
         {
             %(o_rstate)s = (CudaNdarray*)CudaNdarray_Copy(%(rstate)s);
-            if (!%(o_rstate)s) {
-                PyErr_SetString(PyExc_RuntimeError, "GPU_mrg_uniform: "
-                                "could not copy rstate");
-                %(fail)s
-            }
         }
 
-        if (CudaNdarray_NDIM(%(o_rstate)s) != 1)
+        if (PyArray_NDIM(%(o_rstate)s) != 1)
         {
             PyErr_SetString(PyExc_ValueError, "rstate must be vector");
             %(fail)s;
@@ -734,13 +569,6 @@ def c_code(self, node, nodename, inp, out, sub):
             unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
             unsigned int n_blocks = std::min(ceil_intdiv((unsigned int)n_streams_used_in_this_call, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
 
-            if (n_streams > (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK * (unsigned int)NUM_VECTOR_OP_BLOCKS)
-            {
-                PyErr_Format(PyExc_ValueError, "On GPU, n_streams should be at most %%u",
-                    (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK * (unsigned int)NUM_VECTOR_OP_BLOCKS);
-                %(fail)s;
-            }
-
             if (threads_per_block * n_blocks < n_streams)
             {
                 if (! %(nodename)s_printed_warning)
@@ -765,255 +593,17 @@ def c_code(self, node, nodename, inp, out, sub):
         }
 
         //////// </ code generated by mrg_uniform>
-        """ % locals()
-
-    def c_code_cache_version(self):
-        return (9,)
-
-
-class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
-    #GpuArray version
-
-    @classmethod
-    def new(cls, rstate, ndim, dtype, size):
-        v_size = as_tensor_variable(size)
-        if ndim is None:
-            ndim = get_vector_length(v_size)
-        op = cls(GpuArrayType(dtype, (False,)*ndim))
-        return op(rstate, cast(v_size, 'int32'))
-
-    def c_headers(self):
-        return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
-
-    def gpu_kernels(self, node, name):
-        if self.output_type.dtype == 'float32':
-            otype = 'float'
-            NORM = '4.6566126e-10f'  # numpy.float32(1.0/(2**31+65))
-            # this was determined by finding the biggest number such that
-            # numpy.float32(number * M1) < 1.0
-        else:
-            otype = 'double'
-            NORM = '4.656612873077392578125e-10'
-        code = """
-        KERNEL void mrg_uniform(
-                GLOBAL_MEM %(otype)s *sample_data,
-                GLOBAL_MEM ga_int *state_data,
-                const ga_uint Nsamples,
-                const ga_uint Nstreams_used)
-        {
-            /*
-             * The cluda backend makes sure that ga_int corresponds to
-             * a 32 bit signed type on the target device.  It is not a
-             * variable width type.
-             */
-            const ga_int i7 = 7;
-            const ga_int i9 = 9;
-            const ga_int i15 = 15;
-            const ga_int i16 = 16;
-            const ga_int i22 = 22;
-            const ga_int i24 = 24;
-
-            const ga_int M1 = 2147483647;      //2^31 - 1
-            const ga_int M2 = 2147462579;      //2^31 - 21069
-            const ga_int MASK12 = 511;       //2^9 - 1
-            const ga_int MASK13 = 16777215;  //2^24 - 1
-            const ga_int MASK2 = 65535;      //2^16 - 1
-            const ga_int MULT2 = 21069;
-
-            const ga_uint idx = GID_0 * LDIM_0 + LID_0;
-            ga_int y1, y2, x11, x12, x13, x21, x22, x23;
-
-            if (idx < Nstreams_used)
-            {
-            x11 = state_data[idx*6+0];
-            x12 = state_data[idx*6+1];
-            x13 = state_data[idx*6+2];
-            x21 = state_data[idx*6+3];
-            x22 = state_data[idx*6+4];
-            x23 = state_data[idx*6+5];
-
-            for (ga_uint i = idx; i < Nsamples; i += Nstreams_used)
-            {
-                y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
-                y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
-                y1 += x13;
-                y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
-                x13 = x12;
-                x12 = x11;
-                x11 = y1;
-
-                y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
-                y1 -= (y1 < 0 || y1 >= M2) ? M2 : 0;
-                y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
-                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
-                y2 += x23;
-                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
-                y2 += y1;
-                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
-
-                x23 = x22;
-                x22 = x21;
-                x21 = y2;
-
-                if (x11 <= x21) {
-                    sample_data[i] = (x11 - x21 + M1) * %(NORM)s;
-                }
-                else
-                {
-                    sample_data[i] = (x11 - x21) * %(NORM)s;
-                }
-            }
-
-            state_data[idx*6+0]= x11;
-            state_data[idx*6+1]= x12;
-            state_data[idx*6+2]= x13;
-            state_data[idx*6+3]= x21;
-            state_data[idx*6+4]= x22;
-            state_data[idx*6+5]= x23;
-            }
-        }
-
-        """ % locals()
-
-        # we shouldn't get to this line if it's about to fail
-        from pygpu import gpuarray
-
-        return [Kernel(code=code, name="mrg_uniform",
-                       params=[gpuarray.GpuArray, gpuarray.GpuArray,
-                               'uint32', 'uint32'],
-                       flags=Kernel.get_flags(self.output_type.dtype, 'int32'))
-                ]
-
-    def c_code(self, node, nodename, inp, out, sub):
-        rstate, size = inp
-        o_rstate, o_sample = out
-        inplace = int(self.inplace)
-        ndim = self.output_type.ndim
-        o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
-        fail = sub['fail']
-        kname = self.gpu_kernels(node, nodename)[0].objvar
-
-        if self.output_type.dtype == 'float32':
-            otype = 'float'
-            otypecode = 'GA_FLOAT'
-        else:
-            otype = 'double'
-            otypecode = 'GA_DOUBLE'
-
-        return """
-        //////// <code generated by mrg_uniform>
-
-        size_t odims[%(ndim)s];
-        unsigned int n_elements = 1;
-        unsigned int n_streams;
-        int must_alloc_sample = ((NULL == %(o_sample)s)
-                || !pygpu_GpuArray_Check(py_%(o_sample)s)
-                || !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
-                || (PyGpuArray_NDIM(%(o_sample)s) != %(ndim)s));
-
-        if (PyArray_NDIM(%(size)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be vector");
-            %(fail)s
-        }
-        if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
-        {
-            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
-                %(ndim)s, PyArray_DIMS(%(size)s)[0]);
-            %(fail)s
-        }
-        if (PyArray_DESCR(%(size)s)->type_num != NPY_INT32)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be int32");
-            %(fail)s
-        }
-        for (int i = 0; i < %(ndim)s; ++i)
-        {
-            odims[i] = ((npy_int32 *)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
-            n_elements *= odims[i];
-            must_alloc_sample = (must_alloc_sample
-                    || PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
-        }
-        if (must_alloc_sample)
-        {
-            Py_XDECREF(%(o_sample)s);
-            %(o_sample)s = pygpu_empty(%(ndim)s, odims, %(otypecode)s, GA_C_ORDER,
-                                       pygpu_default_context(), Py_None);
-            if(!%(o_sample)s)
-            {
-                %(fail)s;
-            }
-        }
-        if (!pygpu_GpuArray_Check(py_%(rstate)s))
-        {
-            PyErr_Format(PyExc_ValueError, "rstate must be gpuarray");
-            %(fail)s;
-        }
-
-        Py_XDECREF(%(o_rstate)s);
-        if (%(inplace)s)
-        {
-            Py_INCREF(%(rstate)s);
-            %(o_rstate)s = %(rstate)s;
-        }
-        else
-        {
-            %(o_rstate)s = pygpu_copy(%(rstate)s, GA_ANY_ORDER);
-            if (!%(o_rstate)s) {
-                %(fail)s
-            }
-        }
-
-        if (PyGpuArray_NDIM(%(o_rstate)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "rstate must be a matrix");
-            %(fail)s
-        }
-        if (PyGpuArray_DIMS(%(o_rstate)s)[1] != 6)
-        {
-            PyErr_Format(PyExc_ValueError, "rstate must have 6 columns");
-            %(fail)s
-        }
-        if (%(o_rstate)s->ga.typecode != GA_INT) {
-            PyErr_Format(PyExc_ValueError, "rstate must be int32");
-            %(fail)s
-        }
-        if (!GpuArray_CHKFLAGS(&%(o_rstate)s->ga, GA_C_CONTIGUOUS)) {
-            PyErr_Format(PyExc_ValueError, "rstate must be C contiguous");
-            %(fail)s
-        }
-        n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0];
-        if (n_streams > n_elements)
-          n_streams = n_elements;
-
-        {
-          void *args[4];
-          args[0] = &%(o_sample)s->ga;
-          args[1] = &%(o_rstate)s->ga;
-          args[2] = &n_elements;
-          args[3] = &n_streams;
-          int err = GpuKernel_call(&%(kname)s, n_elements, 0, 0, args);
-          if (err != GA_NO_ERROR) {
-              PyErr_Format(PyExc_RuntimeError, "GpuKernel_call: %%s\\n",
-                           GpuKernel_error(&%(kname)s, err));
-              %(fail)s
-          }
-        }
-
-        //////// </ code generated by mrg_uniform>
-        """ % locals()
-
+        """ %locals()
     def c_code_cache_version(self):
-        return (3, self.GpuKernelBase_version)
+        return (7,)
 
 
 def guess_n_streams(size, warn=True):
     """
     Return a guess at a good number of streams.
 
-    :param warn:
-      If True, warn when a guess cannot be made (in which case we
-      return 60 * 256).
+    :param warn: If True, warn when a guess cannot be made (in which case
+    we return 60 * 256).
     """
     # TODO: a smart way of choosing the number of streams, see #612.
     # Note that this code was moved out of `MRG_RandomStreams` so that it can
@@ -1025,7 +615,7 @@ def guess_n_streams(size, warn=True):
         for s in size:
             r *= s
         if r > 6:
-            r = r // 6  # chosen as fastest for rbm_benchmark
+            r = r // 6 # chosen as fastest for rbm_benchmark
 
         # The purpose of sampling from many streams is to be able to use
         # the GPU to its full capacity.  It just wastes RAM and stream-initialization time to
@@ -1038,8 +628,8 @@ def guess_n_streams(size, warn=True):
     else:
         if warn:
             warnings.warn((
-                "MRG_RandomStreams Can't determine #streams from "
-                "size (%s), guessing 60*256") % str(size),
+                    "MRG_RandomStreams Can't determine #streams from "
+                    "size (%s), guessing 60*256") % str(size),
                     stacklevel=3)
         return 60 * 256
 
@@ -1072,7 +662,7 @@ def __init__(self, seed=12345, use_cuda=None):
             elif seed >= M2:
                 raise ValueError('seed should be less than %i' % M2, seed)
             self.rstate = numpy.asarray([seed]*6, dtype='int32')
-        elif len(seed) == 6:
+        elif len(seed)==6:
             if seed[0] == 0 and seed[1] == 0 and seed[2] == 0:
                 raise ValueError('The first 3 values of seed should not be all 0', seed)
             if seed[3] == 0 and seed[4] == 0 and seed[5] == 0:
@@ -1091,8 +681,7 @@ def __init__(self, seed=12345, use_cuda=None):
 
     def inc_rstate(self):
         """Update self.rstate to be skipped 2^134 steps forward to the next stream start"""
-        #self.rstate = ff_2p134(self.rstate)
-        self.rstate = multMatVect(self.rstate, A1p134, M1, A2p134, M2)
+        self.rstate = ff_2p134(self.rstate)
         assert self.rstate.dtype == numpy.int32
 
     def get_substream_rstates(self, n_streams, inc_rstate=True):
@@ -1101,28 +690,10 @@ def get_substream_rstates(self, n_streams, inc_rstate=True):
         """
         assert n_streams < 2**72
         assert n_streams > 0
-        rval = numpy.zeros((n_streams, 6), dtype='int32')
+        rval = numpy.zeros((n_streams,6), dtype='int32')
         rval[0] = self.rstate
-
-        # If multMatVect.dot_modulo isn't compiled, compile it.
-        if multMatVect.dot_modulo is None:
-            multMatVect(rval[0], A1p72, M1, A2p72, M2)
-
-        # This way of calling the Theano fct is done to bypass Theano overhead.
-        f = multMatVect.dot_modulo
-        f.input_storage[0].storage[0] = A1p72
-        f.input_storage[2].storage[0] = M1
-        f.input_storage[3].storage[0] = A2p72
-        f.input_storage[5].storage[0] = M2
         for i in xrange(1, n_streams):
-            # Inline the following call to bypass Python overhead
-            #rval[i] = ff_2p72(rval[i - 1])
-            v = rval[i - 1]
-            f.input_storage[1].storage[0] = v[:3]
-            f.input_storage[4].storage[0] = v[3:]
-            f.fn()
-            rval[i] = f.output_storage[0].storage[0]
-
+            rval[i] = ff_2p72(rval[i - 1])
         if inc_rstate:
             self.inc_rstate()
         return rval
@@ -1147,25 +718,18 @@ def uniform(self, size, low=0.0, high=1.0, ndim=None, dtype=None,
         ndim may be a plain integer to supplement the missing
         information.
 
-        :param low:
-          Lower bound of the interval on which values are sampled.  If
-          the ``dtype`` arg is provided, ``low`` will be cast into
-          dtype.  This bound is excluded.
+        :param low: Lower bound of the interval on which values are sampled.
+        If the ``dtype`` arg is provided, ``low`` will be cast into dtype.
 
-        :param high:
-          Higher bound of the interval on which values are sampled.
-          If the ``dtype`` arg is provided, ``high`` will be cast into
-          dtype.  This bound is excluded.
+        :param high: Higher bound of the interval on which values are sampled.
+        If the ``dtype`` arg is provided, ``high`` will be cast into dtype.
 
-        :param size:
-          Can be a list of integer or Theano variable (ex: the shape
-          of other Theano Variable)
-
-        :param dtype:
-          The output data type. If dtype is not specified, it will be
-          inferred from the dtype of low and high, but will be at
-          least as precise as floatX.
+        :param size: Can be a list of integer or Theano variable
+                (ex: the shape of other Theano Variable)
 
+        :param dtype: The output data type. If dtype is not specified, it will
+        be inferred from the dtype of low and high, but will be at least as
+        precise as floatX.
         """
         low = as_tensor_variable(low)
         high = as_tensor_variable(high)
@@ -1179,17 +743,14 @@ def uniform(self, size, low=0.0, high=1.0, ndim=None, dtype=None,
             msg = "size must be a tuple of int or a Theano variable"
             assert all([isinstance(i, (numpy.integer, int, Variable))
                         for i in size]), msg
-            if any([isinstance(i, (numpy.integer, int)) and i <= 0
-                    for i in size]):
+            if any([isinstance(i, (numpy.integer, int)) and i <= 0 for i in size]):
                 raise ValueError(
                     "The specified size contains a dimension with value <= 0",
                     size)
 
         else:
-            if not (isinstance(size, Variable) and size.ndim == 1):
-                raise TypeError("size must be a tuple of int or a Theano "
-                                "Variable with 1 dimension, got " + str(size) +
-                                " of type " + str(type(size)))
+            msg = "size must be a tuple of int or a Theano variable"
+            assert isinstance(size, Variable) and size.ndim == 1, msg
 
         if nstreams is None:
             nstreams = self.n_streams(size)
@@ -1213,22 +774,20 @@ def uniform(self, size, low=0.0, high=1.0, ndim=None, dtype=None,
             # currently no Theano node that will do a frombuffer
             # reinterpretation.
             u = self.pretty_return(node_rstate,
-                                   *GPU_mrg_uniform.new(node_rstate,
-                                                        ndim, dtype, size))
+                    *GPU_mrg_uniform.new(node_rstate, ndim, dtype, size))
         else:
             node_rstate = shared(self.get_substream_rstates(nstreams))
             u = self.pretty_return(node_rstate,
-                                   *mrg_uniform.new(node_rstate,
-                                                    ndim, dtype, size))
+                    *mrg_uniform.new(node_rstate, ndim, dtype, size))
         r = u * (high - low) + low
 
         if u.type.broadcastable != r.type.broadcastable:
             raise NotImplementedError(
-                'Increase the size to match the broadcasting pattern of '
-                '`low` and `high` arguments')
+                    'Increase the size to match the broadcasting pattern of '
+                    '`low` and `high` arguments')
 
         assert r.dtype == dtype
-        return r
+        return  r
 
     def binomial(self, size=None, n=1, p=0.5, ndim=None, dtype='int64',
                  nstreams=None):
@@ -1292,18 +851,14 @@ def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64',
     def normal(self, size, avg=0.0, std=1.0, ndim=None,
                dtype=None, nstreams=None):
         """
-        :param size:
-          Can be a list of integers or Theano variables (ex: the shape
-          of another Theano Variable)
+        :param size: Can be a list of integers or Theano variables (ex: the
+        shape of another Theano Variable)
 
-        :param dtype:
-          The output data type. If dtype is not specified, it will be
-          inferred from the dtype of low and high, but will be at
-          least as precise as floatX.
-
-        :param nstreams:
-          Number of streams.
+        :param dtype: The output data type. If dtype is not specified, it will
+        be inferred from the dtype of low and high, but will be at least as
+        precise as floatX.
 
+        :param nstreams: Number of streams.
         """
         # We need an even number of ]0,1[ samples. Then we split them
         # in two halves. First half becomes our U1's for Box-Muller,
@@ -1322,8 +877,7 @@ def normal(self, size, avg=0.0, std=1.0, ndim=None,
         constant = False
         if isinstance(size, tuple) and all([isinstance(i, (numpy.integer, int)) for i in size]):
             constant = True
-            # Force dtype because it defaults to float when size is empty
-            n_samples = numpy.prod(size, dtype='int64')
+            n_samples = numpy.prod(size)
 
             if n_samples % 2 == 1:
                 n_samples += 1
@@ -1361,40 +915,21 @@ def normal(self, size, avg=0.0, std=1.0, ndim=None,
         else:
             final_samples = normal_samples[:prod(size)]
 
-        if not size:
-            # Force the dtype to be int64, otherwise reshape complains
-            size = tensor.constant(size, dtype='int64')
-        final_samples = final_samples.reshape(size)
+        if size:
+            final_samples = final_samples.reshape(size)
 
         final_samples = avg + std * final_samples
 
         assert final_samples.dtype == dtype
         return final_samples
 
-from theano.sandbox.gpuarray.opt import (register_opt as register_gpua,
-                                         host_from_gpu as host_from_gpua)
-
-@register_gpua()
-@local_optimizer([mrg_uniform])
-def local_gpua_mrg(node):
-    if (type(node.op) == mrg_uniform and
-        isinstance(node.inputs[0].type, GpuArrayType)):
-        outs = GPUA_mrg_uniform.new(node.inputs[0],
-                                    node.op.output_type.ndim,
-                                    node.op.output_type.dtype,
-                                    node.inputs[1])
-        return [outs[0], host_from_gpua(outs[1])]
-
 
-MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)
-@local_optimizer(MRG_RNGs)
+@local_optimizer([None])
 def mrg_random_make_inplace(node):
     op = node.op
-    if isinstance(op, MRG_RNGs) and not op.inplace:
+    if isinstance(op, mrg_uniform) and not op.inplace:
         # op might be gpu version
         new_op = op.__class__(op.output_type, inplace=True)
         return new_op.make_node(*node.inputs).outputs
     return False
-optdb.register('random_make_inplace_mrg',
-               opt.in2out(mrg_random_make_inplace, ignore_newtrees=True),
-               99, 'fast_run', 'inplace')
+optdb.register('random_make_inplace_mrg', opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace')
diff --git a/theano/sandbox/scan.py b/theano/sandbox/scan.py
index b5605e5966d..ccda62d47a5 100644
--- a/theano/sandbox/scan.py
+++ b/theano/sandbox/scan.py
@@ -196,13 +196,9 @@ def wrap_into_list(x):
     inner_slices = []  # Actual slices if scan is removed from the picture
     # go through sequences picking up time slices as needed
     for i, seq in enumerate(seqs):
-        if isinstance(seq, dict):
-            seq = seq['input']
         actual_slice = seq[0]
         _seq_val = tensor.as_tensor_variable(seq)
         _seq_val_slice = _seq_val[0]
-
-        nw_slice = _seq_val_slice.type()
         # Try to transfer test_value to the new variable
         if config.compute_test_value != 'off':
             try:
@@ -216,6 +212,7 @@ def wrap_into_list(x):
                         'the inner function of scan, input value '
                         'missing %s'), e)
 
+        nw_slice = _seq_val_slice.type()
         if seq.name:
             nw_slice.name = seq.name + '[t]'
         scan_seqs.append(_seq_val)
diff --git a/theano/sandbox/scan_module/scan_utils.py b/theano/sandbox/scan_module/scan_utils.py
index c82a30d8d06..2afdff38d74 100644
--- a/theano/sandbox/scan_module/scan_utils.py
+++ b/theano/sandbox/scan_module/scan_utils.py
@@ -15,7 +15,6 @@
 
 import copy
 import logging
-import warnings
 from itertools import izip
 
 import numpy
@@ -147,8 +146,7 @@ def is_condition(elem):
             raise ValueError(error_msg)
 
 
-DEPRECATED_ARG = object()
-def clone(output, replace=None, strict=True, share_inputs=True, copy_inputs=DEPRECATED_ARG):
+def clone(output, replace=None, strict=True, copy_inputs=True):
     """
     Function that allows replacing subgraphs of a computational
     graph. It returns a copy of the initial subgraph with the corresponding
@@ -161,24 +159,14 @@ def clone(output, replace=None, strict=True, share_inputs=True, copy_inputs=DEPR
     :type replace: dict
     :param replace: dictionary describing which subgraphs should be
                     replaced by what
-
-    :type share_inputs: bool
-    :param share_inputs: If True, use the same inputs (and shared variables)
-        as the original graph. If False, clone them. Note that cloned
-        shared variables still use the same underlying storage, so they
-        will always have the same value.
     """
-    if copy_inputs is not DEPRECATED_ARG:
-        warnings.warn('In `clone()` function, the argument `copy_inputs` has been deprecated and renamed into `share_inputs`')
-        assert share_inputs  # since we used `copy_inputs` we should have default value for `share_inputs`
-        share_inputs = copy_inputs
 
     inps, outs, other_stuff = rebuild_collect_shared(output,
                                                      [],
                                                      replace,
                                                      [],
                                                      strict,
-                                                     share_inputs)
+                                                     copy_inputs)
     return outs
 
 
@@ -350,7 +338,7 @@ def local_traverse(out):
             # shape_feature.on_import does not actually use an fgraph
             # It will call infer_shape and set_shape appropriately
             dummy_fgraph = None
-            shape_feature.on_import(dummy_fgraph, out.owner, reason="dummy")
+            shape_feature.on_import(dummy_fgraph, out.owner)
 
     ret = []
     for o in outs:
diff --git a/theano/sandbox/scan_module/tests/test_utils.py b/theano/sandbox/scan_module/tests/test_utils.py
index d8f06169bbc..d7c894a4c8d 100644
--- a/theano/sandbox/scan_module/tests/test_utils.py
+++ b/theano/sandbox/scan_module/tests/test_utils.py
@@ -178,7 +178,7 @@ def test001_cloning_no_replace_strict_copy_inputs(self):
         f2 = scan_module.scan_utils.clone(f1,
                                           replace=None,
                                           strict=True,
-                                          share_inputs=True)
+                                          copy_inputs=True)
         f2_inp = theano.gof.graph.inputs([f2])
 
         assert z in f2_inp
@@ -197,7 +197,7 @@ def test002_cloning_no_replace_strict_not_copy_inputs(self):
         f2 = scan_module.scan_utils.clone(f1,
                                           replace=None,
                                           strict=True,
-                                          share_inputs=False)
+                                          copy_inputs=False)
         f2_inp = theano.gof.graph.inputs([f2])
 
         assert not z in f2_inp
@@ -217,7 +217,7 @@ def test003_cloning_replace_strict_copy_inputs(self):
         f2 = scan_module.scan_utils.clone(f1,
                                           replace={y: y2},
                                           strict=True,
-                                          share_inputs=True)
+                                          copy_inputs=True)
         f2_inp = theano.gof.graph.inputs([f2])
         assert z in f2_inp
         assert x in f2_inp
@@ -236,7 +236,7 @@ def test004_cloning_replace_not_strict_copy_inputs(self):
         f2 = scan_module.scan_utils.clone(f1,
                                           replace={y: y2},
                                           strict=False,
-                                          share_inputs=True)
+                                          copy_inputs=True)
         f2_inp = theano.gof.graph.inputs([f2])
         assert z in f2_inp
         assert x in f2_inp
@@ -255,7 +255,7 @@ def test005_cloning_replace_strict_not_copy_inputs(self):
         f2 = scan_module.scan_utils.clone(f1,
                                           replace={y: y2},
                                           strict=True,
-                                          share_inputs=False)
+                                          copy_inputs=False)
         f2_inp = theano.gof.graph.inputs([f2])
         assert not z in f2_inp
         assert not x in f2_inp
@@ -274,7 +274,7 @@ def test006_cloning_replace_not_strict_not_copy_inputs(self):
         f2 = scan_module.scan_utils.clone(f1,
                                           replace={y: y2},
                                           strict=False,
-                                          share_inputs=False)
+                                          copy_inputs=False)
         f2_inp = theano.gof.graph.inputs([f2])
         assert not z in f2_inp
         assert not x in f2_inp
diff --git a/theano/sandbox/softsign.py b/theano/sandbox/softsign.py
index a07787064db..bb95c709156 100644
--- a/theano/sandbox/softsign.py
+++ b/theano/sandbox/softsign.py
@@ -1,16 +1,14 @@
 
 import theano
+import numpy
 import theano.tensor
 
-
 class ScalarSoftsign(theano.scalar.UnaryScalarOp):
     @staticmethod
     def static_impl(x):
         return x / (1.0 + abs(x))
-
     def impl(self, x):
         return ScalarSoftsign.static_impl(x)
-
     def grad(self, inp, grads):
         x, = inp
         gz, = grads
@@ -19,15 +17,11 @@ def grad(self, inp, grads):
             return [gz / (d * d)]
         else:
             return NotImplemented
-
     def c_code(self, node, name, inp, out, sub):
         x, = inp
         z, = out
-        if node.inputs[0].type in [theano.scalar.float32,
-                                   theano.scalar.float64]:
+        if node.inputs[0].type in [theano.scalar.float32, theano.scalar.float64]:
             return "%(z)s = %(x)s / (1.0+fabs(%(x)s));" % locals()
         raise NotImplementedError('only floating point x is implemented')
-
-scalar_softsign = ScalarSoftsign(theano.scalar.upgrade_to_float,
-                                 name='scalar_softsign')
+scalar_softsign = ScalarSoftsign(theano.scalar.upgrade_to_float, name='scalar_softsign')
 softsign = theano.tensor.Elemwise(scalar_softsign, name='softsign')
diff --git a/theano/sandbox/test_multinomial.py b/theano/sandbox/test_multinomial.py
index 8c0502f214c..a8e7cc64210 100644
--- a/theano/sandbox/test_multinomial.py
+++ b/theano/sandbox/test_multinomial.py
@@ -9,19 +9,15 @@
 from theano.gof.python25 import any
 import theano.sandbox.cuda as cuda
 
-
 def get_mode(gpu):
     mode = get_default_mode()
     mode = copy.copy(mode)
     if gpu:
-        mode = mode.including('gpu', 'gpu_local_optimizations',
-                              'local_cut_gpu_host_gpu',
-                              'local_gpu_multinomial')
+        mode = mode.including('gpu', 'gpu_local_optimizations', 'local_cut_gpu_host_gpu', 'local_gpu_multinomial')
     if isinstance(mode.linker, theano.gof.PerformLinker):
         mode.linker = predefined_linkers['c|py']
     return mode
 
-
 def run_with_c(f, gpu=False):
     mode = get_mode(gpu)
     f(mode, gpu)
@@ -34,54 +30,52 @@ def test_multinomial_0():
     p = tensor.fmatrix()
     u = tensor.fvector()
 
-    m = multinomial.MultinomialFromUniform('auto')(p, u)
+    m = multinomial.MultinomialFromUniform('auto')(p,u)
 
     def body(mode, gpu):
         #the m*2 allows the multinomial to reuse output
-        f = function([p, u], m*2, allow_input_downcast=True, mode=mode)
+        f = function([p,u], m*2, allow_input_downcast=True, mode=mode)
         if gpu:
-            assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
-                        for node in f.maker.fgraph.toposort()])
+            assert any([type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort()])
 
         # test that both first and second samples can be drawn
-        assert numpy.allclose(f([[1, 0], [0, 1]], [.1, .1]),
-                              [[2, 0], [0, 2]])
+        assert numpy.allclose(f([[1,0], [0,1]], [.1, .1]),
+                [[2,0], [0,2]])
 
         # test that both second labels can be drawn
-        r = f([[.2, .8], [.3, .7]], [.31, .31])
-        assert numpy.allclose(r, [[0, 2], [0, 2]]), r
+        r = f([[.2,.8], [.3,.7]], [.31, .31])
+        assert numpy.allclose(r, [[0,2], [0,2]]), r
+
 
         # test that both first labels can be drawn
-        r = f([[.2, .8], [.3, .7]], [.21, .21])
-        assert numpy.allclose(r, [[0, 2], [2, 0]]), r
+        r = f([[.2,.8], [.3,.7]], [.21, .21])
+        assert numpy.allclose(r, [[0,2], [2,0]]), r
 
         #change the size to make sure output gets reallocated ok
         # and also make sure that the GPU version doesn't screw up the
         # transposed-ness
-        r = f([[.2, .8]], [.25])
-        assert numpy.allclose(r, [[0, 2]]), r
+        r = f([[.2,.8] ], [.25])
+        assert numpy.allclose(r, [[0,2]]), r
 
     run_with_c(body)
     if cuda.cuda_available:
         run_with_c(body, True)
 
-
 #TODO: check a bigger example (make sure blocking on GPU is handled correctly)
 def test_multinomial_large():
     # DEBUG_MODE will test this on GPU
     def body(mode, gpu):
         p = tensor.fmatrix()
         u = tensor.fvector()
-        m = multinomial.MultinomialFromUniform('auto')(p, u)
-        f = function([p, u], m*2, allow_input_downcast=True, mode=mode)
+        m = multinomial.MultinomialFromUniform('auto')(p,u)
+        f = function([p,u], m*2, allow_input_downcast=True, mode=mode)
         if gpu:
-            assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
-                        for node in f.maker.fgraph.toposort()])
+            assert any([type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort()])
 
         pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4))+0.1
-        pval = pval / pval.sum(axis=1)[:, None]
-        uval = numpy.ones_like(pval[:, 0]) * 0.5
-        mval = f(pval, uval)
+        pval = pval / pval.sum(axis=1)[:,None]
+        uval = numpy.ones_like(pval[:,0]) * 0.5
+        mval = f(pval,uval)
 
         assert mval.shape == pval.shape
         if config.cast_policy == 'custom':
@@ -94,7 +88,7 @@ def body(mode, gpu):
             raise NotImplementedError(config.cast_policy)
         assert numpy.allclose(mval.sum(axis=1), 2)
         asdf = numpy.asarray([0, 0, 2, 0])+0*pval
-        assert numpy.allclose(mval, asdf)  # broadcast over all rows
+        assert numpy.allclose(mval, asdf) #broadcast over all rows
     run_with_c(body)
     if cuda.cuda_available:
         run_with_c(body, True)
@@ -103,52 +97,36 @@ def body(mode, gpu):
 def test_multinomial_dtypes():
     p = tensor.dmatrix()
     u = tensor.dvector()
-    m = multinomial.MultinomialFromUniform('auto')(p, u)
+    m = multinomial.MultinomialFromUniform('auto')(p,u)
     assert m.dtype == 'float64', m.dtype
 
     p = tensor.fmatrix()
     u = tensor.fvector()
-    m = multinomial.MultinomialFromUniform('auto')(p, u)
+    m = multinomial.MultinomialFromUniform('auto')(p,u)
     assert m.dtype == 'float32', m.dtype
 
+
     p = tensor.fmatrix()
     u = tensor.fvector()
-    m = multinomial.MultinomialFromUniform('float64')(p, u)
+    m = multinomial.MultinomialFromUniform('float64')(p,u)
     assert m.dtype == 'float64', m.dtype
 
-
 def test_gpu_opt():
     if not cuda.cuda_available:
         # Skip test if cuda_ndarray is not available.
         from nose.plugins.skip import SkipTest
         raise SkipTest('Optional package cuda not available')
 
-    # We test the case where we put the op on the gpu when the output
-    # is moved to the gpu.
+    # We test the case where we put the op on the gpu when the output is moved to the gpu.
     p = tensor.fmatrix()
     u = tensor.fvector()
-    m = multinomial.MultinomialFromUniform('auto')(p, u)
+    m = multinomial.MultinomialFromUniform('auto')(p,u)
     assert m.dtype == 'float32', m.dtype
     m_gpu = cuda.gpu_from_host(m)
 
-    f = function([p, u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
-    assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
-                for node in f.maker.fgraph.toposort()])
+    f = function([p,u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
+    assert any([type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort()])
     pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4))+0.1
-    pval = pval / pval.sum(axis=1)[:, None]
-    uval = numpy.ones_like(pval[:, 0]) * 0.5
-    mval = f(pval, uval)
-
-    # Test with a row, it was failing in the past.
-    r = tensor.frow()
-    m = multinomial.MultinomialFromUniform('auto')(r, u)
-    assert m.dtype == 'float32', m.dtype
-    m_gpu = cuda.gpu_from_host(m)
-
-    f = function([r, u], m_gpu, allow_input_downcast=True, mode=get_mode(True))
-    assert any([type(node.op) is multinomial.GpuMultinomialFromUniform
-                for node in f.maker.fgraph.toposort()])
-    pval = numpy.arange(1 * 4, dtype='float32').reshape((1, 4))+0.1
-    pval = pval / pval.sum(axis=1)[:, None]
-    uval = numpy.ones_like(pval[:, 0]) * 0.5
-    mval2 = f(pval, uval)
+    pval = pval / pval.sum(axis=1)[:,None]
+    uval = numpy.ones_like(pval[:,0]) * 0.5
+    mval = f(pval,uval)
diff --git a/theano/tensor/nnet/tests/test_neighbours.py b/theano/sandbox/test_neighbours.py
similarity index 96%
rename from theano/tensor/nnet/tests/test_neighbours.py
rename to theano/sandbox/test_neighbours.py
index 88d2c81cd8b..81e6489d28e 100644
--- a/theano/tensor/nnet/tests/test_neighbours.py
+++ b/theano/sandbox/test_neighbours.py
@@ -6,7 +6,7 @@
 from theano import shared, function
 from theano.gof.python25 import any
 import theano.tensor as T
-from theano.tensor.nnet.neighbours import images2neibs, neibs2images, Images2Neibs
+from neighbours import images2neibs, neibs2images, Images2Neibs
 
 from theano.tests import unittest_tools
 
@@ -41,8 +41,9 @@ def test_neibs(self):
                     g = function([],
                                  neibs2images(neibs, neib_shape, images.shape),
                                  mode=self.mode)
-                    assert any([isinstance(node.op, self.op)
-                                for node in f.maker.fgraph.toposort()])
+                    if border in ['valid']:
+                        assert any([isinstance(node.op, self.op)
+                                    for node in f.maker.fgraph.toposort()])
 
                     #print g()
                     assert numpy.allclose(images.get_value(borrow=True), g())
@@ -58,8 +59,6 @@ def test_neibs_manual(self):
             for border in ['valid', 'ignore_borders']:
                 f = function([], images2neibs(images, neib_shape, mode=border),
                              mode=self.mode)
-                assert any([isinstance(node.op, self.op)
-                            for node in f.maker.fgraph.toposort()])
 
                 #print images.get_value(borrow=True)
                 neibs = f()
@@ -108,8 +107,9 @@ def test_neibs_manual_step(self):
                              mode=self.mode)
 
                 neibs = f()
-                assert self.op in [type(node.op)
-                                   for node in f.maker.fgraph.toposort()]
+                if border in ['valid']:
+                    assert self.op in [type(node.op)
+                                       for node in f.maker.fgraph.toposort()]
 
                 assert numpy.allclose(neibs,
               [[  0,   1,   2,   5,   6,   7,  10,  11,  12],
@@ -162,8 +162,6 @@ def test_neibs_bad_shape(self):
                              images2neibs(images, neib_shape,
                                           mode='ignore_borders'),
                              mode=self.mode)
-                assert self.op in [type(node.op)
-                                   for node in f.maker.fgraph.toposort()]
                 f()
 
     def test_neibs_wrap_centered_step_manual(self):
diff --git a/theano/sandbox/test_rng_mrg.py b/theano/sandbox/test_rng_mrg.py
index 32a0ef3b6c5..535e5010c5d 100644
--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -16,8 +16,6 @@
 import unittest
 from theano.tests import unittest_tools as utt
 from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
-from nose.tools import assert_raises
 
 #TODO: test gpu
 # Done in test_consistency_GPU_{serial,parallel}
@@ -167,8 +165,7 @@ def test_consistency_cpu_parallel():
         rstate = theano.shared(rstate)
 
         new_rstate, sample = rng_mrg.mrg_uniform.new(rstate, ndim=None,
-                                                     dtype=config.floatX,
-                                                     size=(n_substreams,))
+                dtype=config.floatX, size=(n_substreams,))
         # Not really necessary, just mimicking
         # rng_mrg.MRG_RandomStreams' behavior
         sample.rstate = rstate
@@ -221,8 +218,7 @@ def test_consistency_GPU_serial():
             rstate = float32_shared_constructor(tmp_float_buf)
 
             new_rstate, sample = rng_mrg.GPU_mrg_uniform.new(rstate, ndim=None,
-                                                             dtype='float32',
-                                                             size=(1,))
+                    dtype='float32', size=(1,))
             rstate.default_update = new_rstate
 
             # Not really necessary, just mimicking
@@ -281,137 +277,7 @@ def test_consistency_GPU_parallel():
         rstate = float32_shared_constructor(tmp_float_buf)
 
         new_rstate, sample = rng_mrg.GPU_mrg_uniform.new(rstate, ndim=None,
-                                                         dtype='float32',
-                                                         size=(n_substreams,))
-        rstate.default_update = new_rstate
-
-        # Not really necessary, just mimicking
-        # rng_mrg.MRG_RandomStreams' behavior
-        sample.rstate = rstate
-        sample.update = (rstate, new_rstate)
-
-        # We need the sample back in the main memory
-        cpu_sample = tensor.as_tensor_variable(sample)
-        f = theano.function([], cpu_sample, mode=mode)
-
-        for k in range(n_samples):
-            s = f()
-            stream_samples.append(s)
-
-        samples.append(numpy.array(stream_samples).T.flatten())
-
-        # next stream
-        curr_rstate = rng_mrg.ff_2p134(curr_rstate)
-
-    samples = numpy.array(samples).flatten()
-    assert(numpy.allclose(samples, java_samples))
-
-
-def test_GPU_nstreams_limit():
-    """Verify that a ValueError is raised when n_streams
-    is greater than 2**20 on GPU. This is the value of
-    (NUM_VECTOR_OP_THREADS_PER_BLOCK * NUM_VECTOR_OP_BLOCKS).
-    """
-    if not cuda_available:
-        raise SkipTest('Optional package cuda not available')
-
-    seed = 12345
-    R = MRG_RandomStreams(seed=seed, use_cuda=True)
-
-    def eval_uniform(size, nstreams):
-        if theano.config.mode == "FAST_COMPILE":
-            mode = "FAST_RUN"
-        else:
-            mode = None
-        out = R.uniform(size=size, nstreams=nstreams, dtype='float32')
-        f = theano.function([], out, mode=mode)
-        return f()
-
-    eval_uniform((10,), 2**20)
-    assert_raises(ValueError, eval_uniform, (10,), 2**20 + 1)
-
-
-def test_consistency_GPUA_serial():
-    '''Verify that the random numbers generated by GPUA_mrg_uniform, serially,
-    are the same as the reference (Java) implementation by L'Ecuyer et al.
-    '''
-    from theano.sandbox.gpuarray.tests.test_basic_ops import \
-        mode_with_gpu as mode
-    from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
-
-    seed = 12345
-    n_samples = 5
-    n_streams = 12
-    n_substreams = 7
-
-    samples = []
-    curr_rstate = numpy.array([seed] * 6, dtype='int32')
-
-    for i in range(n_streams):
-        stream_rstate = curr_rstate.copy()
-        for j in range(n_substreams):
-            substream_rstate = numpy.array([stream_rstate.copy()],
-                                           dtype='int32')
-            # Transfer to device
-            rstate = gpuarray_shared_constructor(substream_rstate)
-
-            new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate,
-                                                              ndim=None,
-                                                              dtype='float32',
-                                                              size=(1,))
-            rstate.default_update = new_rstate
-
-            # Not really necessary, just mimicking
-            # rng_mrg.MRG_RandomStreams' behavior
-            sample.rstate = rstate
-            sample.update = (rstate, new_rstate)
-
-            # We need the sample back in the main memory
-            cpu_sample = tensor.as_tensor_variable(sample)
-            f = theano.function([], cpu_sample, mode=mode)
-            for k in range(n_samples):
-                s = f()
-                samples.append(s)
-
-            # next substream
-            stream_rstate = rng_mrg.ff_2p72(stream_rstate)
-
-        # next stream
-        curr_rstate = rng_mrg.ff_2p134(curr_rstate)
-
-    samples = numpy.array(samples).flatten()
-    assert(numpy.allclose(samples, java_samples))
-
-
-def test_consistency_GPUA_parallel():
-    '''Verify that the random numbers generated by GPUA_mrg_uniform, in
-    parallel, are the same as the reference (Java) implementation by
-    L'Ecuyer et al.
-
-    '''
-    from theano.sandbox.gpuarray.tests.test_basic_ops import \
-        mode_with_gpu as mode
-    from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
-
-    seed = 12345
-    n_samples = 5
-    n_streams = 12
-    n_substreams = 7  # 7 samples will be drawn in parallel
-
-    samples = []
-    curr_rstate = numpy.array([seed] * 6, dtype='int32')
-
-    for i in range(n_streams):
-        stream_samples = []
-        rstate = [curr_rstate.copy()]
-        for j in range(1, n_substreams):
-            rstate.append(rng_mrg.ff_2p72(rstate[-1]))
-        rstate = numpy.asarray(rstate)
-        rstate = gpuarray_shared_constructor(rstate)
-
-        new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate, ndim=None,
-                                                          dtype='float32',
-                                                          size=(n_substreams,))
+                dtype='float32', size=(n_substreams,))
         rstate.default_update = new_rstate
 
         # Not really necessary, just mimicking
@@ -437,11 +303,11 @@ def test_consistency_GPUA_parallel():
 
 
 def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=None,
-              target_avg=0.5, target_std=None, mean_rtol=0.01, std_tol=0.01):
+              target_avg=0.5, target_std=None, mean_rtol=0.01):
     if inputs is None:
         inputs = []
     dt = 0.0
-    avg_var = 0.0
+    avg_std = 0.0
 
     for i in xrange(steps):
         t0 = time.time()
@@ -451,14 +317,16 @@ def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=None,
         ival = numpy.asarray(ival)
         if i == 0:
             mean = numpy.array(ival, copy=True)
-            avg_var = numpy.mean((ival - target_avg) ** 2)
+            #avg_std = numpy.std(ival)
+            avg_std = numpy.sqrt(numpy.mean((ival - target_avg) ** 2))
             min_ = ival.min()
             max_ = ival.max()
         else:
             alpha = 1.0 / (1 + i)
             mean = alpha * ival + (1 - alpha) * mean
-            avg_var = (alpha * numpy.mean((ival - target_avg) ** 2)
-                       + (1 - alpha) * avg_var)
+            #avg_std = alpha * numpy.std(ival) + (1-alpha)*avg_std
+            avg_std = alpha * numpy.sqrt(numpy.mean((ival - target_avg) ** 2
+                                                )) + (1 - alpha) * avg_std
             min_ = min(min_, ival.min())
             max_ = max(max_, ival.max())
         if not allow_01:
@@ -468,21 +336,17 @@ def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=None,
     if hasattr(target_avg, 'shape'):  # looks if target_avg is an array
         diff = numpy.mean(abs(mean - target_avg))
         #print prefix, 'mean diff with mean', diff
-        assert numpy.all(diff < mean_rtol * (1 + abs(target_avg))), (
-            'bad mean? %s %s' % (mean, target_avg))
+        assert diff < mean_rtol, 'bad mean? %f %f' % (mean, target_avg)
     else:  # if target_avg is a scalar, then we can do the mean of
            # `mean` to get something more precise
         mean = numpy.mean(mean)
         #print prefix, 'mean', mean
-        assert abs(mean - target_avg) < mean_rtol * (1 + abs(target_avg)), (
-            'bad mean? %f %f' % (mean, target_avg))
-
-    std = numpy.sqrt(avg_var)
-    #print prefix, 'var', avg_var
-    #print prefix, 'std', std
+        assert abs(mean - target_avg) < mean_rtol, 'bad mean? %f %f' % (
+            numpy.mean(mean), target_avg)
+    #print prefix, 'std', avg_std
     if target_std is not None:
-        assert abs(std - target_std) < std_tol * (1 + abs(target_std)), (
-            'bad std? %f %f %f' % (std, target_std, std_tol))
+        assert abs(avg_std - target_std) < .01, 'bad std? %f %f' % (avg_std,
+                                                                    target_std)
     #print prefix, 'time', dt
     #print prefix, 'elements', steps * sample_size[0] * sample_size[1]
     #print prefix, 'samples/sec', steps * sample_size[0] * sample_size[1] / dt
@@ -504,14 +368,9 @@ def test_uniform():
         steps = int(1e3)
 
     x = tensor.matrix()
-    for size, const_size, var_input, input in [
-            (sample_size, sample_size, [], []),
-            (x.shape, sample_size, [x],
-             [numpy.zeros(sample_size, dtype=config.floatX)]),
-            ((x.shape[0], sample_size[1]), sample_size, [x],
-             [numpy.zeros(sample_size, dtype=config.floatX)]),
-            # test empty size (scalar)
-            ((), (), [], []),
+    for size, var_input, input in [
+            (sample_size, [], []),
+            (x.shape, [x], [numpy.zeros(sample_size, dtype=config.floatX)])
             ]:
 
         #### TEST CPU IMPLEMENTATION ####
@@ -535,13 +394,7 @@ def test_uniform():
         #print 'CPU: random?[:10], random?[-10:]'
         #print cpu_out[0, 0:10]
         #print cpu_out[-1, -10:]
-
-        # Increase the number of steps if sizes implies only a few samples
-        if numpy.prod(const_size) < 10:
-            steps_ = steps * 100
-        else:
-            steps_ = steps
-        basictest(f, steps_, const_size, prefix='mrg cpu', inputs=input)
+        basictest(f, steps, sample_size, prefix='mrg cpu', inputs=input)
 
         if mode != 'FAST_COMPILE' and cuda_available:
             #print ''
@@ -552,8 +405,8 @@ def test_uniform():
             # well, it's really that this test w GPU doesn't make sense otw
             assert u.dtype == 'float32'
             f = theano.function(var_input, theano.Out(
-                theano.sandbox.cuda.basic_ops.gpu_from_host(u),
-                borrow=True), mode=mode_with_gpu)
+                    theano.sandbox.cuda.basic_ops.gpu_from_host(u),
+                    borrow=True), mode=mode_with_gpu)
             assert any([isinstance(node.op,
                                    theano.sandbox.rng_mrg.GPU_mrg_uniform)
                         for node in f.maker.fgraph.toposort()])
@@ -563,7 +416,7 @@ def test_uniform():
             #print 'GPU: random?[:10], random?[-10:]'
             #print gpu_out[0, 0:10]
             #print gpu_out[-1, -10:]
-            basictest(f, steps_, const_size, prefix='mrg  gpu', inputs=input)
+            basictest(f, steps, sample_size, prefix='mrg  gpu', inputs=input)
 
             numpy.testing.assert_array_almost_equal(cpu_out, gpu_out,
                                                     decimal=6)
@@ -575,11 +428,10 @@ def test_uniform():
         uu = RR.uniform(size=size)
         ff = theano.function(var_input, uu, mode=mode)
         # It's not our problem if numpy generates 0 or 1
-        basictest(ff, steps_, const_size, prefix='numpy',
+        basictest(ff, steps, sample_size, prefix='numpy',
                   allow_01=True, inputs=input)
 
 
-@attr('slow')
 def test_binomial():
 #TODO: test size=None, ndim=X
 #TODO: test size=X, ndim!=X.ndim
@@ -602,14 +454,9 @@ def test_binomial():
     x = tensor.matrix()
     v = tensor.vector()
     for mean in [0.1, 0.5]:
-        for size, const_size, var_input, input in [
-                (sample_size, sample_size, [], []),
-                (x.shape, sample_size, [x],
-                 [numpy.zeros(sample_size, dtype=config.floatX)]),
-                ((x.shape[0], sample_size[1]), sample_size, [x],
-                 [numpy.zeros(sample_size, dtype=config.floatX)]),
-                # test empty size (scalar)
-                ((), (), [], []),
+        for size, var_input, input in [
+                (sample_size, [], []),
+                (x.shape, [x], [numpy.zeros(sample_size, dtype=config.floatX)])
                 ]:
 
             #print ''
@@ -623,13 +470,7 @@ def test_binomial():
             out = f(*input)
             #print 'random?[:10]\n', out[0, 0:10]
             #print 'random?[-1,-10:]\n', out[-1, -10:]
-
-            # Increase the number of steps if sizes implies only a few samples
-            if numpy.prod(const_size) < 10:
-                steps_ = steps * 100
-            else:
-                steps_ = steps
-            basictest(f, steps_, const_size, prefix='mrg  cpu',
+            basictest(f, steps, sample_size, prefix='mrg  cpu',
                       inputs=input, allow_01=True,
                       target_avg=mean, mean_rtol=rtol)
 
@@ -643,13 +484,13 @@ def test_binomial():
                 #well, it's really that this test w GPU doesn't make sense otw
                 assert u.dtype == 'float32'
                 f = theano.function(var_input, theano.Out(
-                    theano.sandbox.cuda.basic_ops.gpu_from_host(u),
-                    borrow=True), mode=mode_with_gpu)
+                        theano.sandbox.cuda.basic_ops.gpu_from_host(u),
+                        borrow=True), mode=mode_with_gpu)
                 #theano.printing.debugprint(f)
                 gpu_out = numpy.asarray(f(*input))
                 #print 'random?[:10]\n', gpu_out[0, 0:10]
                 #print 'random?[-1,-10:]\n', gpu_out[-1, -10:]
-                basictest(f, steps_, const_size, prefix='mrg  gpu',
+                basictest(f, steps, sample_size, prefix='mrg  gpu',
                           inputs=input, allow_01=True,
                           target_avg=mean, mean_rtol=rtol)
                 numpy.testing.assert_array_almost_equal(out, gpu_out,
@@ -663,11 +504,10 @@ def test_binomial():
             uu = RR.binomial(size=size, p=mean)
             ff = theano.function(var_input, uu, mode=mode)
             # It's not our problem if numpy generates 0 or 1
-            basictest(ff, steps_, const_size, prefix='numpy', allow_01=True,
+            basictest(ff, steps, sample_size, prefix='numpy', allow_01=True,
                       inputs=input, target_avg=mean, mean_rtol=rtol)
 
 
-@attr('slow')
 def test_normal0():
 
     steps = 50
@@ -681,32 +521,21 @@ def test_normal0():
         default_rtol = .01
     sample_size_odd = (sample_size[0], sample_size[1] - 1)
     x = tensor.matrix()
-
-    for size, const_size, var_input, input, avg, rtol, std_tol in [
-        (sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
+    for size, const_size, var_input, input, avg, rtol in [
+        (sample_size, sample_size, [], [], -5., default_rtol),
         (x.shape, sample_size, [x],
          [numpy.zeros(sample_size, dtype=config.floatX)],
-         -5., default_rtol, default_rtol),
-        ((x.shape[0], sample_size[1]), sample_size, [x],
-         [numpy.zeros(sample_size, dtype=config.floatX)],
-         -5., default_rtol, default_rtol),
+         -5., default_rtol),
         #test odd value
-        (sample_size_odd, sample_size_odd, [], [], -5.,
-         default_rtol, default_rtol),
+        (sample_size_odd, sample_size_odd, [], [], -5., default_rtol),
         #test odd value
         (x.shape, sample_size_odd, [x],
          [numpy.zeros(sample_size_odd, dtype=config.floatX)],
-         -5., default_rtol, default_rtol),
+         -5., default_rtol),
         (sample_size, sample_size, [], [],
          numpy.arange(numpy.prod(sample_size),
                       dtype='float32').reshape(sample_size),
-         10. * std / numpy.sqrt(steps), default_rtol),
-        # test empty size (scalar)
-        ((), (), [], [], -5., default_rtol, 0.02),
-        # test with few samples at the same time
-        ((1,), (1,), [], [], -5., default_rtol, 0.02),
-        ((2,), (2,), [], [], -5., default_rtol, 0.02),
-        ((3,), (3,), [], [], -5., default_rtol, 0.02),
+         10. * std / numpy.sqrt(steps)),
             ]:
         #print ''
         #print 'ON CPU:'
@@ -719,15 +548,8 @@ def test_normal0():
         #theano.printing.debugprint(f)
         out = f(*input)
         #print 'random?[:10]\n', out[0, 0:10]
-
-        # Increase the number of steps if size implies only a few samples
-        if numpy.prod(const_size) < 10:
-            steps_ = steps * 50
-        else:
-            steps_ = steps
-        basictest(f, steps_, const_size, target_avg=avg, target_std=std,
-                  prefix='mrg ', allow_01=True, inputs=input,
-                  mean_rtol=rtol, std_tol=std_tol)
+        basictest(f, steps, const_size, target_avg=avg, target_std=std,
+                  prefix='mrg ', allow_01=True, inputs=input, mean_rtol=rtol)
 
         sys.stdout.flush()
 
@@ -749,9 +571,9 @@ def test_normal0():
             #print 'random?[:10]\n', gpu_out[0, 0:10]
             #print '----'
             sys.stdout.flush()
-            basictest(f, steps_, const_size, target_avg=avg, target_std=std,
+            basictest(f, steps, const_size, target_avg=avg, target_std=std,
                       prefix='gpu mrg ', allow_01=True, inputs=input,
-                      mean_rtol=rtol, std_tol=std_tol)
+                      mean_rtol=rtol)
             # Need to allow some rounding error as their is float
             # computation that are done on the gpu vs cpu
             assert numpy.allclose(out, gpu_out, rtol=5e-6, atol=5e-6)
@@ -763,7 +585,7 @@ def test_normal0():
         nn = RR.normal(size=size, avg=avg, std=std)
         ff = theano.function(var_input, nn)
 
-        basictest(ff, steps_, const_size, target_avg=avg, target_std=std,
+        basictest(ff, steps, const_size, target_avg=avg, target_std=std,
                   prefix='numpy ', allow_01=True, inputs=input, mean_rtol=rtol)
 
 
@@ -829,9 +651,9 @@ def test_multinomial():
         #well, it's really that this test w GPU doesn't make sense otw
         assert n.dtype == 'float32'
         f = theano.function(
-            [],
-            theano.sandbox.cuda.basic_ops.gpu_from_host(n),
-            mode=mode_.including('gpu'))
+                [],
+                theano.sandbox.cuda.basic_ops.gpu_from_host(n),
+                mode=mode_.including('gpu'))
 
         #theano.printing.debugprint(f)
         gpu_out = f()
@@ -888,66 +710,3 @@ def __init__(self, seed=123):
         su2[0].set_value(su1[0].get_value())
 
     numpy.testing.assert_array_almost_equal(f1(), f2(), decimal=6)
-
-
-def test_gradient_scan():
-    # Test for a crash when using MRG inside scan and taking the gradient
-    # See https://groups.google.com/d/msg/theano-dev/UbcYyU5m-M8/UO9UgXqnQP0J
-    theano_rng = MRG_RandomStreams(10)
-    w = theano.shared(numpy.ones(1, dtype='float32'))
-
-    def one_step(x):
-        return x + theano_rng.uniform((1,), dtype='float32') * w
-
-    x = tensor.vector(dtype='float32')
-    values, updates = theano.scan(one_step, outputs_info=x, n_steps=10)
-    gw = theano.grad(tensor.sum(values[-1]), w)
-    f = theano.function([x], gw)
-    f(numpy.arange(1, dtype='float32'))
-
-
-def test_multMatVect():
-    A1 = tensor.lmatrix('A1')
-    s1 = tensor.ivector('s1')
-    m1 = tensor.iscalar('m1')
-    A2 = tensor.lmatrix('A2')
-    s2 = tensor.ivector('s2')
-    m2 = tensor.iscalar('m2')
-
-    g0 = rng_mrg.DotModulo()(A1, s1, m1, A2, s2, m2)
-    f0 = theano.function([A1, s1, m1, A2, s2, m2], g0)
-
-    i32max = numpy.iinfo(numpy.int32).max
-
-    A1 = numpy.random.randint(0, i32max, (3, 3)).astype('int64')
-    s1 = numpy.random.randint(0, i32max, 3).astype('int32')
-    m1 = numpy.asarray(numpy.random.randint(i32max), dtype="int32")
-    A2 = numpy.random.randint(0, i32max, (3, 3)).astype('int64')
-    s2 = numpy.random.randint(0, i32max, 3).astype('int32')
-    m2 = numpy.asarray(numpy.random.randint(i32max), dtype="int32")
-
-    f0.input_storage[0].storage[0] = A1
-    f0.input_storage[1].storage[0] = s1
-    f0.input_storage[2].storage[0] = m1
-    f0.input_storage[3].storage[0] = A2
-    f0.input_storage[4].storage[0] = s2
-    f0.input_storage[5].storage[0] = m2
-
-    r_a1 = rng_mrg.matVecModM(A1, s1, m1)
-    r_a2 = rng_mrg.matVecModM(A2, s2, m2)
-    f0.fn()
-    r_b = f0.output_storage[0].value
-
-    assert numpy.allclose(r_a1, r_b[:3])
-    assert numpy.allclose(r_a2, r_b[3:])
-
-
-if __name__ == "__main__":
-    rng = MRG_RandomStreams(numpy.random.randint(2147462579))
-    import time
-    print theano.__file__
-    pvals = theano.tensor.fmatrix()
-    for i in range(10):
-        t0 = time.time()
-        multinomial = rng.multinomial(pvals=pvals)
-        print time.time() - t0
diff --git a/theano/scalar/basic.py b/theano/scalar/basic.py
index 4c14aadb50f..f70340e5dc8 100644
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -24,7 +24,7 @@
 from theano.compat import PY3
 from theano import gof
 from theano.gof import (Op, utils, Variable, Constant, Type, Apply,
-                        FunctionGraph)
+        FunctionGraph)
 from theano.gof.python25 import partial, all, any
 from theano.configparser import config
 
@@ -69,18 +69,6 @@ def make_array(dt):
         return rval
 
 
-def get_scalar_type(dtype):
-    """
-    Return a Scalar(dtype) object.
-
-    This caches objects to save allocation and run time.
-    """
-    if dtype not in get_scalar_type.cache:
-        get_scalar_type.cache[dtype] = Scalar(dtype=dtype)
-    return get_scalar_type.cache[dtype]
-get_scalar_type.cache = {}
-
-
 def as_scalar(x, name=None):
     if isinstance(x, gof.Apply):
         if len(x.outputs) != 1:
@@ -103,7 +91,7 @@ def constant(x):
     # purpose typically.
     if hasattr(x, 'dtype'):
         assert x.ndim == 0
-        return ScalarConstant(get_scalar_type(str(x.dtype)), x)
+        return ScalarConstant(Scalar(str(x.dtype)), x)
     if isinstance(x, builtin_float):
         for dtype in ['float32', 'float64']:
             x_ = theano._asarray(x, dtype=dtype)
@@ -111,7 +99,7 @@ def constant(x):
                 break
             x_ = None
         assert x_ is not None
-        return ScalarConstant(get_scalar_type(str(x_.dtype)), x)
+        return ScalarConstant(Scalar(str(x_.dtype)), x)
     if isinstance(x, builtin_int):
         for dtype in ['int8', 'int16', 'int32', 'int64']:
             x_ = theano._asarray(x, dtype=dtype)
@@ -119,7 +107,7 @@ def constant(x):
                 break
             x_ = None
         assert x_ is not None
-        return ScalarConstant(get_scalar_type(str(x_.dtype)), x)
+        return ScalarConstant(Scalar(str(x_.dtype)), x)
     if isinstance(x, builtin_complex):
         #TODO: We have added the complex type, so this should be tested
         raise NotImplementedError()
@@ -145,18 +133,11 @@ def __init__(self, dtype):
         self.dtype = dtype
         self.dtype_specs()  # error checking
 
-    @staticmethod
-    def may_share_memory(a, b):
-        # This class represent basic c type, represented in python
-        # with numpy.scalar. They are read only. So from python, they
-        # can never share memory.
-        return False
-
     def filter(self, data, strict=False, allow_downcast=None):
         py_type = self.dtype_specs()[0]
         if strict and not isinstance(data, py_type):
             raise TypeError("%s expected a %s, got %s of type %s" % (
-                self, py_type, data, type(data)), data)
+                    self, py_type, data, type(data)), data)
         try:
             converted_data = py_type(data)
             if (allow_downcast or
@@ -167,20 +148,16 @@ def filter(self, data, strict=False, allow_downcast=None):
                 return py_type(data)
             else:
                 raise TypeError('Value cannot accurately be converted to dtype'
-                                ' (%s) and allow_downcast is not True' %
-                                self.dtype)
+                          ' (%s) and allow_downcast is not True' % self.dtype)
         except Exception, e:
             raise TypeError("Could not convert %s (value=%s) to %s" % (
-                type(data), data, self.dtype), e)
+                    type(data), data, self.dtype), e)
 
     def values_eq_approx(self, a, b, tolerance=1e-4):
         return abs(a - b) <= ((abs(a) + abs(b)) * tolerance)
 
     def c_headers(self):
         l = ['<math.h>']
-        # These includes are needed by Scalar and TensorType,
-        # we declare them here and they will be re-used by TensorType
-        l.append('<numpy/arrayobject.h>')
         l.append('<numpy/arrayscalars.h>')
         if config.lib.amdlibm:
             l += ['<amdlibm.h>']
@@ -206,24 +183,6 @@ def __hash__(self):
 
     def dtype_specs(self):
         try:
-            # To help debug dtype/typenum problem, here is code to get
-            # the list of numpy typenum.  This list change between 32
-            # and 64 bit platform and probably also also between
-            # Windows and Linux.
-            # NOTE: equivalent type on a platform can have different typenum.
-            #     This is the source of all dtype/typenum problem found up to
-            #     now, as Theano always expect the exact typenum that
-            #     correspond to our supported dtype.
-            """
-            for dtype in ['int8', 'uint8', 'short', 'ushort', 'intc', 'uintc',
-                          'longlong', 'ulonglong', 'single', 'double',
-                          'longdouble', 'csingle', 'cdouble', 'clongdouble',
-                          'float32', 'float64', 'int8', 'int16', 'int32',
-                          'int64', 'uint8', 'uint16', 'uint32', 'uint64',
-                          'complex64', 'complex128', 'float', 'double',
-                          'int', 'uint']:
-                print dtype, np.zeros(1, dtype=dtype).dtype.num
-            """
             return {  # dtype: (py_type, c_type, cls_name)
                     'float32': (numpy.float32, 'npy_float32', 'Float32'),
                     'float64': (numpy.float64, 'npy_float64', 'Float64'),
@@ -242,7 +201,7 @@ def dtype_specs(self):
                     }[self.dtype]
         except KeyError:
             raise TypeError("Unsupported dtype for %s: %s" % (
-                self.__class__.__name__, self.dtype))
+                    self.__class__.__name__, self.dtype))
 
     def upcast(self, *others):
         return upcast(*[x.dtype for x in [self] + list(others)])
@@ -261,16 +220,10 @@ def c_literal(self, data):
             raise NotImplementedError("No literal for complex values.")
         return str(data)
 
-    def c_declare(self, name, sub, check_input=True):
-        if(check_input):
-            pre = """
-                typedef %(dtype)s %(name)s_dtype; // Deprecated use dtype_%(name)s instead.
-                typedef %(dtype)s dtype_%(name)s;
-            """ % dict(name=name, dtype=self.dtype_specs()[1])
-        else:
-            pre = ""
-        return pre + """
+    def c_declare(self, name, sub):
+        return """
         %(dtype)s %(name)s;
+        typedef %(dtype)s %(name)s_dtype;
         """ % dict(name=name, dtype=self.dtype_specs()[1])
 
     def c_init(self, name, sub):
@@ -278,25 +231,20 @@ def c_init(self, name, sub):
         %(name)s = 0;
         """ % locals()
 
-    def c_extract(self, name, sub, check_input=True):
+    def c_extract(self, name, sub):
         specs = self.dtype_specs()
-        if(check_input):
-            pre = """
-            if (!PyObject_TypeCheck(py_%(name)s, &%(pyarr_type)s))
-            {
-                PyErr_Format(PyExc_ValueError,
-                    "Scalar check failed (%(dtype)s)");
-                %(fail)s
-            }
-            """ % dict(sub,
-                       name=name,
-                       dtype=specs[1],
-                       pyarr_type='Py%sArrType_Type' % specs[2])
-        else:
-            pre = ""
-        return pre + """
+        return """
+        if (!PyObject_TypeCheck(py_%(name)s, &%(pyarr_type)s))
+        {
+            PyErr_Format(PyExc_ValueError,
+                "Scalar check failed (%(dtype)s)");
+            %(fail)s
+        }
         PyArray_ScalarAsCtype(py_%(name)s, &%(name)s);
-        """ % dict(sub, name=name)
+        """ % dict(sub,
+                   name=name,
+                   dtype=specs[1],
+                   pyarr_type='Py%sArrType_Type' % specs[2])
 
     def c_sync(self, name, sub):
         specs = self.dtype_specs()
@@ -356,7 +304,7 @@ def c_support_code(self):
                 bool operator ==(const complex_type &y) const {
                     return (this->real == y.real) && (this->imag == y.imag);
                 }
-                bool operator ==(const scalar_type &y) const {
+                bool operator ==(const npy_float%(nbits)s &y) const {
                     return (this->real == y) && (this->imag == 0);
                 }
                 complex_type operator -(const complex_type &y) const {
@@ -404,11 +352,11 @@ def operator_eq_cplx(mytype, othertype):
                 ''' % dict(mytype=mytype, othertype=othertype)
 
             operator_eq = ''.join(operator_eq_real(ctype, rtype)
-                                  for ctype in cplx_types
-                                  for rtype in real_types) \
+                                for ctype in cplx_types
+                                for rtype in real_types) \
                         + ''.join(operator_eq_cplx(ctype1, ctype2)
-                                  for ctype1 in cplx_types
-                                  for ctype2 in cplx_types)
+                                for ctype1 in cplx_types
+                                for ctype2 in cplx_types)
 
             # We are not using C++ generic templating here, because this would
             # generate two different functions for adding a complex64 and a
@@ -426,8 +374,8 @@ def operator_plus_real(mytype, othertype):
                 ''' % dict(mytype=mytype, othertype=othertype)
 
             operator_plus = ''.join(operator_plus_real(ctype, rtype)
-                                    for ctype in cplx_types
-                                    for rtype in real_types)
+                                for ctype in cplx_types
+                                for rtype in real_types)
 
             def operator_minus_real(mytype, othertype):
                 return '''
@@ -439,8 +387,8 @@ def operator_minus_real(mytype, othertype):
                 ''' % dict(mytype=mytype, othertype=othertype)
 
             operator_minus = ''.join(operator_minus_real(ctype, rtype)
-                                     for ctype in cplx_types
-                                     for rtype in real_types)
+                                for ctype in cplx_types
+                                for rtype in real_types)
 
             def operator_mul_real(mytype, othertype):
                 return '''
@@ -452,24 +400,34 @@ def operator_mul_real(mytype, othertype):
                 ''' % dict(mytype=mytype, othertype=othertype)
 
             operator_mul = ''.join(operator_mul_real(ctype, rtype)
-                                   for ctype in cplx_types
-                                   for rtype in real_types)
+                                for ctype in cplx_types
+                                for rtype in real_types)
 
             return template % dict(nbits=64, half_nbits=32) \
-                   + template % dict(nbits=128, half_nbits=64) \
-                   + operator_eq \
-                   + operator_plus \
-                   + operator_minus \
-                   + operator_mul
+                    + template % dict(nbits=128, half_nbits=64) \
+                    + operator_eq \
+                    + operator_plus \
+                    + operator_minus \
+                    + operator_mul
 
         else:
             return ""
 
-    def c_init_code(self):
-        return ["import_array();"]
-
     def c_code_cache_version(self):
-        return (13, numpy.__version__)
+        # Use the correct type checking and conversion functions
+        return (10, numpy.__version__)
+        # Make operators work with 64 and 128 arguments at the same time
+        return (9, numpy.__version__)
+        # put const around operators and added unary '-' operator
+        return (8, numpy.__version__)
+        # no need to put lib.amdlibm here as c_compile_args() are put
+        # in the key.
+        return (7,)  # make complex c code optional
+        return (6,)  # added implemeentations of operators that work
+                     # with scalar arguments
+        return (5,)  # added constructors to theano_complex class
+        return (4,)  # explicit T given in specialization of operator=
+                     # lines.  This makes it compile with open64
 
     def get_shape_info(self, obj):
         return obj.itemsize
@@ -479,25 +437,25 @@ def get_size(self, shape_info):
 
 # Register C code for ViewOp on Scalars.
 theano.compile.register_view_op_c_code(
-    Scalar,
-    """
-    %(oname)s = %(iname)s;
-    """,
-    1)
-
-
-int8 = get_scalar_type('int8')
-int16 = get_scalar_type('int16')
-int32 = get_scalar_type('int32')
-int64 = get_scalar_type('int64')
-uint8 = get_scalar_type('uint8')
-uint16 = get_scalar_type('uint16')
-uint32 = get_scalar_type('uint32')
-uint64 = get_scalar_type('uint64')
-float32 = get_scalar_type('float32')
-float64 = get_scalar_type('float64')
-complex64 = get_scalar_type('complex64')
-complex128 = get_scalar_type('complex128')
+        Scalar,
+        """
+        %(oname)s = %(iname)s;
+        """,
+        1)
+
+
+int8 = Scalar('int8')
+int16 = Scalar('int16')
+int32 = Scalar('int32')
+int64 = Scalar('int64')
+uint8 = Scalar('uint8')
+uint16 = Scalar('uint16')
+uint32 = Scalar('uint32')
+uint64 = Scalar('uint64')
+float32 = Scalar('float32')
+float64 = Scalar('float64')
+complex64 = Scalar('complex64')
+complex128 = Scalar('complex128')
 
 int_types = int8, int16, int32, int64
 uint_types = uint8, uint16, uint32, uint64
@@ -609,11 +567,9 @@ def __rmod__(self, other):
     def __rpow__(self, other):
         return pow(other, self)
 
-    def zeros_like(self, dtype=None):
+    def zeros_like(self):
         # The second is needed for Elemwise ops to work right
-        if dtype is None:
-            dtype = str(self.type.dtype)
-        return second(self, ScalarConstant(get_scalar_type(dtype), 0))
+        return second(self, ScalarConstant(Scalar(str(self.type.dtype)), 0))
 
     def astype(self, dtype):
         return cast(self, dtype)
@@ -657,8 +613,7 @@ def f2(f, names):
 # necessary to use this same mechanism in other places as well in the future.
 class upcast_out(object):
     def __new__(self, *types):
-        dtype = Scalar.upcast(*types)
-        return get_scalar_type(dtype),
+        return Scalar(dtype=Scalar.upcast(*types)),
 
 
 class upgrade_to_float(object):
@@ -674,7 +629,7 @@ def __new__(self, *types):
                 uint16: float32,
                 uint32: float64,
                 uint64: float64}
-        return get_scalar_type(Scalar.upcast(*[conv.get(type, type)
+        return Scalar(Scalar.upcast(*[conv.get(type, type)
                                       for type in types])),
 
 
@@ -686,7 +641,7 @@ def __new__(self, type):
 def upcast_out_no_complex(*types):
     if any([type in complex_types for type in types]):
         raise TypeError('complex type are not supported')
-    return get_scalar_type(dtype=Scalar.upcast(*types)),
+    return Scalar(dtype=Scalar.upcast(*types)),
 
 
 def same_out_float_only(type):
@@ -822,18 +777,17 @@ def __init__(self, output_types_preference=None, name=None):
         if output_types_preference is not None:
             if not callable(output_types_preference):
                 raise TypeError(
-                    "Expected a callable for the 'output_types_preference' argument to %s. (got: %s)" %
-                    (self.__class__, output_types_preference))
+                    "Expected a callable for the 'output_types_preference' argument to %s. (got: %s)" % (self.__class__, output_types_preference))
             self.output_types_preference = output_types_preference
 
     def make_node(self, *inputs):
         if self.nin >= 0:
             if len(inputs) != self.nin:
-                raise TypeError("Wrong number of inputs for %s.make_node (got %i(%s), expected %i)" %
-                                (self, len(inputs), str(inputs), self.nin))
+                raise TypeError("Wrong number of inputs for %s.make_node (got %i(%s), expected %i)" \
+                                    % (self, len(inputs), str(inputs), self.nin))
         inputs = [as_scalar(input) for input in inputs]
-        outputs = [t() for t in self.output_types([input.type
-                                                   for input in inputs])]
+        outputs = [t() for t in self.output_types([input.
+            type for input in inputs])]
         if len(outputs) != self.nout:
             raise TypeError("Not the right number of outputs produced for %s(%s). Expected %s, got %s."
                             % (self, ", ".join(str(input) for input in inputs), self.nout, len(outputs)))
@@ -886,8 +840,7 @@ def __str__(self):
             return self.name
         else:
             param = [(k, v) for k, v in self.__dict__.items()
-                     if k not in ["name", "_op_use_c_code",
-                                  "output_types_preference"]]
+                     if k not in ["name", "_op_use_c_code"]]
             if param:
                 return "%s{%s}" % (self.__class__.__name__,
                                    ", ".join("%s=%s" % (k, v)
@@ -926,22 +879,7 @@ def c_code_contiguous(self, node, name, (x, ), (z, ), sub):
             node.inputs[0].type != node.outputs[0].type):
             raise theano.gof.utils.MethodNotDefined()
 
-        dtype = node.inputs[0].type.dtype_specs()[1]
-        fct_call = self.c_code_contiguous_raw(dtype, 'n', 'x', 'z')
-        return """
-{
-        npy_intp n = PyArray_SIZE(%(z)s);
-        %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
-        %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
-        %(fct_call)s;
-}
-        """ % locals()
-
-    def c_code_contiguous_raw(self, dtype, n, i, o):
-        if not config.lib.amdlibm:
-            raise theano.gof.utils.MethodNotDefined()
-        if dtype.startswith('npy_'):
-            dtype = dtype[4:]
+        dtype = node.inputs[0].dtype
         if dtype == 'float32' and self.amd_float32 is not None:
             dtype = 'float'
             fct = self.amd_float32
@@ -950,8 +888,12 @@ def c_code_contiguous_raw(self, dtype, n, i, o):
             fct = self.amd_float64
         else:
             raise theano.gof.utils.MethodNotDefined()
-        return "%(fct)s(%(n)s, %(i)s, %(o)s)" % locals()
-
+        return """
+        npy_intp n = PyArray_SIZE(%(z)s);
+        %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
+        %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
+        %(fct)s(n, x, z);
+        """ % locals()
 
 class BinaryScalarOp(ScalarOp):
     # One may define in subclasses the following fields:
@@ -987,7 +929,7 @@ def output_types(self, *input_dtypes):
         return [int8]
 
     def grad(self, inputs, output_gradients):
-        x, = inputs
+        x ,= inputs
         out = self(x)
         assert str(out.type.dtype).find('int') != -1
         return [x.zeros_like().astype(theano.config.floatX)]
@@ -1216,9 +1158,8 @@ def output_types(self, *input_types):
         return upcast_out(*input_types[0])
 
     def grad(self, inputs, output_gradients):
-        a, b = inputs
-        return [a.zeros_like().astype(theano.config.floatX),
-                b.zeros_like().astype(theano.config.floatX)]
+        a,b = inputs
+        return [a.zeros_like().astype(theano.config.floatX), b.zeros_like().astype(theano.config.floatX)]
 
 
 class OR(BinaryBitOp):
@@ -1285,13 +1226,11 @@ def c_code(self, node, name, (x, y), (z, ), sub):
             raise NotImplementedError()
         # Test for both y>x and x>=y to detect NaN
         return ('%(z)s = ((%(y)s)>(%(x)s)? (%(y)s): '
-                '((%(x)s)>=(%(y)s)? (%(x)s): nan("")));' % locals())
+                    '((%(x)s)>=(%(y)s)? (%(x)s): nan("")));' % locals())
 
     def grad(self, (x, y), (gz, )):
-        if gz.type in complex_types:
-            # max is currently defined for complex_types,
-            # but the gradient for complex is not.
-            raise NotImplementedError()
+        assert gz.type not in complex_types
+        # max is not defined for complex_types
 
         output = self(x, y)
 
@@ -1318,13 +1257,11 @@ def c_code(self, node, name, (x, y), (z, ), sub):
         if any([i.type in complex_types for i in node.inputs]):
             raise NotImplementedError()
         return ('%(z)s = ((%(y)s)<(%(x)s)? (%(y)s): '
-                '((%(x)s)<=(%(y)s)? (%(x)s): nan("")));' % locals())
+                    '((%(x)s)<=(%(y)s)? (%(x)s): nan("")));' % locals())
 
     def grad(self, (x, y), (gz, )):
-        if gz.type in complex_types:
-            # min is currently defined for complex_types,
-            # but the gradient for complex is not.
-            raise NotImplementedError()
+        assert gz.type not in complex_types
+        # max is not defined for complex_types
 
         output = minimum(x, y)
         if output.type in discrete_types:
@@ -1359,7 +1296,7 @@ def grad(self, inputs, (gz, )):
             for ii, inp in enumerate(inputs):
                 if hasattr(inp, 'zeros_like'):
                     retval.append(
-                        inp.zeros_like().astype(theano.config.floatX))
+                            inp.zeros_like().astype(theano.config.floatX))
                 else:
                     retval.append(grad_undefined(self, ii, inp))
         else:
@@ -1394,10 +1331,9 @@ def grad(self, inputs, (gz, )):
         output_type = self.output_types([i.type for i in inputs])[0]
         if output_type in complex_types:
             if not gz.type in complex_types:
-                raise TypeError(
-                    'Mul with output_type ' + str(output_type) +
-                    ' expected gz type to be complex, got gz with type ' +
-                    str(gz.type))
+                raise TypeError('Mul with output_type ' + str(output_type) +\
+                        ' expected gz type to be complex, got gz with type ' +\
+                        str(gz.type))
 
         if output_type in discrete_types:
             return [ipt.zeros_like().astype(theano.config.floatX)
@@ -1417,7 +1353,7 @@ def grad(self, inputs, (gz, )):
                     retval += [yr * real(gz) + yi * imag(gz)]
             else:
                 retval += [mul(*([gz] + utils.difference(inputs,
-                                                         [input])))]
+                                                              [input])))]
         return retval
 
 
@@ -1473,10 +1409,10 @@ def int_or_true_div(x_discrete, y_discrete):
                 "`x.__truediv__(y)`.")
         elif config.int_division == 'int':
             warnings.warn(
-                "Division of two integer types with x / y is deprecated, "
-                "please use x // y for an integer division.",
-                DeprecationWarning,
-                stacklevel=4)
+                    "Division of two integer types with x / y is deprecated, "
+                    "please use x // y for an integer division.",
+                    DeprecationWarning,
+                    stacklevel=4)
             return 'int'
         elif config.int_division == 'floatX':
             return 'true'
@@ -1496,7 +1432,7 @@ def div_proxy(x, y):
 class TrueDiv(BinaryScalarOp):
     def output_types(self, types):
         if all(t in discrete_types for t in types):
-            return [get_scalar_type(config.floatX)]
+            return [Scalar(config.floatX)]
         else:
             return super(TrueDiv, self).output_types(types)
 
@@ -1504,7 +1440,7 @@ def impl(self, x, y):
         x = numpy.asarray(x)
         y = numpy.asarray(y)
         if all(a.dtype in discrete_types for a in (x, y)):
-            return numpy.sctypeDict[config.floatX](float(x) / y)
+            return numpy.array(float(x) / y, dtype=config.floatX)
         else:
             return x / y
 
@@ -1546,8 +1482,8 @@ def grad(self, (x, y), (gz, )):
 
 class IntDiv(BinaryScalarOp):
     complex_error = ComplexError(
-        "Theano does not support integer division (//) on "
-        "complex numbers, since numpy deprecated it.")
+                "Theano does not support integer division (//) on "
+                "complex numbers, since numpy deprecated it.")
 
     def impl(self, x, y):
         return x // y
@@ -1610,8 +1546,7 @@ def c_code_cache_version(self):
         return (2,)
 
     def grad(self, inputs, g_output):
-        return [inp.zeros_like(dtype=theano.config.floatX)
-                for inp in inputs]
+        return [None] * len(inputs)
 int_div = IntDiv(upcast_out, name='int_div')
 
 
@@ -1629,8 +1564,8 @@ def mod_check(x, y):
 
 class Mod(BinaryScalarOp):
     complex_error = ComplexError(
-        "Theano does not support the mod operator (%) on "
-        "complex numbers, since numpy deprecated it.")
+                "Theano does not support the mod operator (%) on "
+                "complex numbers, since numpy deprecated it.")
 
     def impl(self, x, y):
         if isinstance(x, numpy.complex) or isinstance(y, numpy.complex):
@@ -1697,14 +1632,7 @@ def c_code(self, node, name, (x, y), (z, ), sub):
             """) % locals()
 
     def grad(self, (x, y), (gz, )):
-        z = self(x, y)
-        if z.type.dtype in discrete_types:
-            # The gradient does not flow in if the output is discrete
-            return [x.zeros_like(dtype=theano.config.floatX),
-                    y.zeros_like(dtype=theano.config.floatX)]
-        return [gz,
-                -(x // y) * gz]
-
+        return None, None
 mod = Mod(upcast_out, name='mod')
 
 
@@ -1729,7 +1657,6 @@ def grad(self, (x, y), (gz, )):
         first_part = gz * y * x ** (y - 1)
 
         second_part = gz * log(x) * x ** y
-        second_part = switch(eq(x, 0), 0, second_part)
 
         return (first_part, second_part)
 
@@ -1792,7 +1719,7 @@ def c_code(self, node, name, (x, min, max), (z, ), sub):
 
     def grad(self, (x, mn, mx), (gz, )):
         assert gz.type not in complex_types
-        gx = ((x >= mn) & (x <= mx)) * gz
+        gx = ((x > mn) & (x < mx)) * gz
         gmn = (x < mn) * gz
         gmx = (x > mx) * gz
 
@@ -1936,20 +1863,17 @@ def make_node(self, x):
             outputs = [float64()]
         else:
             outputs = [t() for t in self.output_types(
-                [input.type for input in inputs])]
+                    [input.type for input in inputs])]
         return Apply(self, inputs, outputs)
 
     def impl(self, x):
         return numpy.abs(x)
 
     def grad(self, (x, ), (gz, )):
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz * x / abs(x),  # formula works for complex and real
+        if x.type in float_types + complex_types:
+            return gz * x / abs(x),  # formula works for complex and real
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         type = node.inputs[0].type
@@ -2054,7 +1978,7 @@ def impl(self, x):
 
     def c_code___(self, node, name, (x, ), (z, ), sub):
         typ = node.outputs[0].type.dtype
-        if not typ in ['float32', 'float64']:
+        if not node.outputs[0].type.dtype in ['float32', 'float64']:
             Exception("The output should be float32 or float64")
 
         return dedent("""
@@ -2101,7 +2025,7 @@ def c_code___(self, node, name, (x, ), (z, ), sub):
 
             #undef ROUNDING_EPSILON
 
-            """ % locals())
+            """)
 round_half_to_even = RoundHalfToEven(same_out_float_only)
 
 
@@ -2150,13 +2074,10 @@ def impl(self, x):
         return -x
 
     def grad(self, (x,), (gz,)):
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return -gz,
+        if x.type in continuous_types:
+            return -gz,
+        else:
+            return None,
 
     def c_code(self, node, name, (x,), (z,), sub):
         return "%(z)s = -%(x)s;" % locals()
@@ -2166,22 +2087,17 @@ def c_code(self, node, name, (x,), (z,), sub):
 class Inv(UnaryScalarOp):
     """ multiplicative inverse. Also called reciprocal"""
     def impl(self, x):
-        return numpy.float32(1.0) / x
+        return 1.0 / x
 
     def grad(self, (x,), (gz,)):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return -gz / (x * x),
+        if x.type in float_types:
+            return -gz / (x * x),
+        else:
+            return None,
 
     def c_code(self, node, name, (x,), (z,), sub):
-        if node.inputs[0].type in complex_types:
-            raise NotImplementedError()
         return "%(z)s = 1.0 / %(x)s;" % locals()
 inv = Inv(upgrade_to_float, name='inv')
 
@@ -2192,23 +2108,15 @@ class Log(UnaryScalarOp):
     amd_float64 = "amd_vrda_log"
 
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.log will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.log(x, sig='f')
         return numpy.log(x)
 
     def grad(self, (x,), (gz,)):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz / x,
+        if x.type in float_types:
+            return gz / x,
+        else:
+            return None,
 
     def c_code(self, node, name, (x,), (z,), sub):
         #todo: the version using log2 seems to be very slightly faster
@@ -2226,23 +2134,15 @@ class Log2(UnaryScalarOp):
     amd_float64 = "amd_vrda_log2"
 
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.log2 will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.log2(x, sig='f')
         return numpy.log2(x)
 
     def grad(self, (x,), (gz,)):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz / (x * math.log(2.0)),
+        if x.type in float_types:
+            return gz / (x * math.log(2.0)),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2257,23 +2157,15 @@ class Log10(UnaryScalarOp):
     amd_float64 = "amd_vrda_log10"
 
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.log10 will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.log10(x, sig='f')
         return numpy.log10(x)
 
     def grad(self, (x,), (gz,)):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz / (x * numpy.log(10.0)),
+        if x.type in float_types:
+            return gz / (x * numpy.log(10.0)),
+        else:
+            return None
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2285,23 +2177,14 @@ def c_code(self, node, name, (x, ), (z, ), sub):
 class Log1p(UnaryScalarOp):
     """ log(1+x) """
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.log1p will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.log1p(x, sig='f')
         return numpy.log1p(x)
 
     def grad(self, (x,), (gz,)):
         if gz.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return [gz / (1 + x)]
+        if gz.type in float_types:
+            return [gz / (1 + x)]
+        return [None]
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2315,23 +2198,15 @@ class Exp(UnaryScalarOp):
     amd_float64 = "amd_vrda_exp"
 
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.exp will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.exp(x, sig='f')
         return numpy.exp(x)
 
     def grad(self, (x, ), (gz, )):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz * exp(x),
+        elif x.type in float_types:
+            return gz * exp(x),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2342,23 +2217,15 @@ def c_code(self, node, name, (x, ), (z, ), sub):
 
 class Exp2(UnaryScalarOp):
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.exp2 will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.exp2(x, sig='f')
         return numpy.exp2(x)
 
     def grad(self, (x, ), (gz, )):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz * exp2(x) * log(numpy.cast[x.type](2)),
+        elif x.type in float_types:
+            return gz * exp2(x) * log(numpy.cast[x.type](2)),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2369,31 +2236,20 @@ def c_code(self, node, name, (x, ), (z, ), sub):
 
 class Expm1(UnaryScalarOp):
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.expm1 will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.expm1(x, sig='f')
         return numpy.expm1(x)
 
     def grad(self, (x, ), (gz, )):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz * exp(x),
+        elif x.type in float_types:
+            return gz * exp(x),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
             raise NotImplementedError('type not supported', type)
-        return "%(z)s = expm1(%(x)s);" % locals()
-
-    def c_code_cache_version(self):
-        return (5,)
+        return "%(z)s = exp(%(x)s) - 1;" % locals()
 expm1 = Expm1(upgrade_to_float, name='expm1')
 
 
@@ -2404,13 +2260,10 @@ def impl(self, x):
     def grad(self, (x, ), (gz, )):
         if gz.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz * x * 2,
+        if x.type in float_types:
+            return gz * x * 2,
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         return "%(z)s = %(x)s * %(x)s;" % locals()
@@ -2419,23 +2272,15 @@ def c_code(self, node, name, (x, ), (z, ), sub):
 
 class Sqrt(UnaryScalarOp):
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.sqrt will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.sqrt(x, sig='f')
         return numpy.sqrt(x)
 
     def grad(self, (x,), (gz,)):
         if gz.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return (gz * 0.5) / sqrt(x),
+        if x.type in float_types:
+            return (gz * 0.5) / sqrt(x),
+        else:
+            return None,
 
     def c_code(self, node, name, (x,), (z,), sub):
         if node.inputs[0].type in complex_types:
@@ -2446,23 +2291,15 @@ def c_code(self, node, name, (x,), (z,), sub):
 
 class Deg2Rad(UnaryScalarOp):
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.deg2rad will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.deg2rad(x, sig='f')
         return numpy.deg2rad(x)
 
     def grad(self, (x,), (gz,)):
         if gz.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz * numpy.asarray(numpy.pi / 180, gz.type),
+        if x.type in float_types:
+            return gz * numpy.asarray(numpy.pi / 180, gz.type),
+        else:
+            return None,
 
     def c_code(self, node, name, (x,), (z,), sub):
         if node.inputs[0].type in complex_types:
@@ -2473,23 +2310,15 @@ def c_code(self, node, name, (x,), (z,), sub):
 
 class Rad2Deg(UnaryScalarOp):
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.rad2deg will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.rad2deg(x, sig='f')
         return numpy.rad2deg(x)
 
     def grad(self, (x,), (gz,)):
         if gz.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz * numpy.asarray(180. / numpy.pi, gz.type),
+        if x.type in float_types:
+            return gz * numpy.asarray(180. / numpy.pi, gz.type),
+        else:
+            return None,
 
     def c_code(self, node, name, (x,), (z,), sub):
         if node.inputs[0].type in complex_types:
@@ -2503,23 +2332,15 @@ class Cos(UnaryScalarOp):
     amd_float64 = "amd_vrda_cos"
 
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.cos will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.cos(x, sig='f')
         return numpy.cos(x)
 
     def grad(self, (x, ), (gz, )):
         if gz.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return -gz * sin(x),
+        if x.type in float_types:
+            return -gz * sin(x),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2530,23 +2351,15 @@ def c_code(self, node, name, (x, ), (z, ), sub):
 
 class ArcCos(UnaryScalarOp):
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.arccos will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.arccos(x, sig='f')
         return numpy.arccos(x)
 
     def grad(self, (x,), (gz,)):
         if gz.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return - gz / sqrt(numpy.cast[x.type](1) - sqr(x)),
+        if x.type in float_types:
+            return - gz / sqrt(numpy.cast[x.type](1) - sqr(x)),
+        else:
+            return None,
 
     def c_code(self, node, name, (x,), (z,), sub):
         if node.inputs[0].type in complex_types:
@@ -2560,23 +2373,15 @@ class Sin(UnaryScalarOp):
     amd_float64 = "amd_vrda_sin"
 
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.sin will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.sin(x, sig='f')
         return numpy.sin(x)
 
     def grad(self, (x, ), (gz, )):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz * cos(x),
+        if x.type in float_types:
+            return gz * cos(x),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2587,23 +2392,15 @@ def c_code(self, node, name, (x, ), (z, ), sub):
 
 class ArcSin(UnaryScalarOp):
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.arcsin will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.arcsin(x, sig='f')
         return numpy.arcsin(x)
 
     def grad(self, (x,), (gz,)):
         if gz.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz / sqrt(numpy.cast[x.type](1) - sqr(x)),
+        if x.type in float_types:
+            return gz / sqrt(numpy.cast[x.type](1) - sqr(x)),
+        else:
+            return None,
 
     def c_code(self, node, name, (x,), (z,), sub):
         if node.inputs[0].type in complex_types:
@@ -2614,23 +2411,15 @@ def c_code(self, node, name, (x,), (z,), sub):
 
 class Tan(UnaryScalarOp):
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.tan will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.tan(x, sig='f')
         return numpy.tan(x)
 
     def grad(self, (x,), (gz,)):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz / sqr(cos(x)),
+        if x.type in float_types:
+            return gz / sqr(cos(x)),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2641,23 +2430,15 @@ def c_code(self, node, name, (x, ), (z, ), sub):
 
 class ArcTan(UnaryScalarOp):
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.arctan will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.arctan(x, sig='f')
         return numpy.arctan(x)
 
     def grad(self, (x,), (gz,)):
         if gz.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz / (numpy.cast[x.type](1) + sqr(x)),
+        if x.type in float_types:
+            return gz / (numpy.cast[x.type](1) + sqr(x)),
+        else:
+            return None,
 
     def c_code(self, node, name, (x,), (z,), sub):
         if node.inputs[0].type in complex_types:
@@ -2668,34 +2449,16 @@ def c_code(self, node, name, (x,), (z,), sub):
 
 class ArcTan2(BinaryScalarOp):
     def impl(self, y, x):
-        # If x and y are int8 or uint8, numpy.arctan2 will compute the result
-        # in half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            y_dtype = str(getattr(x, 'dtype', ''))
-            if y_dtype in ('int8', 'uint8'):
-                return numpy.arctan2(y, x, sig='f')
         return numpy.arctan2(y, x)
 
     def grad(self, (y, x), (gz,)):
         if gz.type in complex_types:
             raise NotImplementedError()
-        else:
-            if self(x, y).type in discrete_types:
-                if x.type in discrete_types:
-                    gx = x.zeros_like(dtype=theano.config.floatX)
-                else:
-                    gx = x.zeros_like()
-                if y.type in discrete_types:
-                    gy = y.zeros_like(dtype=theano.config.floatX)
-                else:
-                    gy = y.zeros_like()
-                return [gx, gy]
-
-            # If the output is float, the gradient should flow,
-            # even if the inputs are ints
+        if x.type in float_types and y.type in float_types:
             return [gz * x / (sqr(x) + sqr(y)),
                     gz * neg(y) / (sqr(x) + sqr(y))]
+        else:
+            return None,
 
     def c_code(self, node, name, (y, x), (z,), sub):
         if (node.inputs[0].type in complex_types or
@@ -2710,23 +2473,15 @@ class Cosh(UnaryScalarOp):
     cosh(x) = (exp(x) + exp(-x)) / 2
     """
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.cosh will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.cosh(x, sig='f')
         return numpy.cosh(x)
 
     def grad(self, (x, ), (gz, )):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz * sinh(x),
+        if x.type in float_types:
+            return gz * sinh(x),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2737,23 +2492,15 @@ def c_code(self, node, name, (x, ), (z, ), sub):
 
 class ArcCosh(UnaryScalarOp):
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.arccosh will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.arccosh(x, sig='f')
         return numpy.arccosh(x)
 
     def grad(self, (x, ), (gz, )):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz / sqrt(sqr(x) - numpy.cast[x.type](1)),
+        if x.type in float_types:
+            return gz / sqrt(sqr(x) - numpy.cast[x.type](1)),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2767,23 +2514,15 @@ class Sinh(UnaryScalarOp):
     sinh(x) = (exp(x) - exp(-x)) / 2
     """
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.sinh will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.sinh(x, sig='f')
         return numpy.sinh(x)
 
     def grad(self, (x, ), (gz, )):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz * cosh(x),
+        if x.type in float_types:
+            return gz * cosh(x),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2794,23 +2533,15 @@ def c_code(self, node, name, (x, ), (z, ), sub):
 
 class ArcSinh(UnaryScalarOp):
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.arcsinh will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.arcsinh(x, sig='f')
         return numpy.arcsinh(x)
 
     def grad(self, (x, ), (gz, )):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz / sqrt(sqr(x) + numpy.cast[x.type](1)),
+        if x.type in float_types:
+            return gz / sqrt(sqr(x) + numpy.cast[x.type](1)),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2825,23 +2556,15 @@ class Tanh(UnaryScalarOp):
             = (exp(2*x) - 1) / (exp(2*x) + 1)
     """
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.tanh will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.tanh(x, sig='f')
         return numpy.tanh(x)
 
     def grad(self, (x, ), (gz, )):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz * (1 - sqr(tanh(x))),
+        if x.type in float_types:
+            return gz * (1 - sqr(tanh(x))),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -2852,23 +2575,15 @@ def c_code(self, node, name, (x, ), (z, ), sub):
 
 class ArcTanh(UnaryScalarOp):
     def impl(self, x):
-        # If x is an int8 or uint8, numpy.arctanh will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.arctanh(x, sig='f')
         return numpy.arctanh(x)
 
     def grad(self, (x, ), (gz, )):
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        return gz / (numpy.cast[x.type](1) - sqr(x)),
+        if x.type in float_types:
+            return gz / (numpy.cast[x.type](1) - sqr(x)),
+        else:
+            return None,
 
     def c_code(self, node, name, (x, ), (z, ), sub):
         if node.inputs[0].type in complex_types:
@@ -3024,14 +2739,15 @@ def init_c_code(self):
                     subd[orphan] = orphan.type.c_literal(orphan.data)
                 else:
                     raise ValueError(
-                        "All orphans in the fgraph to Composite must"
-                        " be Constant instances.")
+                            "All orphans in the fgraph to Composite must"
+                            " be Constant instances.")
 
         _c_code = "{\n"
+        i = 0
+        j = 0
         self.nodenames = ["%(nodename)s_" + ('subnode%i' % j)
-                          for j, n in enumerate(self.fgraph.toposort())]
+                for j, n in enumerate(self.fgraph.toposort())]
 
-        i = 0
         for j, node in enumerate(self.fgraph.toposort()):
             for output in node.outputs:
                 if output not in subd:
@@ -3099,10 +2815,7 @@ def init_name(self):
         self.name = rval
 
     def init_fgraph(self):
-        #The clone done by FunctionGraph is needed as we don't want
-        #the fgraph to be set to the variable as we need to pickle
-        #them for the cache of c module to work.
-        fgraph = FunctionGraph(self.inputs, self.outputs)
+        fgraph = FunctionGraph(*gof.graph.clone(self.inputs, self.outputs))
         gof.MergeOptimizer().optimize(fgraph)
         for node in fgraph.apply_nodes:
             if not isinstance(node.op, ScalarOp):
@@ -3111,43 +2824,6 @@ def init_fgraph(self):
         self.fgraph = fgraph
 
     def __init__(self, inputs, outputs):
-        # We need to clone the graph as sometimes its nodes already
-        # contain a reference to an fgraph. As we want the Composite
-        # to be pickable, we can't have reference to fgraph.
-
-        # Also, if there is Composite in the inner graph, we want to
-        # remove them. In that case, we do a more complicated clone
-        # that will flatten Composite. We don't need to do this
-        # recusively, as the way the fusion optimizer work, we have
-        # only 1 new Composite each time at the output.
-        if len(outputs) > 1 or not any([isinstance(var.owner.op, Composite)
-                                        for var in outputs]):
-            # No inner Composite
-            inputs, outputs = gof.graph.clone(inputs, outputs)
-        else:
-            # Inner Composite that we need to flatten
-            assert len(outputs) == 1
-            # 1. Create a new graph from inputs up to the
-            # Composite
-            res = theano.compile.rebuild_collect_shared(
-                inputs=inputs,
-                outputs=outputs[0].owner.inputs,
-                copy_inputs_over=False) #  Clone also the inputs
-            # 2. We continue this partial clone with the graph in
-            # the inner Composite
-            res2 = theano.compile.rebuild_collect_shared(
-                inputs=outputs[0].owner.op.inputs,
-                outputs=outputs[0].owner.op.outputs,
-                replace=dict(zip(outputs[0].owner.op.inputs, res[1]))
-            )
-            assert len(res2[1]) == len(outputs)
-            assert len(res[0]) == len(inputs)
-            assert res[0] != inputs
-            inputs, outputs = res[0], res2[1]
-            # Next assert comment just for speed
-            #assert not any([isinstance(node.op, Composite) for node in
-            #                theano.gof.graph.ops(inputs, outputs)])
-
         self.inputs = copy(inputs)
         self.outputs = copy(outputs)
         self.inputs_type = tuple([input.type for input in inputs])
@@ -3225,7 +2901,7 @@ def c_support_code(self):
         rval = []
         for subnode in self.fgraph.toposort():
             try:
-                rval.append(subnode.op.c_support_code().strip())
+                rval.append(subnode.op.c_support_code())
             except gof.utils.MethodNotDefined:
                 pass
         # remove duplicate code blocks
diff --git a/theano/scalar/basic_scipy.py b/theano/scalar/basic_scipy.py
index d65ba1618c9..c2c6be25c01 100644
--- a/theano/scalar/basic_scipy.py
+++ b/theano/scalar/basic_scipy.py
@@ -2,12 +2,11 @@
 #as scipy is not always available, we treat them separatly
 import numpy
 
-import theano
 from theano.scalar.basic import (UnaryScalarOp, BinaryScalarOp,
                                  exp, upgrade_to_float,
                                  float_types)
 from theano.scalar.basic import (upgrade_to_float_no_complex,
-                                 complex_types, discrete_types,
+                                 complex_types,
                                  upcast)
 
 imported_scipy_special = False
@@ -33,15 +32,12 @@ def grad(self, inp, grads):
         gz, = grads
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        cst = numpy.asarray(2. / numpy.sqrt(numpy.pi),
-                            dtype=upcast(x.type.dtype, gz.type.dtype))
-        return gz * cst * exp(-x * x),
+        elif x.type in float_types:
+            cst = numpy.asarray(2. / numpy.sqrt(numpy.pi),
+                                dtype=upcast(x.type.dtype, gz.type.dtype))
+            return gz * cst * exp(-x * x),
+        else:
+            return None,
 
     def c_code(self, node, name, inp, out, sub):
         x, = inp
@@ -64,15 +60,12 @@ def grad(self, inp, grads):
         gz, = grads
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        cst = numpy.asarray(2. / numpy.sqrt(numpy.pi),
-                            dtype=upcast(x.type.dtype, gz.type.dtype))
-        return - gz * cst * exp(-x * x),
+        elif x.type in float_types:
+            cst = numpy.asarray(2. / numpy.sqrt(numpy.pi),
+                                dtype=upcast(x.type.dtype, gz.type.dtype))
+            return - gz * cst * exp(-x * x),
+        else:
+            return None,
 
     def c_code(self, node, name, inp, out, sub):
         x, = inp
@@ -106,15 +99,12 @@ def grad(self, inp, grads):
         gz, = grads
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        cst = numpy.asarray(numpy.sqrt(numpy.pi) / 2.,
-                            dtype=upcast(x.type.dtype, gz.type.dtype))
-        return gz * cst * exp(erfinv(x) ** 2),
+        elif x.type in float_types:
+            cst = numpy.asarray(numpy.sqrt(numpy.pi) / 2.,
+                                dtype=upcast(x.type.dtype, gz.type.dtype))
+            return gz * cst * exp(erfinv(x) ** 2),
+        else:
+            return None,
 
     # TODO: erfinv() is not provided by the C standard library
     #def c_code(self, node, name, inp, out, sub):
@@ -139,15 +129,12 @@ def grad(self, inp, grads):
         gz, = grads
         if x.type in complex_types:
             raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
-        cst = numpy.asarray(numpy.sqrt(numpy.pi) / 2.,
-                            dtype=upcast(x.type.dtype, gz.type.dtype))
-        return - gz * cst * exp(erfcinv(x) ** 2),
+        elif x.type in float_types:
+            cst = numpy.asarray(numpy.sqrt(numpy.pi) / 2.,
+                                dtype=upcast(x.type.dtype, gz.type.dtype))
+            return - gz * cst * exp(erfcinv(x) ** 2),
+        else:
+            return None,
 
     # TODO: erfcinv() is not provided by the C standard library
     #def c_code(self, node, name, inp, out, sub):
@@ -172,14 +159,6 @@ def impl(self, x):
             super(Gamma, self).impl(x)
 
     def grad(self, (x, ), (gz, )):
-        if x.type in complex_types:
-            raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
         return gz * gamma(x) * psi(x),
 
     def c_code(self, node, name, (x, ), (z, ), sub):
@@ -208,18 +187,9 @@ def impl(self, x):
             return GammaLn.st_impl(x)
         else:
             super(GammaLn, self).impl(x)
-
     def grad(self, inp, grads):
         x, = inp
         gz, = grads
-        if x.type in complex_types:
-            raise NotImplementedError()
-        if self(x).type in discrete_types:
-            if x.type in discrete_types:
-                return [x.zeros_like(dtype=theano.config.floatX)]
-            else:
-                return [x.zeros_like()]
-
         return [gz * psi(x)]
 
     def c_code(self, node, name, inp, out, sub):
@@ -254,6 +224,7 @@ def impl(self, x):
 
     def grad(self, inputs, outputs_gradients):
         raise NotImplementedError()
+        return [None]
 
     def c_support_code(self):
         return (
@@ -323,11 +294,6 @@ class Chi2SF(BinaryScalarOp):
     """
     Compute (1 - chi2_cdf(x))
         ie. chi2 pvalue (chi2 'survival function')
-
-    C code is provided in the Theano_lgpl repository.
-    This make it faster.
-
-    https://github.com/Theano/Theano_lgpl.git
     """
 
     @staticmethod
@@ -340,6 +306,269 @@ def impl(self, x, k):
         else:
             super(Chi2SF, self).impl(x, k)
 
+    def c_support_code(self):
+        return(
+               """
+                   //For GPU support
+                   #ifdef __CUDACC__
+                   #define DEVICE __device__
+                   #else
+                   #define DEVICE
+                   #endif
+
+                   #ifndef _CHI2FUNCDEFINED
+                   #define _CHI2FUNCDEFINED
+
+                   /*----------------------------------------------------------------------
+                   File    : gamma.c
+                   Contents: computation of the (incomplete/regularized) gamma function
+                   Author  : Christian Borgelt
+                   History : 2002.07.04 file created
+                   2003.05.19 incomplete Gamma function added
+                   2008.03.14 more incomplete Gamma functions added
+                   2008.03.15 table of factorials and logarithms added
+                   2008.03.17 gamma distribution functions added
+                   ----------------------------------------------------------------------*/
+                   #include <stdio.h>
+                   #include <stdlib.h>
+
+
+                   #include <assert.h>
+                   #include <float.h>
+                   #include <math.h>
+                   /*----------------------------------------------------------------------
+                   Preprocessor Definitions
+                   ----------------------------------------------------------------------*/
+                   #define LN_BASE      2.71828182845904523536028747135  /* e */
+                   #define SQRT_PI      1.77245385090551602729816748334  /* \sqrt(\pi) */
+                   #define LN_PI        1.14472988584940017414342735135  /* \ln(\pi) */
+                   #define LN_SQRT_2PI  0.918938533204672741780329736406
+                   /* \ln(\sqrt(2\pi)) */
+                   #define EPSILON      2.2204460492503131e-16
+                   #define EPS_QTL      1.4901161193847656e-08
+                   #define MAXFACT      170
+                   #define MAXITER      1024
+                   #define TINY         (EPSILON *EPSILON *EPSILON)
+
+                   /*----------------------------------------------------------------------
+                   Table of Factorials/Gamma Values
+                   ----------------------------------------------------------------------*/
+                   DEVICE static double _facts[MAXFACT+1] = { 0 };
+                   DEVICE static double _logfs[MAXFACT+1];
+                   DEVICE static double _halfs[MAXFACT+1];
+                   DEVICE static double _loghs[MAXFACT+1];
+
+                   /*----------------------------------------------------------------------
+                   Functions
+                   ----------------------------------------------------------------------*/
+                   DEVICE static void _init (void)
+                   {                               /* --- init. factorial tables */
+                   int    i;                     /* loop variable */
+                   double x = 1;                 /* factorial */
+
+                   _facts[0] = _facts[1] = 1;    /* store factorials for 0 and 1 */
+                   _logfs[0] = _logfs[1] = 0;    /* and their logarithms */
+                   for (i = 1; ++i <= MAXFACT; ) {
+                   _facts[i] = x *= i;         /* initialize the factorial table */
+                   _logfs[i] = log(x);         /* and the table of their logarithms */
+                   }
+                   _halfs[0] = x = SQRT_PI;      /* store Gamma(0.5) */
+                   _loghs[0] = 0.5*LN_PI;        /* and its logarithm */
+                   for (i = 0; ++i < MAXFACT; ) {
+                   _halfs[i] = x *= i-0.5;     /* initialize the table for */
+                   _loghs[i] = log(x);         /* the Gamma function of half numbers */
+                   }                             /* and the table of their logarithms */
+                   }  /* _init() */
+
+                   /*--------------------------------------------------------------------*/
+                   #if 0
+
+                   double logGamma (double n)
+                   {                               /* --- compute ln(Gamma(n))         */
+                   double s;                     /*           = ln((n-1)!), n \in IN */
+
+                   assert(n > 0);                /* check the function argument */
+                   if (_facts[0] <= 0) _init();  /* initialize the tables */
+                   if (n < MAXFACT +1 +4 *EPSILON) {
+                   if (fabs(  n -floor(  n)) < 4 *EPSILON)
+                   return _logfs[(int)floor(n)-1];
+                   if (fabs(2*n -floor(2*n)) < 4 *EPSILON)
+                   return _loghs[(int)floor(n)];
+                   }                             /* try to get the value from a table */
+                   s =  1.000000000190015        /* otherwise compute it */
+                   + 76.18009172947146      /(n+1)
+                   - 86.50532032941677      /(n+2)
+                   + 24.01409824083091      /(n+3)
+                   -  1.231739572450155     /(n+4)
+                   +  0.1208650972866179e-2 /(n+5)
+                   -  0.5395239384953e-5    /(n+6);
+                   return (n+0.5) *log((n+5.5)/LN_BASE) +(LN_SQRT_2PI +log(s/n) -5.0);
+                   }  /* logGamma() */
+
+                   #else /*--------------------------------------------------------------*/
+
+                   DEVICE double logGamma (double n)
+                   {                               /* --- compute ln(Gamma(n))         */
+                   double s;                     /*           = ln((n-1)!), n \in IN */
+
+                   assert(n > 0);                /* check the function argument */
+                   if (_facts[0] <= 0) _init();  /* initialize the tables */
+                   if (n < MAXFACT +1 +4 *EPSILON) {
+                   if (fabs(  n -floor(  n)) < 4 *EPSILON)
+                   return _logfs[(int)floor(n)-1];
+                   if (fabs(2*n -floor(2*n)) < 4 *EPSILON)
+                   return _loghs[(int)floor(n)];
+                   }                             /* try to get the value from a table */
+                   s =    0.99999999999980993227684700473478  /* otherwise compute it */
+                   +  676.520368121885098567009190444019 /(n+1)
+                   - 1259.13921672240287047156078755283  /(n+2)
+                   +  771.3234287776530788486528258894   /(n+3)
+                   -  176.61502916214059906584551354     /(n+4)
+                   +   12.507343278686904814458936853    /(n+5)
+                   -    0.13857109526572011689554707     /(n+6)
+                   +    9.984369578019570859563e-6       /(n+7)
+                   +    1.50563273514931155834e-7        /(n+8);
+                   return (n+0.5) *log((n+7.5)/LN_BASE) +(LN_SQRT_2PI +log(s/n) -7.0);
+                   }  /* logGamma() */
+
+                   #endif
+                   /*----------------------------------------------------------------------
+                   Use Lanczos' approximation
+                   \Gamma(n+1) = (n+\gamma+0.5)^(n+0.5)
+                   * e^{-(n+\gamma+0.5)}
+                   * \sqrt{2\pi}
+                   * (c_0 +c_1/(n+1) +c_2/(n+2) +...+c_n/(n+k) +\epsilon)
+                   and exploit the recursion \Gamma(n+1) = n *\Gamma(n) once,
+                   i.e., compute \Gamma(n) as \Gamma(n+1) /n.
+
+                   For the choices \gamma = 5, k = 6, and c_0 to c_6 as defined
+                   in the first version, it is |\epsilon| < 2e-10 for all n > 0.
+
+                   Source: W.H. Press, S.A. Teukolsky, W.T. Vetterling, and B.P. Flannery
+                   Numerical Recipes in C - The Art of Scientific Computing
+                   Cambridge University Press, Cambridge, United Kingdom 1992
+                   pp. 213-214
+
+                   For the choices gamma = 7, k = 8, and c_0 to c_8 as defined
+                   in the second version, the value is slightly more accurate.
+                   ----------------------------------------------------------------------*/
+
+                   DEVICE double Gamma (double n)
+                   {                               /* --- compute Gamma(n) = (n-1)! */
+                   assert(n > 0);                /* check the function argument */
+                   if (_facts[0] <= 0) _init();  /* initialize the tables */
+                   if (n < MAXFACT +1 +4 *EPSILON) {
+                   if (fabs(  n -floor(  n)) < 4 *EPSILON)
+                   return _facts[(int)floor(n)-1];
+                   if (fabs(2*n -floor(2*n)) < 4 *EPSILON)
+                   return _halfs[(int)floor(n)];
+                   }                             /* try to get the value from a table */
+                   return exp(logGamma(n));      /* compute through natural logarithm */
+                   }  /* Gamma() */
+
+                   /*--------------------------------------------------------------------*/
+
+                   DEVICE static double _series (double n, double x)
+                   {                               /* --- series approximation */
+                   int    i;                     /* loop variable */
+                   double t, sum;                /* buffers */
+
+                   sum = t = 1/n;                /* compute initial values */
+                   for (i = MAXITER; --i >= 0; ) {
+                   sum += t *= x/++n;          /* add one term of the series */
+                   if (fabs(t) < fabs(sum) *EPSILON) break;
+                   }                             /* if term is small enough, abort */
+                   return sum;                   /* return the computed factor */
+                   }  /* _series() */
+
+                   /*----------------------------------------------------------------------
+                   series approximation:
+                   P(a,x) =    \gamma(a,x)/\Gamma(a)
+                   \gamma(a,x) = e^-x x^a \sum_{n=0}^\infty (\Gamma(a)/\Gamma(a+1+n)) x^n
+
+                   Source: W.H. Press, S.A. Teukolsky, W.T. Vetterling, and B.P. Flannery
+                   Numerical Recipes in C - The Art of Scientific Computing
+                   Cambridge University Press, Cambridge, United Kingdom 1992
+                   formula: pp. 216-219
+
+                   The factor exp(n *log(x) -x) is added in the functions below.
+                   ----------------------------------------------------------------------*/
+
+                   DEVICE static double _cfrac (double n, double x)
+                   {                               /* --- continued fraction approx. */
+                   int    i;                     /* loop variable */
+                   double a, b, c, d, e, f;      /* buffers */
+
+                   b = x+1-n; c = 1/TINY; f = d = 1/b;
+                   for (i = 1; i < MAXITER; i++) {
+                   a = i*(n-i);                /* use Lentz's algorithm to compute */
+                   d = a *d +(b += 2);         /* consecutive approximations */
+                   if (fabs(d) < TINY) d = TINY;
+                   c = b +a/c;
+                   if (fabs(c) < TINY) c = TINY;
+                   d = 1/d; f *= e = d *c;
+                   if (fabs(e-1) < EPSILON) break;
+                   }                             /* if factor is small enough, abort */
+                   return f;                     /* return the computed factor */
+                   }  /* _cfrac() */
+
+                   /*----------------------------------------------------------------------
+                   continued fraction approximation:
+                   P(a,x) = 1 -\Gamma(a,x)/\Gamma(a)
+                   \Gamma(a,x) = e^-x x^a (1/(x+1-a- 1(1-a)/(x+3-a- 2*(2-a)/(x+5-a- ...))))
+
+                   Source: W.H. Press, S.A. Teukolsky, W.T. Vetterling, and B.P. Flannery
+                   Numerical Recipes in C - The Art of Scientific Computing
+                   Cambridge University Press, Cambridge, United Kingdom 1992
+                   formula:           pp. 216-219
+                   Lentz's algorithm: p.  171
+
+                   The factor exp(n *log(x) -x) is added in the functions below.
+                   ----------------------------------------------------------------------*/
+
+                   DEVICE double lowerGamma (double n, double x)
+                   {                               /* --- lower incomplete Gamma fn. */
+                   assert((n > 0) && (x > 0));   /* check the function arguments */
+                   return _series(n, x) *exp(n *log(x) -x);
+                   }  /* lowerGamma() */
+
+                   /*--------------------------------------------------------------------*/
+
+                   DEVICE double upperGamma (double n, double x)
+                   {                               /* --- upper incomplete Gamma fn. */
+                   assert((n > 0) && (x > 0));   /* check the function arguments */
+                   return _cfrac(n, x) *exp(n *log(x) -x);
+                   }  /* upperGamma() */
+
+                   /*--------------------------------------------------------------------*/
+
+                   DEVICE double GammaP (double n, double x)
+                   {                               /* --- regularized Gamma function P */
+                   assert((n > 0) && (x >= 0));  /* check the function arguments */
+                   if (x <=  0) return 0;        /* treat x = 0 as a special case */
+                   if (x < n+1) return _series(n, x) *exp(n *log(x) -x -logGamma(n));
+                   return 1 -_cfrac(n, x) *exp(n *log(x) -x -logGamma(n));
+                   }  /* GammaP() */
+
+
+                   //ebuchman: this function is equivalent to scipy.stats.chi2.sf
+                   //it's the pvalue (survival function) of a chi2 distribution
+                   DEVICE double Chi2SF (double k, double x)
+                   {
+                   return 1 - GammaP(k/2., x/2.);
+                   }
+                   #endif
+                   """)
+
+    def c_code(self, node, name, inp, out, sub):
+        x, k = inp
+        z, = out
+        if node.inputs[0].type in float_types:
+            dtype = 'npy_' + node.outputs[0].dtype
+            return """%(z)s =
+                (%(dtype)s)Chi2SF(%(k)s, %(x)s);""" % locals()
+        raise NotImplementedError('only floatingpoint is implemented')
+
     def __eq__(self, other):
         return type(self) == type(other)
 
diff --git a/theano/scalar/basic_sympy.py b/theano/scalar/basic_sympy.py
deleted file mode 100644
index da519b656c8..00000000000
--- a/theano/scalar/basic_sympy.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import numpy as np
-
-from theano.scalar.basic import Apply, ScalarOp, as_scalar, float64, float32, int64
-from theano.gof.utils import remove
-
-imported_sympy = False
-try:
-    import sympy
-    from sympy.utilities.codegen import get_default_datatype, codegen
-    imported_sympy = True
-except ImportError:
-    pass
-
-import itertools as it
-names = ("sympy_func_%d"%i for i in it.count(0))
-
-def include_line(line):
-    return '#include' in line
-
-def sympy_dtype(expr):
-    return get_default_datatype(expr).cname
-def theano_dtype(expr):
-    return {'double': float64,
-            'float': float32,
-            'int': int64}[sympy_dtype(expr)]
-
-class SymPyCCode(ScalarOp):
-    """ An Operator that wraps SymPy's C code generation
-
-    >>> from sympy.abc import x, y  # SymPy Variables
-    >>> from theano.scalar.basic_sympy import SymPyCCode
-    >>> op = SymPyCCode([x, y], x + y)
-
-    >>> from theano.scalar.basic import floats
-    >>> xt, yt = floats('xy') # Theano variables
-    >>> zt = op(xt, yt)
-
-    >>> import theano
-    >>> f = theano.function([xt, yt], zt)
-    >>> f(1.0, 2.0)
-    3.0
-    """
-
-
-    def __init__(self, inputs, expr, name=None):
-        self.name = name or next(names)
-        self.inputs = inputs
-        self.expr = expr
-
-    def _sympy_c_code(self):
-        [(c_name, c_code), (h_name, c_header)] = codegen(
-                (self.name, self.expr), 'C', 'project_name',
-                header=False, argument_sequence=self.inputs)
-        return c_code
-
-    def c_support_code(self):
-        c_code = self._sympy_c_code()
-        return '\n'.join(remove(include_line, c_code.split('\n')))
-
-    def c_headers(self):
-        c_code = self._sympy_c_code()
-        return [line.replace("#include", "").strip() for line in
-                c_code.split('\n') if include_line(line)
-                and not 'project_name' in line]
-
-    def c_code(self, node, name, input_names, output_names, sub):
-        y, = output_names
-        xs = ', '.join(input_names)
-        f = self.name
-        return "%(y)s = %(f)s(%(xs)s);" % locals()
-
-    def output_types_preference(self, *inputs):
-        return [theano_dtype(self.expr)]
-
-    def make_node(self, *inputs):
-        # TODO: assert input types are correct use get_default_datatype
-
-        if len(inputs) != len(self.inputs):
-            raise TypeError("Wrong number of inputs for %s.make_node (got %i(%s), expected %i)" % (self, len(inputs), str(inputs), self.nin))
-
-        inputs = [as_scalar(input) for input in inputs]
-        outputs = [t() for t in self.output_types([input.type for input in inputs])]
-        return Apply(self, inputs, outputs)
-
-    def perform(self, node, inputs, output_storage):
-        raise NotImplementedError()
-
-    def grad(self, inputs, output_grads):
-        return [SymPyCCode(self.inputs,
-                           self.expr.diff(inp),
-                           name=self.name+"_grad_%d"%i)(*inputs)
-                for i, inp in enumerate(self.inputs)]
-
-    def _info(self):
-        return type(self), self.name, tuple(self.inputs), self.expr
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self._info() == other._info()
-
-    def __hash__(self):
-        return hash(self._info())
diff --git a/theano/scalar/tests/test_basic.py b/theano/scalar/tests/test_basic.py
index fec138998fe..33bd0cfc5fe 100644
--- a/theano/scalar/tests/test_basic.py
+++ b/theano/scalar/tests/test_basic.py
@@ -10,22 +10,12 @@
 """
 
 import unittest
-import numpy as np
 
 import theano
-from theano.gof import FunctionGraph
+from theano.gof import Variable, Op, FunctionGraph
 from theano import gof
-from theano.tests import unittest_tools as utt
 
-from theano.scalar.basic import (floats, float32, float64,
-                                 ints, int8, int32, complex64,
-                                 ComplexError, IntDiv, TrueDiv,
-                                 Composite, add, div_proxy, clip,
-                                 and_, eq, neq, invert, mul, Scalar)
-from theano.scalar.basic import (
-    true_div, inv, log, log2, log10, log1p, exp, exp2, expm1, sqrt, deg2rad,
-    rad2deg, cos, arccos, sin, arcsin, tan, arctan, arctan2, cosh, arccosh,
-    sinh, arcsinh, tanh, arctanh)
+from theano.scalar.basic import *
 
 
 def inputs():
@@ -62,43 +52,6 @@ def tes_mod(self):
                     ):
             self.assertTrue(fn(a,b) == a%b, (a,))
 
-
-    def test_clip_grad(self):
-        #This is testing for the issue #633
-        x, y = floats('xy')
-        a = theano.tensor.clip(x, y, x)
-        g = theano.gradient.grad(a, x)
-        fn = gof.DualLinker().accept(FunctionGraph([x, y], [g])).make_function()
-
-        # Test the other way around as well
-        a2 = theano.tensor.clip(x, x, y)
-        g2 = theano.gradient.grad(a2, x)
-        fn2 = gof.DualLinker().accept(FunctionGraph([x, y], [g2])).make_function()
-
-        # Test for the equal case too .
-        a3 = theano.tensor.clip(x, x, x)
-        g3 = theano.gradient.grad(a3, x)
-        fn3 = gof.DualLinker().accept(FunctionGraph([x], [g3])).make_function()
-
-        rng = np.random.RandomState(utt.fetch_seed())
-
-        ntests = 50
-        for i in xrange(ntests):
-            xval = rng.rand(1)
-            #To ensure that the min < x .
-            yval_mn = rng.rand(1) - 1.0
-
-            #To ensure that the max > x.
-            yval_mx = rng.rand(1) + 1.0
-
-            aval = fn(xval, yval_mn)
-            aval2 = fn2(xval, yval_mx)
-            aval3 = fn3(xval)
-            self.assertTrue(aval == 1.)
-            self.assertTrue(aval2 == 1.)
-            self.assertTrue(aval3 == 1.)
-
-
 class test_composite(unittest.TestCase):
 
     def test_straightforward(self):
@@ -111,17 +64,19 @@ def test_straightforward(self):
         fn = gof.DualLinker().accept(g).make_function()
         assert fn(1.0, 2.0) == 1.5
 
-    def test_flatten(self):
-        #Test that we flatten multiple Composite.
-        x, y, z = inputs()
-        C = Composite([x, y], [x + y])
-        CC = Composite([x, y], [C(x * y, y)])
-        assert not isinstance(CC.outputs[0].owner.op, Composite)
+#    def test_sin(self):
+#        x = inputs()
+#        e = sin(x)
+#        C = Composite([x], [e])
+#        c = C.make_node(x)
+#        # print c.c_code(['x'], ['z'], dict(id = 0))
+#        g = FunctionGraph([x], [c.out])
+#        fn = gof.DualLinker().accept(g).make_function()
+#        assert fn(0) == 0
+#        assert fn(3.14159265358/2) == 1
+#        assert fn(3.14159265358) == 0
 
-        # Test with multiple outputs
-        CC = Composite([x, y, z], [C(x * y, y), C(x * z, y)])
-        #We don't flatten that case.
-        assert isinstance(CC.outputs[0].owner.op, Composite)
+    # WRITEME: Test for sin, pow, and other scalar ops.
 
     def test_with_constants(self):
         x, y, z = inputs()
@@ -240,128 +195,6 @@ def test_not(self):
             self.assertTrue(fn(a,b) == ~a, (a,))
 
 
-# This class does not inherit from unittest.TestCase, because it would
-# interfere with the "yield" mechanism that automatically generates test, see
-# http://stackoverflow.com/questions/6689537/nose-test-generators-inside-class
-# Therefore, it needs to be named "test_..." or "Test_...", so nose can pick
-# it up by name, otherwise the tests would not be executed.
-class test_upgrade_to_float(object):
-    # Test for Ops whose output has to be floating point, even when all
-    # inputs are ints.
-    # In particular, when the inputs are int8, the output should be
-    # at least float32, not float16.
-
-    unary_ops_vals = [
-        (inv, range(-127, 0) + range(1, 127)),
-        (sqrt, range(0, 128)),
-        (log, range(1, 128)),
-        (log2, range(1, 128)),
-        (log10, range(1, 128)),
-        (log1p, range(0, 128)),
-        (exp, range(-127, 89)),
-        (exp2, range(-127, 89)),
-        (expm1, range(-127, 89)),
-        (deg2rad, range(-127, 128)),
-        (rad2deg, range(-127, 128)),
-        (cos, range(-127, 128)),
-        (arccos, range(-1, 2)),
-        (cosh, range(-89, 90)),
-        (arccosh, range(1, 128)),
-        (sin, range(-127, 128)),
-        (arcsin, range(-1, 2)),
-        (sinh, range(-89, 90)),
-        (arcsinh, range(-127, 128)),
-        (tan, range(-3, 4)),
-        (arctan, range(-127, 128)),
-        (tanh, range(-127, 128)),
-        (arctanh, [0])]
-
-    binary_ops_vals = [
-        (arctan2, range(-127, 128), range(-127, 128))]
-
-    @staticmethod
-    def _test_unary(unary_op, x_range):
-        xi = int8('xi')
-        xf = float32('xf')
-
-        ei = unary_op(xi)
-        fi = theano.function([xi], ei)
-
-        ef = unary_op(xf)
-        ff = theano.function([xf], ef)
-
-        for x_val in x_range:
-            outi = fi(x_val)
-            outf = ff(x_val)
-
-            assert outi.dtype == outf.dtype, 'incorrect dtype'
-            assert np.allclose(outi, outf), 'insufficient precision'
-
-    @staticmethod
-    def _test_binary(binary_op, x_range, y_range):
-        xi = int8('xi')
-        yi = int8('yi')
-        xf = float32('xf')
-        yf = float32('yf')
-
-        ei = binary_op(xi, yi)
-        fi = theano.function([xi, yi], ei)
-
-        ef = binary_op(xf, yf)
-        ff = theano.function([xf, yf], ef)
-
-        for x_val in x_range:
-            for y_val in y_range:
-                outi = fi(x_val, y_val)
-                outf = ff(x_val, y_val)
-
-                assert outi.dtype == outf.dtype, 'incorrect dtype'
-                assert np.allclose(outi, outf), 'insufficient precision'
-
-    def test_true_div(self):
-        # true_div's upcast policy is not exactly "upgrade_to_float",
-        # so the test is a little bit different
-        x_range = range(-127, 128)
-        y_range = range(-127, 0) + range(1, 127)
-
-        xi = int8('xi')
-        yi = int8('yi')
-        xf = Scalar(theano.config.floatX)('xf')
-        yf = Scalar(theano.config.floatX)('yf')
-
-        ei = true_div(xi, yi)
-        fi = theano.function([xi, yi], ei)
-
-        ef = true_div(xf, yf)
-        ff = theano.function([xf, yf], ef)
-
-        for x_val in x_range:
-            for y_val in y_range:
-                outi = fi(x_val, y_val)
-                outf = ff(x_val, y_val)
-
-                assert outi.dtype == outf.dtype, 'incorrect dtype'
-                assert np.allclose(outi, outf), 'insufficient precision'
-
-    def test_unary(self):
-        # Automatically define all individual unary tests
-        for unary_op, x_range in self.unary_ops_vals:
-            test_name = 'test_%s' % unary_op.name
-            # Make a lambda function so we can name the test
-            test = lambda: self._test_unary(unary_op, x_range)
-            test.description = test_name
-            yield test
-
-    def test_binary(self):
-        # Automatically define all individual binary tests
-        for binary_op, x_range, y_range in self.binary_ops_vals:
-            test_name = 'test_%s' % binary_op.name
-            # Make a lambda function so we can name the test
-            test = lambda: self._test_binary(binary_op, x_range, y_range)
-            test.description = test_name
-            yield test
-
-
 class test_complex_mod(unittest.TestCase):
     """Make sure % fails on complex numbers."""
 
@@ -383,7 +216,7 @@ def test_0(self):
         d = float64()
         f = float32()
 
-        #print (a//b).owner.op
+        print (a//b).owner.op
         assert isinstance((a//b).owner.op, IntDiv)
         assert isinstance((b//a).owner.op, IntDiv)
         assert isinstance((b/d).owner.op, TrueDiv)
diff --git a/theano/scalar/tests/test_basic_sympy.py b/theano/scalar/tests/test_basic_sympy.py
deleted file mode 100644
index 8deed03749e..00000000000
--- a/theano/scalar/tests/test_basic_sympy.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from theano.scalar.basic_sympy import SymPyCCode
-from theano.scalar.basic import floats
-import theano
-
-try:
-    import sympy
-    xs = sympy.Symbol('x')
-    ys = sympy.Symbol('y')
-except ImportError:
-    from nose.plugins.skip import SkipTest
-    raise SkipTest('optional package sympy disabled')
-
-xt, yt = floats('xy')
-
-def test_SymPyCCode():
-    op = SymPyCCode([xs, ys], xs + ys)
-    e = op(xt, yt)
-    g = theano.gof.FunctionGraph([xt, yt], [e])
-    fn = theano.gof.CLinker().accept(g).make_function()
-    assert fn(1.0, 2.0) == 3.0
-
-def test_grad():
-    op = SymPyCCode([xs], xs**2)
-    zt = op(xt)
-    ztprime = theano.grad(zt, xt)
-    assert ztprime.owner.op.expr == 2*xs
-
-def test_multivar_grad():
-    op = SymPyCCode([xs, ys], xs**2 + ys**3)
-    zt = op(xt, yt)
-    dzdx, dzdy = theano.grad(zt, [xt, yt])
-    assert dzdx.owner.op.expr == 2*xs
-    assert dzdy.owner.op.expr == 3*ys**2
diff --git a/theano/scan_module/numpy_api_changes.diff b/theano/scan_module/numpy_api_changes.diff
deleted file mode 100644
index c96b827bb6a..00000000000
--- a/theano/scan_module/numpy_api_changes.diff
+++ /dev/null
@@ -1,55 +0,0 @@
-diff --git a/theano/scan_module/scan_perform.c b/theano/scan_module/scan_perform.c
-index aaebb43..2d06b29 100644
---- a/theano/scan_module/scan_perform.c
-+++ b/theano/scan_module/scan_perform.c
-@@ -5595,7 +5595,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
-  *             cdef list stack
-  *             cdef int offset
-  */
--  __pyx_t_4 = ((PyObject *)__pyx_v_self->descr);
-+  __pyx_t_4 = ((PyObject *)PyArray_DESCR(__pyx_v_self));
-   __Pyx_INCREF(__pyx_t_4);
-   __pyx_v_descr = ((PyArray_Descr *)__pyx_t_4);
-   __pyx_t_4 = 0;
-@@ -7147,7 +7147,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a
-  *      arr.base = baseptr
-  * 
-  */
--  Py_XDECREF(__pyx_v_arr->base);
-+  Py_XDECREF(PyArray_BASE(__pyx_v_arr));
- 
-   /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":974
-  *          baseptr = <PyObject*>base
-@@ -7156,7 +7156,11 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a
-  * 
-  * cdef inline object get_array_base(ndarray arr):
-  */
--  __pyx_v_arr->base = __pyx_v_baseptr;
-+#if NPY_API_VERSION < 0x00000007
-+  PyArray_BASE(__pyx_v_arr) = __pyx_v_baseptr;
-+#else
-+  PyArray_SetBaseObject(__pyx_v_arr, __pyx_v_baseptr);
-+#endif
- 
-   /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":966
-  * 
-@@ -7191,7 +7195,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__py
-  *         return None
-  *     else:
-  */
--  __pyx_t_1 = ((__pyx_v_arr->base == NULL) != 0);
-+  __pyx_t_1 = ((PyArray_BASE(__pyx_v_arr) == NULL) != 0);
-   if (__pyx_t_1) {
- 
-     /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":978
-@@ -7214,8 +7218,8 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__py
-  *         return <object>arr.base             # <<<<<<<<<<<<<<
-  */
-     __Pyx_XDECREF(__pyx_r);
--    __Pyx_INCREF(((PyObject *)__pyx_v_arr->base));
--    __pyx_r = ((PyObject *)__pyx_v_arr->base);
-+    __Pyx_INCREF(((PyObject *)PyArray_BASE(__pyx_v_arr)));
-+    __pyx_r = ((PyObject *)PyArray_BASE(__pyx_v_arr));
-     goto __pyx_L0;
-   }
- 
diff --git a/theano/scan_module/scan.py b/theano/scan_module/scan.py
index bdd183190ff..17bccc9d16b 100644
--- a/theano/scan_module/scan.py
+++ b/theano/scan_module/scan.py
@@ -101,7 +101,7 @@ def scan(fn,
 
         The order of the sequences is the same as the one in the list
         `sequences` given to scan. The order of the outputs is the same
-        as the order of ``outputs_info``. For any sequence or output the
+        as the order of ``output_info``. For any sequence or output the
         order of the time slices is the same as the one in which they have
         been given as taps. For example if one writes the following :
 
@@ -262,7 +262,7 @@ def f(x):
         outputs will have *0 rows*. If the value is negative, ``scan``
         will run backwards in time. If the ``go_backwards`` flag is already
         set and also ``n_steps`` is negative, ``scan`` will run forward
-        in time. If n_steps is not provided, ``scan`` will figure
+        in time. If n stpes is not provided, ``scan`` will figure
         out the amount of steps it should run given its input sequences.
 
 
@@ -382,9 +382,9 @@ def wrap_into_list(x):
     for i in xrange(n_seqs):
         if not isinstance(seqs[i], dict):
             seqs[i] = OrderedDict([('input', seqs[i]), ('taps', [0])])
-        elif seqs[i].get('taps', None) is not None:
+        elif seqs[i].get('taps', None):
             seqs[i]['taps'] = wrap_into_list(seqs[i]['taps'])
-        elif seqs[i].get('taps', None) is None:
+        elif seqs[i].get('taps', True) is None:
             # seqs dictionary does not have the ``taps`` key
             seqs[i]['taps'] = [0]
 
@@ -393,7 +393,7 @@ def wrap_into_list(x):
         if outs_info[i] is not None:
             if isinstance(outs_info[i], dict):
                 # DEPRECATED :
-                if outs_info[i].get('return_steps', None) is not None:
+                if outs_info[i].get('return_steps', None):
                     raise ValueError(
                             "Using `return_steps` has been deprecated. "
                             "Simply select the entries you need using a "
@@ -404,14 +404,14 @@ def wrap_into_list(x):
             if not isinstance(outs_info[i], dict):
                 # by default any output has a tap value of -1
                 outs_info[i] = OrderedDict([('initial', outs_info[i]), ('taps', [-1])])
-            elif (outs_info[i].get('initial', None) is None and
-                    outs_info[i].get('taps', None) is not None):
+            elif (not outs_info[i].get('initial', None) and
+                    outs_info[i].get('taps', None)):
                 # ^ no initial state but taps provided
                 raise ValueError(('If you are using slices of an output '
                                   'you need to provide a initial state '
                                   'for it'), outs_info[i])
-            elif (outs_info[i].get('initial', None) is not None and
-                  outs_info[i].get('taps', None) is None):
+            elif (outs_info[i].get('initial', None) and
+                  not outs_info[i].get('taps', None)):
                 # ^ initial state but taps not provided
                 if 'taps' in outs_info[i]:
                     # ^ explicitly provided a None for taps
@@ -817,7 +817,7 @@ def wrap_into_list(x):
     if as_while:
         tmp_dummy_f_outs -= 1
     if not (tmp_dummy_f_outs == n_outs or outs_info == []):
-        raise ValueError('Please provide None as outputs_info for '
+        raise ValueError('Please provide None as output_info for '
                          'any output that does not feed back into '
                          'scan (i.e. it behaves like a map) ')
 
diff --git a/theano/scan_module/scan_op.py b/theano/scan_module/scan_op.py
index ab564aaf047..af47acde0b5 100644
--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -20,7 +20,6 @@
 import numpy
 
 import theano
-from theano.compat import exc_message
 from theano.compile import function, Param, Out
 from theano import compile
 from theano import gradient
@@ -32,7 +31,6 @@
 from theano.tensor.opt import Shape_i
 from theano.gradient import grad_undefined
 from theano.gradient import DisconnectedType
-from theano.gradient import NullType
 from theano.compile.profiling import ScanProfileStats
 
 from theano.scan_module import scan_utils
@@ -56,24 +54,23 @@ def __init__(self,
             the scan op (like number of different types of
             arguments, name, mode, if it should run on GPU or
             not, etc.)
-        :param typeConstructor: function that constructs an equivalent
-            to Theano TensorType
-
-
-        Note: ``typeConstructor`` had been added to refactor how
-        Theano deals with the GPU. If it runs on the GPU, scan needs
-        to construct certain outputs (those who reside in the GPU
-        memory) as the GPU-specific type.  However we can not import
-        gpu code in this file (as it is in sandbox, and not available
-        on each machine) so the workaround is that the GPU
-        optimization passes to the constructor of this class a
-        function that is able to construct a GPU type. This way the
-        class Scan does not need to be aware of the details for the
-        GPU, it just constructs any tensor using this function (which
-        by default constructs normal tensors).
+        :param typeConstructor: function that constructs a Theano TensorType
+            able to represent a float32 ndarray.
+
+        Note: ``typeConstructor`` had been added to refactor how Theano
+        deals with the GPU. If it runs on the GPU, scan needs to construct
+        certain outputs (those who reside in the GPU memory) as CudaNdarray.
+        However we can not import cuda in this file (as it is in sandbox,
+        and not available on each machine) so the workaround is that the GPU
+        optimization (which is aware of cuda types) passes to the
+        constructor of this class a function that is able to construct
+        CudaNdarray. This way the class Scan does not need to be aware of
+        CudaNdarray, it just constructs any float32 tensor using this
+        function (which by default constructs normal tensors). Note that the
+        second assumption in this code is that any float32 output or input
+        will be moved on the GPU if the optimization gets applied (following
+        Theano's philosophy of moving as much as possible on gpu).
         """
-        if 'gpua' not in info:
-            info['gpua'] = False
         # adding properties into self
         self.inputs = inputs
         self.outputs = outputs
@@ -96,10 +93,23 @@ class Scan does not need to be aware of the details for the
             # Not that for mit_mot there are several output slices per
             # output sequence
             o = outputs[idx]
-            self.output_types.append(
-                typeConstructor(
-                    broadcastable=(False,) + o.type.broadcastable,
-                    dtype=o.type.dtype))
+            # Scan assumes that only variables of dtype float32 might need a
+            # special constructor (i.e. CudaNdarray constructor) when the
+            # code is running on GPU, as it is the only type supported by
+            # Theano yet. Therefore only for dtype float32 we use the passed
+            # type constructor ``typeConstructor``. For anything else we
+            # know that even if we run it on the GPU we still construct
+            # normal Theano tensors.
+            if o.type.dtype in ['float32']:
+                self.output_types.append(
+                    typeConstructor(
+                        broadcastable=(False,) + o.type.broadcastable,
+                        dtype=o.type.dtype))
+            else:
+                self.output_types.append(
+                    tensorConstructor(
+                        broadcastable=(False,) + o.type.broadcastable,
+                        dtype=o.type.dtype))
 
             idx += len(self.mit_mot_out_slices[jdx])
             jdx += 1
@@ -108,11 +118,23 @@ class Scan does not need to be aware of the details for the
         end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot
 
         for o in outputs[idx:end]:
-            self.output_types.append(
-                typeConstructor(
-                    broadcastable=(False,) + o.type.broadcastable,
-                    dtype=o.type.dtype))
-
+            # Scan assumes that only variables of dtype float32 might need a
+            # special constructor (i.e. CudaNdarray constructor) when the
+            # code is running on GPU, as it is the only type supported by
+            # Theano yet. Therefore only for dtype float32 we use the passed
+            # type constructor ``typeConstructor``. For anything else we
+            # know that even if we run it on the GPU we still construct
+            # normal Theano tensors.
+            if o.type.dtype in ['float32']:
+                self.output_types.append(
+                    typeConstructor(
+                        broadcastable=(False,) + o.type.broadcastable,
+                        dtype=o.type.dtype))
+            else:
+                self.output_types.append(
+                    tensorConstructor(
+                        broadcastable=(False,) + o.type.broadcastable,
+                        dtype=o.type.dtype))
         # shared outputs + possibly the ending condition
         for o in outputs[end:]:
             self.output_types.append(o.type)
@@ -158,14 +180,14 @@ class Scan does not need to be aware of the details for the
                                    self.n_shared_outs)
         self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
         self.n_tap_outs = self.n_mit_mot + self.n_mit_sot
-        if self.info['gpu'] or self.info['gpua']:
-            self._hash_inner_graph = self.info['gpu_hash']
-        else:
+        if not self.info['gpu']:
             tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs,
-                                                           self.outputs)
-            local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False)
+                                                       self.outputs)
+            local_fgraph = gof.FunctionGraph(tmp_in, tmp_out)
             self._cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
             self._hash_inner_graph = hash(self._cmodule_key)
+        else:
+            self._hash_inner_graph = self.info['gpu_hash']
 
     def make_node(self, *inputs):
         """
@@ -205,18 +227,13 @@ def make_node(self, *inputs):
                    )
         err_msg2 = ('When compiling the inner function of scan the '
                     'following error has been encountered: The '
-                    'initial state (`outputs_info` in scan nomenclature) '
-                    'of variable %s (argument number %d) '
-                    'has dtype %s, while the result of the inner function '
-                    '(`fn`) has dtype %s. This can happen if the inner '
-                    'function of scan results in an upcast or downcast.')
-        err_msg3 = ('When compiling the inner function of scan the '
-                    'following error has been encountered: The '
-                    'initial state (`outputs_info` in scan nomenclature) '
-                    'of variable %s (argument number %d) has %d dimension(s), '
-                    'while the result of the inner function (`fn`) has %d '
-                    'dimension(s) (should be one less than the initial '
-                    'state).')
+                    'initial state (outputs_info in scan nomenclature) '
+                    'of variable %s (argument number %d)'
+                    ' has dtype %s and %d dimension(s), while the result '
+                    'of the inner function for this output has dtype %s '
+                    'and %d dimension(s). This could happen if the inner '
+                    'graph of scan results in an upcast or downcast. '
+                    'Please make sure that you use dtypes consistently')
 
         def format(var, as_var):
             """ This functions ensures that ``out`` has the same dtype as
@@ -277,19 +294,17 @@ def format(var, as_var):
                                            inner_mitmot[ipos + k].type.ndim))
             ipos += len(itaps)
             for k in xrange(len(otaps)):
-                if (inner_mitmot_outs[opos + k].type.dtype !=
-                        outer_mitmot.type.dtype):
+                if (inner_mitmot_outs[opos + k].type.dtype != \
+                        outer_mitmot.type.dtype or
+                    inner_mitmot_outs[opos + k].ndim != \
+                         outer_mitmot.ndim - 1):
                     raise ValueError(err_msg2 %
-                                     (str(outer_mitmot),
-                                      argoffset + idx,
-                                      outer_mitmot.type.dtype,
-                                      inner_mitmot_outs[opos + k].type.dtype))
-                if inner_mitmot_outs[opos + k].ndim != outer_mitmot.ndim - 1:
-                    raise ValueError(err_msg3 %
-                                     (str(outer_mitmot),
-                                      argoffset + idx,
-                                      outer_mitmot.ndim,
-                                      inner_mitmot_outs[opos + k].ndim))
+                                      (str(outer_mitmot),
+                                       argoffset + idx,
+                                       outer_mitmot.type.dtype,
+                                       outer_mitmot.ndim,
+                                       inner_mitmot_outs[opos + k].type.dtype,
+                                       inner_mitmot_outs[opos + k].ndim))
             opos += len(otaps)
         argoffset += len(self.outer_mitmot(inputs))
         # Same checks as above but for outputs of type mit_sot
@@ -316,18 +331,15 @@ def format(var, as_var):
                                            inner_mitsots[ipos + k].type.dtype,
                                            inner_mitsots[ipos + k].type.ndim))
             ipos += len(itaps)
-            if inner_mitsot_out.type.dtype != outer_mitsot.type.dtype:
+            if (inner_mitsot_out.type.dtype != outer_mitsot.type.dtype or
+                inner_mitsot_out.ndim != outer_mitsot.ndim - 1):
                 raise ValueError(err_msg2 %
                                  (str(outer_mitsot),
-                                  argoffset + idx,
-                                  outer_mitsot.type.dtype,
-                                  inner_mitsot_out.type.dtype))
-            if inner_mitsot_out.ndim != outer_mitsot.ndim - 1:
-                raise ValueError(err_msg3 %
-                                 (str(outer_mitsot),
-                                  argoffset + idx,
-                                  outer_mitsot.ndim,
-                                  inner_mitsot_out.ndim))
+                                 argoffset + idx,
+                                 outer_mitsot.type.dtype,
+                                 outer_mitsot.type.ndim,
+                                 inner_mitsot_out.type.dtype,
+                                 inner_mitsot_out.type.ndim))
 
         argoffset += len(self.outer_mitsot(inputs))
         # Same checks as above but for outputs of type sit_sot
@@ -347,18 +359,15 @@ def format(var, as_var):
                                 str(inner_sitsot),
                                 inner_sitsot.type.dtype,
                                 inner_sitsot.type.ndim))
-            if inner_sitsot_out.type.dtype != outer_sitsot.type.dtype:
+            if (inner_sitsot_out.type.dtype != outer_sitsot.type.dtype or
+                inner_sitsot_out.ndim != outer_sitsot.ndim - 1):
                 raise ValueError(err_msg2 %
-                                 (str(outer_sitsot),
-                                  argoffset + idx,
-                                  outer_sitsot.type.dtype,
-                                  inner_sitsot_out.type.dtype))
-            if inner_sitsot_out.ndim != outer_sitsot.ndim - 1:
-                raise ValueError(err_msg3 %
-                                 (str(outer_sitsot),
-                                  argoffset + idx,
-                                  outer_sitsot.type.ndim,
-                                  inner_sitsot_out.type.ndim))
+                                (str(outer_sitsot),
+                                argoffset + idx,
+                                outer_sitsot.type.dtype,
+                                outer_sitsot.type.ndim,
+                                inner_sitsot_out.type.dtype,
+                                inner_sitsot_out.type.ndim))
 
         argoffset += len(self.outer_sitsot(inputs))
         # Check that the shared variable and their update rule have the same
@@ -370,16 +379,13 @@ def format(var, as_var):
             outer_shared = format(_outer_shared, as_var=inner_shared)
             new_inputs.append(outer_shared)
             if (hasattr(outer_shared, 'dtype') and
-                    outer_shared.dtype != inner_shared_out.dtype):
+                (outer_shared.dtype != inner_shared_out.dtype or
+                 outer_shared.ndim != inner_shared_out.ndim)):
                 raise ValueError(err_msg2 % (str(outer_shared),
                                              idx + argoffset,
                                              outer_shared.dtype,
-                                             inner_shared_out.dtype))
-            if (hasattr(outer_shared, 'dtype') and
-                    outer_shared.ndim != inner_shared_out.ndim):
-                raise ValueError(err_msg3 % (str(outer_shared),
-                                             idx + argoffset,
                                              outer_shared.ndim,
+                                             inner_shared_out.dtype,
                                              inner_shared_out.ndim))
 
             if (hasattr(outer_shared, 'dtype') and
@@ -422,19 +428,11 @@ def format(var, as_var):
                 raise ValueError('For output %s you need to provide a '
                                  'scalar int !', str(outer_nitsot))
         assert len(new_inputs) == len(inputs)
-
-        # The vector_seqs and vector_outs are just a workaround
-        # strange NumPy behavior: vector_ndarray[int] return a NumPy
-        # scalar and not a NumPy ndarray of 0 dimensions.
-        self.vector_seqs = [isinstance(seq, (tensor.TensorVariable,
-                                             tensor.TensorConstant)) and
-                            seq.ndim == 1 for seq in
-                            new_inputs[1:1 + self.n_seqs]]
-        self.vector_outs = [isinstance(arg, (tensor.TensorVariable,
-                                             tensor.TensorConstant)) and
-                            arg.ndim == 1 for arg in
-                            new_inputs[1 + self.n_seqs: (1 + self.n_seqs +
-                                                         self.n_outs)]]
+        self.vector_seqs = [seq.ndim == 1 for seq in
+                             new_inputs[1:1 + self.n_seqs]]
+        self.vector_outs = [arg.ndim == 1 for arg in
+                             new_inputs[1 + self.n_seqs: (1 + self.n_seqs +
+                                                    self.n_outs)]]
         self.vector_outs += [False] * self.n_nit_sot
 
         apply_node = Apply(self,
@@ -568,15 +566,12 @@ def make_thunk(self, node, storage_map, compute_map, no_recycling):
                 profile = ScanProfileStats(name=self.name)
         elif self.profile:
             profile = self.profile
-        # make_thunk can be called many times on the same op
-        # we do not want to recompile the inner fct every time.
-        if not getattr(self, 'fn', None):
-            self.fn = function(wrapped_inputs,
-                               wrapped_outputs,
-                               mode=self.mode_instance,
-                               name=self.name,
-                               profile=profile,
-                               on_unused_input='ignore')
+        self.fn = function(wrapped_inputs,
+                           wrapped_outputs,
+                           mode=self.mode_instance,
+                           name=self.name,
+                           profile=profile,
+                           on_unused_input='ignore')
 
         try:
             cython_mintaps = numpy.asarray(self.mintaps, dtype='int32')
@@ -606,6 +601,12 @@ def make_thunk(self, node, storage_map, compute_map, no_recycling):
                 for _d1 in range(cython_mit_mot_out_nslices[_d0]):
                     cython_mit_mot_out_slices[_d0, _d1] = \
                         self.mit_mot_out_slices[_d0][_d1]
+            vector_seqs = [seq.ndim == 1 for seq in
+                                 node.inputs[1:1 + self.n_seqs]]
+            vector_outs = [arg.ndim == 1 for arg in
+                                 node.inputs[1 + self.n_seqs:
+                                             (1 + self.n_seqs + self.n_outs)]]
+            vector_outs += [False] * self.n_nit_sot
 
             cython_vector_seqs = numpy.asarray(self.vector_seqs,
                                                     dtype='int32')
@@ -835,14 +836,15 @@ def execute(self, node, args, outs):
         n_steps = args[0]
         seqs = []
         if n_steps < 0:
-            # History, in the past, this was used for backward
-            # scan. Now we reverse the inputs outside of scan.
-            raise IndexError(
-                "Scan was asked to run for negative number of step %d" %
-                n_steps)
-        elif n_steps == 0:
-            raise NotImplementedError(
-                "We didn't implemented yet the case where scan do 0 iteration")
+            n_steps = abs(n_steps)
+            for idx, seq in enumerate(args[1:self.seqs_arg_offset]):
+                if seq.shape[0] < n_steps:
+                    raise ValueError(('Sequence is shorter then the required '
+                                     'number of steps : (n_steps, seq, '
+                                      'seq.shape):'), n_steps,
+                                      node.inputs[1 + idx],
+                                      seq.shape)
+                seqs.append(seq[::-1])
         else:
             for idx, seq in enumerate(args[1:self.seqs_arg_offset]):
                 if seq.shape[0] < n_steps:
@@ -972,18 +974,10 @@ def execute(self, node, args, outs):
                 fn()
             except Exception:
                 if hasattr(fn, 'position_of_error'):
-                    # this is a new vm-provided function or c linker
-                    # they need this because the exception manipulation
+                    # this is a new vm-provided function
+                    # the C VM needs this because the exception manipulation
                     # done by raise_with_op is not implemented in C.
-                    if hasattr(self.fn, 'thunks'):
-                        # For the CVM
-                        gof.vm.raise_with_op(self.fn.nodes[self.fn.position_of_error],
-                                             self.fn.thunks[self.fn.position_of_error])
-                    else:
-                        # For the c linker
-                        # We don't have access from python to all the temps values
-                        # So for now, we just don't print the extra shapes/strides info
-                        gof.vm.raise_with_op(self.fn.nodes[self.fn.position_of_error])
+                    gof.vm.raise_with_op(fn.nodes[fn.position_of_error])
                 else:
                     # old-style linkers raise their own exceptions
                     raise
@@ -1104,13 +1098,6 @@ def execute(self, node, args, outs):
                     # little trick that I used
                     outs[idx][0] = outs[idx][0][:-(n_steps - i)]
 
-        # We never reuse the input or output storage of the
-        # inner function so we clear it.
-        for i_s in input_storage:
-            i_s.storage[0] = None
-        for o_s in output_storage:
-            o_s.storage[0] = None
-
         t_call = time.time() - t0_call
         # NOTE: make this match what's in function_module.Function
         # and this little string helps us to find this spot:
@@ -1291,11 +1278,6 @@ def get_output_slice_idx(self, output_index):
         return ipos + opos
 
     def connection_pattern(self, node):
-        # We cache this, as grad call connection_pattern, and it call
-        # grad in its turn. I was a case where theano.grad() took 4h
-        # that had many scan one inside each others.
-        if hasattr(node.tag, 'connection_pattern'):
-            return node.tag.connection_pattern
         # The gradient wrt to n_steps is disconnected
         connection_pattern = [[False for output in node.outputs]]
         connection_pattern += [[False for output in node.outputs]
@@ -1312,9 +1294,7 @@ def compute_gradient(y, g_y, diff_inputs):
                                            known_grads={y: g_y}, wrt=x)
                 except gradient.NullTypeGradError:
                     # It means the gradient is undefined (which implies
-                    # is connected).
-                    # Warning: x is not the right gradient here, but the only
-                    # thing we will check later is whether it is None.
+                    # is connected)
                     gmp[x] = x
                 except gradient.DisconnectedInputError:
                     gmp[x] = None
@@ -1402,8 +1382,6 @@ def _get_inner_inps(iidx):
                         for k in xrange(len(connection_pattern)):
                             if connection_pattern[k][jidx]:
                                 connection_pattern[k][iidx] = True
-
-        node.tag.connection_pattern = connection_pattern
         return connection_pattern
 
     ### GRAD FUNCTION
@@ -1487,33 +1465,16 @@ def compute_gradient(y, g_y):
                    if (x in diff_inputs) and
                    (connection_pattern[
                        get_inp_idx(self_inputs.index(x))][odx])]
-            gmp = OrderedDict()
-
-            for x in wrt:
-                try:
-                    gmp[x] = gradient.grad(
-                        cost=None,
-                        known_grads={y: g_y},
-                        wrt=x,
-                        consider_constant=wrt,
-                        disconnected_inputs='ignore',
-                        return_disconnected='None')
-                except gradient.NullTypeGradError, e:
-                    # The gradient wrt that particular input is undefined.
-                    # This is not necessarily an issue, because maybe that
-                    # particular input is not in the path between the
-                    # "cost" and "wrt" of the external, initial call to grad().
-                    # We simply return a Null gradient, forwarding the message.
-                    gmp[x] = NullType((
-                        "This variable is Null because the grad method on the "
-                        "inner graph of the Scan node %s returned Null for "
-                        "the corresponding inner input variable. The original "
-                        "message was: %s"
-                        % (str(self), exc_message(e))))()
-
+            grads = gradient.grad(
+                cost=None,
+                known_grads={y: g_y},
+                wrt=wrt,
+                consider_constant=wrt,
+                disconnected_inputs='ignore',
+                return_disconnected='None')
+            gmp = dict(zip(wrt, grads))
             rval = [gmp.get(p, None) for p in diff_inputs]
             return rval
-
         dC_dinps_t = [None for inp in diff_inputs]
         disconnected_dC_dinps_t = [True for inp in diff_inputs]
         dC_dXts = []
@@ -1556,17 +1517,8 @@ def compute_gradient(y, g_y):
             for jdx in xrange(len(_dC_dinps_t)):
                 if dC_dinps_t[jdx] is None:
                     dC_dinps_t[jdx] = _dC_dinps_t[jdx]
-                elif isinstance(dC_dinps_t[jdx].type, NullType):
-                    # The accumulated gradient is undefined
-                    pass
                 elif _dC_dinps_t[jdx]:
-                    if isinstance(_dC_dinps_t[jdx].type, NullType):
-                        # The accumulated gradient is defined, but the new
-                        # term is undefined. The whole thing has to be undefined.
-                        dC_dinps_t[jdx] = _dC_dinps_t[jdx]
-                    else:
-                        dC_dinps_t[jdx] += _dC_dinps_t[jdx]
-
+                    dC_dinps_t[jdx] += _dC_dinps_t[jdx]
         # mask inputs that get no gradients
         for dx in xrange(len(dC_dinps_t)):
             if not dC_dinps_t[dx]:
@@ -1588,37 +1540,32 @@ def compute_gradient(y, g_y):
             opos = self.get_output_pos(pos)
             if opos >= 0:
                 dC_dXtm1s.append(safe_new(dC_dXts[opos]))
-                if hasattr(x, 'dtype') and x.dtype != dC_dXts[opos].dtype:
+                if x.dtype != dC_dXts[opos].dtype:
                     dC_dinps_t[pos + self.n_seqs] = \
                             x.astype(dC_dXts[opos].dtype)
             else:
                 dC_dXtm1s.append(safe_new(x))
         for dx, dC_dXtm1 in enumerate(dC_dXtm1s):
-            if isinstance(dC_dinps_t[dx + self.n_seqs].type, NullType):
-                # The accumulated gradient is undefined
-                pass
-            elif isinstance(dC_dXtm1.type, NullType):
-                # The new gradient is undefined, this makes the accumulated
-                # gradient undefined as weell
-                dC_dinps_t[dx + self.n_seqs] = dC_dXtm1
-            else:
-                dC_dinps_t[dx + self.n_seqs] += dC_dXtm1
+            dC_dinps_t[dx + self.n_seqs] += dC_dXtm1
         # Construct scan op
         # Seqs
         outer_inp_seqs = [x[::-1] for x in inputs[1:1 + self.n_seqs]]
         for idx in xrange(self.n_mit_mot + self.n_mit_sot):
             mintap = numpy.min(self.tap_array[idx])
             maxtap = numpy.max(self.tap_array[idx])
-            if idx < self.n_mit_mot:
-                outmaxtap = numpy.max(self.mitmot_out_taps()[idx])
-            else:
-                outmaxtap = 0
             seq = outs[idx]
             for k in self.tap_array[idx]:
-                if outmaxtap -k != 0:
-                    nw_seq = seq[k - mintap: -(outmaxtap-k)][::-1]
+                if maxtap < 0:
+                    dim_offset = abs(maxtap)
+                else:
+                    dim_offset = 0
+                if maxtap == mintap and maxtap != 0:
+                    nw_seq = seq[:abs(maxtap)]
+                elif maxtap - k != 0:
+                    nw_seq = seq[dim_offset + k - mintap - 1:\
+                                 -(maxtap - k + 1)][::-1]
                 else:
-                    nw_seq = seq[k - mintap:][::-1]
+                    nw_seq = seq[dim_offset + k - mintap - 1: -1][::-1]
                 outer_inp_seqs.append(nw_seq)
         outer_inp_seqs += [
             x[:-1][::-1] for x in self.outer_sitsot_outs(outs)]
@@ -1626,30 +1573,8 @@ def compute_gradient(y, g_y):
             if not isinstance(x.type, DisconnectedType):
                 outer_inp_seqs.append(x[::-1])
 
-        if hasattr(inputs[0].tag, 'test_value'):
-            # Here we tests that the new scan input sequence all have
-            # the same shape[0]. This is a properties that the scan()
-            # fct add and we want to keep it for all Scan op.  This is
-            # used in T_Scan.test_grad_multiple_outs_taps to test
-            # that.
-            for taps, x in zip(self.mitsot_taps(),
-                               self.outer_mitsot_outs(outs)):
-                mintap = numpy.min(taps)
-                if hasattr(x[::-1][:mintap], 'test_value'):
-                    assert (x[::-1][:mintap].tag.test_value.shape[0] ==
-                            inputs[0].tag.test_value)
-            for x in self.outer_sitsot_outs(outs):
-                if hasattr(x[::-1][:-1].tag, 'test_value'):
-                    assert (x[::-1][:-1].tag.test_value.shape[0] ==
-                            inputs[0].tag.test_value)
-            for x in self.outer_nitsot_outs(outs):
-                if hasattr(x[::-1].tag, 'test_value'):
-                    assert (x[::-1].tag.test_value.shape[0] ==
-                            inputs[0].tag.test_value)
-        outer_inp_seqs += [x[::-1][:numpy.min(taps)]
-                           for taps, x in zip(self.mitsot_taps(),
-                                              self.outer_mitsot_outs(outs))]
-        outer_inp_seqs += [x[::-1][:-1] for x in self.outer_sitsot_outs(outs)]
+        outer_inp_seqs += [x[::-1] for x in self.outer_mitsot_outs(outs)]
+        outer_inp_seqs += [x[::-1] for x in self.outer_sitsot_outs(outs)]
         outer_inp_seqs += [x[::-1] for x in self.outer_nitsot_outs(outs)]
 
         inner_inp_seqs = self.inner_seqs(self_inputs)
@@ -1672,15 +1597,10 @@ def compute_gradient(y, g_y):
         n_mitmot_inps = 0
 
         for idx in xrange(self.n_mit_mot):
-            if isinstance(dC_douts[idx].type, DisconnectedType):
-                out = outs[idx]
-                outer_inp_mitmot.append(tensor.zeros_like(out))
-            else:
-                outer_inp_mitmot.append(dC_douts[idx][::-1])
+            outer_inp_mitmot.append(dC_douts[idx][::-1])
             mitmot_inp_taps.append([])
             mitmot_out_taps.append([])
-            undefined_msg = None
-            through_shared = False
+            undefined = False
             disconnected = True
             for jdx in xrange(len(self.mit_mot_out_slices[idx])):
                 inner_inp_mitmot.append(dC_dXts[out_pos])
@@ -1690,33 +1610,21 @@ def compute_gradient(y, g_y):
 
             for jdx in xrange(len(self.tap_array[idx])):
                 inner_inp_mitmot.append(dC_dXtm1s[ins_pos - self.n_seqs])
-                if isinstance(dC_dinps_t[ins_pos].type, NullType):
-                    # We cannot use Null in the inner graph, so we
-                    # use a zero tensor of the appropriate shape instead.
-                    inner_out_mitmot.append(
-                        tensor.zeros(diff_inputs[ins_pos].shape,
-                                     dtype=theano.config.floatX))
-                    undefined_msg = dC_dinps_t[ins_pos].type.why_null
-                else:
-                    inner_out_mitmot.append(dC_dinps_t[ins_pos])
-
+                inner_out_mitmot.append(dC_dinps_t[ins_pos])
                 if not disconnected_dC_dinps_t[ins_pos]:
                     disconnected = False
 
                 for _sh in self.inner_shared(self_inputs):
                     if _sh in gof.graph.inputs([dC_dinps_t[ins_pos]]):
-                        through_shared = True
+                        undefined = True
 
-                n_mitmot_inps += 1
+                n_mitmot_inps_ += 1
                 ins_pos += 1
                 n_mitmot_outs += 1
                 mitmot_inp_taps[idx].append(-self.tap_array[idx][jdx])
                 mitmot_out_taps[idx].append(-self.tap_array[idx][jdx])
-
-            if undefined_msg:
-                type_outs.append(undefined_msg)
-            elif through_shared:
-                type_outs.append('through_shared')
+            if undefined:
+                type_outs.append('undefined')
             elif disconnected:
                 type_outs.append('disconnected')
             else:
@@ -1731,23 +1639,12 @@ def compute_gradient(y, g_y):
             inner_inp_mitmot.append(dC_dXts[out_pos])
             out_pos += 1
             n_mitmot_inps += 1
-            undefined_msg = None
-            through_shared = False
+            undefined = False
             disconnected = True
             mitmot_inp_taps[idx + offset].append(0)
             for jdx in xrange(len(self.tap_array[idx_tap])):
                 inner_inp_mitmot.append(dC_dXtm1s[ins_pos - self.n_seqs])
-
-                if isinstance(dC_dinps_t[ins_pos].type, NullType):
-                    # We cannot use Null in the inner graph, so we
-                    # use a zero tensor of the appropriate shape instead.
-                    inner_out_mitmot.append(
-                        tensor.zeros(diff_inputs[ins_pos].shape,
-                                     dtype=theano.config.floatX))
-                    undefined_msg = dC_dinps_t[ins_pos].type.why_null
-                else:
-                    inner_out_mitmot.append(dC_dinps_t[ins_pos])
-
+                inner_out_mitmot.append(dC_dinps_t[ins_pos])
                 mitmot_inp_taps[idx + offset].append(
                     -self.tap_array[idx_tap][jdx])
                 mitmot_out_taps[idx].append(
@@ -1756,16 +1653,12 @@ def compute_gradient(y, g_y):
                     disconnected = False
                 for _sh in self.inner_shared(self_inputs):
                     if _sh in gof.graph.inputs([dC_dinps_t[ins_pos]]):
-                        through_shared = True
-
+                        undefined = True
                 n_mitmot_inps += 1
                 ins_pos += 1
                 n_mitmot_outs += 1
-
-            if undefined_msg:
-                type_outs.append(undefined_msg)
-            elif through_shared:
-                type_outs.append('through_shared')
+            if undefined:
+                type_outs.append('undefined')
             elif disconnected:
                 type_outs.append('disconnected')
             else:
@@ -1775,46 +1668,26 @@ def compute_gradient(y, g_y):
         for idx in xrange(self.n_sit_sot):
             mitmot_inp_taps.append([0, 1])
             mitmot_out_taps.append([1])
-            through_shared = False
+            undefined = False
             if not isinstance(dC_douts[idx + offset].type, DisconnectedType):
                 outer_inp_mitmot.append(dC_douts[idx + offset][::-1])
             else:
-                if isinstance(dC_dinps_t[ins_pos].type, NullType):
-                    # Cannot use dC_dinps_t[ins_pos].dtype, so we use
-                    # floatX instead, as it is a dummy value that will not
-                    # be used anyway.
-                    outer_inp_mitmot.append(
-                        tensor.zeros(outs[idx + offset].shape,
-                                     dtype=theano.config.floatX))
-                else:
-                    outer_inp_mitmot.append(
-                        tensor.zeros(outs[idx + offset].shape,
-                                     dtype=dC_dinps_t[ins_pos].dtype))
-
-            if isinstance(dC_dinps_t[ins_pos].type, NullType):
-                # We cannot use Null in the inner graph, so we
-                # use a zero tensor of the appropriate shape instead.
-                inner_out_mitmot.append(
-                    tensor.zeros(diff_inputs[ins_pos].shape,
-                                 dtype=theano.config.floatX))
-            else:
-                inner_out_mitmot.append(dC_dinps_t[ins_pos])
-
+                outer_inp_mitmot.append(
+                    tensor.zeros(outs[idx + offset].shape,
+                                 dtype=dC_dinps_t[ins_pos].dtype))
+            inner_out_mitmot.append(dC_dinps_t[ins_pos])
             for _sh in self.inner_shared(self_inputs):
                 if _sh in gof.graph.inputs([dC_dinps_t[ins_pos]]):
-                    through_shared = True
-
-            if isinstance(dC_dinps_t[ins_pos].type, NullType):
-                type_outs.append(dC_dinps_t[ins_pos].type.why_null)
-            elif through_shared:
-                type_outs.append('through_shared')
+                    undefined = True
+            if undefined:
+                type_outs.append('undefined')
             elif disconnected_dC_dinps_t[ins_pos]:
                 type_outs.append('disconnected')
             else:
                 type_outs.append('connected')
 
             inner_inp_mitmot += [dC_dXts[out_pos],
-                                 dC_dXtm1s[ins_pos - self.n_seqs]]
+                              dC_dXtm1s[ins_pos - self.n_seqs]]
             n_mitmot_outs += 1
             out_pos += 1
             ins_pos += 1
@@ -1827,38 +1700,24 @@ def compute_gradient(y, g_y):
         inner_out_nitsot = dC_dinps_t[:self.n_seqs]
         inner_out_sitsot = dC_dinps_t[ins_pos:]
         for _p, vl in enumerate(inner_out_sitsot):
-            through_shared = False
+            undefined = False
             for _sh in self.inner_shared(self_inputs):
                 if _sh in gof.graph.inputs([vl]):
-                    through_shared = True
-            if isinstance(vl.type, NullType):
-                type_outs.append(vl.type.why_null)
-                # Replace the inner output with a zero tensor of
-                # the right shape
-                inner_out_sitsot[_p] = tensor.zeros(
-                    diff_inputs[ins_pos + _p].shape,
-                    dtype=theano.config.floatX)
-            elif through_shared:
-                type_outs.append('through_shared')
+                    undefined = True
+            if undefined:
+                type_outs.append('undefined')
             elif disconnected_dC_dinps_t[_p + ins_pos]:
                 type_outs.append('disconnected')
             else:
                 type_outs.append('connected')
 
         for _p, vl in enumerate(inner_out_nitsot):
-            through_shared = False
+            undefined = False
             for _sh in self.inner_shared(self_inputs):
                 if _sh in gof.graph.inputs([vl]):
-                    through_shared = True
-            if isinstance(vl.type, NullType):
-                type_outs.append(vl.type.why_null)
-                # Replace the inner output with a zero tensor of
-                # the right shape
-                inner_out_sitsot[_p] = tensor.zeros(
-                    diff_inputs[_p].shape,
-                    dtype=theano.config.floatX)
-            if through_shared:
-                type_outs.append('through_shared')
+                    undefined = True
+            if undefined:
+                type_outs.append('undefined')
             elif disconnected_dC_dinps_t[_p]:
                 type_outs.append('disconnected')
             else:
@@ -1897,12 +1756,12 @@ def compute_gradient(y, g_y):
         info['mode'] = self.mode
 
         outer_inputs = ([grad_steps] +
-                        outer_inp_seqs +
-                        outer_inp_mitmot +
-                        outer_inp_sitsot +
-                        [inputs[0] for x in xrange(n_nit_sot)] +
-                        self.outer_shared(inputs) +
-                        self.outer_non_seqs(inputs))
+                       outer_inp_seqs +
+                       outer_inp_mitmot +
+                       outer_inp_sitsot +
+                       [inputs[0] for x in xrange(n_nit_sot)] +
+                       self.outer_shared(inputs) +
+                       self.outer_non_seqs(inputs))
 
         inner_other_args = self_inputs[offset:]
         inner_gfn_ins = (inner_inp_seqs +
@@ -1913,7 +1772,6 @@ def compute_gradient(y, g_y):
         inner_gfn_outs = (inner_out_mitmot +
                           inner_out_sitsot +
                           inner_out_nitsot)
-
         local_op = Scan(inner_gfn_ins, inner_gfn_outs, info)
         outputs = local_op(*outer_inputs)
         if type(outputs) not in (list, tuple):
@@ -1928,36 +1786,29 @@ def compute_gradient(y, g_y):
         for p, (x, t) in enumerate(
             zip(outputs[offset:offset + self.n_seqs],
                 type_outs[offset:offset + self.n_seqs])):
-            if t == 'connected':
-                gradients.append(x[::-1])
-            elif t == 'disconnected':
+            if t == 'disconnected':
                 gradients.append(DisconnectedType()())
-            elif t == 'through_shared':
+            elif t == 'undefined':
                 gradients.append(
                     grad_undefined(self,
                                    p + 1,
                                    inputs[p + 1],
                                    'Depends on a shared variable'))
             else:
-                # t contains the "why_null" string of a NullType
-                gradients.append(NullType(t)())
-
+                gradients.append(x[::-1])
         end = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
         for p, (x, t) in enumerate(
             zip(outputs[:end], type_outs[:end])):
-            if t == 'connected':
-                gradients.append(x[::-1])
-            elif t == 'disconnected':
+            if t == 'disconnected':
                 gradients.append(DisconnectedType()())
-            elif t == 'through_shared':
+            elif t == 'undefined':
                 gradients.append(
                     grad_undefined(self,
                                    p + 1 + self.n_seqs,
                                    inputs[p + 1 + self.n_seqs],
                                    'Depends on a shared variable'))
             else:
-                # t contains the "why_null" string of a NullType
-                gradients.append(NullType(t)())
+                gradients.append(x[::-1])
 
         start = len(gradients)
         node = outs[0].owner
@@ -1983,20 +1834,16 @@ def compute_gradient(y, g_y):
         end = begin + n_sitsot_outs
         for p, (x, t) in enumerate(
             zip(outputs[begin:end], type_outs[begin:end])):
-            if t == 'connected':
-                gradients.append(x[-1])
-            elif t == 'disconnected':
+            if t == 'disconnected':
                 gradients.append(DisconnectedType()())
-            elif t == 'through_shared':
+            elif t == 'undefined':
                 gradients.append(
                     grad_undefined(self,
                                    p + begin + 1,
                                    inputs[p + begin + 1],
                                    'Depends on a shared variable'))
             else:
-                # t contains the "why_null" string of a NullType
-                gradients.append(NullType(t)())
-
+                gradients.append(x[-1])
         # Mask disconnected gradients
         # Ideally we would want to assert that the gradients we are
         # replacing do indeed evaluate to 0, though that is not practical
diff --git a/theano/scan_module/scan_opt.py b/theano/scan_module/scan_opt.py
index 3124aaea99c..98267d02c56 100644
--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -1,50 +1,5 @@
 """
 This module provides optimizations for scan
-The Optimization provided in this file:
-
-local opt: remove_constants_and_unused_inputs_scan,
-           constant_folding_for_scan2,
-           scan_merge_inouts
-           They are wrapped in in2out to create global opt.
-global opt: ScanInplaceOptimizer,
-            PushOutNonSeqScan,
-            PushOutSeqScan,
-            PushOutDot1,
-            ScanMerge,
-            ScanSaveMem
-
-How the are registered:
-
-optdb: scan_eqopt1 (.1), scan_eqopt2(1.6), scan_inplace(75)
-scan_eqopt1 -> scan_seqopt1
-scan_seqopt1 -> in2out(remove_constants_and_unused_inputs_scan)(1),
-                PushOutNonSeqScan(2),
-                PushOutSeqScan(3), PushOutDot1(4)
-scan_eqopt2 -> They are all global optimizer. (in2out convert local to global).
-               This is important, as the order is important and all global
-               optimizer run before local optimizer in the order they where
-               registered. (So don't change the order we register them!)
-               If we convert to local optimizer, we must convert all of them
-               to local optimizer. But:
-               1) can ScanMerge be made local? Can we keep only this one global?
-               2) ScanSaveMem assert that we remove all nodes outputs,
-                  we need to keep this.
-               3) It is ScanSaveMem suppose the the others ran before.
-                  I added an assert at one place, but didn't looked for other place.
-               4) Moving this to local opt could speed up significant this opt,
-                  as we pass frequently on all nodes in the graph for no good reason.
-               5) We register remove_constant_*  many places, as some
-                  opt create them and let this one clean up the mess.
-                  Doing it that way, make things simpler for those already
-                  complex opt.
-
-               in2out(constant_folding),
-               in2out(remove_constants_and_unused_inputs_scan1),
-               ScanMerge,
-               in2out(remove_constants_and_unused_inputs_scan2),
-               in2out(scan_merge_inouts),
-               ScanSaveMem,
-               in2out(remove_constants_and_unused_inputs_scan3)
 """
 
 
@@ -94,7 +49,7 @@ def info(*msg):
     _logger.info('INFO theano.scan: ' + ' '.join(msg))
 
 
-@gof.local_optimizer([scan_op.Scan])
+@gof.local_optimizer([None])
 def remove_constants_and_unused_inputs_scan(node):
     '''
     Move constants into the inner graph, and remove unused inputs.
@@ -111,12 +66,10 @@ def remove_constants_and_unused_inputs_scan(node):
     # We only need to take care of sequences and other arguments
     st = op.n_seqs
     st += int(numpy.sum([len(x) for x in
-                         op.tap_array[:(op.n_mit_mot + op.n_mit_sot)]]))
+                     op.tap_array[:(op.n_mit_mot + op.n_mit_sot)]]))
     st += op.n_sit_sot
     st += op.n_shared_outs
-
-    op_ins = op.inputs
-    op_outs = op.outputs
+    op_ins, op_outs = scan_utils.reconstruct_graph(op.inputs, op.outputs)
 
     # Corresponds to the initial states, which should stay untouched.
     # We put those variables aside, and put them back at the end.
@@ -141,51 +94,49 @@ def remove_constants_and_unused_inputs_scan(node):
 
     all_ins = gof.graph.inputs(op_outs)
     for idx in xrange(op.n_seqs):
-        node_inp = node.inputs[idx + 1]
-        if (isinstance(node_inp, tensor.TensorConstant) and
-            node_inp.tag.unique_value is not None):
+        if (isinstance(node.inputs[idx + 1], tensor.TensorConstant) and
+            node.inputs[idx + 1].tag.unique_value is not None):
             try:
                 # This works if input is a constant that has all entries
                 # equal
-                givens[op_ins[idx]] = node_inp.clone()[0]
+                givens[op_ins[idx]] = node.inputs[idx + 1].clone()[0]
             except TypeError:
                 pass
         elif op_ins[idx] in all_ins:
             # Check for identical other sequence
             identical_seqs = [x for x in nw_outer
-                              if scan_utils.equal_computations(
-                                  [x], [node_inp])]
+                                  if scan_utils.equal_computations(
+                                      [x], [node.inputs[idx + 1]])]
             if identical_seqs:
                 index = node.inputs.index(identical_seqs[0]) - 1
                 givens[op_ins[idx]] = op_ins[index]
             else:
                 nw_inner += [op_ins[idx]]
-                nw_outer += [node_inp]
+                nw_outer += [node.inputs[idx + 1]]
 
     nw_n_seqs = len(nw_inner)
     # Add outputs stuff
     nw_inner += out_stuff_inner
     nw_outer += out_stuff_outer
-
     # Look through non sequences
-    nw_inner_nonseq = []
-    nw_outer_nonseq = []
     for idx, (nw_in, nw_out) in enumerate(zip(non_seqs, outer_non_seqs)):
         if isinstance(nw_out, tensor.Constant):
             givens[nw_in] = nw_out.clone()
         elif nw_in in all_ins:
-            # Indices of elements of nw_outer_nonseq that are equivalent
-            # to nw_out.
-            identical_nonseq_idx = [
-                i for (i, x) in enumerate(nw_outer_nonseq)
-                if scan_utils.equal_computations([x], [nw_out])]
-            if identical_nonseq_idx:
-                givens[nw_in] = nw_inner_nonseq[identical_nonseq_idx[0]]
+            identical_non_seqs = [x for x in nw_outer
+                                  if scan_utils.equal_computations(
+                                      [x], [nw_out])]
+            if identical_non_seqs:
+                identical_idx = outer_non_seqs.index(identical_non_seqs[0])
+                # If we have identical non sequences, the previous one
+                # must be in nw_inner or be a constant.
+                assert (non_seqs[identical_idx] in nw_inner or
+                        isinstance(identical_non_seqs[0], tensor.Constant))
+                index = outer_non_seqs.index(identical_non_seqs[0])
+                givens[nw_in] = non_seqs[index]
             else:
-                nw_inner_nonseq += [nw_in]
-                nw_outer_nonseq += [nw_out]
-    nw_inner.extend(nw_inner_nonseq)
-    nw_outer.extend(nw_outer_nonseq)
+                nw_inner += [nw_in]
+                nw_outer += [nw_out]
 
     if len(nw_inner) != len(op_ins):
         op_outs = scan_utils.clone(op_outs, replace=givens)
@@ -193,7 +144,7 @@ def remove_constants_and_unused_inputs_scan(node):
         nw_info['n_seqs'] = nw_n_seqs
         # DEBUG CHECK
         nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
-        nw_outs = nwScan(*nw_outer, **dict(return_list=True))
+        nw_outs = nwScan.make_node(*nw_outer).outputs
         return nw_outs
     else:
         return False
@@ -211,7 +162,7 @@ def add_requirements(self, fgraph):
 
     def apply(self, fgraph):
         nodelist = [x for x in fgraph.toposort() if isinstance(x.op,
-                                                               scan_op.Scan)]
+                                                           scan_op.Scan)]
         for node in nodelist:
             self.process_node(fgraph, node)
 
@@ -219,9 +170,9 @@ def process_node(self, fgraph, node):
         # this flag tells if there was any change during the last iterations
         changed = True
         clean_inputs, clean_outputs = scan_utils.reconstruct_graph(
-            node.op.inputs, node.op.outputs)
+                        node.op.inputs, node.op.outputs)
 
-        local_fgraph = gof.FunctionGraph(clean_inputs, clean_outputs, clone=False)
+        local_fgraph = gof.FunctionGraph(clean_inputs, clean_outputs)
         max_iterations = 2 * len(local_fgraph.toposort()) + 3
         counts = 0
         to_remove = []
@@ -245,7 +196,7 @@ def process_node(self, fgraph, node):
                 if (numpy.all([(x in inner_non_seqs) or
                                (x.owner in to_remove) or
                                isinstance(x, tensor.Constant)
-                               for x in nd.inputs]) and
+                                 for x in nd.inputs]) and
                         # we can do this because the assumption is that a
                         # viewOp or deepCopyOp will be just at the end of the
                         # function and not somewhere in the middle ..
@@ -276,11 +227,7 @@ def process_node(self, fgraph, node):
                                  'this on theano-users list'), x)
                     outside_ins = [x.type.filter_variable(y) for x, y in
                                    zip(nd.inputs, outside_ins)]
-
-                    # Do not call make_node for test_value
-                    nw_outer_node = nd.op(*outside_ins,
-                                          **dict(return_list=True))[0].owner
-
+                    nw_outer_node = nd.op.make_node(*outside_ins)
                     # Step 2. Create variables for replacements
                     for idx, y in enumerate(nd.outputs):
 
@@ -303,7 +250,7 @@ def process_node(self, fgraph, node):
         clean_replace_with_in = []
         clean_replace_with_out = []
         existent_nodes = [nd for nd in local_fgraph.toposort()
-                          if nd not in to_remove]
+                            if nd not in to_remove]
         to_keep = []
         for nd in existent_nodes:
             to_keep += nd.inputs
@@ -323,8 +270,8 @@ def process_node(self, fgraph, node):
             nw_outer = []
             nw_inner = []
             for to_repl, repl_in, repl_out in zip(clean_to_replace,
-                                                  clean_replace_with_in,
-                                                  clean_replace_with_out):
+                                              clean_replace_with_in,
+                                              clean_replace_with_out):
                 if isinstance(repl_out, theano.Constant):
                     repl_in = repl_out.clone()
                 else:
@@ -338,15 +285,11 @@ def process_node(self, fgraph, node):
             op_ins, op_outs = scan_utils.reconstruct_graph(_op_ins, _op_outs)
             # Reconstruct node
             nwScan = scan_op.Scan(op_ins, op_outs, op.info)
-
-            # Do not call make_node for test_value
-            nw_node = nwScan(*(node.inputs + nw_outer),
-                             **dict(return_list=True))[0].owner
-
+            nw_node = nwScan.make_node(* (node.inputs + nw_outer))
             fgraph.replace_all_validate_remove(
                 zip(node.outputs, nw_node.outputs),
                 remove=[node],
-                reason='scanOp_pushout_nonseqs_ops')
+                reason='scan_push_computation_out')
             return True
         elif to_keep == []:
             # Nothing in the inner graph should be kept
@@ -367,7 +310,7 @@ def process_node(self, fgraph, node):
                 fgraph.replace_all_validate_remove(
                     replace_with.items(),
                     remove=[node],
-                    reason='scanOp_pushout_nonseqs_ops')
+                    reason='scan_push_computation_out')
 
         else:
             return False
@@ -384,8 +327,8 @@ def add_requirements(self, fgraph):
         fgraph.attach_feature(gof.toolbox.ReplaceValidate())
 
     def apply(self, fgraph):
-        nodelist = [x for x in fgraph.toposort()
-                    if isinstance(x.op, scan_op.Scan)]
+        nodelist = [x for x in fgraph.toposort() if isinstance(x.op,
+                                                           scan_op.Scan)]
         for node in nodelist:
             self.process_node(fgraph, node)
 
@@ -393,9 +336,9 @@ def process_node(self, fgraph, node):
         # this flag tells if there was any change during the last iterations
         changed = True
         clean_inputs, clean_outputs = scan_utils.reconstruct_graph(
-            node.op.inputs, node.op.outputs)
+                        node.op.inputs, node.op.outputs)
 
-        local_fgraph = gof.FunctionGraph(clean_inputs, clean_outputs, clone=False)
+        local_fgraph = gof.FunctionGraph(clean_inputs, clean_outputs)
         max_iterations = 2 * len(local_fgraph.toposort()) + 3
         counts = 0
         to_remove = []
@@ -418,12 +361,12 @@ def process_node(self, fgraph, node):
 
             for nd in local_fgraph.toposort():
                 if (isinstance(nd.op, theano.tensor.Elemwise) and
-                    numpy.all([(x in inner_non_seqs) or
-                               (x.owner in to_remove) or
-                               isinstance(x, tensor.Constant) or
-                               (x in inner_seqs)
-                               for x in nd.inputs]) and
-                    not nd in to_remove):
+                      numpy.all([(x in inner_non_seqs) or
+                                 (x.owner in to_remove) or
+                                 isinstance(x, tensor.Constant) or
+                                 (x in inner_seqs)
+                                 for x in nd.inputs]) and
+                      not nd in to_remove):
                     to_remove.append(nd)
                     outside_ins = []
                     for x in nd.inputs:
@@ -433,21 +376,18 @@ def process_node(self, fgraph, node):
                         elif x in inner_seqs:
                             outside_ins += [outer_seqs[inner_seqs.index(x)]]
                         elif x in to_replace:
-                            outside_ins += [replace_with_out[
-                                to_replace.index(x)]]
+                            outside_ins += [replace_with_out[\
+                                                    to_replace.index(x)]]
                         elif isinstance(x, theano.Constant):
                             outside_ins += [x.clone()]
                         else:
                             raise Exception(
-                                ('Error in the `scan_pushout_seq_'
+                                ('Error in the `scan_pushout_non_seq_'
                                  'operations`. The optimization tries '
                                  'to move some computation fron scan '
                                  'which is not allowed to move. Report '
                                  'this on theano-users list'), x)
-                    # Do not call make_node for test_value
-                    nw_outer_node = nd.op(*outside_ins,
-                                          **dict(return_list=True))[0].owner
-
+                    nw_outer_node = nd.op.make_node(*outside_ins)
                     # Step 2. Create variables for replacements
                     for idx, y in enumerate(nd.outputs):
 
@@ -480,15 +420,10 @@ def process_node(self, fgraph, node):
                     to_replace += [y]
                     replace_with_in += [y_place_holder]
                     replace_with_out += [new_outer]
-                    if hasattr(new_outer.tag, "test_value"):
-                        new_sh = new_outer.tag.test_value.shape
-                        ref_sh = (outside_ins.tag.test_value.shape[0],)
-                        ref_sh += nd.outputs[0].tag.test_value.shape
-                        assert new_sh == ref_sh
 
                     changed = True
         if counts >= max_iterations:
-            raise Exception('Error in the `scan_pushout_seq_operations`.'
+            raise Exception('Error in the `scan_pushout_non_seq_operations`.'
                             ' The optimization exhausted the maximal number '
                             'of iterations allowed!')
         # We need to check all candidate replacements and choose those that
@@ -501,7 +436,7 @@ def process_node(self, fgraph, node):
         clean_replace_with_out = []
 
         existent_nodes = [nd for nd in local_fgraph.toposort()
-                          if nd not in to_remove]
+                            if nd not in to_remove]
         to_keep = []
         for nd in existent_nodes:
             to_keep += nd.inputs
@@ -521,8 +456,8 @@ def process_node(self, fgraph, node):
             nw_outer = []
             nw_inner = []
             for to_repl, repl_in, repl_out in zip(clean_to_replace,
-                                                  clean_replace_with_in,
-                                                  clean_replace_with_out):
+                                              clean_replace_with_in,
+                                              clean_replace_with_out):
                 if isinstance(repl_out, theano.Constant):
                     repl_in = repl_out.clone()
                 else:
@@ -538,14 +473,12 @@ def process_node(self, fgraph, node):
             nw_info = op.info.copy()
             nw_info['n_seqs'] += len(nw_inner)
             nwScan = scan_op.Scan(op_ins, op_outs, nw_info)
-            # Do not call make_node for test_value
-            nw_node = nwScan(*(node.inputs[:1] + nw_outer + node.inputs[1:]),
-                             **dict(return_list=True))[0].owner
-
+            nw_node = nwScan.make_node(* (node.inputs[:1] + nw_outer +
+                                          node.inputs[1:]))
             fgraph.replace_all_validate_remove(
                 zip(node.outputs, nw_node.outputs),
                 remove=[node],
-                reason='scanOp_pushout_seqs_ops')
+                reason='scan_push_computation_out')
             return True
         elif (to_keep == [] and
               not op.as_while and
@@ -573,280 +506,22 @@ def process_node(self, fgraph, node):
                     replace_with[x] = y
 
             # We need to add one extra dimension to the outputs
-            if replace_with and len(replace_with) == len(node.outputs):
+            if replace_with:
                 fgraph.replace_all_validate_remove(
                     replace_with.items(),
                     remove=[node],
-                    reason='scanOp_pushout_seqs_ops')
-                return True
+                    reason='scan_push_seq_computation_out')
+
         else:
             return False
 
 
-class PushOutScanOutput(gof.Optimizer):
-    """
-    This optimization can push operations performed at the end of the inner
-    graph of scan to outside of scan
-    """
-
-    def __init__(self):
-        gof.Optimizer.__init__(self)
-
-    def add_requirements(self, fgraph):
-        fgraph.attach_feature(gof.toolbox.ReplaceValidate())
-
-    def apply(self, fgraph):
-        nodelist = [x for x in fgraph.toposort()
-                    if isinstance(x.op, scan_op.Scan)]
-        for node in nodelist:
-            # Process the node as long as something gets optimized
-            while node != None:
-                node = self.process_node(fgraph, node)
-
-    def process_node(self, fgraph, node):
-
-        clean_inputs, clean_outputs = scan_utils.reconstruct_graph(
-            node.op.inputs, node.op.outputs)
-
-        local_fgraph = gof.FunctionGraph(clean_inputs, clean_outputs, clone=False)
-
-        op = node.op
-
-        # Use scan_args to parse the inputs and outputs of scan for ease of
-        # use
-        args = scan_args(node.inputs, node.outputs,
-                         node.op.inputs, node.op.outputs, node.op.info)
-
-        # Obtain the list containing the indices, in clean_outputs, of the
-        # scan op's outputs that are nit_sot (not fed back to the inner fct.)
-        nitsot_outs = op.inner_nitsot_outs(node.outputs)
-        idx_nitsot_outs = [node.outputs.index(i) for i in nitsot_outs]
-
-        # Construct the list of non_sequences to simplify a few things
-        inner_non_seqs = op.inner_non_seqs(clean_inputs)
-        outer_non_seqs = op.outer_non_seqs(node.inputs)
-        assert len(inner_non_seqs) == len(outer_non_seqs)
-
-        inner_seqs = op.inner_seqs(clean_inputs)
-        outer_seqs = op.outer_seqs(node.inputs)
-
-        new_scan_node = None
-
-        for nd in local_fgraph.toposort():
-
-            if (isinstance(nd.op, theano.tensor.Dot) and
-                  nd.out in clean_outputs):
-
-                """
-                The following optimization involves pushing out, after the
-                scan, a Dot where one input is one of scan's input with ndim=2
-                and the other is an intermediate variable in the Scan inner
-                graph with ndim=1.
-
-                The Dot product is pushed out of the scan and its inputs are
-                now the original matrix and a new matrix obtained by
-                concatenating the vectors into a matrix.
-                """
-
-                # Go through clean_outputs and pick one that is
-                # - Equal to the output of the tensor.Dot
-                # - Nit_sot : not fed back to the inner graph because applying
-                #   the optimization in that case would alter the results of
-                #   the function
-                # - Used by something outside of the graph to avoid applying
-                #   the optimization needlessly
-                idx_dot_output = -1
-                for i in range(len(clean_outputs)):
-
-                    is_dot_output = (nd.out == clean_outputs[i])
-                    is_nitsot_output = i in idx_nitsot_outs
-                    used_in_outer_graph = (len(node.outputs[i].clients) > 0)
-
-                    if (is_dot_output and is_nitsot_output and
-                        used_in_outer_graph):
-
-                        idx_dot_output = i
-                        break
-
-                if idx_dot_output == -1:
-                    # The dot has no output that fits the requirements for
-                    # this optimization. Move on to the next node.
-                    continue
-
-                """
-                Validate that one of the inputs is a matrix AND a
-                non-sequence input to scan and that the other input is a
-                vector and either an sequence input to scan or the result
-                of computation in the inner function of scan.
-                """
-                valid_inputs = False
-                idx_matrix_input = -1
-                idx_vector_input = -1
-
-                if (nd.inputs[0].ndim == 2 and
-                    (nd.inputs[0] in inner_non_seqs or
-                     isinstance(nd.inputs[0], tensor.Constant)) and
-                    nd.inputs[1].ndim == 1 and
-                      (nd.inputs[1] in inner_seqs or
-                       nd.inputs[1] not in clean_inputs)):
-
-                    valid_inputs = True
-                    idx_matrix_input = 0
-                    idx_vector_input = 1
-
-                elif (nd.inputs[1].ndim == 2 and
-                      (nd.inputs[1] in inner_non_seqs or
-                       isinstance(nd.inputs[1], tensor.Constant)) and
-                      nd.inputs[0].ndim == 1 and
-                      (nd.inputs[0] in inner_seqs or
-                       nd.inputs[0] not in clean_inputs)):
-
-                    valid_inputs = True
-                    idx_matrix_input = 1
-                    idx_vector_input = 0
-
-                if valid_inputs:
-                    # The optimization can be applied on the current Dot
-
-                    # Create a copy of the Dot's matrix input outside
-                    # of scan
-                    inner_matrix_input = nd.inputs[idx_matrix_input]
-                    if inner_matrix_input in inner_non_seqs:
-                        _idx = inner_non_seqs.index(inner_matrix_input)
-                        outer_matrix_input = outer_non_seqs[_idx]
-                    elif isinstance(inner_matrix_input, theano.Constant):
-                        outer_matrix_input = inner_matrix_input.clone()
-                    else:
-                        # Should not have happened
-                        raise Exception(
-                            ('Error in the `scan_pushout_seq_'
-                             'operations`. The optimization tries '
-                             'to move some computation fron scan '
-                             'which is not allowed to move. Report '
-                             'this on theano-users list'),
-                             inner_matrix_input)
-
-                    # If the vector_input is already a nit_sot output of the
-                    # scan, get a reference to the corresponding outer output.
-                    # Otherwise, add it as a new nit_sot output and then get a
-                    # reference to it
-                    if nd.inputs[idx_vector_input] in inner_seqs:
-                        _idx = inner_seqs.index(nd.inputs[idx_vector_input])
-                        outer_vector_input = outer_seqs[_idx]
-
-                    elif nd.inputs[idx_vector_input] in nitsot_outs:
-                        # Figure out which scan output corresponds the vector
-                        # input
-                        inner_vector_input = nd.inputs[idx_vector_input]
-                        vector_input_nitsot_idx = args.inner_out_nit_sot.index(inner_vector_input)
-                        outer_vector_input = args.outer_out_nit_sot[vector_input_nitsot_idx]
-
-                    else:
-                        # Add the vector_input as a new nitsot output to scan
-                        new_output_inner = nd.inputs[idx_vector_input]
-                        new_scan_node, idx_old_outputs, idx_new_output = self.add_nitsot_outputs(
-                                                                                        fgraph, node,
-                                                                                        clean_inputs,
-                                                                                        clean_outputs,
-                                                                                        new_output_inner)
-                        outer_vector_input = new_scan_node.outputs[idx_new_output]
-
-                        node = new_scan_node
-                        idx_dot_output = idx_old_outputs[idx_dot_output]
-
-                    # Perform the Dot outside of scan
-                    if idx_matrix_input == 0:
-                        outer_dot_inputs = [outer_vector_input,
-                                            outer_matrix_input.transpose()]
-                        outer_dot_output = theano.tensor.dot(*outer_dot_inputs)
-                    else: # idx_matrix_input == 1
-                        outer_dot_inputs = [outer_vector_input,
-                                            outer_matrix_input]
-                        outer_dot_output = theano.tensor.dot(*outer_dot_inputs)
-
-                    # Modify the outer graph to add the outer Dot
-                    fgraph.replace_all([
-                           (node.outputs[idx_dot_output],
-                            outer_dot_output)],
-                           reason="scanOp_pushout_output")
-
-                    break
-
-        return new_scan_node
-
-    def add_nitsot_outputs(self, fgraph, scan_node, clean_inputs,
-                                    clean_outputs, new_output_inner):
-        """
-        Create a new scan that takes the same inputs as scan_node and produces
-        the same output as well as the provided output new_output_inner
-        """
-
-        # Compute the index at which to insert the new output. For a scan Op,
-        # the outputs follow the ordering : mit_mot, mit_sot, sis_sot, nit_sot
-        # and shared_outs
-        output_insert_idx = (scan_node.op.info['n_mit_mot'] +
-                             scan_node.op.info['n_mit_sot'] +
-                             scan_node.op.info['n_sit_sot'] +
-                             scan_node.op.info['n_nit_sot'])
-
-
-        # Compile list of new inputs and outputs for the new Scan op
-        _nw_op_ins = clean_inputs
-        _nw_op_outs = (scan_utils.clone(clean_outputs[:output_insert_idx]) +
-                       [new_output_inner] +
-                       scan_utils.clone(clean_outputs[output_insert_idx:]))
-        nw_op_ins, nw_op_outs = scan_utils.reconstruct_graph(_nw_op_ins,
-                                                             _nw_op_outs)
-
-        # Compile a list containing, for every output of the old scan op,
-        # what its output index will be under the new scan op
-        nw_op_output_indices = [i + int(i>output_insert_idx)
-                                for i in range(output_insert_idx)]
-
-        # Construct the new Scan op
-        nw_info = scan_node.op.info.copy()
-        nw_info['n_nit_sot'] += 1
-        nw_scan = scan_op.Scan(nw_op_ins, nw_op_outs, nw_info)
-
-        # Assemble the lists of inputs for the node that will apply the new
-        # scan op by inserting an initial value for the new input in the
-        # at the right position in the list of inputs for the old node.
-        nw_node_input_idx = (scan_node.op.info['n_seqs'] +
-                             scan_node.op.info['n_mit_mot'] +
-                             scan_node.op.info['n_mit_sot'] +
-                             scan_node.op.info['n_sit_sot'] +
-                             scan_node.op.info['n_shared_outs'] +
-                             scan_node.op.info['n_nit_sot'])
-
-        # (the initial value is the nb of steps to store. For a nistot,
-        # it should be the number of steps performed by scan)
-        nw_node_input_init_value = scan_node.inputs[0]
-
-        nw_node_inputs = (scan_node.inputs[:nw_node_input_idx] +
-                          [nw_node_input_init_value] +
-                          scan_node.inputs[nw_node_input_idx:])
-
-        # Build the Scan's apply node
-        nw_node = nw_scan(*nw_node_inputs, **dict(return_list=True))[0].owner
-
-        nw_node_old_outputs = (nw_node.outputs[:output_insert_idx] +
-                               nw_node.outputs[output_insert_idx+1:])
-
-        # Make sure the outputs of the new scan op are used instead of the old
-        fgraph.replace_all(
-            zip(scan_node.outputs, nw_node_old_outputs),
-            reason='scanOp_pushout_output')
-
-        return nw_node, nw_op_output_indices, output_insert_idx
-
-
 class ScanInplaceOptimizer(Optimizer):
     """Graph optimizer for Scan(makes it run inplace)"""
-    def __init__(self, typeConstructor=None, gpu_flag=False, gpua_flag=False):
+    def __init__(self, typeConstructor=None, gpu_flag=False):
         Optimizer.__init__(self)
         self.typeConstructor = typeConstructor
         self.gpu_flag = gpu_flag
-        self.gpua_flag = gpua_flag
 
     def add_requirements(self, fgraph):
         fgraph.attach_feature(toolbox.ReplaceValidate())
@@ -857,8 +532,7 @@ def apply(self, fgraph):
         nodes = fgraph.toposort()
         scan_nodes = [x for x in nodes
                       if (isinstance(x.op, scan_op.Scan) and
-                          x.op.info['gpu'] == self.gpu_flag and
-                          x.op.info['gpua'] == self.gpua_flag)]
+                         x.op.info['gpu'] == self.gpu_flag)]
         for scan_idx in xrange(len(scan_nodes)):
             node = scan_nodes[scan_idx]
             op = node.op
@@ -889,13 +563,12 @@ def apply(self, fgraph):
                                       info,
                                       typeConstructor=self.typeConstructor)
 
-                # Do not call make_node for test_value
-                new_outs = new_op(*inputs, **dict(return_list=True))
+                new_outs = new_op.make_node(*inputs).outputs
                 try:
                     fgraph.replace_all_validate_remove(
                         zip(node.outputs, new_outs),
                         remove=[node],
-                        reason='scanOp_make_inplace')
+                        reason=self.__class__.__name__)
                     op = new_op
                     node = new_outs[0].owner
                 except InconsistencyError, e:
@@ -1018,7 +691,7 @@ def sanitize(x):
                     break
                 # 2.2 non-subtensor nodes
                 #=> output needs all its intermediate values
-                elif not isinstance(cl.op, tensor.Subtensor):
+                elif not isinstance(cl.op, tensor.basic.Subtensor):
                     global_nsteps = None
                     slices[i] = None
                     break
@@ -1026,7 +699,7 @@ def sanitize(x):
                 #=> output might need to store just a subset of its values
                 else:
                     # 2.3.1 extract idx list of subtensor
-                    this_slice = tensor.get_idx_list(cl.inputs,
+                    this_slice = tensor.basic.get_idx_list(cl.inputs,
                                                      cl.op.idx_list)
                     if this_slice is None:
                         # if unable to extract idx_list
@@ -1046,8 +719,8 @@ def sanitize(x):
                             length = shape_of[out][0]
                         except KeyError:
                             length = out.shape[0]
-                    cf_slice = tensor.get_canonical_form_slice(
-                        this_slice[0], length)
+                    cf_slice = tensor.basic.get_canonical_form_slice(
+                                                    this_slice[0], length)
                     slices[i] += [(cf_slice, this_slice)]
 
                     if (isinstance(this_slice[0], slice) and
@@ -1122,12 +795,12 @@ def sanitize(x):
                 if type(cl) == str:
                     store_steps[i] = 0
                     break
-                elif not isinstance(cl.op, tensor.Subtensor):
+                elif not isinstance(cl.op, tensor.basic.Subtensor):
                     store_steps[i] = 0
                     break
                 else:
-                    this_slice = tensor.get_idx_list(cl.inputs,
-                                                     cl.op.idx_list)
+                    this_slice = tensor.basic.get_idx_list(cl.inputs,
+                                                         cl.op.idx_list)
                     if this_slice is None:
                         store_steps[i] = 0
                         break
@@ -1144,8 +817,8 @@ def sanitize(x):
                             length = shape_of[out][0]
                         except KeyError:
                             length = out.shape[0]
-                    cf_slice = tensor.get_canonical_form_slice(
-                        this_slice[0], length)
+                    cf_slice = tensor.basic.get_canonical_form_slice(
+                                                    this_slice[0], length)
 
                     if isinstance(cf_slice[0], slice):
                         start = tensor.basic.extract_constant(
@@ -1160,20 +833,6 @@ def sanitize(x):
                         if store_steps[i] != -1:
                             pval = select_max(pval, store_steps[i])
 
-                        # TODO: Simplify the number of steps needed.
-                        # FB: This need good testing, left to later.
-                        #     call get_scalar_constant_value()? it can
-                        # return python/numpy scalar or numpy.ndarray currently.
-                        #pval = pre_greedy_local_optimizer(list_opt_slice,
-                        #                                  pval)
-                        #pval = pre_constant_merge([pval])[0]
-                        #if (isinstance(pval, theano.tensor.TensorConstant) and
-                        #    pval.dtype.startswith('int')):
-                        #    try:
-                        #        pval = int(pval.data)
-                        #    except Exception:
-                        #        pass
-
                         store_steps[i] = pval
                         flag_store = True
 
@@ -1188,8 +847,9 @@ def sanitize(x):
             nw_inputs[0] = nw_steps
 
             # 3.2 check orphane outputs to see if we can eliminate any
-            required, not_required = scan_utils.scan_can_remove_outs(
-                node.op, orphane_outs)
+            required, not_required = \
+                    scan_utils.scan_can_remove_outs(node.op,
+                                                    orphane_outs)
             # 3.3. compose replace pairs for those nodes that need not
             # to store everything in memory ( or ar orphane and required
             # by the inner function .. )
@@ -1220,8 +880,6 @@ def sanitize(x):
                                 nw_inputs[offset + idx].owner.op.idx_list[0],
                                 slice)):
 
-                            assert isinstance(nw_inputs[offset + idx].owner.op,
-                                              tensor.IncSubtensor)
                             _nw_input = nw_inputs[offset + idx].owner.inputs[1]
                             cval = tensor.as_tensor_variable(val)
                             initl = tensor.as_tensor_variable(init_l[i])
@@ -1265,6 +923,7 @@ def sanitize(x):
                     if val == 0:
                         if idx < op.n_mit_sot + op.n_sit_sot:
                             _nw_input = nw_inputs[offset + idx].owner.inputs[1]
+                            odx = op.n_mit_mot + idx
                             nw_input = scan_utils.expand(_nw_input, nw_steps)
                             nw_inputs[offset + idx] = nw_input
                         elif idx < (op.n_mit_sot + op.n_sit_sot +
@@ -1272,6 +931,7 @@ def sanitize(x):
                             in_idx = offset + idx + op.n_shared_outs
                             if nw_inputs[in_idx] == node.inputs[0]:
                                 nw_inputs[in_idx] = nw_steps
+                            odx = op.n_mit_mot + idx
 
             # 3.5 Remove unwanted orphane outputs
             (inps, outs, info, node_ins, compress_map) = \
@@ -1286,19 +946,10 @@ def sanitize(x):
             # 3.6 Compose the new scan
             # I need to make sure I'm not reapplying the same optimization
             # twice since bad things usually happen if I do that
-            # TODO: why not check if save mem was done on any of merged nodes?
-            #       That way, if none of them had save mem applied, it would
-            #       be applied later.
             info['_scan_savemem_visited'] = True
-
-            # TODO: currently we don't support scan with 0 step. So
-            # don't create one.
-            if theano.tensor.extract_constant(node_ins[0]) == 0:
-                return
-
-            # Do not call make_node for test_value
-            new_outs = scan_op.Scan(inps, outs, info)(*node_ins,
-                                                      **dict(return_list=True))
+            new_outs = scan_op.Scan(inps,
+                                    outs,
+                                    info).make_node(*node_ins).outputs
 
             old_new = []
             # 3.7 Get replace pairs for those outputs that do not change
@@ -1322,13 +973,14 @@ def sanitize(x):
                         nw_slice = (fslice,) + tuple(old_slices[1:])
                         nw_pos = inv_compress_map[idx]
 
-                        subtens = tensor.Subtensor(nw_slice)
+                        subtens = tensor.basic.Subtensor(nw_slice)
                         # slice inputs
-                        sl_ins = tensor.Subtensor.collapse(
+                        sl_ins = tensor.basic.Subtensor.collapse(
                             nw_slice,
                             lambda entry: isinstance(entry,
-                                                     tensor.Variable))
-                        new_o = subtens(new_outs[nw_pos], *sl_ins)
+                                                    tensor.Variable))
+                        new_o = subtens.make_node(new_outs[nw_pos],
+                                                  *sl_ins).outputs[0]
                         if new_o.ndim > 0:
                             new_o = new_o[::cnf_slice[1]]
                         replaced_outs.append(idx)
@@ -1357,16 +1009,18 @@ def sanitize(x):
 
                         else:
                             position = (cnf_slice[0] - nw_steps -
-                                        init_l[pos] + store_steps[pos])
+                                         init_l[pos] + store_steps[pos])
+
+                            nw_slice = (sanitize(position),) + \
+                                    tuple(old_slices[1:])
 
-                            nw_slice = (sanitize(position),) + tuple(
-                                old_slices[1:])
-                        subtens = tensor.Subtensor(nw_slice)
-                        sl_ins = tensor.Subtensor.collapse(
+                        subtens = tensor.basic.Subtensor(nw_slice)
+                        sl_ins = tensor.basic.Subtensor.collapse(
                             nw_slice,
                             lambda entry: isinstance(entry,
                                                      tensor.Variable))
-                        new_o = subtens(new_outs[nw_pos], *sl_ins)
+                        new_o = subtens.make_node(new_outs[nw_pos],
+                                                  *sl_ins).outputs[0]
                         if new_o.ndim > 0:
                             new_o = new_o[::cnf_slice[1]]
                         old_new += [(old, new_o)]
@@ -1388,12 +1042,12 @@ def sanitize(x):
                 remove.append(node)
                 fgraph.replace_all_validate_remove(old_new,
                                                    remove,
-                                                   reason='scanOp_save_mem')
+                                                   reason='scan_save_mem')
 
     def apply(self, fgraph):
 
         nodelist = [x for x in fgraph.toposort() if isinstance(x.op,
-                                                               scan_op.Scan)]
+                                                           scan_op.Scan)]
         for node in nodelist:
             if not hasattr(node.op, '_scan_savemem_visited'):
                 self.process_node(fgraph, node)
@@ -1429,13 +1083,9 @@ def merge(self, nodes):
         info['as_while'] = as_while
         info['profile'] = nodes[0].op.profile
 
-        # We keep the inner_ins and inner_outs of each original node separated.
-        # To be able to recombine them in the right order after the clone,
-        # we also need to split them by types (seq, mitmot, ...).
-        # On the other hand, outer_ins, outer_outs and info are held together.
-        inner_ins = [[] for nd in nodes]
+        inner_ins = []
         outer_ins = []
-        inner_outs = [[] for nd in nodes]
+        inner_outs = []
         outer_outs = []
 
         def rename(ls, suffix):
@@ -1446,14 +1096,13 @@ def rename(ls, suffix):
 
         for idx, nd in enumerate(nodes):
             # Seq
-            inner_ins[idx].append(rename(nd.op.inner_seqs(nd.op.inputs), idx))
+            inner_ins += rename(nd.op.inner_seqs(nd.op.inputs), idx)
             outer_ins += rename(nd.op.outer_seqs(nd.inputs), idx)
 
         for idx, nd in enumerate(nodes):
             # MitMot
-            inner_ins[idx].append(
-                rename(nd.op.inner_mitmot(nd.op.inputs), idx))
-            inner_outs[idx].append(nd.op.inner_mitmot_outs(nd.op.outputs))
+            inner_ins += rename(nd.op.inner_mitmot(nd.op.inputs), idx)
+            inner_outs += nd.op.inner_mitmot_outs(nd.op.outputs)
             info['tap_array'] += nd.op.mitmot_taps()
             info['mit_mot_out_slices'] += nd.op.mitmot_out_taps()
             outer_ins += rename(nd.op.outer_mitmot(nd.inputs), idx)
@@ -1461,108 +1110,51 @@ def rename(ls, suffix):
 
         for idx, nd in enumerate(nodes):
             # MitSot
-            inner_ins[idx].append(
-                rename(nd.op.inner_mitsot(nd.op.inputs), idx))
-            inner_outs[idx].append(nd.op.inner_mitsot_outs(nd.op.outputs))
+            inner_ins += rename(nd.op.inner_mitsot(nd.op.inputs), idx)
+            inner_outs += nd.op.inner_mitsot_outs(nd.op.outputs)
             info['tap_array'] += nd.op.mitsot_taps()
             outer_ins += rename(nd.op.outer_mitsot(nd.inputs), idx)
             outer_outs += nd.op.outer_mitsot_outs(nd.outputs)
 
         for idx, nd in enumerate(nodes):
             # SitSot
-            inner_ins[idx].append(
-                rename(nd.op.inner_sitsot(nd.op.inputs), idx))
+            inner_ins += rename(nd.op.inner_sitsot(nd.op.inputs), idx)
             info['tap_array'] += [[-1] for x in xrange(nd.op.n_sit_sot)]
-            inner_outs[idx].append(nd.op.inner_sitsot_outs(nd.op.outputs))
+            inner_outs += nd.op.inner_sitsot_outs(nd.op.outputs)
             outer_ins += rename(nd.op.outer_sitsot(nd.inputs), idx)
             outer_outs += nd.op.outer_sitsot_outs(nd.outputs)
 
         for idx, nd in enumerate(nodes):
             # Shared
-            inner_ins[idx].append(
-                rename(nd.op.inner_shared(nd.op.inputs), idx))
+            inner_ins += rename(nd.op.inner_shared(nd.op.inputs), idx)
             outer_ins += rename(nd.op.outer_shared(nd.inputs), idx)
 
         for idx, nd in enumerate(nodes):
             # NitSot
-            inner_outs[idx].append(nd.op.inner_nitsot_outs(nd.op.outputs))
+            inner_outs += nd.op.inner_nitsot_outs(nd.op.outputs)
             outer_ins += rename(nd.op.outer_nitsot(nd.inputs), idx)
             outer_outs += nd.op.outer_nitsot_outs(nd.outputs)
 
         for idx, nd in enumerate(nodes):
             # Shared
             outer_outs += nd.op.outer_shared_outs(nd.outputs)
-            inner_outs[idx].append(nd.op.inner_shared_outs(nd.op.outputs))
+            inner_outs += nd.op.inner_shared_outs(nd.op.outputs)
 
         for idx, nd in enumerate(nodes):
             # Non Seqs
-            inner_ins[idx].append(
-                rename(nd.op.inner_non_seqs(nd.op.inputs), idx))
+            inner_ins += rename(nd.op.inner_non_seqs(nd.op.inputs), idx)
             outer_ins += rename(nd.op.outer_non_seqs(nd.inputs), idx)
 
         # Add back the number of steps
         outer_ins = [nodes[0].inputs[0]] + outer_ins
 
         if as_while:
-            # add the condition, which was the one of nodes[0]
-            inner_outs[0].append([condition])
+            # add the condition
+            inner_outs.append(condition)
+        inner_ins, inner_outs = scan_utils.reconstruct_graph(inner_ins,
+                                                             inner_outs)
 
-        # Clone the inner graph of each node independently
-        for idx, nd in enumerate(nodes):
-            # concatenate all inner_ins and inner_outs of nd
-            flat_inner_ins = sum(inner_ins[idx], [])
-            flat_inner_outs = sum(inner_outs[idx], [])
-            # clone
-            flat_inner_ins, flat_inner_outs = scan_utils.reconstruct_graph(
-                    flat_inner_ins, flat_inner_outs)
-            # split the new inner variables again in seq, mitmot, etc.
-            new_inner_ins = []
-            count = 0
-            for nl in inner_ins[idx]:
-                seq_len = len(nl)
-                new_inner_ins.append(flat_inner_ins[count:(count + seq_len)])
-                count += seq_len
-
-            new_inner_outs = []
-            count = 0
-            for nl in inner_outs[idx]:
-                seq_len = len(nl)
-                new_inner_outs.append(flat_inner_outs[count:(count + seq_len)])
-                count += seq_len
-
-            inner_ins[idx] = new_inner_ins
-            inner_outs[idx] = new_inner_outs
-
-        # Flatten inner_ins and inner_outs so that all seqs are first,
-        # then mitmot, etc.
-        new_inner_ins = []
-        new_inner_outs = []
-        nb_ins_groups = len(inner_ins[0])
-        nb_outs_groups = len(inner_outs[0])
-        for idx, nd in enumerate(nodes):
-            # All inner_ins should have the same length
-            assert len(inner_ins[idx]) == nb_ins_groups
-
-            # All inner_outs should have the same length, except if as_while,
-            # in which case the first one should have one more element
-            if as_while and idx > 0:
-                assert len(inner_outs[idx]) == nb_outs_groups - 1
-            else:
-                assert len(inner_outs[idx]) == nb_outs_groups
-
-        for gr_idx in range(nb_ins_groups):
-            for idx, nd in enumerate(nodes):
-                new_inner_ins += inner_ins[idx][gr_idx]
-
-        for gr_idx in range(nb_outs_groups):
-            for idx, nd in enumerate(nodes):
-                if as_while and idx > 0 and gr_idx == (nb_outs_groups - 1):
-                    # There is no condition on that node, skip it
-                    pass
-                else:
-                    new_inner_outs += inner_outs[idx][gr_idx]
-
-        new_op = scan_op.Scan(new_inner_ins, new_inner_outs, info)
+        new_op = scan_op.Scan(inner_ins, inner_outs, info)
         new_outs = new_op(*outer_ins)
 
         if not isinstance(new_outs, (list, tuple)):
@@ -1625,13 +1217,8 @@ def apply(self, fgraph):
             belongs_to_set_idx = -1
             for pos, subset in enumerate(all_sets):
                 if self.belongs_to_set(nd, subset):
+                    assert belongs_to_set_idx == -1
                     belongs_to_set_idx = pos
-                    # It is possible that nd belongs to more than one subset.
-                    # For instance, if we have 3 Scan nodes X, Y and Z, if Z
-                    # depends on the output of X, then X and Z are incompatible
-                    # and would create different subsets, but Y could be
-                    # compatible with both X and Z. We choose the first one.
-                    break
 
             if belongs_to_set_idx == -1:
                 all_sets.append([nd])
@@ -1643,7 +1230,7 @@ def apply(self, fgraph):
                 proposal = self.merge(subset)
                 fgraph.replace_all_validate_remove(proposal,
                                                    remove=subset,
-                                                   reason='scanOp_merge')
+                                                   reason='scan_merge')
 
 
 def has_duplicates(l):
@@ -1666,14 +1253,11 @@ def make_equiv(lo, li):
     return left, right
 
 
-@gof.local_optimizer([scan_op.Scan])
+@gof.local_optimizer([None])
 def scan_merge_inouts(node):
     if not isinstance(node.op, scan_op.Scan):
         return False
 
-    # Do a first pass to merge identical external inputs.
-    # Equivalent inputs will be stored in inp_equiv, then a new
-    # scan node created without duplicates.
     a = scan_args(node.inputs, node.outputs,
                   node.op.inputs, node.op.outputs, node.op.info)
 
@@ -1710,7 +1294,10 @@ def scan_merge_inouts(node):
         inner_inputs = a.inner_inputs
         outer_inputs = a.outer_inputs
         info = a.info
-        a_inner_outs = a.inner_outputs
+        if info['as_while']:
+            a_inner_outs = a.inner_outputs + a.cond
+        else:
+            a_inner_outs = a.inner_outputs
         inner_outputs = scan_utils.clone(a_inner_outs, replace=inp_equiv)
 
         op = scan_op.Scan(inner_inputs, inner_outputs, info)
@@ -1723,9 +1310,7 @@ def scan_merge_inouts(node):
     else:
         na = a
 
-    # Now that the identical external inputs have been merged, we do a new
-    # loop in order to merge external outputs that compute the same things
-    # from the same inputs.
+    # start again
     left = []
     right = []
 
@@ -1762,67 +1347,77 @@ def scan_merge_inouts(node):
             else:
                 seen[(oms, sl)] = ims
 
-    def map_out(outer_i, inner_o, outer_o, seen):
-        # Return the outer input corresponding to an
-        # (outer input, inner output) pair. If we see that pair for the first
-        # time, return the provided outer output. If an equivalent pair had
-        # already been seen, return that one instead.
-        # Note that we need to check that the outer input match as well,
-        # because they could have different sizes, and the corresponding
-        # outer outputs cannot be merged in that case.
-        for s_outer_i, s_inner_o, s_outer_o in seen:
-            if (equal_computations([inner_o], [s_inner_o], left, right)
-                    and outer_i == s_outer_i):
-                return s_outer_o
-        seen.append((outer_i, inner_o, outer_o))
-        return outer_o
+    def map_out(i, o, seen):
+        for si, so in seen:
+            if equal_computations([i], [si], left, right):
+                return so
+        seen.append((i, o))
+        return o
+
+    def map_nitsot_out(i, o, sh, seen):
+        for p, (si, so, ssh) in enumerate(seen):
+            if equal_computations([i], [si], left, right):
+                if equal_computations([sh], [ssh]):
+                    return so
+                try:
+                    vsh = int(opt.get_scalar_constant_value(sh))
+                    vssh = int(opt.get_scalar_constant_value(ssh))
+                except tensor.NotScalarConstantError:
+                    return o
+                if vsh == vssh:
+                    return so
+                elif vsh > vssh:
+                    seen[p] = (i, o, sh)
+                    return o
+                else:
+                    return so[:vsh]
+        seen.append((i, o, sh))
+        return o
 
     seen = []
 
-    assert len(na.outer_in_nit_sot) == len(na.inner_out_nit_sot)
-    assert len(na.inner_out_nit_sot) == len(na.outer_out_nit_sot)
-    na.outer_out_nit_sot = [
-        map_out(outer_i, inner_o, outer_o, seen)
-        for outer_i, inner_o, outer_o in zip(na.outer_in_nit_sot,
-                                             na.inner_out_nit_sot,
-                                             na.outer_out_nit_sot)]
+    shapes = []
+    for x in na.outer_in_nit_sot:
+        if x.ndim > 0:
+            if hasattr(node.fgraph, 'shape_feature'):
+                shapes.append(
+                    node.fgraph.shape_feature.shape_of[x][0])
+            else:
+                shapes.append(x.shape[0])
+        else:
+            # If x is a scalar, then it means its value is the number of
+            # items scan is supposed to store for this nit_sot sequence
+            shapes.append(x)
+    tmp = [map_nitsot_out(i, o, sh, seen)
+                            for i, o, sh in zip(na.inner_out_nit_sot,
+                                            na.outer_out_nit_sot,
+                                            shapes)]
+    na.outer_out_nit_sot = [map_nitsot_out(i, o, sh, seen)
+                            for i, o, sh in zip(na.inner_out_nit_sot,
+                                            na.outer_out_nit_sot,
+                                            shapes)]
 
     seen = []
-    assert len(na.outer_in_sit_sot) == len(na.inner_out_sit_sot)
-    assert len(na.inner_out_sit_sot) == len(na.outer_out_sit_sot)
-    na.outer_out_sit_sot = [
-        map_out(outer_i, inner_o, outer_o, seen)
-        for outer_i, inner_o, outer_o in zip(na.outer_in_sit_sot,
-                                             na.inner_out_sit_sot,
-                                             na.outer_out_sit_sot)]
+    na.outer_out_sit_sot = [map_out(i, o, seen)
+                            for i, o in zip(na.inner_out_sit_sot,
+                                            na.outer_out_sit_sot)]
 
     seen = []
-    assert len(na.outer_in_mit_sot) == len(na.inner_out_mit_sot)
-    assert len(na.inner_out_mit_sot) == len(na.outer_out_mit_sot)
-    na.outer_out_mit_sot = [
-        map_out(outer_i, inner_o, outer_o, seen)
-        for outer_i, inner_o, outer_o in zip(na.outer_in_mit_sot,
-                                             na.inner_out_mit_sot,
-                                             na.outer_out_mit_sot)]
+    na.outer_out_mit_sot = [map_out(i, o, seen)
+                            for i, o in zip(na.inner_out_mit_sot,
+                                            na.outer_out_mit_sot)]
 
     seen = []
     new_outer_out_mit_mot = []
-    assert len(na.outer_in_mit_mot) == len(na.inner_out_mit_mot)
-    assert len(na.inner_out_mit_mot) == len(na.outer_out_mit_mot)
-    assert len(na.outer_out_mit_mot) == len(na.mit_mot_out_slices)
-    for outer_imm, inner_omm, outer_omm, osl in zip(na.outer_in_mit_mot,
-                                                    na.inner_out_mit_mot,
-                                                    na.outer_out_mit_mot,
-                                                    na.mit_mot_out_slices):
-        for s_outer_imm, s_inner_omm, s_outer_omm, sosl in seen:
-            if (osl == sosl
-                    and equal_computations(inner_omm, s_inner_omm, left, right)
-                    and outer_imm == s_outer_imm):
-                new_outer_out_mit_mot.append(s_outer_omm)
+    for imm, omm, osl in zip(na.inner_out_mit_mot,
+                             na.outer_out_mit_mot, na.mit_mot_out_slices):
+        for simm, somm, sosl in seen:
+            if osl == sosl and equal_computations(imm, simm, left, right):
+                new_outer_out_mit_mot.append(somm)
                 break
         else:
-            seen.append((outer_imm, inner_omm, outer_omm, osl))
-            new_outer_out_mit_mot.append(outer_omm)
+            seen.append((imm, omm, osl))
+            new_outer_out_mit_mot.append(omm)
     na.outer_out_mit_mot = new_outer_out_mit_mot
 
     return na.outer_outputs
@@ -1835,6 +1430,7 @@ def __init__(self):
 
     def add_requirements(self, fgraph):
         fgraph.attach_feature(toolbox.ReplaceValidate())
+        fgraph.attach_feature(DestroyHandler())
 
     def apply(self, fgraph):
 
@@ -1996,8 +1592,10 @@ def apply_opt(self, fgraph, node):
                         old = node.outputs[pos].clients[0][0].outputs[0]
                         old_new.append((old, new_out))
                         old_new += zip(node.outputs[pos+1:], new_outs[pos:])
-                        fgraph.replace_all_validate_remove(
-                            old_new, remove=[node], reason='scan_pushout_dot1')
+                        fgraph.replace_all_validate_remove(old_new,
+                                                   remove = [node],
+                                                   reason='PushOutDot1')
+
 
 
 # I've added an equilibrium because later scan optimization in the sequence
@@ -2007,18 +1605,21 @@ def apply_opt(self, fgraph, node):
 scan_seqopt1 = theano.gof.SequenceDB()
 
 scan_eqopt2 = theano.gof.EquilibriumDB()
+scan_seqopt2 = theano.gof.EquilibriumDB()
 # We run before blas opt at 1.7 and specialize 2.0
 # but after stabilize at 1.5. Should we put it before stabilize?
 optdb.register('scan_eqopt1', scan_eqopt1, .1, 'fast_run', 'scan')
 optdb.register('scan_eqopt2', scan_eqopt2, 1.6, 'fast_run', 'scan')
 optdb.register('scanOp_make_inplace',
                ScanInplaceOptimizer(typeConstructor=None,
-                                    gpu_flag=False),
+                                   gpu_flag=False),
                75,
                'fast_run',
                'inplace',
                'scan')
 
+scan_eqopt2.register(
+    'all_scan_opts', scan_seqopt2, 1, 'fast_run', 'scan')
 scan_eqopt1.register(
     'all_pushout_opt', scan_seqopt1, 1, 'fast_run', 'scan')
 
@@ -2027,7 +1628,6 @@ def apply_opt(self, fgraph, node):
                       opt.in2out(remove_constants_and_unused_inputs_scan,
                                  ignore_newtrees=True),
                       1,
-                      'remove_constants_and_unused_inputs_scan',
                       'fast_run',
                       'scan')
 
@@ -2054,15 +1654,7 @@ def apply_opt(self, fgraph, node):
                       'scan')
 
 
-scan_seqopt1.register('scanOp_pushout_output',
-                      PushOutScanOutput(),
-                      5,
-                      'fast_run',
-                      'more_mem',
-                      'scan')
-
-
-scan_eqopt2.register('constant_folding_for_scan2',
+scan_seqopt2.register('constant_folding_for_scan2',
                       opt.in2out(tensor.opt.constant_folding,
                                  ignore_newtrees=True),
                       1,
@@ -2070,11 +1662,10 @@ def apply_opt(self, fgraph, node):
                       'scan')
 
 
-scan_eqopt2.register('scanOp_remove_constants_and_unused_inputs1',
+scan_seqopt2.register('scanOp_remove_constants_and_unused_inputs0',
                       opt.in2out(remove_constants_and_unused_inputs_scan,
                                  ignore_newtrees=True),
                       2,
-                      'remove_constants_and_unused_inputs_scan',
                       'fast_run',
                       'scan')
 
@@ -2082,42 +1673,39 @@ def apply_opt(self, fgraph, node):
 # after const merge but before stabilize so that we can have identity
 # for equivalent nodes but we still have the chance to hoist stuff out
 # of the scan later.
-scan_eqopt2.register('scanOp_merge',
+scan_seqopt2.register('scanOp_merge',
                       ScanMerge(),
                       4,
                       'fast_run',
                       'scan')
 
 # After Merge optimization
-scan_eqopt2.register('scanop_remove_constants_and_unused_inputs2',
+scan_seqopt2.register('scanop_remove_constants_and_unused_inputs2',
                       opt.in2out(remove_constants_and_unused_inputs_scan,
                                  ignore_newtrees=True),
                       5,
-                      'remove_constants_and_unused_inputs_scan',
                       'fast_run',
                       'scan')
 
-scan_eqopt2.register('scanOp_merge_inouts',
+scan_seqopt2.register('scanOp_merge_inouts',
                       opt.in2out(scan_merge_inouts, ignore_newtrees=True),
                       6,
-                      'scan_merge_inouts',
                       'fast_run',
                       'scan')
 
 # Just before specialize to have the other optimization
 # like constant folding being applied
 # This don't introduce inplace.
-scan_eqopt2.register('scanOp_save_mem',
+scan_seqopt2.register('scanOp_save_mem',
                       ScanSaveMem(),
                       7,
                       'fast_run',
                       'scan')
 
 # After everything else
-scan_eqopt2.register('scanOp_remove_constants_and_unused_inputs3',
+scan_seqopt2.register('scanOp_remove_constants_and_unused_inputs3',
                       opt.in2out(remove_constants_and_unused_inputs_scan,
                                  ignore_newtrees=True),
                       8,
-                      'remove_constants_and_unused_inputs_scan',
                       'fast_run',
                       'scan')
diff --git a/theano/scan_module/scan_perform.c b/theano/scan_module/scan_perform.c.txt
similarity index 52%
rename from theano/scan_module/scan_perform.c
rename to theano/scan_module/scan_perform.c.txt
index b0c1f83a048..58fc18278d2 100644
--- a/theano/scan_module/scan_perform.c
+++ b/theano/scan_module/scan_perform.c.txt
@@ -1,29 +1,17 @@
-/* Generated by Cython 0.20.1 on Tue Sep 30 15:00:32 2014 */
+/* Generated by Cython 0.16 on Wed Sep  5 10:06:35 2012 */
 
 #define PY_SSIZE_T_CLEAN
-#ifndef CYTHON_USE_PYLONG_INTERNALS
-#ifdef PYLONG_BITS_IN_DIGIT
-#define CYTHON_USE_PYLONG_INTERNALS 0
-#else
-#include "pyconfig.h"
-#ifdef PYLONG_BITS_IN_DIGIT
-#define CYTHON_USE_PYLONG_INTERNALS 1
-#else
-#define CYTHON_USE_PYLONG_INTERNALS 0
-#endif
-#endif
-#endif
 #include "Python.h"
 #ifndef Py_PYTHON_H
     #error Python headers needed to compile C extensions, please install development version of Python.
 #elif PY_VERSION_HEX < 0x02040000
     #error Cython requires Python 2.4+.
 #else
-#define CYTHON_ABI "0_20_1"
 #include <stddef.h> /* For offsetof */
 #ifndef offsetof
 #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
 #endif
+
 #if !defined(WIN32) && !defined(MS_WINDOWS)
   #ifndef __stdcall
     #define __stdcall
@@ -35,18 +23,22 @@
     #define __fastcall
   #endif
 #endif
+
 #ifndef DL_IMPORT
   #define DL_IMPORT(t) t
 #endif
 #ifndef DL_EXPORT
   #define DL_EXPORT(t) t
 #endif
+
 #ifndef PY_LONG_LONG
   #define PY_LONG_LONG LONG_LONG
 #endif
+
 #ifndef Py_HUGE_VAL
   #define Py_HUGE_VAL HUGE_VAL
 #endif
+
 #ifdef PYPY_VERSION
 #define CYTHON_COMPILING_IN_PYPY 1
 #define CYTHON_COMPILING_IN_CPYTHON 0
@@ -54,31 +46,28 @@
 #define CYTHON_COMPILING_IN_PYPY 0
 #define CYTHON_COMPILING_IN_CPYTHON 1
 #endif
+
 #if CYTHON_COMPILING_IN_PYPY
-#define Py_OptimizeFlag 0
+  #define __Pyx_PyCFunction_Call PyObject_Call
+#else
+  #define __Pyx_PyCFunction_Call PyCFunction_Call
 #endif
+
 #if PY_VERSION_HEX < 0x02050000
   typedef int Py_ssize_t;
   #define PY_SSIZE_T_MAX INT_MAX
   #define PY_SSIZE_T_MIN INT_MIN
   #define PY_FORMAT_SIZE_T ""
-  #define CYTHON_FORMAT_SSIZE_T ""
   #define PyInt_FromSsize_t(z) PyInt_FromLong(z)
-  #define PyInt_AsSsize_t(o)   __Pyx_PyInt_As_int(o)
-  #define PyNumber_Index(o)    ((PyNumber_Check(o) && !PyFloat_Check(o)) ? PyNumber_Int(o) : \
-                                (PyErr_Format(PyExc_TypeError, \
-                                              "expected index value, got %.200s", Py_TYPE(o)->tp_name), \
-                                 (PyObject*)0))
-  #define __Pyx_PyIndex_Check(o) (PyNumber_Check(o) && !PyFloat_Check(o) && \
-                                  !PyComplex_Check(o))
-  #define PyIndex_Check __Pyx_PyIndex_Check
+  #define PyInt_AsSsize_t(o)   __Pyx_PyInt_AsInt(o)
+  #define PyNumber_Index(o)    PyNumber_Int(o)
+  #define PyIndex_Check(o)     PyNumber_Check(o)
   #define PyErr_WarnEx(category, message, stacklevel) PyErr_Warn(category, message)
   #define __PYX_BUILD_PY_SSIZE_T "i"
 #else
   #define __PYX_BUILD_PY_SSIZE_T "n"
-  #define CYTHON_FORMAT_SSIZE_T "z"
-  #define __Pyx_PyIndex_Check PyIndex_Check
 #endif
+
 #if PY_VERSION_HEX < 0x02060000
   #define Py_REFCNT(ob) (((PyObject*)(ob))->ob_refcnt)
   #define Py_TYPE(ob)   (((PyObject*)(ob))->ob_type)
@@ -86,6 +75,7 @@
   #define PyVarObject_HEAD_INIT(type, size) \
           PyObject_HEAD_INIT(type) size,
   #define PyType_Modified(t)
+
   typedef struct {
      void *buf;
      PyObject *obj;
@@ -99,6 +89,7 @@
      Py_ssize_t *suboffsets;
      void *internal;
   } Py_buffer;
+
   #define PyBUF_SIMPLE 0
   #define PyBUF_WRITABLE 0x0001
   #define PyBUF_FORMAT 0x0004
@@ -110,72 +101,45 @@
   #define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES)
   #define PyBUF_RECORDS (PyBUF_STRIDES | PyBUF_FORMAT | PyBUF_WRITABLE)
   #define PyBUF_FULL (PyBUF_INDIRECT | PyBUF_FORMAT | PyBUF_WRITABLE)
+
   typedef int (*getbufferproc)(PyObject *, Py_buffer *, int);
   typedef void (*releasebufferproc)(PyObject *, Py_buffer *);
 #endif
+
 #if PY_MAJOR_VERSION < 3
   #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
   #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) \
-          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
-  #define __Pyx_DefaultClassType PyClass_Type
+          PyCode_New(a, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
 #else
   #define __Pyx_BUILTIN_MODULE_NAME "builtins"
   #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) \
           PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
-  #define __Pyx_DefaultClassType PyType_Type
 #endif
-#if PY_VERSION_HEX < 0x02060000
+
+#if PY_MAJOR_VERSION < 3 && PY_MINOR_VERSION < 6
   #define PyUnicode_FromString(s) PyUnicode_Decode(s, strlen(s), "UTF-8", "strict")
 #endif
+
 #if PY_MAJOR_VERSION >= 3
   #define Py_TPFLAGS_CHECKTYPES 0
   #define Py_TPFLAGS_HAVE_INDEX 0
 #endif
+
 #if (PY_VERSION_HEX < 0x02060000) || (PY_MAJOR_VERSION >= 3)
   #define Py_TPFLAGS_HAVE_NEWBUFFER 0
 #endif
-#if PY_VERSION_HEX < 0x02060000
-  #define Py_TPFLAGS_HAVE_VERSION_TAG 0
-#endif
-#if PY_VERSION_HEX < 0x02060000 && !defined(Py_TPFLAGS_IS_ABSTRACT)
-  #define Py_TPFLAGS_IS_ABSTRACT 0
-#endif
-#if PY_VERSION_HEX < 0x030400a1 && !defined(Py_TPFLAGS_HAVE_FINALIZE)
-  #define Py_TPFLAGS_HAVE_FINALIZE 0
-#endif
-#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+
+
+#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_GET_LENGTH)
   #define CYTHON_PEP393_ENABLED 1
-  #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ? \
-                                              0 : _PyUnicode_Ready((PyObject *)(op)))
-  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
+  #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_LENGTH(u)
   #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
-  #define __Pyx_PyUnicode_KIND(u)         PyUnicode_KIND(u)
-  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
-  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
 #else
   #define CYTHON_PEP393_ENABLED 0
-  #define __Pyx_PyUnicode_READY(op)       (0)
-  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
+  #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_SIZE(u)
   #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
-  #define __Pyx_PyUnicode_KIND(u)         (sizeof(Py_UNICODE))
-  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
-  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
-#endif
-#if CYTHON_COMPILING_IN_PYPY
-  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
-  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
-#else
-  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
-  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ? \
-      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
-#endif
-#define __Pyx_PyString_FormatSafe(a, b)  ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
-#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
-#if PY_MAJOR_VERSION >= 3
-  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
-#else
-  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
 #endif
+
 #if PY_MAJOR_VERSION >= 3
   #define PyBaseString_Type            PyUnicode_Type
   #define PyStringObject               PyUnicodeObject
@@ -183,6 +147,7 @@
   #define PyString_Check               PyUnicode_Check
   #define PyString_CheckExact          PyUnicode_CheckExact
 #endif
+
 #if PY_VERSION_HEX < 0x02060000
   #define PyBytesObject                PyStringObject
   #define PyBytes_Type                 PyString_Type
@@ -201,14 +166,7 @@
   #define PyBytes_Concat               PyString_Concat
   #define PyBytes_ConcatAndDel         PyString_ConcatAndDel
 #endif
-#if PY_MAJOR_VERSION >= 3
-  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
-  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
-#else
-  #define __Pyx_PyBaseString_Check(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj) || \
-                                         PyString_Check(obj) || PyUnicode_Check(obj))
-  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
-#endif
+
 #if PY_VERSION_HEX < 0x02060000
   #define PySet_Check(obj)             PyObject_TypeCheck(obj, &PySet_Type)
   #define PyFrozenSet_Check(obj)       PyObject_TypeCheck(obj, &PyFrozenSet_Type)
@@ -216,7 +174,9 @@
 #ifndef PySet_CheckExact
   #define PySet_CheckExact(obj)        (Py_TYPE(obj) == &PySet_Type)
 #endif
+
 #define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+
 #if PY_MAJOR_VERSION >= 3
   #define PyIntObject                  PyLongObject
   #define PyInt_Type                   PyLong_Type
@@ -232,12 +192,13 @@
   #define PyInt_AsSsize_t              PyLong_AsSsize_t
   #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
   #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
-  #define PyNumber_Int                 PyNumber_Long
 #endif
+
 #if PY_MAJOR_VERSION >= 3
   #define PyBoolObject                 PyLongObject
 #endif
-#if PY_VERSION_HEX < 0x030200A4
+
+#if PY_VERSION_HEX < 0x03020000
   typedef long Py_hash_t;
   #define __Pyx_PyInt_FromHash_t PyInt_FromLong
   #define __Pyx_PyInt_AsHash_t   PyInt_AsLong
@@ -245,6 +206,7 @@
   #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
   #define __Pyx_PyInt_AsHash_t   PyInt_AsSsize_t
 #endif
+
 #if (PY_MAJOR_VERSION < 3) || (PY_VERSION_HEX >= 0x03010300)
   #define __Pyx_PySequence_GetSlice(obj, a, b) PySequence_GetSlice(obj, a, b)
   #define __Pyx_PySequence_SetSlice(obj, a, b, value) PySequence_SetSlice(obj, a, b, value)
@@ -263,9 +225,11 @@
         (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_DelSlice(obj, a, b)) : \
             (PyErr_Format(PyExc_TypeError, "'%.200s' object doesn't support slice deletion", (obj)->ob_type->tp_name), -1)))
 #endif
+
 #if PY_MAJOR_VERSION >= 3
   #define PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : PyInstanceMethod_New(func))
 #endif
+
 #if PY_VERSION_HEX < 0x02050000
   #define __Pyx_GetAttrString(o,n)   PyObject_GetAttrString((o),((char *)(n)))
   #define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),((char *)(n)),(a))
@@ -275,6 +239,7 @@
   #define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),(n),(a))
   #define __Pyx_DelAttrString(o,n)   PyObject_DelAttrString((o),(n))
 #endif
+
 #if PY_VERSION_HEX < 0x02050000
   #define __Pyx_NAMESTR(n) ((char *)(n))
   #define __Pyx_DOCSTR(n)  ((char *)(n))
@@ -282,41 +247,6 @@
   #define __Pyx_NAMESTR(n) (n)
   #define __Pyx_DOCSTR(n)  (n)
 #endif
-#ifndef CYTHON_INLINE
-  #if defined(__GNUC__)
-    #define CYTHON_INLINE __inline__
-  #elif defined(_MSC_VER)
-    #define CYTHON_INLINE __inline
-  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
-    #define CYTHON_INLINE inline
-  #else
-    #define CYTHON_INLINE
-  #endif
-#endif
-#ifndef CYTHON_RESTRICT
-  #if defined(__GNUC__)
-    #define CYTHON_RESTRICT __restrict__
-  #elif defined(_MSC_VER) && _MSC_VER >= 1400
-    #define CYTHON_RESTRICT __restrict
-  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
-    #define CYTHON_RESTRICT restrict
-  #else
-    #define CYTHON_RESTRICT
-  #endif
-#endif
-#ifdef NAN
-#define __PYX_NAN() ((float) NAN)
-#else
-static CYTHON_INLINE float __PYX_NAN() {
-  /* Initialize NaN. The sign is irrelevant, an exponent with all bits 1 and
-   a nonzero mantissa means NaN. If the first bit in the mantissa is 1, it is
-   a quiet NaN. */
-  float value;
-  memset(&value, 0xFF, sizeof(value));
-  return value;
-}
-#endif
-
 
 #if PY_MAJOR_VERSION >= 3
   #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
@@ -340,7 +270,6 @@ static CYTHON_INLINE float __PYX_NAN() {
 #include <math.h>
 #define __PYX_HAVE__theano__scan_module__scan_perform
 #define __PYX_HAVE_API__theano__scan_module__scan_perform
-#include "string.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "numpy/arrayobject.h"
@@ -353,6 +282,21 @@ static CYTHON_INLINE float __PYX_NAN() {
 #define CYTHON_WITHOUT_ASSERTIONS
 #endif
 
+
+/* inline attribute */
+#ifndef CYTHON_INLINE
+  #if defined(__GNUC__)
+    #define CYTHON_INLINE __inline__
+  #elif defined(_MSC_VER)
+    #define CYTHON_INLINE __inline
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_INLINE inline
+  #else
+    #define CYTHON_INLINE
+  #endif
+#endif
+
+/* unused attribute */
 #ifndef CYTHON_UNUSED
 # if defined(__GNUC__)
 #   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
@@ -366,143 +310,26 @@ static CYTHON_INLINE float __PYX_NAN() {
 #   define CYTHON_UNUSED
 # endif
 #endif
-typedef struct {PyObject **p; char *s; const Py_ssize_t n; const char* encoding;
-                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; /*proto*/
-
-#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
-#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT 0
-#define __PYX_DEFAULT_STRING_ENCODING ""
-#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
-#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
-#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (    \
-    (sizeof(type) < sizeof(Py_ssize_t))  ||             \
-    (sizeof(type) > sizeof(Py_ssize_t) &&               \
-          likely(v < (type)PY_SSIZE_T_MAX ||            \
-                 v == (type)PY_SSIZE_T_MAX)  &&         \
-          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||       \
-                                v == (type)PY_SSIZE_T_MIN)))  ||  \
-    (sizeof(type) == sizeof(Py_ssize_t) &&              \
-          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||        \
-                               v == (type)PY_SSIZE_T_MAX)))  )
-static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject*);
-static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
-#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
-#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
-#define __Pyx_PyBytes_FromString        PyBytes_FromString
-#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
-static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char*);
-#if PY_MAJOR_VERSION < 3
-    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
-    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
-#else
-    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
-    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
-#endif
-#define __Pyx_PyObject_AsSString(s)    ((signed char*) __Pyx_PyObject_AsString(s))
-#define __Pyx_PyObject_AsUString(s)    ((unsigned char*) __Pyx_PyObject_AsString(s))
-#define __Pyx_PyObject_FromUString(s)  __Pyx_PyObject_FromString((char*)s)
-#define __Pyx_PyBytes_FromUString(s)   __Pyx_PyBytes_FromString((char*)s)
-#define __Pyx_PyByteArray_FromUString(s)   __Pyx_PyByteArray_FromString((char*)s)
-#define __Pyx_PyStr_FromUString(s)     __Pyx_PyStr_FromString((char*)s)
-#define __Pyx_PyUnicode_FromUString(s) __Pyx_PyUnicode_FromString((char*)s)
-#if PY_MAJOR_VERSION < 3
-static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u)
-{
-    const Py_UNICODE *u_end = u;
-    while (*u_end++) ;
-    return u_end - u - 1;
-}
-#else
-#define __Pyx_Py_UNICODE_strlen Py_UNICODE_strlen
-#endif
-#define __Pyx_PyUnicode_FromUnicode(u)       PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
-#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
-#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+
+typedef struct {PyObject **p; char *s; const long n; const char* encoding; const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; /*proto*/
+
+
+/* Type Conversion Predeclarations */
+
+#define __Pyx_PyBytes_FromUString(s) PyBytes_FromString((char*)s)
+#define __Pyx_PyBytes_AsUString(s)   ((unsigned char*) PyBytes_AsString(s))
+
 #define __Pyx_Owned_Py_None(b) (Py_INCREF(Py_None), Py_None)
 #define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False))
 static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
 static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x);
+
 static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
 static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
-#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE size_t __Pyx_PyInt_AsSize_t(PyObject*);
+
 #define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
-#else
-#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
-#endif
 #define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
-#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
-static int __Pyx_sys_getdefaultencoding_not_ascii;
-static int __Pyx_init_sys_getdefaultencoding_params(void) {
-    PyObject* sys = NULL;
-    PyObject* default_encoding = NULL;
-    PyObject* ascii_chars_u = NULL;
-    PyObject* ascii_chars_b = NULL;
-    sys = PyImport_ImportModule("sys");
-    if (sys == NULL) goto bad;
-    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
-    if (default_encoding == NULL) goto bad;
-    if (strcmp(PyBytes_AsString(default_encoding), "ascii") == 0) {
-        __Pyx_sys_getdefaultencoding_not_ascii = 0;
-    } else {
-        const char* default_encoding_c = PyBytes_AS_STRING(default_encoding);
-        char ascii_chars[128];
-        int c;
-        for (c = 0; c < 128; c++) {
-            ascii_chars[c] = c;
-        }
-        __Pyx_sys_getdefaultencoding_not_ascii = 1;
-        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
-        if (ascii_chars_u == NULL) goto bad;
-        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
-        if (ascii_chars_b == NULL || strncmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
-            PyErr_Format(
-                PyExc_ValueError,
-                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
-                default_encoding_c);
-            goto bad;
-        }
-    }
-    Py_XDECREF(sys);
-    Py_XDECREF(default_encoding);
-    Py_XDECREF(ascii_chars_u);
-    Py_XDECREF(ascii_chars_b);
-    return 0;
-bad:
-    Py_XDECREF(sys);
-    Py_XDECREF(default_encoding);
-    Py_XDECREF(ascii_chars_u);
-    Py_XDECREF(ascii_chars_b);
-    return -1;
-}
-#endif
-#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
-#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
-#else
-#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
-#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
-static char* __PYX_DEFAULT_STRING_ENCODING;
-static int __Pyx_init_sys_getdefaultencoding_params(void) {
-    PyObject* sys = NULL;
-    PyObject* default_encoding = NULL;
-    char* default_encoding_c;
-    sys = PyImport_ImportModule("sys");
-    if (sys == NULL) goto bad;
-    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
-    if (default_encoding == NULL) goto bad;
-    default_encoding_c = PyBytes_AS_STRING(default_encoding);
-    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c));
-    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
-    Py_DECREF(sys);
-    Py_DECREF(default_encoding);
-    return 0;
-bad:
-    Py_XDECREF(sys);
-    Py_XDECREF(default_encoding);
-    return -1;
-}
-#endif
-#endif
-
 
 #ifdef __GNUC__
   /* Test for GCC > 2.95 */
@@ -517,9 +344,8 @@ static int __Pyx_init_sys_getdefaultencoding_params(void) {
   #define likely(x)   (x)
   #define unlikely(x) (x)
 #endif /* __GNUC__ */
-
+    
 static PyObject *__pyx_m;
-static PyObject *__pyx_d;
 static PyObject *__pyx_b;
 static PyObject *__pyx_empty_tuple;
 static PyObject *__pyx_empty_bytes;
@@ -552,8 +378,7 @@ static const char *__pyx_filename;
 
 static const char *__pyx_f[] = {
   "scan_perform.pyx",
-  "__init__.pxd",
-  "type.pxd",
+  "numpy.pxd",
 };
 #define IS_UNSIGNED(type) (((type) -1) > 0)
 struct __Pyx_StructField_;
@@ -564,7 +389,7 @@ typedef struct {
   size_t size;     /* sizeof(type) */
   size_t arraysize[8]; /* length of array in each dimension */
   int ndim;
-  char typegroup; /* _R_eal, _C_omplex, Signed _I_nt, _U_nsigned int, _S_truct, _P_ointer, _O_bject, c_H_ar */
+  char typegroup; /* _R_eal, _C_omplex, Signed _I_nt, _U_nsigned int, _S_truct, _P_ointer, _O_bject */
   char is_unsigned;
   int flags;
 } __Pyx_TypeInfo;
@@ -591,7 +416,7 @@ typedef struct {
 } __Pyx_BufFmt_Context;
 
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":723
+/* "numpy.pxd":722
  * # in Cython to enable them only on the right systems.
  * 
  * ctypedef npy_int8       int8_t             # <<<<<<<<<<<<<<
@@ -600,7 +425,7 @@ typedef struct {
  */
 typedef npy_int8 __pyx_t_5numpy_int8_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":724
+/* "numpy.pxd":723
  * 
  * ctypedef npy_int8       int8_t
  * ctypedef npy_int16      int16_t             # <<<<<<<<<<<<<<
@@ -609,7 +434,7 @@ typedef npy_int8 __pyx_t_5numpy_int8_t;
  */
 typedef npy_int16 __pyx_t_5numpy_int16_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":725
+/* "numpy.pxd":724
  * ctypedef npy_int8       int8_t
  * ctypedef npy_int16      int16_t
  * ctypedef npy_int32      int32_t             # <<<<<<<<<<<<<<
@@ -618,7 +443,7 @@ typedef npy_int16 __pyx_t_5numpy_int16_t;
  */
 typedef npy_int32 __pyx_t_5numpy_int32_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":726
+/* "numpy.pxd":725
  * ctypedef npy_int16      int16_t
  * ctypedef npy_int32      int32_t
  * ctypedef npy_int64      int64_t             # <<<<<<<<<<<<<<
@@ -627,7 +452,7 @@ typedef npy_int32 __pyx_t_5numpy_int32_t;
  */
 typedef npy_int64 __pyx_t_5numpy_int64_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":730
+/* "numpy.pxd":729
  * #ctypedef npy_int128     int128_t
  * 
  * ctypedef npy_uint8      uint8_t             # <<<<<<<<<<<<<<
@@ -636,7 +461,7 @@ typedef npy_int64 __pyx_t_5numpy_int64_t;
  */
 typedef npy_uint8 __pyx_t_5numpy_uint8_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":731
+/* "numpy.pxd":730
  * 
  * ctypedef npy_uint8      uint8_t
  * ctypedef npy_uint16     uint16_t             # <<<<<<<<<<<<<<
@@ -645,7 +470,7 @@ typedef npy_uint8 __pyx_t_5numpy_uint8_t;
  */
 typedef npy_uint16 __pyx_t_5numpy_uint16_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":732
+/* "numpy.pxd":731
  * ctypedef npy_uint8      uint8_t
  * ctypedef npy_uint16     uint16_t
  * ctypedef npy_uint32     uint32_t             # <<<<<<<<<<<<<<
@@ -654,7 +479,7 @@ typedef npy_uint16 __pyx_t_5numpy_uint16_t;
  */
 typedef npy_uint32 __pyx_t_5numpy_uint32_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":733
+/* "numpy.pxd":732
  * ctypedef npy_uint16     uint16_t
  * ctypedef npy_uint32     uint32_t
  * ctypedef npy_uint64     uint64_t             # <<<<<<<<<<<<<<
@@ -663,7 +488,7 @@ typedef npy_uint32 __pyx_t_5numpy_uint32_t;
  */
 typedef npy_uint64 __pyx_t_5numpy_uint64_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":737
+/* "numpy.pxd":736
  * #ctypedef npy_uint128    uint128_t
  * 
  * ctypedef npy_float32    float32_t             # <<<<<<<<<<<<<<
@@ -672,7 +497,7 @@ typedef npy_uint64 __pyx_t_5numpy_uint64_t;
  */
 typedef npy_float32 __pyx_t_5numpy_float32_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":738
+/* "numpy.pxd":737
  * 
  * ctypedef npy_float32    float32_t
  * ctypedef npy_float64    float64_t             # <<<<<<<<<<<<<<
@@ -681,7 +506,7 @@ typedef npy_float32 __pyx_t_5numpy_float32_t;
  */
 typedef npy_float64 __pyx_t_5numpy_float64_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":747
+/* "numpy.pxd":746
  * # The int types are mapped a bit surprising --
  * # numpy.int corresponds to 'l' and numpy.long to 'q'
  * ctypedef npy_long       int_t             # <<<<<<<<<<<<<<
@@ -690,7 +515,7 @@ typedef npy_float64 __pyx_t_5numpy_float64_t;
  */
 typedef npy_long __pyx_t_5numpy_int_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":748
+/* "numpy.pxd":747
  * # numpy.int corresponds to 'l' and numpy.long to 'q'
  * ctypedef npy_long       int_t
  * ctypedef npy_longlong   long_t             # <<<<<<<<<<<<<<
@@ -699,7 +524,7 @@ typedef npy_long __pyx_t_5numpy_int_t;
  */
 typedef npy_longlong __pyx_t_5numpy_long_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":749
+/* "numpy.pxd":748
  * ctypedef npy_long       int_t
  * ctypedef npy_longlong   long_t
  * ctypedef npy_longlong   longlong_t             # <<<<<<<<<<<<<<
@@ -708,7 +533,7 @@ typedef npy_longlong __pyx_t_5numpy_long_t;
  */
 typedef npy_longlong __pyx_t_5numpy_longlong_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":751
+/* "numpy.pxd":750
  * ctypedef npy_longlong   longlong_t
  * 
  * ctypedef npy_ulong      uint_t             # <<<<<<<<<<<<<<
@@ -717,7 +542,7 @@ typedef npy_longlong __pyx_t_5numpy_longlong_t;
  */
 typedef npy_ulong __pyx_t_5numpy_uint_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":752
+/* "numpy.pxd":751
  * 
  * ctypedef npy_ulong      uint_t
  * ctypedef npy_ulonglong  ulong_t             # <<<<<<<<<<<<<<
@@ -726,7 +551,7 @@ typedef npy_ulong __pyx_t_5numpy_uint_t;
  */
 typedef npy_ulonglong __pyx_t_5numpy_ulong_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":753
+/* "numpy.pxd":752
  * ctypedef npy_ulong      uint_t
  * ctypedef npy_ulonglong  ulong_t
  * ctypedef npy_ulonglong  ulonglong_t             # <<<<<<<<<<<<<<
@@ -735,7 +560,7 @@ typedef npy_ulonglong __pyx_t_5numpy_ulong_t;
  */
 typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":755
+/* "numpy.pxd":754
  * ctypedef npy_ulonglong  ulonglong_t
  * 
  * ctypedef npy_intp       intp_t             # <<<<<<<<<<<<<<
@@ -744,7 +569,7 @@ typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
  */
 typedef npy_intp __pyx_t_5numpy_intp_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":756
+/* "numpy.pxd":755
  * 
  * ctypedef npy_intp       intp_t
  * ctypedef npy_uintp      uintp_t             # <<<<<<<<<<<<<<
@@ -753,7 +578,7 @@ typedef npy_intp __pyx_t_5numpy_intp_t;
  */
 typedef npy_uintp __pyx_t_5numpy_uintp_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":758
+/* "numpy.pxd":757
  * ctypedef npy_uintp      uintp_t
  * 
  * ctypedef npy_double     float_t             # <<<<<<<<<<<<<<
@@ -762,7 +587,7 @@ typedef npy_uintp __pyx_t_5numpy_uintp_t;
  */
 typedef npy_double __pyx_t_5numpy_float_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":759
+/* "numpy.pxd":758
  * 
  * ctypedef npy_double     float_t
  * ctypedef npy_double     double_t             # <<<<<<<<<<<<<<
@@ -771,7 +596,7 @@ typedef npy_double __pyx_t_5numpy_float_t;
  */
 typedef npy_double __pyx_t_5numpy_double_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":760
+/* "numpy.pxd":759
  * ctypedef npy_double     float_t
  * ctypedef npy_double     double_t
  * ctypedef npy_longdouble longdouble_t             # <<<<<<<<<<<<<<
@@ -802,7 +627,7 @@ typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
 
 /*--- Type declarations ---*/
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":762
+/* "numpy.pxd":761
  * ctypedef npy_longdouble longdouble_t
  * 
  * ctypedef npy_cfloat      cfloat_t             # <<<<<<<<<<<<<<
@@ -811,7 +636,7 @@ typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
  */
 typedef npy_cfloat __pyx_t_5numpy_cfloat_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":763
+/* "numpy.pxd":762
  * 
  * ctypedef npy_cfloat      cfloat_t
  * ctypedef npy_cdouble     cdouble_t             # <<<<<<<<<<<<<<
@@ -820,7 +645,7 @@ typedef npy_cfloat __pyx_t_5numpy_cfloat_t;
  */
 typedef npy_cdouble __pyx_t_5numpy_cdouble_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":764
+/* "numpy.pxd":763
  * ctypedef npy_cfloat      cfloat_t
  * ctypedef npy_cdouble     cdouble_t
  * ctypedef npy_clongdouble clongdouble_t             # <<<<<<<<<<<<<<
@@ -829,7 +654,7 @@ typedef npy_cdouble __pyx_t_5numpy_cdouble_t;
  */
 typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t;
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":766
+/* "numpy.pxd":765
  * ctypedef npy_clongdouble clongdouble_t
  * 
  * ctypedef npy_cdouble     complex_t             # <<<<<<<<<<<<<<
@@ -888,33 +713,10 @@ typedef npy_cdouble __pyx_t_5numpy_complex_t;
   #define __Pyx_XGOTREF(r)
   #define __Pyx_XGIVEREF(r)
 #endif /* CYTHON_REFNANNY */
-#define __Pyx_XDECREF_SET(r, v) do {                            \
-        PyObject *tmp = (PyObject *) r;                         \
-        r = v; __Pyx_XDECREF(tmp);                              \
-    } while (0)
-#define __Pyx_DECREF_SET(r, v) do {                             \
-        PyObject *tmp = (PyObject *) r;                         \
-        r = v; __Pyx_DECREF(tmp);                               \
-    } while (0)
 #define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
 #define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
 
-#if CYTHON_COMPILING_IN_CPYTHON
-static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
-    PyTypeObject* tp = Py_TYPE(obj);
-    if (likely(tp->tp_getattro))
-        return tp->tp_getattro(obj, attr_name);
-#if PY_MAJOR_VERSION < 3
-    if (likely(tp->tp_getattr))
-        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
-#endif
-    return PyObject_GetAttr(obj, attr_name);
-}
-#else
-#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
-#endif
-
-static PyObject *__Pyx_GetBuiltinName(PyObject *name); /*proto*/
+static PyObject *__Pyx_GetName(PyObject *dict, PyObject *name); /*proto*/
 
 static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
     Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found); /*proto*/
@@ -925,111 +727,142 @@ static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[], \
     PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args, \
     const char* function_name); /*proto*/
 
-static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed,
+static int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed,
     const char *name, int exact); /*proto*/
 
 static CYTHON_INLINE int  __Pyx_GetBufferAndValidate(Py_buffer* buf, PyObject* obj,
     __Pyx_TypeInfo* dtype, int flags, int nd, int cast, __Pyx_BufFmt_StackElem* stack);
 static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info);
 
-static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name); /*proto*/
-
-#if CYTHON_COMPILING_IN_CPYTHON
-static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw); /*proto*/
-#else
-#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
-#endif
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) {
+    PyObject *r;
+    if (!j) return NULL;
+    r = PyObject_GetItem(o, j);
+    Py_DECREF(j);
+    return r;
+}
+#define __Pyx_GetItemInt_List(o, i, size, to_py_func) (((size) <= sizeof(Py_ssize_t)) ? \
+                                                    __Pyx_GetItemInt_List_Fast(o, i) : \
+                                                    __Pyx_GetItemInt_Generic(o, to_py_func(i)))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i) {
+    if (likely(o != Py_None)) {
+        if (likely((0 <= i) & (i < PyList_GET_SIZE(o)))) {
+            PyObject *r = PyList_GET_ITEM(o, i);
+            Py_INCREF(r);
+            return r;
+        }
+        else if ((-PyList_GET_SIZE(o) <= i) & (i < 0)) {
+            PyObject *r = PyList_GET_ITEM(o, PyList_GET_SIZE(o) + i);
+            Py_INCREF(r);
+            return r;
+        }
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+}
+#define __Pyx_GetItemInt_Tuple(o, i, size, to_py_func) (((size) <= sizeof(Py_ssize_t)) ? \
+                                                    __Pyx_GetItemInt_Tuple_Fast(o, i) : \
+                                                    __Pyx_GetItemInt_Generic(o, to_py_func(i)))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i) {
+    if (likely(o != Py_None)) {
+        if (likely((0 <= i) & (i < PyTuple_GET_SIZE(o)))) {
+            PyObject *r = PyTuple_GET_ITEM(o, i);
+            Py_INCREF(r);
+            return r;
+        }
+        else if ((-PyTuple_GET_SIZE(o) <= i) & (i < 0)) {
+            PyObject *r = PyTuple_GET_ITEM(o, PyTuple_GET_SIZE(o) + i);
+            Py_INCREF(r);
+            return r;
+        }
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+}
+#define __Pyx_GetItemInt(o, i, size, to_py_func) (((size) <= sizeof(Py_ssize_t)) ? \
+                                                    __Pyx_GetItemInt_Fast(o, i) : \
+                                                    __Pyx_GetItemInt_Generic(o, to_py_func(i)))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i) {
+    if (PyList_CheckExact(o)) {
+        Py_ssize_t n = (likely(i >= 0)) ? i : i + PyList_GET_SIZE(o);
+        if (likely((n >= 0) & (n < PyList_GET_SIZE(o)))) {
+            PyObject *r = PyList_GET_ITEM(o, n);
+            Py_INCREF(r);
+            return r;
+        }
+    }
+    else if (PyTuple_CheckExact(o)) {
+        Py_ssize_t n = (likely(i >= 0)) ? i : i + PyTuple_GET_SIZE(o);
+        if (likely((n >= 0) & (n < PyTuple_GET_SIZE(o)))) {
+            PyObject *r = PyTuple_GET_ITEM(o, n);
+            Py_INCREF(r);
+            return r;
+        }
+    }
+    else if (likely(i >= 0)) {
+        PySequenceMethods *m = Py_TYPE(o)->tp_as_sequence;
+        if (likely(m && m->sq_item)) {
+            return m->sq_item(o, i);
+        }
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+}
 
 static CYTHON_INLINE void __Pyx_ErrRestore(PyObject *type, PyObject *value, PyObject *tb); /*proto*/
 static CYTHON_INLINE void __Pyx_ErrFetch(PyObject **type, PyObject **value, PyObject **tb); /*proto*/
 
 static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause); /*proto*/
 
-#define __Pyx_GetItemInt(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck) \
-    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ? \
-    __Pyx_GetItemInt_Fast(o, (Py_ssize_t)i, is_list, wraparound, boundscheck) : \
-    (is_list ? (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL) : \
-               __Pyx_GetItemInt_Generic(o, to_py_func(i))))
-#define __Pyx_GetItemInt_List(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck) \
-    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ? \
-    __Pyx_GetItemInt_List_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) : \
-    (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL))
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
-                                                              int wraparound, int boundscheck);
-#define __Pyx_GetItemInt_Tuple(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck) \
-    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ? \
-    __Pyx_GetItemInt_Tuple_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) : \
-    (PyErr_SetString(PyExc_IndexError, "tuple index out of range"), (PyObject*)NULL))
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
-                                                              int wraparound, int boundscheck);
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j);
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i,
-                                                     int is_list, int wraparound, int boundscheck);
+#define __Pyx_SetItemInt(o, i, v, size, to_py_func) (((size) <= sizeof(Py_ssize_t)) ? \
+                                                    __Pyx_SetItemInt_Fast(o, i, v) : \
+                                                    __Pyx_SetItemInt_Generic(o, to_py_func(i), v))
+static CYTHON_INLINE int __Pyx_SetItemInt_Generic(PyObject *o, PyObject *j, PyObject *v) {
+    int r;
+    if (!j) return -1;
+    r = PyObject_SetItem(o, j, v);
+    Py_DECREF(j);
+    return r;
+}
+static CYTHON_INLINE int __Pyx_SetItemInt_Fast(PyObject *o, Py_ssize_t i, PyObject *v) {
+    if (PyList_CheckExact(o)) {
+        Py_ssize_t n = (likely(i >= 0)) ? i : i + PyList_GET_SIZE(o);
+        if (likely((n >= 0) & (n < PyList_GET_SIZE(o)))) {
+            PyObject* old = PyList_GET_ITEM(o, n);
+            Py_INCREF(v);
+            PyList_SET_ITEM(o, n, v);
+            Py_DECREF(old);
+            return 1;
+        }
+    }
+    else if (likely(i >= 0)) {
+        PySequenceMethods *m = Py_TYPE(o)->tp_as_sequence;
+        if (likely(m && m->sq_ass_item)) {
+            return m->sq_ass_item(o, i, v);
+        }
+    }
+    return __Pyx_SetItemInt_Generic(o, PyInt_FromSsize_t(i), v);
+}
 
 static CYTHON_INLINE int __Pyx_mod_int(int, int); /* proto */
 
 #define __Pyx_BufPtrStrided1d(type, buf, i0, s0) (type)((char*)buf + i0 * s0)
-#ifndef __PYX_FORCE_INIT_THREADS
-  #define __PYX_FORCE_INIT_THREADS 0
-#endif
-
-#define __Pyx_SetItemInt(o, i, v, type, is_signed, to_py_func, is_list, wraparound, boundscheck) \
-    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ? \
-    __Pyx_SetItemInt_Fast(o, (Py_ssize_t)i, v, is_list, wraparound, boundscheck) : \
-    (is_list ? (PyErr_SetString(PyExc_IndexError, "list assignment index out of range"), -1) : \
-               __Pyx_SetItemInt_Generic(o, to_py_func(i), v)))
-static CYTHON_INLINE int __Pyx_SetItemInt_Generic(PyObject *o, PyObject *j, PyObject *v);
-static CYTHON_INLINE int __Pyx_SetItemInt_Fast(PyObject *o, Py_ssize_t i, PyObject *v,
-                                               int is_list, int wraparound, int boundscheck);
-
-static CYTHON_INLINE PyObject* __Pyx_PyObject_GetSlice(
-        PyObject* obj, Py_ssize_t cstart, Py_ssize_t cstop,
-        PyObject** py_start, PyObject** py_stop, PyObject** py_slice,
-        int has_cstart, int has_cstop, int wraparound);
-
-#define __Pyx_PyObject_DelSlice(obj, cstart, cstop, py_start, py_stop, py_slice, has_cstart, has_cstop, wraparound) \
-    __Pyx_PyObject_SetSlice(obj, (PyObject*)NULL, cstart, cstop, py_start, py_stop, py_slice, has_cstart, has_cstop, wraparound)
-static CYTHON_INLINE int __Pyx_PyObject_SetSlice(
-        PyObject* obj, PyObject* value, Py_ssize_t cstart, Py_ssize_t cstop,
-        PyObject** py_start, PyObject** py_stop, PyObject** py_slice,
-        int has_cstart, int has_cstop, int wraparound);
-
 #define __Pyx_BufPtrStrided2d(type, buf, i0, s0, i1, s1) (type)((char*)buf + i0 * s0 + i1 * s1)
-static CYTHON_INLINE void __Pyx_ExceptionSave(PyObject **type, PyObject **value, PyObject **tb); /*proto*/
-static void __Pyx_ExceptionReset(PyObject *type, PyObject *value, PyObject *tb); /*proto*/
-
 static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb); /*proto*/
 
 static CYTHON_INLINE long __Pyx_mod_long(long, long); /* proto */
 
 static CYTHON_INLINE long __Pyx_div_long(long, long); /* proto */
 
-#if CYTHON_COMPILING_IN_CPYTHON
-#define __Pyx_PyObject_DelAttrStr(o,n) __Pyx_PyObject_SetAttrStr(o,n,NULL)
-static CYTHON_INLINE int __Pyx_PyObject_SetAttrStr(PyObject* obj, PyObject* attr_name, PyObject* value) {
-    PyTypeObject* tp = Py_TYPE(obj);
-    if (likely(tp->tp_setattro))
-        return tp->tp_setattro(obj, attr_name, value);
-#if PY_MAJOR_VERSION < 3
-    if (likely(tp->tp_setattr))
-        return tp->tp_setattr(obj, PyString_AS_STRING(attr_name), value);
-#endif
-    return PyObject_SetAttr(obj, attr_name, value);
-}
-#else
-#define __Pyx_PyObject_DelAttrStr(o,n)   PyObject_DelAttr(o,n)
-#define __Pyx_PyObject_SetAttrStr(o,n,v) PyObject_SetAttr(o,n,v)
-#endif
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index);
 
 static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected);
 
-static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index);
-
 static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void);
 
+static void __Pyx_UnpackTupleError(PyObject *, Py_ssize_t index); /*proto*/
+
 static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type); /*proto*/
 
-static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name); /*proto*/
+static CYTHON_INLINE void __Pyx_ExceptionSave(PyObject **type, PyObject **value, PyObject **tb); /*proto*/
+static void __Pyx_ExceptionReset(PyObject *type, PyObject *value, PyObject *tb); /*proto*/
 
 typedef struct {
   Py_ssize_t shape, strides, suboffsets;
@@ -1052,23 +885,14 @@ typedef struct {
     #define __Pyx_ReleaseBuffer PyBuffer_Release
 #endif
 
-
 static Py_ssize_t __Pyx_zeros[] = {0, 0, 0, 0, 0, 0, 0, 0};
 static Py_ssize_t __Pyx_minusones[] = {-1, -1, -1, -1, -1, -1, -1, -1};
 
-static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level); /*proto*/
-
-static CYTHON_INLINE unsigned int __Pyx_PyInt_As_unsigned_int(PyObject *);
-
-static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
-
-static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value);
-
-static CYTHON_INLINE PyObject* __Pyx_PyInt_From_unsigned_int(unsigned int value);
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, long level); /*proto*/
 
-static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+static CYTHON_INLINE void __Pyx_RaiseImportError(PyObject *name);
 
-static CYTHON_INLINE PyObject* __Pyx_PyInt_From_npy_int32(npy_int32 value);
+static CYTHON_INLINE PyObject *__Pyx_PyInt_to_py_npy_int32(npy_int32);
 
 #if CYTHON_CCOMPLEX
   #ifdef __cplusplus
@@ -1082,7 +906,7 @@ static CYTHON_INLINE PyObject* __Pyx_PyInt_From_npy_int32(npy_int32 value);
     #define __Pyx_CREAL(z) ((z).real)
     #define __Pyx_CIMAG(z) ((z).imag)
 #endif
-#if (defined(_WIN32) || defined(__clang__)) && defined(__cplusplus) && CYTHON_CCOMPLEX
+#if defined(_WIN32) && defined(__cplusplus) && CYTHON_CCOMPLEX
     #define __Pyx_SET_CREAL(z,x) ((z).real(x))
     #define __Pyx_SET_CIMAG(z,y) ((z).imag(y))
 #else
@@ -1168,7 +992,37 @@ static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(do
     #endif
 #endif
 
-static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+static CYTHON_INLINE unsigned char __Pyx_PyInt_AsUnsignedChar(PyObject *);
+
+static CYTHON_INLINE unsigned short __Pyx_PyInt_AsUnsignedShort(PyObject *);
+
+static CYTHON_INLINE unsigned int __Pyx_PyInt_AsUnsignedInt(PyObject *);
+
+static CYTHON_INLINE char __Pyx_PyInt_AsChar(PyObject *);
+
+static CYTHON_INLINE short __Pyx_PyInt_AsShort(PyObject *);
+
+static CYTHON_INLINE int __Pyx_PyInt_AsInt(PyObject *);
+
+static CYTHON_INLINE signed char __Pyx_PyInt_AsSignedChar(PyObject *);
+
+static CYTHON_INLINE signed short __Pyx_PyInt_AsSignedShort(PyObject *);
+
+static CYTHON_INLINE signed int __Pyx_PyInt_AsSignedInt(PyObject *);
+
+static CYTHON_INLINE int __Pyx_PyInt_AsLongDouble(PyObject *);
+
+static CYTHON_INLINE unsigned long __Pyx_PyInt_AsUnsignedLong(PyObject *);
+
+static CYTHON_INLINE unsigned PY_LONG_LONG __Pyx_PyInt_AsUnsignedLongLong(PyObject *);
+
+static CYTHON_INLINE long __Pyx_PyInt_AsLong(PyObject *);
+
+static CYTHON_INLINE PY_LONG_LONG __Pyx_PyInt_AsLongLong(PyObject *);
+
+static CYTHON_INLINE signed long __Pyx_PyInt_AsSignedLong(PyObject *);
+
+static CYTHON_INLINE signed PY_LONG_LONG __Pyx_PyInt_AsSignedLongLong(PyObject *);
 
 static int __Pyx_check_binary_version(void);
 
@@ -1180,10 +1034,10 @@ static int __Pyx_check_binary_version(void);
 #endif
 #endif
 
-static PyObject *__Pyx_ImportModule(const char *name); /*proto*/
-
 static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name, size_t size, int strict);  /*proto*/
 
+static PyObject *__Pyx_ImportModule(const char *name); /*proto*/
+
 typedef struct {
     int code_line;
     PyCodeObject* code_object;
@@ -1210,17 +1064,10 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/
 
 /* Module declarations from 'cpython.ref' */
 
-/* Module declarations from 'libc.string' */
-
 /* Module declarations from 'libc.stdio' */
 
 /* Module declarations from 'cpython.object' */
 
-/* Module declarations from '__builtin__' */
-
-/* Module declarations from 'cpython.type' */
-static PyTypeObject *__pyx_ptype_7cpython_4type_type = 0;
-
 /* Module declarations from 'libc.stdlib' */
 
 /* Module declarations from 'numpy' */
@@ -1234,13 +1081,11 @@ static PyTypeObject *__pyx_ptype_5numpy_ufunc = 0;
 static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *, char *, char *, int *); /*proto*/
 
 /* Module declarations from 'theano.scan_module.scan_perform' */
-static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t = { "int32_t", NULL, sizeof(__pyx_t_5numpy_int32_t), { 0 }, 0, IS_UNSIGNED(__pyx_t_5numpy_int32_t) ? 'U' : 'I', IS_UNSIGNED(__pyx_t_5numpy_int32_t), 0 };
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t = { "int32_t", NULL, sizeof(__pyx_t_5numpy_int32_t), { 0 }, 0, 'I', IS_UNSIGNED(__pyx_t_5numpy_int32_t), 0 };
 #define __Pyx_MODULE_NAME "theano.scan_module.scan_perform"
 int __pyx_module_is_main_theano__scan_module__scan_perform = 0;
 
 /* Implementation of 'theano.scan_module.scan_perform' */
-static PyObject *__pyx_builtin_IndexError;
-static PyObject *__pyx_builtin_NotImplementedError;
 static PyObject *__pyx_builtin_range;
 static PyObject *__pyx_builtin_ValueError;
 static PyObject *__pyx_builtin_Exception;
@@ -1249,282 +1094,252 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_get_version(CYTHO
 static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_UNUSED PyObject *__pyx_self, unsigned int __pyx_v_n_shared_outs, unsigned int __pyx_v_n_mit_mot_outs, unsigned int __pyx_v_n_seqs, unsigned int __pyx_v_n_mit_mot, unsigned int __pyx_v_n_mit_sot, unsigned int __pyx_v_n_sit_sot, unsigned int __pyx_v_n_nit_sot, int __pyx_v_n_steps, int __pyx_v_as_while, PyArrayObject *__pyx_v_mintaps, PyArrayObject *__pyx_v_tap_array, PyArrayObject *__pyx_v_tap_array_len, PyArrayObject *__pyx_v_vector_seqs, PyArrayObject *__pyx_v_vector_outs, PyArrayObject *__pyx_v_mit_mot_out_slices, PyArrayObject *__pyx_v_mit_mot_out_nslices, PyObject *__pyx_v_fn, PyObject *__pyx_v_fnct, PyArrayObject *__pyx_v_destroy_map, PyObject *__pyx_v_args, PyObject *__pyx_v_outs, PyObject *__pyx_v_self, PyObject *__pyx_v_node); /* proto */
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
 static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info); /* proto */
-static char __pyx_k_B[] = "B";
-static char __pyx_k_H[] = "H";
-static char __pyx_k_I[] = "I";
-static char __pyx_k_L[] = "L";
-static char __pyx_k_O[] = "O";
-static char __pyx_k_Q[] = "Q";
-static char __pyx_k_b[] = "b";
-static char __pyx_k_d[] = "d";
-static char __pyx_k_f[] = "f";
-static char __pyx_k_g[] = "g";
-static char __pyx_k_h[] = "h";
-static char __pyx_k_i[] = "i";
-static char __pyx_k_j[] = "j";
-static char __pyx_k_k[] = "k";
-static char __pyx_k_l[] = "l";
-static char __pyx_k_q[] = "q";
-static char __pyx_k_Zd[] = "Zd";
-static char __pyx_k_Zf[] = "Zf";
-static char __pyx_k_Zg[] = "Zg";
-static char __pyx_k_fn[] = "fn";
-static char __pyx_k_vm[] = "vm";
-static char __pyx_k_end[] = "end";
-static char __pyx_k_gof[] = "gof";
-static char __pyx_k_i_s[] = "i_s";
-static char __pyx_k_idx[] = "_idx";
-static char __pyx_k_kdx[] = "kdx";
-static char __pyx_k_o_s[] = "o_s";
-static char __pyx_k_pdx[] = "pdx";
-static char __pyx_k_pos[] = "pos";
-static char __pyx_k_sh0[] = "sh0";
-static char __pyx_k_tap[] = "tap";
-static char __pyx_k_tdx[] = "tdx";
-static char __pyx_k_tmp[] = "tmp";
-static char __pyx_k_args[] = "args";
-static char __pyx_k_cond[] = "cond";
-static char __pyx_k_copy[] = "copy";
-static char __pyx_k_fnct[] = "fnct";
-static char __pyx_k_jout[] = "jout";
-static char __pyx_k_main[] = "__main__";
-static char __pyx_k_node[] = "node";
-static char __pyx_k_outs[] = "outs";
-static char __pyx_k_self[] = "self";
-static char __pyx_k_t_fn[] = "t_fn";
-static char __pyx_k_test[] = "__test__";
-static char __pyx_k_time[] = "time";
-static char __pyx_k_type[] = "type";
-static char __pyx_k_begin[] = "begin";
-static char __pyx_k_dt_fn[] = "dt_fn";
-static char __pyx_k_dtype[] = "dtype";
-static char __pyx_k_idx_2[] = "idx";
-static char __pyx_k_maker[] = "maker";
-static char __pyx_k_nodes[] = "nodes";
-static char __pyx_k_numpy[] = "numpy";
-static char __pyx_k_range[] = "range";
-static char __pyx_k_shape[] = "shape";
-static char __pyx_k_t0_fn[] = "t0_fn";
-static char __pyx_k_import[] = "__import__";
-static char __pyx_k_lenpos[] = "lenpos";
-static char __pyx_k_n_outs[] = "n_outs";
-static char __pyx_k_n_seqs[] = "n_seqs";
-static char __pyx_k_offset[] = "offset";
-static char __pyx_k_t_call[] = "t_call";
-static char __pyx_k_theano[] = "theano";
-static char __pyx_k_authors[] = "__authors__";
-static char __pyx_k_contact[] = "__contact__";
-static char __pyx_k_mintaps[] = "mintaps";
-static char __pyx_k_n_steps[] = "n_steps";
-static char __pyx_k_nbsteps[] = "nbsteps";
-static char __pyx_k_outputs[] = "outputs";
-static char __pyx_k_perform[] = "perform";
-static char __pyx_k_profile[] = "profile";
-static char __pyx_k_reshape[] = "reshape";
-static char __pyx_k_storage[] = "storage";
-static char __pyx_k_t0_call[] = "t0_call";
-static char __pyx_k_a_offset[] = "a_offset";
-static char __pyx_k_as_while[] = "as_while";
-static char __pyx_k_o_offset[] = "o_offset";
-static char __pyx_k_Exception[] = "Exception";
-static char __pyx_k_call_time[] = "call_time";
-static char __pyx_k_callcount[] = "callcount";
-static char __pyx_k_copyright[] = "__copyright__";
-static char __pyx_k_n_mit_mot[] = "n_mit_mot";
-static char __pyx_k_n_mit_sot[] = "n_mit_sot";
-static char __pyx_k_n_nit_sot[] = "n_nit_sot";
-static char __pyx_k_n_sit_sot[] = "n_sit_sot";
-static char __pyx_k_tap_array[] = "tap_array";
-static char __pyx_k_IndexError[] = "IndexError";
-static char __pyx_k_ValueError[] = "ValueError";
-static char __pyx_k_offset_out[] = "offset_out";
-static char __pyx_k_other_args[] = "other_args";
-static char __pyx_k_destroy_map[] = "destroy_map";
-static char __pyx_k_get_version[] = "get_version";
-static char __pyx_k_store_steps[] = "store_steps";
-static char __pyx_k_value_zeros[] = "value_zeros";
-static char __pyx_k_vector_outs[] = "vector_outs";
-static char __pyx_k_vector_seqs[] = "vector_seqs";
-static char __pyx_k_RuntimeError[] = "RuntimeError";
-static char __pyx_k_vm_call_time[] = "vm_call_time";
-static char __pyx_k_input_storage[] = "input_storage";
-static char __pyx_k_n_shared_outs[] = "n_shared_outs";
-static char __pyx_k_pyx_getbuffer[] = "__pyx_getbuffer";
-static char __pyx_k_raise_with_op[] = "raise_with_op";
-static char __pyx_k_tap_array_len[] = "tap_array_len";
-static char __pyx_k_Razvan_Pascanu[] = "Razvan Pascanu";
-static char __pyx_k_n_mit_mot_outs[] = "n_mit_mot_outs";
-static char __pyx_k_output_storage[] = "output_storage";
-static char __pyx_k_update_profile[] = "update_profile";
-static char __pyx_k_len_store_steps[] = "len_store_steps";
-static char __pyx_k_seqs_arg_offset[] = "seqs_arg_offset";
-static char __pyx_k_position_of_error[] = "position_of_error";
-static char __pyx_k_pyx_releasebuffer[] = "__pyx_releasebuffer";
-static char __pyx_k_shared_arg_offset[] = "shared_arg_offset";
-static char __pyx_k_mit_mot_out_slices[] = "mit_mot_out_slices";
-static char __pyx_k_nit_sot_arg_offset[] = "nit_sot_arg_offset";
-static char __pyx_k_NotImplementedError[] = "NotImplementedError";
-static char __pyx_k_mit_mot_out_nslices[] = "mit_mot_out_nslices";
-static char __pyx_k_ndarray_is_not_C_contiguous[] = "ndarray is not C contiguous";
-static char __pyx_k_c_2011_Universite_de_Montreal[] = "(c) 2011, Universite de Montreal";
-static char __pyx_k_Razvan_Pascanu_r_pascanu_gmail[] = "Razvan Pascanu <r.pascanu@gmail>";
-static char __pyx_k_This_code_implements_the_operat[] = "\n This code implements the operations that scan has to carry on when called\n as a stand alone function.\n\n IF anything this is the entire code that needs to be transported to C.\n\n Short description of how this code works:\n     Scan divides its inputs ( Op's inputs) into different classes of inputs\n     as follows:\n         i) sequences : inputs over which scan loops to get data. Nothing is\n         written into them ( they are readonly, loop over)\n\n         ii) mit_mot : multiple input taps multiple output taps arguments.\n         These are inputs over which scan loops and gets data but into which\n         scan also writes data. The shorthand mit_mot describes how scan\n         deal with them at each step : at each step take several slices as\n         input and produce sevaral slices as outputs\n\n         iii) mit_sot : multiple input taps single output tap arguments.\n         As before scan reads from these but also writes. At each step scan\n         uses several slices as input but produces only one as output\n\n         iv) sit_sot : single input tap single output tap arguments.\n         At each step use only the previous slice as input, produce only one\n         slice as output\n\n         v) nit_sot: no input tap single output tap arguments.\n         At each step don't use any previous values, only produce new onese\n\n         vi) shared_outs: arguments corresponding to shared variables with\n         updates.\n         At each step use its value as input, and afterwards replace it with\n         a new value.\n         vii) other_args: arguments that are passed to every call of the\n         inner function as they are ( no slicing is perfomed)\n\n    All these outputs are one after the other in the inputs list (named in\n    this code as args) in a given order ( namely the one described above\n    with little discrepencies depending if we are talking about the outputs\n    of the Scan op or the inputs of the Scan op Node, and if we are tal""king\n    about the inputs of the inner function of scan or of the scan op).\n\n    Because of this, all we need to be able to separate and tell arguments\n    apart is how many of which we have as well as how many taps and which\n    ones (where applicable). All this information is desribed (more or less)\n    by describing the arguments of this function)\n";
-static char __pyx_k_home_anakha_ext_theano_theano_s[] = "/home/anakha/ext/theano/theano/scan_module/scan_perform.pyx";
-static char __pyx_k_theano_scan_module_scan_perform[] = "theano.scan_module.scan_perform";
-static char __pyx_k_unknown_dtype_code_in_numpy_pxd[] = "unknown dtype code in numpy.pxd (%d)";
-static char __pyx_k_Format_string_allocated_too_shor[] = "Format string allocated too short, see comment in numpy.pxd";
-static char __pyx_k_Non_native_byte_order_not_suppor[] = "Non-native byte order not supported";
-static char __pyx_k_Scan_was_asked_to_run_for_negati[] = "Scan was asked to run for negative number of step %d";
-static char __pyx_k_Sequence_is_shorter_then_the_req[] = "Sequence is shorter then the required number of steps : (n_steps, seq, seq.shape):";
-static char __pyx_k_We_didn_t_implemented_yet_the_ca[] = "We didn't implemented yet the case where scan do 0 iteration";
-static char __pyx_k_ndarray_is_not_Fortran_contiguou[] = "ndarray is not Fortran contiguous";
-static char __pyx_k_Format_string_allocated_too_shor_2[] = "Format string allocated too short.";
-static PyObject *__pyx_n_s_Exception;
-static PyObject *__pyx_kp_u_Format_string_allocated_too_shor;
-static PyObject *__pyx_kp_u_Format_string_allocated_too_shor_2;
-static PyObject *__pyx_n_s_IndexError;
-static PyObject *__pyx_kp_u_Non_native_byte_order_not_suppor;
-static PyObject *__pyx_n_s_NotImplementedError;
-static PyObject *__pyx_kp_s_Razvan_Pascanu;
-static PyObject *__pyx_kp_s_Razvan_Pascanu_r_pascanu_gmail;
-static PyObject *__pyx_n_s_RuntimeError;
-static PyObject *__pyx_kp_s_Scan_was_asked_to_run_for_negati;
-static PyObject *__pyx_kp_s_Sequence_is_shorter_then_the_req;
-static PyObject *__pyx_n_s_ValueError;
-static PyObject *__pyx_kp_s_We_didn_t_implemented_yet_the_ca;
-static PyObject *__pyx_n_s_a_offset;
-static PyObject *__pyx_n_s_args;
-static PyObject *__pyx_n_s_as_while;
-static PyObject *__pyx_n_s_authors;
-static PyObject *__pyx_n_s_begin;
-static PyObject *__pyx_kp_s_c_2011_Universite_de_Montreal;
-static PyObject *__pyx_n_s_call_time;
-static PyObject *__pyx_n_s_callcount;
-static PyObject *__pyx_n_s_cond;
-static PyObject *__pyx_n_s_contact;
-static PyObject *__pyx_n_s_copy;
-static PyObject *__pyx_n_s_copyright;
-static PyObject *__pyx_n_s_destroy_map;
-static PyObject *__pyx_n_s_dt_fn;
-static PyObject *__pyx_n_s_dtype;
-static PyObject *__pyx_n_s_end;
-static PyObject *__pyx_n_s_fn;
-static PyObject *__pyx_n_s_fnct;
-static PyObject *__pyx_n_s_get_version;
-static PyObject *__pyx_n_s_gof;
-static PyObject *__pyx_kp_s_home_anakha_ext_theano_theano_s;
-static PyObject *__pyx_n_s_i;
-static PyObject *__pyx_n_s_i_s;
-static PyObject *__pyx_n_s_idx;
-static PyObject *__pyx_n_s_idx_2;
-static PyObject *__pyx_n_s_import;
-static PyObject *__pyx_n_s_input_storage;
-static PyObject *__pyx_n_s_j;
-static PyObject *__pyx_n_s_jout;
-static PyObject *__pyx_n_s_k;
-static PyObject *__pyx_n_s_kdx;
-static PyObject *__pyx_n_s_l;
-static PyObject *__pyx_n_s_len_store_steps;
-static PyObject *__pyx_n_s_lenpos;
-static PyObject *__pyx_n_s_main;
-static PyObject *__pyx_n_s_maker;
-static PyObject *__pyx_n_s_mintaps;
-static PyObject *__pyx_n_s_mit_mot_out_nslices;
-static PyObject *__pyx_n_s_mit_mot_out_slices;
-static PyObject *__pyx_n_s_n_mit_mot;
-static PyObject *__pyx_n_s_n_mit_mot_outs;
-static PyObject *__pyx_n_s_n_mit_sot;
-static PyObject *__pyx_n_s_n_nit_sot;
-static PyObject *__pyx_n_s_n_outs;
-static PyObject *__pyx_n_s_n_seqs;
-static PyObject *__pyx_n_s_n_shared_outs;
-static PyObject *__pyx_n_s_n_sit_sot;
-static PyObject *__pyx_n_s_n_steps;
-static PyObject *__pyx_n_s_nbsteps;
-static PyObject *__pyx_kp_u_ndarray_is_not_C_contiguous;
-static PyObject *__pyx_kp_u_ndarray_is_not_Fortran_contiguou;
-static PyObject *__pyx_n_s_nit_sot_arg_offset;
-static PyObject *__pyx_n_s_node;
-static PyObject *__pyx_n_s_nodes;
-static PyObject *__pyx_n_s_numpy;
-static PyObject *__pyx_n_s_o_offset;
-static PyObject *__pyx_n_s_o_s;
-static PyObject *__pyx_n_s_offset;
-static PyObject *__pyx_n_s_offset_out;
-static PyObject *__pyx_n_s_other_args;
-static PyObject *__pyx_n_s_output_storage;
-static PyObject *__pyx_n_s_outputs;
-static PyObject *__pyx_n_s_outs;
-static PyObject *__pyx_n_s_pdx;
-static PyObject *__pyx_n_s_perform;
-static PyObject *__pyx_n_s_pos;
-static PyObject *__pyx_n_s_position_of_error;
-static PyObject *__pyx_n_s_profile;
-static PyObject *__pyx_n_s_pyx_getbuffer;
-static PyObject *__pyx_n_s_pyx_releasebuffer;
-static PyObject *__pyx_n_s_raise_with_op;
-static PyObject *__pyx_n_s_range;
-static PyObject *__pyx_n_s_reshape;
-static PyObject *__pyx_n_s_self;
-static PyObject *__pyx_n_s_seqs_arg_offset;
-static PyObject *__pyx_n_s_sh0;
-static PyObject *__pyx_n_s_shape;
-static PyObject *__pyx_n_s_shared_arg_offset;
-static PyObject *__pyx_n_s_storage;
-static PyObject *__pyx_n_s_store_steps;
-static PyObject *__pyx_n_s_t0_call;
-static PyObject *__pyx_n_s_t0_fn;
-static PyObject *__pyx_n_s_t_call;
-static PyObject *__pyx_n_s_t_fn;
-static PyObject *__pyx_n_s_tap;
-static PyObject *__pyx_n_s_tap_array;
-static PyObject *__pyx_n_s_tap_array_len;
-static PyObject *__pyx_n_s_tdx;
-static PyObject *__pyx_n_s_test;
-static PyObject *__pyx_n_s_theano;
-static PyObject *__pyx_n_s_theano_scan_module_scan_perform;
-static PyObject *__pyx_n_s_time;
-static PyObject *__pyx_n_s_tmp;
-static PyObject *__pyx_n_s_type;
-static PyObject *__pyx_kp_u_unknown_dtype_code_in_numpy_pxd;
-static PyObject *__pyx_n_s_update_profile;
-static PyObject *__pyx_n_s_value_zeros;
-static PyObject *__pyx_n_s_vector_outs;
-static PyObject *__pyx_n_s_vector_seqs;
-static PyObject *__pyx_n_s_vm;
-static PyObject *__pyx_n_s_vm_call_time;
-static PyObject *__pyx_float_0_283;
+static char __pyx_k_1[] = "Sequence is shorter then the required number of steps : (n_steps, seq, seq.shape):";
+static char __pyx_k_5[] = "ndarray is not C contiguous";
+static char __pyx_k_7[] = "ndarray is not Fortran contiguous";
+static char __pyx_k_9[] = "Non-native byte order not supported";
+static char __pyx_k_11[] = "unknown dtype code in numpy.pxd (%d)";
+static char __pyx_k_12[] = "Format string allocated too short, see comment in numpy.pxd";
+static char __pyx_k_15[] = "Format string allocated too short.";
+static char __pyx_k_17[] = "\n This code implements the operations that scan has to carry on when called\n as a stand alone function.\n\n IF anything this is the entire code that needs to be transported to C.\n\n Short description of how this code works:\n     Scan divides its inputs ( Op's inputs) into different classes of inputs\n     as follows:\n         i) sequences : inputs over which scan loops to get data. Nothing is\n         written into them ( they are readonly, loop over)\n\n         ii) mit_mot : multiple input taps multiple output taps arguments.\n         These are inputs over which scan loops and gets data but into which\n         scan also writes data. The shorthand mit_mot describes how scan\n         deal with them at each step : at each step take several slices as\n         input and produce sevaral slices as outputs\n\n         iii) mit_sot : multiple input taps single output tap arguments.\n         As before scan reads from these but also writes. At each step scan\n         uses several slices as input but produces only one as output\n\n         iv) sit_sot : single input tap single output tap arguments.\n         At each step use only the previous slice as input, produce only one\n         slice as output\n\n         v) nit_sot: no input tap single output tap arguments.\n         At each step don't use any previous values, only produce new onese\n\n         vi) shared_outs: arguments corresponding to shared variables with\n         updates.\n         At each step use its value as input, and afterwards replace it with\n         a new value.\n         vii) other_args: arguments that are passed to every call of the\n         inner function as they are ( no slicing is perfomed)\n\n    All these outputs are one after the other in the inputs list (named in\n    this code as args) in a given order ( namely the one described above\n    with little discrepencies depending if we are talking about the outputs\n    of the Scan op or the inputs of the Scan op Node, and if we are tal""king\n    about the inputs of the inner function of scan or of the scan op).\n\n    Because of this, all we need to be able to separate and tell arguments\n    apart is how many of which we have as well as how many taps and which\n    ones (where applicable). All this information is desribed (more or less)\n    by describing the arguments of this function)\n";
+static char __pyx_k_18[] = "Razvan Pascanu";
+static char __pyx_k_19[] = "(c) 2011, Universite de Montreal";
+static char __pyx_k_20[] = "Razvan Pascanu <r.pascanu@gmail>";
+static char __pyx_k_21[] = "theano.sandbox";
+static char __pyx_k_23[] = "/u/pascanur/repos/Theano/theano/scan_module/scan_perform.pyx";
+static char __pyx_k_24[] = "theano.scan_module.scan_perform";
+static char __pyx_k__B[] = "B";
+static char __pyx_k__H[] = "H";
+static char __pyx_k__I[] = "I";
+static char __pyx_k__L[] = "L";
+static char __pyx_k__O[] = "O";
+static char __pyx_k__Q[] = "Q";
+static char __pyx_k__b[] = "b";
+static char __pyx_k__d[] = "d";
+static char __pyx_k__f[] = "f";
+static char __pyx_k__g[] = "g";
+static char __pyx_k__h[] = "h";
+static char __pyx_k__i[] = "i";
+static char __pyx_k__j[] = "j";
+static char __pyx_k__k[] = "k";
+static char __pyx_k__l[] = "l";
+static char __pyx_k__q[] = "q";
+static char __pyx_k__Zd[] = "Zd";
+static char __pyx_k__Zf[] = "Zf";
+static char __pyx_k__Zg[] = "Zg";
+static char __pyx_k__fn[] = "fn";
+static char __pyx_k__vm[] = "vm";
+static char __pyx_k__end[] = "end";
+static char __pyx_k__gof[] = "gof";
+static char __pyx_k__idx[] = "idx";
+static char __pyx_k__kdx[] = "kdx";
+static char __pyx_k__pdx[] = "pdx";
+static char __pyx_k__pos[] = "pos";
+static char __pyx_k__sh0[] = "sh0";
+static char __pyx_k__tap[] = "tap";
+static char __pyx_k__tdx[] = "tdx";
+static char __pyx_k__tmp[] = "tmp";
+static char __pyx_k___idx[] = "_idx";
+static char __pyx_k__args[] = "args";
+static char __pyx_k__cond[] = "cond";
+static char __pyx_k__copy[] = "copy";
+static char __pyx_k__cuda[] = "cuda";
+static char __pyx_k__fnct[] = "fnct";
+static char __pyx_k__jout[] = "jout";
+static char __pyx_k__node[] = "node";
+static char __pyx_k__outs[] = "outs";
+static char __pyx_k__self[] = "self";
+static char __pyx_k__t_fn[] = "t_fn";
+static char __pyx_k__time[] = "time";
+static char __pyx_k__type[] = "type";
+static char __pyx_k__begin[] = "begin";
+static char __pyx_k__dt_fn[] = "dt_fn";
+static char __pyx_k__dtype[] = "dtype";
+static char __pyx_k__maker[] = "maker";
+static char __pyx_k__nodes[] = "nodes";
+static char __pyx_k__numpy[] = "numpy";
+static char __pyx_k__range[] = "range";
+static char __pyx_k__shape[] = "shape";
+static char __pyx_k__t0_fn[] = "t0_fn";
+static char __pyx_k__lenpos[] = "lenpos";
+static char __pyx_k__n_outs[] = "n_outs";
+static char __pyx_k__n_seqs[] = "n_seqs";
+static char __pyx_k__offset[] = "offset";
+static char __pyx_k__t_call[] = "t_call";
+static char __pyx_k__theano[] = "theano";
+static char __pyx_k__mintaps[] = "mintaps";
+static char __pyx_k__n_steps[] = "n_steps";
+static char __pyx_k__nbsteps[] = "nbsteps";
+static char __pyx_k__outputs[] = "outputs";
+static char __pyx_k__perform[] = "perform";
+static char __pyx_k__profile[] = "profile";
+static char __pyx_k__reshape[] = "reshape";
+static char __pyx_k__storage[] = "storage";
+static char __pyx_k__t0_call[] = "t0_call";
+static char __pyx_k____main__[] = "__main__";
+static char __pyx_k____test__[] = "__test__";
+static char __pyx_k__a_offset[] = "a_offset";
+static char __pyx_k__as_while[] = "as_while";
+static char __pyx_k__o_offset[] = "o_offset";
+static char __pyx_k__Exception[] = "Exception";
+static char __pyx_k__call_time[] = "call_time";
+static char __pyx_k__callcount[] = "callcount";
+static char __pyx_k__n_mit_mot[] = "n_mit_mot";
+static char __pyx_k__n_mit_sot[] = "n_mit_sot";
+static char __pyx_k__n_nit_sot[] = "n_nit_sot";
+static char __pyx_k__n_sit_sot[] = "n_sit_sot";
+static char __pyx_k__tap_array[] = "tap_array";
+static char __pyx_k__ValueError[] = "ValueError";
+static char __pyx_k__offset_out[] = "offset_out";
+static char __pyx_k__other_args[] = "other_args";
+static char __pyx_k____authors__[] = "__authors__";
+static char __pyx_k____contact__[] = "__contact__";
+static char __pyx_k__destroy_map[] = "destroy_map";
+static char __pyx_k__get_version[] = "get_version";
+static char __pyx_k__store_steps[] = "store_steps";
+static char __pyx_k__value_zeros[] = "value_zeros";
+static char __pyx_k__vector_outs[] = "vector_outs";
+static char __pyx_k__vector_seqs[] = "vector_seqs";
+static char __pyx_k__RuntimeError[] = "RuntimeError";
+static char __pyx_k__vm_call_time[] = "vm_call_time";
+static char __pyx_k____copyright__[] = "__copyright__";
+static char __pyx_k__input_storage[] = "input_storage";
+static char __pyx_k__n_shared_outs[] = "n_shared_outs";
+static char __pyx_k__raise_with_op[] = "raise_with_op";
+static char __pyx_k__tap_array_len[] = "tap_array_len";
+static char __pyx_k__n_mit_mot_outs[] = "n_mit_mot_outs";
+static char __pyx_k__output_storage[] = "output_storage";
+static char __pyx_k__update_profile[] = "update_profile";
+static char __pyx_k__len_store_steps[] = "len_store_steps";
+static char __pyx_k__seqs_arg_offset[] = "seqs_arg_offset";
+static char __pyx_k__position_of_error[] = "position_of_error";
+static char __pyx_k__shared_arg_offset[] = "shared_arg_offset";
+static char __pyx_k__mit_mot_out_slices[] = "mit_mot_out_slices";
+static char __pyx_k__nit_sot_arg_offset[] = "nit_sot_arg_offset";
+static char __pyx_k__mit_mot_out_nslices[] = "mit_mot_out_nslices";
+static PyObject *__pyx_kp_s_1;
+static PyObject *__pyx_kp_u_11;
+static PyObject *__pyx_kp_u_12;
+static PyObject *__pyx_kp_u_15;
+static PyObject *__pyx_kp_s_18;
+static PyObject *__pyx_kp_s_19;
+static PyObject *__pyx_kp_s_20;
+static PyObject *__pyx_n_s_21;
+static PyObject *__pyx_kp_s_23;
+static PyObject *__pyx_n_s_24;
+static PyObject *__pyx_kp_u_5;
+static PyObject *__pyx_kp_u_7;
+static PyObject *__pyx_kp_u_9;
+static PyObject *__pyx_n_s__Exception;
+static PyObject *__pyx_n_s__RuntimeError;
+static PyObject *__pyx_n_s__ValueError;
+static PyObject *__pyx_n_s____authors__;
+static PyObject *__pyx_n_s____contact__;
+static PyObject *__pyx_n_s____copyright__;
+static PyObject *__pyx_n_s____main__;
+static PyObject *__pyx_n_s____test__;
+static PyObject *__pyx_n_s___idx;
+static PyObject *__pyx_n_s__a_offset;
+static PyObject *__pyx_n_s__args;
+static PyObject *__pyx_n_s__as_while;
+static PyObject *__pyx_n_s__begin;
+static PyObject *__pyx_n_s__call_time;
+static PyObject *__pyx_n_s__callcount;
+static PyObject *__pyx_n_s__cond;
+static PyObject *__pyx_n_s__copy;
+static PyObject *__pyx_n_s__cuda;
+static PyObject *__pyx_n_s__destroy_map;
+static PyObject *__pyx_n_s__dt_fn;
+static PyObject *__pyx_n_s__dtype;
+static PyObject *__pyx_n_s__end;
+static PyObject *__pyx_n_s__fn;
+static PyObject *__pyx_n_s__fnct;
+static PyObject *__pyx_n_s__get_version;
+static PyObject *__pyx_n_s__gof;
+static PyObject *__pyx_n_s__i;
+static PyObject *__pyx_n_s__idx;
+static PyObject *__pyx_n_s__input_storage;
+static PyObject *__pyx_n_s__j;
+static PyObject *__pyx_n_s__jout;
+static PyObject *__pyx_n_s__k;
+static PyObject *__pyx_n_s__kdx;
+static PyObject *__pyx_n_s__l;
+static PyObject *__pyx_n_s__len_store_steps;
+static PyObject *__pyx_n_s__lenpos;
+static PyObject *__pyx_n_s__maker;
+static PyObject *__pyx_n_s__mintaps;
+static PyObject *__pyx_n_s__mit_mot_out_nslices;
+static PyObject *__pyx_n_s__mit_mot_out_slices;
+static PyObject *__pyx_n_s__n_mit_mot;
+static PyObject *__pyx_n_s__n_mit_mot_outs;
+static PyObject *__pyx_n_s__n_mit_sot;
+static PyObject *__pyx_n_s__n_nit_sot;
+static PyObject *__pyx_n_s__n_outs;
+static PyObject *__pyx_n_s__n_seqs;
+static PyObject *__pyx_n_s__n_shared_outs;
+static PyObject *__pyx_n_s__n_sit_sot;
+static PyObject *__pyx_n_s__n_steps;
+static PyObject *__pyx_n_s__nbsteps;
+static PyObject *__pyx_n_s__nit_sot_arg_offset;
+static PyObject *__pyx_n_s__node;
+static PyObject *__pyx_n_s__nodes;
+static PyObject *__pyx_n_s__numpy;
+static PyObject *__pyx_n_s__o_offset;
+static PyObject *__pyx_n_s__offset;
+static PyObject *__pyx_n_s__offset_out;
+static PyObject *__pyx_n_s__other_args;
+static PyObject *__pyx_n_s__output_storage;
+static PyObject *__pyx_n_s__outputs;
+static PyObject *__pyx_n_s__outs;
+static PyObject *__pyx_n_s__pdx;
+static PyObject *__pyx_n_s__perform;
+static PyObject *__pyx_n_s__pos;
+static PyObject *__pyx_n_s__position_of_error;
+static PyObject *__pyx_n_s__profile;
+static PyObject *__pyx_n_s__raise_with_op;
+static PyObject *__pyx_n_s__range;
+static PyObject *__pyx_n_s__reshape;
+static PyObject *__pyx_n_s__self;
+static PyObject *__pyx_n_s__seqs_arg_offset;
+static PyObject *__pyx_n_s__sh0;
+static PyObject *__pyx_n_s__shape;
+static PyObject *__pyx_n_s__shared_arg_offset;
+static PyObject *__pyx_n_s__storage;
+static PyObject *__pyx_n_s__store_steps;
+static PyObject *__pyx_n_s__t0_call;
+static PyObject *__pyx_n_s__t0_fn;
+static PyObject *__pyx_n_s__t_call;
+static PyObject *__pyx_n_s__t_fn;
+static PyObject *__pyx_n_s__tap;
+static PyObject *__pyx_n_s__tap_array;
+static PyObject *__pyx_n_s__tap_array_len;
+static PyObject *__pyx_n_s__tdx;
+static PyObject *__pyx_n_s__theano;
+static PyObject *__pyx_n_s__time;
+static PyObject *__pyx_n_s__tmp;
+static PyObject *__pyx_n_s__type;
+static PyObject *__pyx_n_s__update_profile;
+static PyObject *__pyx_n_s__value_zeros;
+static PyObject *__pyx_n_s__vector_outs;
+static PyObject *__pyx_n_s__vector_seqs;
+static PyObject *__pyx_n_s__vm;
+static PyObject *__pyx_n_s__vm_call_time;
 static PyObject *__pyx_int_0;
 static PyObject *__pyx_int_1;
-static PyObject *__pyx_tuple_;
-static PyObject *__pyx_slice__2;
-static PyObject *__pyx_slice__3;
-static PyObject *__pyx_slice__4;
-static PyObject *__pyx_slice__7;
-static PyObject *__pyx_slice__8;
-static PyObject *__pyx_slice__9;
-static PyObject *__pyx_tuple__5;
-static PyObject *__pyx_tuple__6;
-static PyObject *__pyx_slice__10;
-static PyObject *__pyx_slice__11;
-static PyObject *__pyx_slice__12;
-static PyObject *__pyx_tuple__13;
-static PyObject *__pyx_tuple__14;
-static PyObject *__pyx_tuple__15;
-static PyObject *__pyx_tuple__16;
-static PyObject *__pyx_tuple__17;
-static PyObject *__pyx_tuple__18;
-static PyObject *__pyx_tuple__20;
-static PyObject *__pyx_codeobj__19;
-static PyObject *__pyx_codeobj__21;
-
-/* "theano/scan_module/scan_perform.pyx":64
- * 
- * 
- * def get_version():             # <<<<<<<<<<<<<<
- *     return 0.283
- * 
- */
+static PyObject *__pyx_int_neg_1;
+static PyObject *__pyx_int_15;
+static PyObject *__pyx_k_slice_2;
+static PyObject *__pyx_k_tuple_3;
+static PyObject *__pyx_k_tuple_4;
+static PyObject *__pyx_k_tuple_6;
+static PyObject *__pyx_k_tuple_8;
+static PyObject *__pyx_k_tuple_10;
+static PyObject *__pyx_k_tuple_13;
+static PyObject *__pyx_k_tuple_14;
+static PyObject *__pyx_k_tuple_16;
+static PyObject *__pyx_k_tuple_25;
+static PyObject *__pyx_k_codeobj_22;
+static PyObject *__pyx_k_codeobj_26;
 
 /* Python wrapper */
 static PyObject *__pyx_pw_6theano_11scan_module_12scan_perform_1get_version(PyObject *__pyx_self, CYTHON_UNUSED PyObject *unused); /*proto*/
@@ -1533,53 +1348,55 @@ static PyObject *__pyx_pw_6theano_11scan_module_12scan_perform_1get_version(PyOb
   PyObject *__pyx_r = 0;
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("get_version (wrapper)", 0);
+  __pyx_self = __pyx_self;
   __pyx_r = __pyx_pf_6theano_11scan_module_12scan_perform_get_version(__pyx_self);
-
-  /* function exit code */
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
+/* "theano/scan_module/scan_perform.pyx":65
+ * 
+ * 
+ * def get_version():             # <<<<<<<<<<<<<<
+ *     return 0.278
+ * 
+ */
+
 static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_get_version(CYTHON_UNUSED PyObject *__pyx_self) {
   PyObject *__pyx_r = NULL;
   __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("get_version", 0);
 
-  /* "theano/scan_module/scan_perform.pyx":65
+  /* "theano/scan_module/scan_perform.pyx":66
  * 
  * def get_version():
- *     return 0.283             # <<<<<<<<<<<<<<
+ *     return 0.278             # <<<<<<<<<<<<<<
  * 
  * @cython.boundscheck(False)
  */
   __Pyx_XDECREF(__pyx_r);
-  __Pyx_INCREF(__pyx_float_0_283);
-  __pyx_r = __pyx_float_0_283;
-  goto __pyx_L0;
-
-  /* "theano/scan_module/scan_perform.pyx":64
- * 
- * 
- * def get_version():             # <<<<<<<<<<<<<<
- *     return 0.283
- * 
- */
+  __pyx_t_1 = PyFloat_FromDouble(0.278); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 66; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
 
-  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("theano.scan_module.scan_perform.get_version", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
   __pyx_L0:;
   __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-/* "theano/scan_module/scan_perform.pyx":68
- * 
- * @cython.boundscheck(False)
- * def perform(             # <<<<<<<<<<<<<<
- *             unsigned int n_shared_outs,
- *             unsigned int n_mit_mot_outs,
- */
-
 /* Python wrapper */
 static PyObject *__pyx_pw_6theano_11scan_module_12scan_perform_3perform(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
 static char __pyx_doc_6theano_11scan_module_12scan_perform_2perform[] = "\n    Parameters\n    ----------\n    n_shared_outs: unsigned int\n        Number of arugments that correspond to shared variables with\n        updates\n    n_mit_mot_outs: unsigned int\n        Sum over the number of output taps for each mit_mot sequence\n    n_seqs: unsigned int\n        Number of sequences provided as input\n    n_mit_mot : unsigned int\n        Number of mit_mot arguemnts\n    n_mit_sot: unsigned int\n        Number of mit_sot arguments\n    n_sit_sot: unsigned int\n        Number of sit sot arguemnts\n    n_nit_sot: unsigned int\n        Number of nit_sot arguments\n    n_steps: unsigned int\n        Number of steps to loop over\n    mintaps: int32 ndarray (can also be a simple python list if that is better !)\n        For any of the mit_mot, mit_sot, sit_sot says which is the furtherst\n        away input tap from current position. For example, if the taps where [-2,\n        -5, -9], the mintap would be -9. For sit_sot this is always -1 since\n        is the only allowed tap.\n    tap_array: int32 ndarray( can be replaced by a list of list in python if better)\n        For each of the mit_mot, mit_sot, sit_sot (the first dimension) says\n        which are the corresponding input taps. While this is a matrix, not all\n        values in a row are needed and tap_array_len is there to say up to\n        which entry we are dealing with valid taps ( afterwards there are\n        just 0s to ensure the fix format)\n    tap_array_len: int32 ndarray( can be replaced by a list if better)\n        For each of the mit_mot, mit_sot, sit_sot says how many input taps\n        each has. For sit_sot this will always be 1.\n    vector_seqs: int32 ndarray (can be replaced by a list of bools if better)\n        For each sequence the corresponding entry is either a 1, is the\n        sequence is a vector or 0 if it has more than 1 dimension\n    vector_outs: int32 ndarray( can be replaced by list of bools if better)\n        For each output ( mit_mot, mit_sot, si""t_sot, nit_sot in this order)\n        the entry is 1 if the corresponding argument is a 1 dimensional\n        tensor, 0 otherwise.\n    mit_mot_out_slices : int32 ndarray( can be replaced by list of lists)\n        Same as tap_array, but for the output taps of mit_mot sequences\n    mit_mot_out_nslices: int32 ndarray (Can be replaced by a list)\n        Same as tap_array_len, but is the number of output taps of the\n        mit_mot sequences (i.e. it corresponds to mit_mot_out_slices)\n    fn: callable\n        This is the linker, i.e. the function that will loop over the\n        computational graph and call the perform of each operation. For this\n        linker there is a c version in gof/lazy_linker.c that will be the\n        starting point of implementing this funciton in C ( we need to take\n        all the code around the call of this function and put in C inside\n        that code)\n    fnct: python object\n        Only used to attach some timings for the profile mode ( can be\n        skiped if we don't care about Theano's profile mode)\n    destroy_map\n        Array of boolean saying if an output is computed inplace\n    args: list of ndarrays (and random states)\n        The inputs of scan in a given order ( n_steps, sequences, mit_mot,\n        mit_sot, sit_sot, nit_sot, shared_outs, other_args)\n    outs: list of 1 element list ( or storage objects?)\n        This is where we need to copy our outputs ( we don't return the\n        results, though we can change the code such that we return, and\n        figure things out on the outside - python)\n    self: python object\n        The scan op itself. I only use it to attach to it some timing\n        informations .. but I don;t need to.\n\n    ";
@@ -1608,14 +1425,12 @@ static PyObject *__pyx_pw_6theano_11scan_module_12scan_perform_3perform(PyObject
   PyObject *__pyx_v_outs = 0;
   PyObject *__pyx_v_self = 0;
   PyObject *__pyx_v_node = 0;
-  int __pyx_lineno = 0;
-  const char *__pyx_filename = NULL;
-  int __pyx_clineno = 0;
+  static PyObject **__pyx_pyargnames[] = {&__pyx_n_s__n_shared_outs,&__pyx_n_s__n_mit_mot_outs,&__pyx_n_s__n_seqs,&__pyx_n_s__n_mit_mot,&__pyx_n_s__n_mit_sot,&__pyx_n_s__n_sit_sot,&__pyx_n_s__n_nit_sot,&__pyx_n_s__n_steps,&__pyx_n_s__as_while,&__pyx_n_s__mintaps,&__pyx_n_s__tap_array,&__pyx_n_s__tap_array_len,&__pyx_n_s__vector_seqs,&__pyx_n_s__vector_outs,&__pyx_n_s__mit_mot_out_slices,&__pyx_n_s__mit_mot_out_nslices,&__pyx_n_s__fn,&__pyx_n_s__fnct,&__pyx_n_s__destroy_map,&__pyx_n_s__args,&__pyx_n_s__outs,&__pyx_n_s__self,&__pyx_n_s__node,0};
   PyObject *__pyx_r = 0;
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("perform (wrapper)", 0);
+  __pyx_self = __pyx_self;
   {
-    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_n_shared_outs,&__pyx_n_s_n_mit_mot_outs,&__pyx_n_s_n_seqs,&__pyx_n_s_n_mit_mot,&__pyx_n_s_n_mit_sot,&__pyx_n_s_n_sit_sot,&__pyx_n_s_n_nit_sot,&__pyx_n_s_n_steps,&__pyx_n_s_as_while,&__pyx_n_s_mintaps,&__pyx_n_s_tap_array,&__pyx_n_s_tap_array_len,&__pyx_n_s_vector_seqs,&__pyx_n_s_vector_outs,&__pyx_n_s_mit_mot_out_slices,&__pyx_n_s_mit_mot_out_nslices,&__pyx_n_s_fn,&__pyx_n_s_fnct,&__pyx_n_s_destroy_map,&__pyx_n_s_args,&__pyx_n_s_outs,&__pyx_n_s_self,&__pyx_n_s_node,0};
     PyObject* values[23] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
     if (unlikely(__pyx_kwds)) {
       Py_ssize_t kw_args;
@@ -1650,121 +1465,144 @@ static PyObject *__pyx_pw_6theano_11scan_module_12scan_perform_3perform(PyObject
       kw_args = PyDict_Size(__pyx_kwds);
       switch (pos_args) {
         case  0:
-        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_n_shared_outs)) != 0)) kw_args--;
+        values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__n_shared_outs);
+        if (likely(values[0])) kw_args--;
         else goto __pyx_L5_argtuple_error;
         case  1:
-        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_n_mit_mot_outs)) != 0)) kw_args--;
+        values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__n_mit_mot_outs);
+        if (likely(values[1])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
-        if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_n_seqs)) != 0)) kw_args--;
+        values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__n_seqs);
+        if (likely(values[2])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
-        if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_n_mit_mot)) != 0)) kw_args--;
+        values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__n_mit_mot);
+        if (likely(values[3])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 3); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 3); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  4:
-        if (likely((values[4] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_n_mit_sot)) != 0)) kw_args--;
+        values[4] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__n_mit_sot);
+        if (likely(values[4])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 4); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 4); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  5:
-        if (likely((values[5] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_n_sit_sot)) != 0)) kw_args--;
+        values[5] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__n_sit_sot);
+        if (likely(values[5])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 5); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 5); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  6:
-        if (likely((values[6] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_n_nit_sot)) != 0)) kw_args--;
+        values[6] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__n_nit_sot);
+        if (likely(values[6])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 6); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 6); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  7:
-        if (likely((values[7] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_n_steps)) != 0)) kw_args--;
+        values[7] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__n_steps);
+        if (likely(values[7])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 7); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 7); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  8:
-        if (likely((values[8] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_as_while)) != 0)) kw_args--;
+        values[8] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__as_while);
+        if (likely(values[8])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 8); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 8); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  9:
-        if (likely((values[9] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_mintaps)) != 0)) kw_args--;
+        values[9] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__mintaps);
+        if (likely(values[9])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 9); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 9); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 10:
-        if (likely((values[10] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_tap_array)) != 0)) kw_args--;
+        values[10] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__tap_array);
+        if (likely(values[10])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 10); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 10); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 11:
-        if (likely((values[11] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_tap_array_len)) != 0)) kw_args--;
+        values[11] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__tap_array_len);
+        if (likely(values[11])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 11); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 11); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 12:
-        if (likely((values[12] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_vector_seqs)) != 0)) kw_args--;
+        values[12] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__vector_seqs);
+        if (likely(values[12])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 12); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 12); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 13:
-        if (likely((values[13] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_vector_outs)) != 0)) kw_args--;
+        values[13] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__vector_outs);
+        if (likely(values[13])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 13); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 13); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 14:
-        if (likely((values[14] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_mit_mot_out_slices)) != 0)) kw_args--;
+        values[14] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__mit_mot_out_slices);
+        if (likely(values[14])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 14); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 14); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 15:
-        if (likely((values[15] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_mit_mot_out_nslices)) != 0)) kw_args--;
+        values[15] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__mit_mot_out_nslices);
+        if (likely(values[15])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 15); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 15); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 16:
-        if (likely((values[16] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_fn)) != 0)) kw_args--;
+        values[16] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__fn);
+        if (likely(values[16])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 16); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 16); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 17:
-        if (likely((values[17] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_fnct)) != 0)) kw_args--;
+        values[17] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__fnct);
+        if (likely(values[17])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 17); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 17); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 18:
-        if (likely((values[18] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_destroy_map)) != 0)) kw_args--;
+        values[18] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__destroy_map);
+        if (likely(values[18])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 18); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 18); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 19:
-        if (likely((values[19] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_args)) != 0)) kw_args--;
+        values[19] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__args);
+        if (likely(values[19])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 19); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 19); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 20:
-        if (likely((values[20] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_outs)) != 0)) kw_args--;
+        values[20] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__outs);
+        if (likely(values[20])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 20); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 20); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 21:
-        if (likely((values[21] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_self)) != 0)) kw_args--;
+        values[21] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__self);
+        if (likely(values[21])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 21); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 21); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case 22:
-        if (likely((values[22] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_node)) != 0)) kw_args--;
+        values[22] = PyDict_GetItem(__pyx_kwds, __pyx_n_s__node);
+        if (likely(values[22])) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 22); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, 22); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "perform") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "perform") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 23) {
       goto __pyx_L5_argtuple_error;
@@ -1793,15 +1631,15 @@ static PyObject *__pyx_pw_6theano_11scan_module_12scan_perform_3perform(PyObject
       values[21] = PyTuple_GET_ITEM(__pyx_args, 21);
       values[22] = PyTuple_GET_ITEM(__pyx_args, 22);
     }
-    __pyx_v_n_shared_outs = __Pyx_PyInt_As_unsigned_int(values[0]); if (unlikely((__pyx_v_n_shared_outs == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_n_mit_mot_outs = __Pyx_PyInt_As_unsigned_int(values[1]); if (unlikely((__pyx_v_n_mit_mot_outs == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_n_seqs = __Pyx_PyInt_As_unsigned_int(values[2]); if (unlikely((__pyx_v_n_seqs == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 71; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_n_mit_mot = __Pyx_PyInt_As_unsigned_int(values[3]); if (unlikely((__pyx_v_n_mit_mot == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 72; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_n_mit_sot = __Pyx_PyInt_As_unsigned_int(values[4]); if (unlikely((__pyx_v_n_mit_sot == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 73; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_n_sit_sot = __Pyx_PyInt_As_unsigned_int(values[5]); if (unlikely((__pyx_v_n_sit_sot == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 74; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_n_nit_sot = __Pyx_PyInt_As_unsigned_int(values[6]); if (unlikely((__pyx_v_n_nit_sot == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 75; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_n_steps = __Pyx_PyInt_As_int(values[7]); if (unlikely((__pyx_v_n_steps == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 76; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_as_while = __Pyx_PyObject_IsTrue(values[8]); if (unlikely((__pyx_v_as_while == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 77; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_n_shared_outs = __Pyx_PyInt_AsUnsignedInt(values[0]); if (unlikely((__pyx_v_n_shared_outs == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_n_mit_mot_outs = __Pyx_PyInt_AsUnsignedInt(values[1]); if (unlikely((__pyx_v_n_mit_mot_outs == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 71; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_n_seqs = __Pyx_PyInt_AsUnsignedInt(values[2]); if (unlikely((__pyx_v_n_seqs == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 72; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_n_mit_mot = __Pyx_PyInt_AsUnsignedInt(values[3]); if (unlikely((__pyx_v_n_mit_mot == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 73; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_n_mit_sot = __Pyx_PyInt_AsUnsignedInt(values[4]); if (unlikely((__pyx_v_n_mit_sot == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 74; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_n_sit_sot = __Pyx_PyInt_AsUnsignedInt(values[5]); if (unlikely((__pyx_v_n_sit_sot == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 75; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_n_nit_sot = __Pyx_PyInt_AsUnsignedInt(values[6]); if (unlikely((__pyx_v_n_nit_sot == (unsigned int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 76; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_n_steps = __Pyx_PyInt_AsInt(values[7]); if (unlikely((__pyx_v_n_steps == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 77; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_as_while = __Pyx_PyObject_IsTrue(values[8]); if (unlikely((__pyx_v_as_while == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 78; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
     __pyx_v_mintaps = ((PyArrayObject *)values[9]);
     __pyx_v_tap_array = ((PyArrayObject *)values[10]);
     __pyx_v_tap_array_len = ((PyArrayObject *)values[11]);
@@ -1819,23 +1657,21 @@ static PyObject *__pyx_pw_6theano_11scan_module_12scan_perform_3perform(PyObject
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("perform", 1, 23, 23, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("theano.scan_module.scan_perform.perform", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_mintaps), __pyx_ptype_5numpy_ndarray, 1, "mintaps", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 78; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_tap_array), __pyx_ptype_5numpy_ndarray, 1, "tap_array", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 79; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_tap_array_len), __pyx_ptype_5numpy_ndarray, 1, "tap_array_len", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 80; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_vector_seqs), __pyx_ptype_5numpy_ndarray, 1, "vector_seqs", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 81; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_vector_outs), __pyx_ptype_5numpy_ndarray, 1, "vector_outs", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 82; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_mit_mot_out_slices), __pyx_ptype_5numpy_ndarray, 1, "mit_mot_out_slices", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 83; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_mit_mot_out_nslices), __pyx_ptype_5numpy_ndarray, 1, "mit_mot_out_nslices", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_destroy_map), __pyx_ptype_5numpy_ndarray, 1, "destroy_map", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_mintaps), __pyx_ptype_5numpy_ndarray, 1, "mintaps", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 79; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_tap_array), __pyx_ptype_5numpy_ndarray, 1, "tap_array", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 80; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_tap_array_len), __pyx_ptype_5numpy_ndarray, 1, "tap_array_len", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 81; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_vector_seqs), __pyx_ptype_5numpy_ndarray, 1, "vector_seqs", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 82; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_vector_outs), __pyx_ptype_5numpy_ndarray, 1, "vector_outs", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 83; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_mit_mot_out_slices), __pyx_ptype_5numpy_ndarray, 1, "mit_mot_out_slices", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_mit_mot_out_nslices), __pyx_ptype_5numpy_ndarray, 1, "mit_mot_out_nslices", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 85; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_destroy_map), __pyx_ptype_5numpy_ndarray, 1, "destroy_map", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 88; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_6theano_11scan_module_12scan_perform_2perform(__pyx_self, __pyx_v_n_shared_outs, __pyx_v_n_mit_mot_outs, __pyx_v_n_seqs, __pyx_v_n_mit_mot, __pyx_v_n_mit_sot, __pyx_v_n_sit_sot, __pyx_v_n_nit_sot, __pyx_v_n_steps, __pyx_v_as_while, __pyx_v_mintaps, __pyx_v_tap_array, __pyx_v_tap_array_len, __pyx_v_vector_seqs, __pyx_v_vector_outs, __pyx_v_mit_mot_out_slices, __pyx_v_mit_mot_out_nslices, __pyx_v_fn, __pyx_v_fnct, __pyx_v_destroy_map, __pyx_v_args, __pyx_v_outs, __pyx_v_self, __pyx_v_node);
-
-  /* function exit code */
   goto __pyx_L0;
   __pyx_L1_error:;
   __pyx_r = NULL;
@@ -1844,6 +1680,14 @@ static PyObject *__pyx_pw_6theano_11scan_module_12scan_perform_3perform(PyObject
   return __pyx_r;
 }
 
+/* "theano/scan_module/scan_perform.pyx":69
+ * 
+ * @cython.boundscheck(False)
+ * def perform(             # <<<<<<<<<<<<<<
+ *             unsigned int n_shared_outs,
+ *             unsigned int n_mit_mot_outs,
+ */
+
 static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_UNUSED PyObject *__pyx_self, unsigned int __pyx_v_n_shared_outs, unsigned int __pyx_v_n_mit_mot_outs, unsigned int __pyx_v_n_seqs, unsigned int __pyx_v_n_mit_mot, unsigned int __pyx_v_n_mit_sot, unsigned int __pyx_v_n_sit_sot, unsigned int __pyx_v_n_nit_sot, int __pyx_v_n_steps, int __pyx_v_as_while, PyArrayObject *__pyx_v_mintaps, PyArrayObject *__pyx_v_tap_array, PyArrayObject *__pyx_v_tap_array_len, PyArrayObject *__pyx_v_vector_seqs, PyArrayObject *__pyx_v_vector_outs, PyArrayObject *__pyx_v_mit_mot_out_slices, PyArrayObject *__pyx_v_mit_mot_out_nslices, PyObject *__pyx_v_fn, PyObject *__pyx_v_fnct, PyArrayObject *__pyx_v_destroy_map, PyObject *__pyx_v_args, PyObject *__pyx_v_outs, PyObject *__pyx_v_self, PyObject *__pyx_v_node) {
   PyObject *__pyx_v_t0_call = NULL;
   PyObject *__pyx_v_t_fn = NULL;
@@ -1882,8 +1726,6 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
   PyObject *__pyx_v_dtype = NULL;
   PyObject *__pyx_v_tmp = NULL;
   PyObject *__pyx_v_sh0 = NULL;
-  PyObject *__pyx_v_i_s = NULL;
-  PyObject *__pyx_v_o_s = NULL;
   PyObject *__pyx_v_t_call = NULL;
   PyObject *__pyx_v_profile = NULL;
   __Pyx_LocalBuf_ND __pyx_pybuffernd_destroy_map;
@@ -1945,7 +1787,6 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
   int __pyx_t_39;
   unsigned int __pyx_t_40;
   unsigned int __pyx_t_41;
-  PyObject *(*__pyx_t_42)(PyObject *);
   int __pyx_lineno = 0;
   const char *__pyx_filename = NULL;
   int __pyx_clineno = 0;
@@ -1984,64 +1825,64 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
   __pyx_pybuffernd_destroy_map.rcbuffer = &__pyx_pybuffer_destroy_map;
   {
     __Pyx_BufFmt_StackElem __pyx_stack[1];
-    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_mintaps.rcbuffer->pybuffer, (PyObject*)__pyx_v_mintaps, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_mintaps.rcbuffer->pybuffer, (PyObject*)__pyx_v_mintaps, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_pybuffernd_mintaps.diminfo[0].strides = __pyx_pybuffernd_mintaps.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_mintaps.diminfo[0].shape = __pyx_pybuffernd_mintaps.rcbuffer->pybuffer.shape[0];
   {
     __Pyx_BufFmt_StackElem __pyx_stack[1];
-    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_tap_array.rcbuffer->pybuffer, (PyObject*)__pyx_v_tap_array, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_tap_array.rcbuffer->pybuffer, (PyObject*)__pyx_v_tap_array, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_pybuffernd_tap_array.diminfo[0].strides = __pyx_pybuffernd_tap_array.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_tap_array.diminfo[0].shape = __pyx_pybuffernd_tap_array.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_tap_array.diminfo[1].strides = __pyx_pybuffernd_tap_array.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_tap_array.diminfo[1].shape = __pyx_pybuffernd_tap_array.rcbuffer->pybuffer.shape[1];
   {
     __Pyx_BufFmt_StackElem __pyx_stack[1];
-    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_tap_array_len.rcbuffer->pybuffer, (PyObject*)__pyx_v_tap_array_len, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_tap_array_len.rcbuffer->pybuffer, (PyObject*)__pyx_v_tap_array_len, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_pybuffernd_tap_array_len.diminfo[0].strides = __pyx_pybuffernd_tap_array_len.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_tap_array_len.diminfo[0].shape = __pyx_pybuffernd_tap_array_len.rcbuffer->pybuffer.shape[0];
   {
     __Pyx_BufFmt_StackElem __pyx_stack[1];
-    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_vector_seqs.rcbuffer->pybuffer, (PyObject*)__pyx_v_vector_seqs, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_vector_seqs.rcbuffer->pybuffer, (PyObject*)__pyx_v_vector_seqs, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_pybuffernd_vector_seqs.diminfo[0].strides = __pyx_pybuffernd_vector_seqs.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_vector_seqs.diminfo[0].shape = __pyx_pybuffernd_vector_seqs.rcbuffer->pybuffer.shape[0];
   {
     __Pyx_BufFmt_StackElem __pyx_stack[1];
-    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_vector_outs.rcbuffer->pybuffer, (PyObject*)__pyx_v_vector_outs, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES| PyBUF_WRITABLE, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_vector_outs.rcbuffer->pybuffer, (PyObject*)__pyx_v_vector_outs, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES| PyBUF_WRITABLE, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_pybuffernd_vector_outs.diminfo[0].strides = __pyx_pybuffernd_vector_outs.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_vector_outs.diminfo[0].shape = __pyx_pybuffernd_vector_outs.rcbuffer->pybuffer.shape[0];
   {
     __Pyx_BufFmt_StackElem __pyx_stack[1];
-    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_mit_mot_out_slices.rcbuffer->pybuffer, (PyObject*)__pyx_v_mit_mot_out_slices, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_mit_mot_out_slices.rcbuffer->pybuffer, (PyObject*)__pyx_v_mit_mot_out_slices, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_pybuffernd_mit_mot_out_slices.diminfo[0].strides = __pyx_pybuffernd_mit_mot_out_slices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_mit_mot_out_slices.diminfo[0].shape = __pyx_pybuffernd_mit_mot_out_slices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_mit_mot_out_slices.diminfo[1].strides = __pyx_pybuffernd_mit_mot_out_slices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_mit_mot_out_slices.diminfo[1].shape = __pyx_pybuffernd_mit_mot_out_slices.rcbuffer->pybuffer.shape[1];
   {
     __Pyx_BufFmt_StackElem __pyx_stack[1];
-    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_mit_mot_out_nslices.rcbuffer->pybuffer, (PyObject*)__pyx_v_mit_mot_out_nslices, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_mit_mot_out_nslices.rcbuffer->pybuffer, (PyObject*)__pyx_v_mit_mot_out_nslices, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_pybuffernd_mit_mot_out_nslices.diminfo[0].strides = __pyx_pybuffernd_mit_mot_out_nslices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_mit_mot_out_nslices.diminfo[0].shape = __pyx_pybuffernd_mit_mot_out_nslices.rcbuffer->pybuffer.shape[0];
   {
     __Pyx_BufFmt_StackElem __pyx_stack[1];
-    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_destroy_map.rcbuffer->pybuffer, (PyObject*)__pyx_v_destroy_map, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_destroy_map.rcbuffer->pybuffer, (PyObject*)__pyx_v_destroy_map, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_pybuffernd_destroy_map.diminfo[0].strides = __pyx_pybuffernd_destroy_map.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_destroy_map.diminfo[0].shape = __pyx_pybuffernd_destroy_map.rcbuffer->pybuffer.shape[0];
 
-  /* "theano/scan_module/scan_perform.pyx":164
+  /* "theano/scan_module/scan_perform.pyx":165
  *     # 1. Unzip the number of steps and sequences. If number of steps is
  *     # negative flip sequences around, and make n_steps positive
  *     t0_call = time.time()             # <<<<<<<<<<<<<<
  *     t_fn = 0
  *     cdef unsigned int n_outs = n_mit_mot + n_mit_sot + n_sit_sot
  */
-  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_time); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 164; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_GetName(__pyx_m, __pyx_n_s__time); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 165; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_time); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 164; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__time); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 165; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 164; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyObject_Call(__pyx_t_2, ((PyObject *)__pyx_empty_tuple), NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 165; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   __pyx_v_t0_call = __pyx_t_1;
   __pyx_t_1 = 0;
 
-  /* "theano/scan_module/scan_perform.pyx":165
+  /* "theano/scan_module/scan_perform.pyx":166
  *     # negative flip sequences around, and make n_steps positive
  *     t0_call = time.time()
  *     t_fn = 0             # <<<<<<<<<<<<<<
@@ -2051,7 +1892,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
   __Pyx_INCREF(__pyx_int_0);
   __pyx_v_t_fn = __pyx_int_0;
 
-  /* "theano/scan_module/scan_perform.pyx":166
+  /* "theano/scan_module/scan_perform.pyx":167
  *     t0_call = time.time()
  *     t_fn = 0
  *     cdef unsigned int n_outs = n_mit_mot + n_mit_sot + n_sit_sot             # <<<<<<<<<<<<<<
@@ -2060,7 +1901,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
   __pyx_v_n_outs = ((__pyx_v_n_mit_mot + __pyx_v_n_mit_sot) + __pyx_v_n_sit_sot);
 
-  /* "theano/scan_module/scan_perform.pyx":167
+  /* "theano/scan_module/scan_perform.pyx":168
  *     t_fn = 0
  *     cdef unsigned int n_outs = n_mit_mot + n_mit_sot + n_sit_sot
  *     cdef unsigned int seqs_arg_offset = n_seqs + 1             # <<<<<<<<<<<<<<
@@ -2069,7 +1910,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
   __pyx_v_seqs_arg_offset = (__pyx_v_n_seqs + 1);
 
-  /* "theano/scan_module/scan_perform.pyx":169
+  /* "theano/scan_module/scan_perform.pyx":170
  *     cdef unsigned int seqs_arg_offset = n_seqs + 1
  *     cdef unsigned int shared_arg_offset = ( 1 + n_seqs + n_mit_mot +
  *                                            n_mit_sot + n_sit_sot)             # <<<<<<<<<<<<<<
@@ -2078,16 +1919,16 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
   __pyx_v_shared_arg_offset = ((((1 + __pyx_v_n_seqs) + __pyx_v_n_mit_mot) + __pyx_v_n_mit_sot) + __pyx_v_n_sit_sot);
 
-  /* "theano/scan_module/scan_perform.pyx":170
- *     cdef unsigned int shared_arg_offset = ( 1 + n_seqs + n_mit_mot +
+  /* "theano/scan_module/scan_perform.pyx":172
  *                                            n_mit_sot + n_sit_sot)
- *     cdef unsigned int nit_sot_arg_offset = ( shared_arg_offset +             # <<<<<<<<<<<<<<
- *                                             n_shared_outs)
+ *     cdef unsigned int nit_sot_arg_offset = ( shared_arg_offset +
+ *                                             n_shared_outs)             # <<<<<<<<<<<<<<
  *     cdef unsigned int offset_out
+ *     cdef unsigned int lenpos = n_outs + n_nit_sot
  */
   __pyx_v_nit_sot_arg_offset = (__pyx_v_shared_arg_offset + __pyx_v_n_shared_outs);
 
-  /* "theano/scan_module/scan_perform.pyx":173
+  /* "theano/scan_module/scan_perform.pyx":174
  *                                             n_shared_outs)
  *     cdef unsigned int offset_out
  *     cdef unsigned int lenpos = n_outs + n_nit_sot             # <<<<<<<<<<<<<<
@@ -2096,7 +1937,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
   __pyx_v_lenpos = (__pyx_v_n_outs + __pyx_v_n_nit_sot);
 
-  /* "theano/scan_module/scan_perform.pyx":175
+  /* "theano/scan_module/scan_perform.pyx":176
  *     cdef unsigned int lenpos = n_outs + n_nit_sot
  *     cdef int pos[500] # put a maximum of 500 outputs
  *     cdef unsigned int len_store_steps = n_mit_mot + n_mit_sot + n_sit_sot + n_nit_sot             # <<<<<<<<<<<<<<
@@ -2105,84 +1946,143 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
   __pyx_v_len_store_steps = (((__pyx_v_n_mit_mot + __pyx_v_n_mit_sot) + __pyx_v_n_sit_sot) + __pyx_v_n_nit_sot);
 
-  /* "theano/scan_module/scan_perform.pyx":196
+  /* "theano/scan_module/scan_perform.pyx":197
  * 
  * 
  *     if n_steps < 0:             # <<<<<<<<<<<<<<
- *         # History, in the past, this was used for backward
- *         # scan. Now we reverse the inputs outside of scan.
+ *         n_steps = -n_steps
+ *         for idx in range(n_seqs):
  */
-  __pyx_t_3 = ((__pyx_v_n_steps < 0) != 0);
+  __pyx_t_3 = (__pyx_v_n_steps < 0);
   if (__pyx_t_3) {
 
-    /* "theano/scan_module/scan_perform.pyx":201
- *         raise IndexError(
- *             "Scan was asked to run for negative number of step %d" %
- *             n_steps)             # <<<<<<<<<<<<<<
- *     elif n_steps == 0:
- *         raise NotImplementedError(
+    /* "theano/scan_module/scan_perform.pyx":198
+ * 
+ *     if n_steps < 0:
+ *         n_steps = -n_steps             # <<<<<<<<<<<<<<
+ *         for idx in range(n_seqs):
+ *             if args[<unsigned int>(1+idx)].shape[0] < n_steps:
  */
-    __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_n_steps); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 201; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_v_n_steps = (-__pyx_v_n_steps);
 
-    /* "theano/scan_module/scan_perform.pyx":200
- *         # scan. Now we reverse the inputs outside of scan.
- *         raise IndexError(
- *             "Scan was asked to run for negative number of step %d" %             # <<<<<<<<<<<<<<
- *             n_steps)
- *     elif n_steps == 0:
+    /* "theano/scan_module/scan_perform.pyx":199
+ *     if n_steps < 0:
+ *         n_steps = -n_steps
+ *         for idx in range(n_seqs):             # <<<<<<<<<<<<<<
+ *             if args[<unsigned int>(1+idx)].shape[0] < n_steps:
+ *                 raise ValueError(('Sequence is shorter then the required '
  */
-    __pyx_t_2 = __Pyx_PyString_Format(__pyx_kp_s_Scan_was_asked_to_run_for_negati, __pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 200; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_2);
-    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_t_4 = __pyx_v_n_seqs;
+    for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
+      __pyx_v_idx = __pyx_t_5;
 
-    /* "theano/scan_module/scan_perform.pyx":199
- *         # History, in the past, this was used for backward
- *         # scan. Now we reverse the inputs outside of scan.
- *         raise IndexError(             # <<<<<<<<<<<<<<
- *             "Scan was asked to run for negative number of step %d" %
- *             n_steps)
+      /* "theano/scan_module/scan_perform.pyx":200
+ *         n_steps = -n_steps
+ *         for idx in range(n_seqs):
+ *             if args[<unsigned int>(1+idx)].shape[0] < n_steps:             # <<<<<<<<<<<<<<
+ *                 raise ValueError(('Sequence is shorter then the required '
+ *                                  'number of steps : (n_steps, seq, '
  */
-    __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 199; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_1);
-    PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_2);
-    __Pyx_GIVEREF(__pyx_t_2);
-    __pyx_t_2 = 0;
-    __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_IndexError, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 199; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_2);
-    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-    __Pyx_Raise(__pyx_t_2, 0, 0, 0);
-    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-    {__pyx_filename = __pyx_f[0]; __pyx_lineno = 199; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  }
+      __pyx_t_6 = ((unsigned int)(1 + __pyx_v_idx));
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_6, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 200; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_1);
+      __pyx_t_2 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__shape); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 200; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_2);
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_2, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 200; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+      __pyx_t_2 = PyInt_FromLong(__pyx_v_n_steps); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 200; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_2);
+      __pyx_t_7 = PyObject_RichCompare(__pyx_t_1, __pyx_t_2, Py_LT); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 200; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+      __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_7); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 200; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      if (__pyx_t_3) {
+
+        /* "theano/scan_module/scan_perform.pyx":203
+ *                 raise ValueError(('Sequence is shorter then the required '
+ *                                  'number of steps : (n_steps, seq, '
+ *                                   'seq.shape):'), n_steps,             # <<<<<<<<<<<<<<
+ *                                   args[1+idx],
+ *                                   args[1+idx].shape)
+ */
+        __pyx_t_7 = PyInt_FromLong(__pyx_v_n_steps); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 203; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
 
-  /* "theano/scan_module/scan_perform.pyx":202
- *             "Scan was asked to run for negative number of step %d" %
- *             n_steps)
- *     elif n_steps == 0:             # <<<<<<<<<<<<<<
- *         raise NotImplementedError(
- *             "We didn't implemented yet the case where scan do 0 iteration")
+        /* "theano/scan_module/scan_perform.pyx":204
+ *                                  'number of steps : (n_steps, seq, '
+ *                                   'seq.shape):'), n_steps,
+ *                                   args[1+idx],             # <<<<<<<<<<<<<<
+ *                                   args[1+idx].shape)
+ *             args[<unsigned int>(1+idx)] = args[<unsigned int>(1+idx)][::-1]
  */
-  __pyx_t_3 = ((__pyx_v_n_steps == 0) != 0);
-  if (__pyx_t_3) {
+        __pyx_t_8 = (1 + __pyx_v_idx);
+        __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_8, sizeof(long), PyInt_FromLong); if (!__pyx_t_2) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 204; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_2);
+
+        /* "theano/scan_module/scan_perform.pyx":205
+ *                                   'seq.shape):'), n_steps,
+ *                                   args[1+idx],
+ *                                   args[1+idx].shape)             # <<<<<<<<<<<<<<
+ *             args[<unsigned int>(1+idx)] = args[<unsigned int>(1+idx)][::-1]
+ *     else:
+ */
+        __pyx_t_8 = (1 + __pyx_v_idx);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_8, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 205; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_t_9 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__shape); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 205; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __pyx_t_1 = PyTuple_New(4); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 201; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_INCREF(((PyObject *)__pyx_kp_s_1));
+        PyTuple_SET_ITEM(__pyx_t_1, 0, ((PyObject *)__pyx_kp_s_1));
+        __Pyx_GIVEREF(((PyObject *)__pyx_kp_s_1));
+        PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_t_7);
+        __Pyx_GIVEREF(__pyx_t_7);
+        PyTuple_SET_ITEM(__pyx_t_1, 2, __pyx_t_2);
+        __Pyx_GIVEREF(__pyx_t_2);
+        PyTuple_SET_ITEM(__pyx_t_1, 3, __pyx_t_9);
+        __Pyx_GIVEREF(__pyx_t_9);
+        __pyx_t_7 = 0;
+        __pyx_t_2 = 0;
+        __pyx_t_9 = 0;
+        __pyx_t_9 = PyObject_Call(__pyx_builtin_ValueError, ((PyObject *)__pyx_t_1), NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 201; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __Pyx_DECREF(((PyObject *)__pyx_t_1)); __pyx_t_1 = 0;
+        __Pyx_Raise(__pyx_t_9, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 201; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        goto __pyx_L6;
+      }
+      __pyx_L6:;
 
-    /* "theano/scan_module/scan_perform.pyx":203
- *             n_steps)
- *     elif n_steps == 0:
- *         raise NotImplementedError(             # <<<<<<<<<<<<<<
- *             "We didn't implemented yet the case where scan do 0 iteration")
+      /* "theano/scan_module/scan_perform.pyx":206
+ *                                   args[1+idx],
+ *                                   args[1+idx].shape)
+ *             args[<unsigned int>(1+idx)] = args[<unsigned int>(1+idx)][::-1]             # <<<<<<<<<<<<<<
  *     else:
+ *         for idx in range(n_seqs):
  */
-    __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_NotImplementedError, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 203; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_2);
-    __Pyx_Raise(__pyx_t_2, 0, 0, 0);
-    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-    {__pyx_filename = __pyx_f[0]; __pyx_lineno = 203; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_6 = ((unsigned int)(1 + __pyx_v_idx));
+      __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_6, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 206; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_9);
+      __pyx_t_1 = PyObject_GetItem(__pyx_t_9, __pyx_k_slice_2); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 206; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+      __pyx_t_6 = ((unsigned int)(1 + __pyx_v_idx));
+      if (__Pyx_SetItemInt(__pyx_v_args, __pyx_t_6, __pyx_t_1, sizeof(unsigned int)+1, PyLong_FromUnsignedLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 206; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    }
+    goto __pyx_L3;
   }
   /*else*/ {
 
-    /* "theano/scan_module/scan_perform.pyx":206
- *             "We didn't implemented yet the case where scan do 0 iteration")
+    /* "theano/scan_module/scan_perform.pyx":208
+ *             args[<unsigned int>(1+idx)] = args[<unsigned int>(1+idx)][::-1]
  *     else:
  *         for idx in range(n_seqs):             # <<<<<<<<<<<<<<
  *             if args[<unsigned int>(1+idx)].shape[0] < n_steps:
@@ -2192,7 +2092,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
       __pyx_v_idx = __pyx_t_5;
 
-      /* "theano/scan_module/scan_perform.pyx":207
+      /* "theano/scan_module/scan_perform.pyx":209
  *     else:
  *         for idx in range(n_seqs):
  *             if args[<unsigned int>(1+idx)].shape[0] < n_steps:             # <<<<<<<<<<<<<<
@@ -2200,34 +2100,35 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *                                  'number of steps : (n_steps, seq, '
  */
       __pyx_t_6 = ((unsigned int)(1 + __pyx_v_idx));
-      __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_6, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 207; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_2);
-      __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 207; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_6, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 209; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-      __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 207; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_2);
+      __pyx_t_9 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__shape); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 209; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_9);
       __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-      __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_n_steps); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 207; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 209; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_1);
-      __pyx_t_7 = PyObject_RichCompare(__pyx_t_2, __pyx_t_1, Py_LT); __Pyx_XGOTREF(__pyx_t_7); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 207; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+      __pyx_t_9 = PyInt_FromLong(__pyx_v_n_steps); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 209; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_9);
+      __pyx_t_2 = PyObject_RichCompare(__pyx_t_1, __pyx_t_9, Py_LT); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 209; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_2);
       __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-      __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_7); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 207; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+      __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 209; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
       if (__pyx_t_3) {
 
-        /* "theano/scan_module/scan_perform.pyx":210
+        /* "theano/scan_module/scan_perform.pyx":212
  *                 raise ValueError(('Sequence is shorter then the required '
  *                                  'number of steps : (n_steps, seq, '
  *                                   'seq.shape):'), n_steps,             # <<<<<<<<<<<<<<
  *                                   args[1+idx],
  *                                   args[1+idx].shape)
  */
-        __pyx_t_7 = __Pyx_PyInt_From_int(__pyx_v_n_steps); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 210; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_2 = PyInt_FromLong(__pyx_v_n_steps); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 212; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_2);
 
-        /* "theano/scan_module/scan_perform.pyx":211
+        /* "theano/scan_module/scan_perform.pyx":213
  *                                  'number of steps : (n_steps, seq, '
  *                                   'seq.shape):'), n_steps,
  *                                   args[1+idx],             # <<<<<<<<<<<<<<
@@ -2235,10 +2136,10 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *     # 2. Allocate memory for the outputs. Construct the list:
  */
         __pyx_t_8 = (1 + __pyx_v_idx);
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_8, long, 1, __Pyx_PyInt_From_long, 0, 1, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 211; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_8, sizeof(long), PyInt_FromLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 213; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
 
-        /* "theano/scan_module/scan_perform.pyx":212
+        /* "theano/scan_module/scan_perform.pyx":214
  *                                   'seq.shape):'), n_steps,
  *                                   args[1+idx],
  *                                   args[1+idx].shape)             # <<<<<<<<<<<<<<
@@ -2246,44 +2147,39 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *     #       store_steps  -- map containting the length of each output
  */
         __pyx_t_8 = (1 + __pyx_v_idx);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_8, long, 1, __Pyx_PyInt_From_long, 0, 1, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 212; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_shape); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 212; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-
-        /* "theano/scan_module/scan_perform.pyx":208
- *         for idx in range(n_seqs):
- *             if args[<unsigned int>(1+idx)].shape[0] < n_steps:
- *                 raise ValueError(('Sequence is shorter then the required '             # <<<<<<<<<<<<<<
- *                                  'number of steps : (n_steps, seq, '
- *                                   'seq.shape):'), n_steps,
- */
-        __pyx_t_2 = PyTuple_New(4); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 208; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_INCREF(__pyx_kp_s_Sequence_is_shorter_then_the_req);
-        PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_kp_s_Sequence_is_shorter_then_the_req);
-        __Pyx_GIVEREF(__pyx_kp_s_Sequence_is_shorter_then_the_req);
-        PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_t_7);
-        __Pyx_GIVEREF(__pyx_t_7);
-        PyTuple_SET_ITEM(__pyx_t_2, 2, __pyx_t_1);
-        __Pyx_GIVEREF(__pyx_t_1);
-        PyTuple_SET_ITEM(__pyx_t_2, 3, __pyx_t_9);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_8, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 214; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_t_7 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__shape); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 214; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __pyx_t_1 = PyTuple_New(4); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 210; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_INCREF(((PyObject *)__pyx_kp_s_1));
+        PyTuple_SET_ITEM(__pyx_t_1, 0, ((PyObject *)__pyx_kp_s_1));
+        __Pyx_GIVEREF(((PyObject *)__pyx_kp_s_1));
+        PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_t_2);
+        __Pyx_GIVEREF(__pyx_t_2);
+        PyTuple_SET_ITEM(__pyx_t_1, 2, __pyx_t_9);
         __Pyx_GIVEREF(__pyx_t_9);
-        __pyx_t_7 = 0;
-        __pyx_t_1 = 0;
+        PyTuple_SET_ITEM(__pyx_t_1, 3, __pyx_t_7);
+        __Pyx_GIVEREF(__pyx_t_7);
+        __pyx_t_2 = 0;
         __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_2, NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 208; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __Pyx_Raise(__pyx_t_9, 0, 0, 0);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 208; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_7 = 0;
+        __pyx_t_7 = PyObject_Call(__pyx_builtin_ValueError, ((PyObject *)__pyx_t_1), NULL); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 210; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(((PyObject *)__pyx_t_1)); __pyx_t_1 = 0;
+        __Pyx_Raise(__pyx_t_7, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 210; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        goto __pyx_L9;
       }
+      __pyx_L9:;
     }
   }
+  __pyx_L3:;
 
-  /* "theano/scan_module/scan_perform.pyx":217
+  /* "theano/scan_module/scan_perform.pyx":219
  *     #       pos          -- map containing the current position of each output
  * 
  *     for idx in range(n_mit_mot + n_mit_sot + n_sit_sot):             # <<<<<<<<<<<<<<
@@ -2294,7 +2190,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
   for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
     __pyx_v_idx = __pyx_t_5;
 
-    /* "theano/scan_module/scan_perform.pyx":218
+    /* "theano/scan_module/scan_perform.pyx":220
  * 
  *     for idx in range(n_mit_mot + n_mit_sot + n_sit_sot):
  *         store_steps[<unsigned int>idx] = args[<unsigned int>(idx+n_seqs+1)].shape[0]             # <<<<<<<<<<<<<<
@@ -2302,20 +2198,20 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *     for idx in range(n_nit_sot):
  */
     __pyx_t_6 = ((unsigned int)((__pyx_v_idx + __pyx_v_n_seqs) + 1));
-    __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_6, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 218; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-    __Pyx_GOTREF(__pyx_t_9);
-    __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_shape); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 218; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_2);
-    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-    __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 218; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-    __Pyx_GOTREF(__pyx_t_9);
-    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-    __pyx_t_10 = __Pyx_PyInt_As_int(__pyx_t_9); if (unlikely((__pyx_t_10 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 218; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+    __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_6, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 220; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_1 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__shape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 220; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 220; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_t_10 = __Pyx_PyInt_AsInt(__pyx_t_7); if (unlikely((__pyx_t_10 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 220; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
     (__pyx_v_store_steps[((unsigned int)__pyx_v_idx)]) = __pyx_t_10;
   }
 
-  /* "theano/scan_module/scan_perform.pyx":220
+  /* "theano/scan_module/scan_perform.pyx":222
  *         store_steps[<unsigned int>idx] = args[<unsigned int>(idx+n_seqs+1)].shape[0]
  * 
  *     for idx in range(n_nit_sot):             # <<<<<<<<<<<<<<
@@ -2326,20 +2222,20 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
   for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
     __pyx_v_idx = __pyx_t_5;
 
-    /* "theano/scan_module/scan_perform.pyx":222
- *     for idx in range(n_nit_sot):
+    /* "theano/scan_module/scan_perform.pyx":225
  *         store_steps[<unsigned int>(idx + n_mit_mot + n_mit_sot + n_sit_sot)]=\
- *                 args[<unsigned int>(idx + n_mit_mot + n_mit_sot + n_sit_sot             # <<<<<<<<<<<<<<
- *                                     + n_shared_outs + n_seqs+1)]
+ *                 args[<unsigned int>(idx + n_mit_mot + n_mit_sot + n_sit_sot
+ *                                     + n_shared_outs + n_seqs+1)]             # <<<<<<<<<<<<<<
  * 
+ *     for idx in range(n_outs + n_nit_sot):
  */
     __pyx_t_6 = ((unsigned int)((((((__pyx_v_idx + __pyx_v_n_mit_mot) + __pyx_v_n_mit_sot) + __pyx_v_n_sit_sot) + __pyx_v_n_shared_outs) + __pyx_v_n_seqs) + 1));
-    __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_6, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 222; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-    __Pyx_GOTREF(__pyx_t_9);
-    __pyx_t_10 = __Pyx_PyInt_As_int(__pyx_t_9); if (unlikely((__pyx_t_10 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 222; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+    __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_6, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 224; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_10 = __Pyx_PyInt_AsInt(__pyx_t_7); if (unlikely((__pyx_t_10 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 224; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
 
-    /* "theano/scan_module/scan_perform.pyx":221
+    /* "theano/scan_module/scan_perform.pyx":223
  * 
  *     for idx in range(n_nit_sot):
  *         store_steps[<unsigned int>(idx + n_mit_mot + n_mit_sot + n_sit_sot)]=\             # <<<<<<<<<<<<<<
@@ -2349,7 +2245,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     (__pyx_v_store_steps[((unsigned int)(((__pyx_v_idx + __pyx_v_n_mit_mot) + __pyx_v_n_mit_sot) + __pyx_v_n_sit_sot))]) = __pyx_t_10;
   }
 
-  /* "theano/scan_module/scan_perform.pyx":225
+  /* "theano/scan_module/scan_perform.pyx":227
  *                                     + n_shared_outs + n_seqs+1)]
  * 
  *     for idx in range(n_outs + n_nit_sot):             # <<<<<<<<<<<<<<
@@ -2360,7 +2256,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
   for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
     __pyx_v_idx = __pyx_t_5;
 
-    /* "theano/scan_module/scan_perform.pyx":226
+    /* "theano/scan_module/scan_perform.pyx":228
  * 
  *     for idx in range(n_outs + n_nit_sot):
  *         pos[idx] = (-mintaps[idx])%store_steps[idx]             # <<<<<<<<<<<<<<
@@ -2370,19 +2266,13 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     __pyx_t_6 = __pyx_v_idx;
     __pyx_t_10 = (-(*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_mintaps.rcbuffer->pybuffer.buf, __pyx_t_6, __pyx_pybuffernd_mintaps.diminfo[0].strides)));
     if (unlikely((__pyx_v_store_steps[__pyx_v_idx]) == 0)) {
-      #ifdef WITH_THREAD
-      PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();
-      #endif
-      PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
-      #ifdef WITH_THREAD
-      PyGILState_Release(__pyx_gilstate_save);
-      #endif
-      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 226; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      PyErr_Format(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 228; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     }
     (__pyx_v_pos[__pyx_v_idx]) = __Pyx_mod_int(__pyx_t_10, (__pyx_v_store_steps[__pyx_v_idx]));
   }
 
-  /* "theano/scan_module/scan_perform.pyx":230
+  /* "theano/scan_module/scan_perform.pyx":232
  * 
  *     # 2.1 Create storage space for outputs
  *     for idx in range(n_outs):             # <<<<<<<<<<<<<<
@@ -2393,7 +2283,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
   for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
     __pyx_v_idx = __pyx_t_5;
 
-    /* "theano/scan_module/scan_perform.pyx":231
+    /* "theano/scan_module/scan_perform.pyx":233
  *     # 2.1 Create storage space for outputs
  *     for idx in range(n_outs):
  *         if destroy_map[idx] != 0:             # <<<<<<<<<<<<<<
@@ -2401,10 +2291,10 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *             # initial state
  */
     __pyx_t_11 = __pyx_v_idx;
-    __pyx_t_3 = (((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_destroy_map.rcbuffer->pybuffer.buf, __pyx_t_11, __pyx_pybuffernd_destroy_map.diminfo[0].strides)) != 0) != 0);
+    __pyx_t_3 = ((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_destroy_map.rcbuffer->pybuffer.buf, __pyx_t_11, __pyx_pybuffernd_destroy_map.diminfo[0].strides)) != 0);
     if (__pyx_t_3) {
 
-      /* "theano/scan_module/scan_perform.pyx":234
+      /* "theano/scan_module/scan_perform.pyx":236
  *             # ^ Case 1. Outputs should be computed inplace of their
  *             # initial state
  *             outs[idx][0] = args[ <unsigned int>(1+ n_seqs + idx)]             # <<<<<<<<<<<<<<
@@ -2412,91 +2302,93 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *               outs[idx][0].shape[1:] == args[<unsigned int>(1+ n_seqs + idx)].shape[1:]
  */
       __pyx_t_12 = ((unsigned int)((1 + __pyx_v_n_seqs) + __pyx_v_idx));
-      __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_12, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 234; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_9);
-      __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 234; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_2);
-      if (unlikely(__Pyx_SetItemInt(__pyx_t_2, 0, __pyx_t_9, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 234; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-      goto __pyx_L15;
+      __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_12, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_1);
+      if (__Pyx_SetItemInt(__pyx_t_1, 0, __pyx_t_7, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      goto __pyx_L18;
     }
 
-    /* "theano/scan_module/scan_perform.pyx":235
+    /* "theano/scan_module/scan_perform.pyx":237
  *             # initial state
  *             outs[idx][0] = args[ <unsigned int>(1+ n_seqs + idx)]
  *         elif ( outs[idx][0] is not None and             # <<<<<<<<<<<<<<
  *               outs[idx][0].shape[1:] == args[<unsigned int>(1+ n_seqs + idx)].shape[1:]
  *               and outs[idx][0].shape[0] >= store_steps[idx] ):
  */
-    __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 235; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-    __Pyx_GOTREF(__pyx_t_9);
-    __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 235; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-    __Pyx_GOTREF(__pyx_t_2);
-    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-    __pyx_t_3 = (__pyx_t_2 != Py_None);
-    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 237; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 237; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_3 = (__pyx_t_1 != Py_None);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
     if (__pyx_t_3) {
 
-      /* "theano/scan_module/scan_perform.pyx":236
+      /* "theano/scan_module/scan_perform.pyx":238
  *             outs[idx][0] = args[ <unsigned int>(1+ n_seqs + idx)]
  *         elif ( outs[idx][0] is not None and
  *               outs[idx][0].shape[1:] == args[<unsigned int>(1+ n_seqs + idx)].shape[1:]             # <<<<<<<<<<<<<<
  *               and outs[idx][0].shape[0] >= store_steps[idx] ):
  *             # Put in the values of the initial state
  */
-      __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_2);
-      __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_9);
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-      __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_shape); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_2);
-      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-      __pyx_t_9 = __Pyx_PyObject_GetSlice(__pyx_t_2, 1, 0, NULL, NULL, &__pyx_slice__2, 1, 0, 1); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_9);
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 238; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_1);
+      __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 238; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      __pyx_t_1 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__shape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 238; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __pyx_t_7 = __Pyx_PySequence_GetSlice(__pyx_t_1, 1, PY_SSIZE_T_MAX); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 238; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
       __pyx_t_12 = ((unsigned int)((1 + __pyx_v_n_seqs) + __pyx_v_idx));
-      __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_12, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_2);
-      __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_12, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 238; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-      __pyx_t_2 = __Pyx_PyObject_GetSlice(__pyx_t_1, 1, 0, NULL, NULL, &__pyx_slice__3, 1, 0, 1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_2);
+      __pyx_t_9 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__shape); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 238; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_9);
       __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-      __pyx_t_1 = PyObject_RichCompare(__pyx_t_9, __pyx_t_2, Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_1 = __Pyx_PySequence_GetSlice(__pyx_t_9, 1, PY_SSIZE_T_MAX); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 238; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_1);
       __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-      __pyx_t_13 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_13 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_9 = PyObject_RichCompare(__pyx_t_7, __pyx_t_1, Py_EQ); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 238; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_9);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
       __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      __pyx_t_13 = __Pyx_PyObject_IsTrue(__pyx_t_9); if (unlikely(__pyx_t_13 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 238; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
       if (__pyx_t_13) {
 
-        /* "theano/scan_module/scan_perform.pyx":237
+        /* "theano/scan_module/scan_perform.pyx":239
  *         elif ( outs[idx][0] is not None and
  *               outs[idx][0].shape[1:] == args[<unsigned int>(1+ n_seqs + idx)].shape[1:]
  *               and outs[idx][0].shape[0] >= store_steps[idx] ):             # <<<<<<<<<<<<<<
  *             # Put in the values of the initial state
  *             outs[idx][0] = outs[idx][0][:store_steps[idx]]
  */
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 237; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 237; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 237; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 239; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 239; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 237; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_9 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__shape); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 239; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_PyInt_From_int((__pyx_v_store_steps[__pyx_v_idx])); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 237; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 239; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_9 = PyObject_RichCompare(__pyx_t_2, __pyx_t_1, Py_GE); __Pyx_XGOTREF(__pyx_t_9); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 237; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_9 = PyInt_FromLong((__pyx_v_store_steps[__pyx_v_idx])); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 239; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_7 = PyObject_RichCompare(__pyx_t_1, __pyx_t_9, Py_GE); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 239; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_14 = __Pyx_PyObject_IsTrue(__pyx_t_9); if (unlikely(__pyx_t_14 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 237; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_14 = __Pyx_PyObject_IsTrue(__pyx_t_7); if (unlikely(__pyx_t_14 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 239; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
         __pyx_t_15 = __pyx_t_14;
       } else {
         __pyx_t_15 = __pyx_t_13;
@@ -2507,38 +2399,38 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     }
     if (__pyx_t_13) {
 
-      /* "theano/scan_module/scan_perform.pyx":239
+      /* "theano/scan_module/scan_perform.pyx":241
  *               and outs[idx][0].shape[0] >= store_steps[idx] ):
  *             # Put in the values of the initial state
  *             outs[idx][0] = outs[idx][0][:store_steps[idx]]             # <<<<<<<<<<<<<<
  *             if idx > n_mit_mot:
  *                 l = - mintaps[idx]
  */
-      __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 239; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+      __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 241; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
+      __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 241; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_9);
-      __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 239; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __pyx_t_7 = __Pyx_PySequence_GetSlice(__pyx_t_9, 0, (__pyx_v_store_steps[__pyx_v_idx])); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 241; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
       __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-      __pyx_t_9 = __Pyx_PyObject_GetSlice(__pyx_t_1, 0, (__pyx_v_store_steps[__pyx_v_idx]), NULL, NULL, NULL, 0, 1, 1); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 239; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 241; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_9);
-      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-      __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 239; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_1);
-      if (unlikely(__Pyx_SetItemInt(__pyx_t_1, 0, __pyx_t_9, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 239; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      if (__Pyx_SetItemInt(__pyx_t_9, 0, __pyx_t_7, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 241; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
 
-      /* "theano/scan_module/scan_perform.pyx":240
+      /* "theano/scan_module/scan_perform.pyx":242
  *             # Put in the values of the initial state
  *             outs[idx][0] = outs[idx][0][:store_steps[idx]]
  *             if idx > n_mit_mot:             # <<<<<<<<<<<<<<
  *                 l = - mintaps[idx]
  *                 outs[idx][0][:l] = args[<unsigned int>(seqs_arg_offset +
  */
-      __pyx_t_13 = ((__pyx_v_idx > __pyx_v_n_mit_mot) != 0);
+      __pyx_t_13 = (__pyx_v_idx > __pyx_v_n_mit_mot);
       if (__pyx_t_13) {
 
-        /* "theano/scan_module/scan_perform.pyx":241
+        /* "theano/scan_module/scan_perform.pyx":243
  *             outs[idx][0] = outs[idx][0][:store_steps[idx]]
  *             if idx > n_mit_mot:
  *                 l = - mintaps[idx]             # <<<<<<<<<<<<<<
@@ -2548,48 +2440,40 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
         __pyx_t_12 = __pyx_v_idx;
         __pyx_v_l = (-(*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_mintaps.rcbuffer->pybuffer.buf, __pyx_t_12, __pyx_pybuffernd_mintaps.diminfo[0].strides)));
 
-        /* "theano/scan_module/scan_perform.pyx":242
- *             if idx > n_mit_mot:
- *                 l = - mintaps[idx]
- *                 outs[idx][0][:l] = args[<unsigned int>(seqs_arg_offset +             # <<<<<<<<<<<<<<
- *                                                        idx)][:l]
- *             else:
- */
-        __pyx_t_16 = ((unsigned int)(__pyx_v_seqs_arg_offset + __pyx_v_idx));
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_16, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 242; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-
-        /* "theano/scan_module/scan_perform.pyx":243
+        /* "theano/scan_module/scan_perform.pyx":245
  *                 l = - mintaps[idx]
  *                 outs[idx][0][:l] = args[<unsigned int>(seqs_arg_offset +
  *                                                        idx)][:l]             # <<<<<<<<<<<<<<
  *             else:
  *                 outs[idx][0][:] = args[<unsigned int>(seqs_arg_offset + idx)]
  */
-        __pyx_t_1 = __Pyx_PyObject_GetSlice(__pyx_t_9, 0, __pyx_v_l, NULL, NULL, NULL, 0, 1, 1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 243; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_1);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_16 = ((unsigned int)(__pyx_v_seqs_arg_offset + __pyx_v_idx));
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_16, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 244; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_9 = __Pyx_PySequence_GetSlice(__pyx_t_7, 0, __pyx_v_l); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 245; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":242
+        /* "theano/scan_module/scan_perform.pyx":244
  *             if idx > n_mit_mot:
  *                 l = - mintaps[idx]
  *                 outs[idx][0][:l] = args[<unsigned int>(seqs_arg_offset +             # <<<<<<<<<<<<<<
  *                                                        idx)][:l]
  *             else:
  */
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 242; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 242; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        if (__Pyx_PyObject_SetSlice(__pyx_t_2, __pyx_t_1, 0, __pyx_v_l, NULL, NULL, NULL, 0, 1, 1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 242; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 244; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 244; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        if (__Pyx_PySequence_SetSlice(__pyx_t_1, 0, __pyx_v_l, __pyx_t_9) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 244; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        goto __pyx_L16;
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        goto __pyx_L19;
       }
       /*else*/ {
 
-        /* "theano/scan_module/scan_perform.pyx":245
+        /* "theano/scan_module/scan_perform.pyx":247
  *                                                        idx)][:l]
  *             else:
  *                 outs[idx][0][:] = args[<unsigned int>(seqs_arg_offset + idx)]             # <<<<<<<<<<<<<<
@@ -2597,23 +2481,23 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *             outs[idx][0] = args[<unsigned int>(seqs_arg_offset + idx)].copy()
  */
         __pyx_t_16 = ((unsigned int)(__pyx_v_seqs_arg_offset + __pyx_v_idx));
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_16, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 245; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 245; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 245; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_16, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 247; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        if (__Pyx_PyObject_SetSlice(__pyx_t_9, __pyx_t_1, 0, 0, NULL, NULL, &__pyx_slice__4, 0, 0, 1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 245; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 247; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 247; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        if (__Pyx_PySequence_SetSlice(__pyx_t_7, 0, PY_SSIZE_T_MAX, __pyx_t_9) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 247; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
       }
-      __pyx_L16:;
-      goto __pyx_L15;
+      __pyx_L19:;
+      goto __pyx_L18;
     }
     /*else*/ {
 
-      /* "theano/scan_module/scan_perform.pyx":247
+      /* "theano/scan_module/scan_perform.pyx":249
  *                 outs[idx][0][:] = args[<unsigned int>(seqs_arg_offset + idx)]
  *         else:
  *             outs[idx][0] = args[<unsigned int>(seqs_arg_offset + idx)].copy()             # <<<<<<<<<<<<<<
@@ -2621,24 +2505,24 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  * 
  */
       __pyx_t_16 = ((unsigned int)(__pyx_v_seqs_arg_offset + __pyx_v_idx));
-      __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_16, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 247; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_1);
-      __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_copy); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 247; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_16, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 249; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_9);
-      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_9, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 247; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_1);
+      __pyx_t_7 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__copy); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 249; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
       __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-      __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 247; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+      __pyx_t_9 = PyObject_Call(__pyx_t_7, ((PyObject *)__pyx_empty_tuple), NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 249; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_9);
-      if (unlikely(__Pyx_SetItemInt(__pyx_t_9, 0, __pyx_t_1, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 247; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 249; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
+      if (__Pyx_SetItemInt(__pyx_t_7, 0, __pyx_t_9, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 249; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
       __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
     }
-    __pyx_L15:;
+    __pyx_L18:;
   }
 
-  /* "theano/scan_module/scan_perform.pyx":250
+  /* "theano/scan_module/scan_perform.pyx":252
  * 
  * 
  *     offset = nit_sot_arg_offset + n_nit_sot             # <<<<<<<<<<<<<<
@@ -2647,43 +2531,43 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
   __pyx_v_offset = (__pyx_v_nit_sot_arg_offset + __pyx_v_n_nit_sot);
 
-  /* "theano/scan_module/scan_perform.pyx":251
+  /* "theano/scan_module/scan_perform.pyx":253
  * 
  *     offset = nit_sot_arg_offset + n_nit_sot
  *     other_args = args[offset:]             # <<<<<<<<<<<<<<
  *     input_storage = fnct.input_storage
  *     output_storage = fnct.output_storage
  */
-  __pyx_t_1 = __Pyx_PyObject_GetSlice(__pyx_v_args, __pyx_v_offset, 0, NULL, NULL, NULL, 1, 0, 1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 251; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_v_other_args = __pyx_t_1;
-  __pyx_t_1 = 0;
+  __pyx_t_9 = __Pyx_PySequence_GetSlice(__pyx_v_args, __pyx_v_offset, PY_SSIZE_T_MAX); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 253; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_9);
+  __pyx_v_other_args = __pyx_t_9;
+  __pyx_t_9 = 0;
 
-  /* "theano/scan_module/scan_perform.pyx":252
+  /* "theano/scan_module/scan_perform.pyx":254
  *     offset = nit_sot_arg_offset + n_nit_sot
  *     other_args = args[offset:]
  *     input_storage = fnct.input_storage             # <<<<<<<<<<<<<<
  *     output_storage = fnct.output_storage
  *     offset = n_seqs
  */
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_fnct, __pyx_n_s_input_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 252; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_v_input_storage = __pyx_t_1;
-  __pyx_t_1 = 0;
+  __pyx_t_9 = PyObject_GetAttr(__pyx_v_fnct, __pyx_n_s__input_storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 254; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_9);
+  __pyx_v_input_storage = __pyx_t_9;
+  __pyx_t_9 = 0;
 
-  /* "theano/scan_module/scan_perform.pyx":253
+  /* "theano/scan_module/scan_perform.pyx":255
  *     other_args = args[offset:]
  *     input_storage = fnct.input_storage
  *     output_storage = fnct.output_storage             # <<<<<<<<<<<<<<
  *     offset = n_seqs
  *     for idx in range(n_outs):
  */
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_fnct, __pyx_n_s_output_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 253; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_v_output_storage = __pyx_t_1;
-  __pyx_t_1 = 0;
+  __pyx_t_9 = PyObject_GetAttr(__pyx_v_fnct, __pyx_n_s__output_storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 255; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_9);
+  __pyx_v_output_storage = __pyx_t_9;
+  __pyx_t_9 = 0;
 
-  /* "theano/scan_module/scan_perform.pyx":254
+  /* "theano/scan_module/scan_perform.pyx":256
  *     input_storage = fnct.input_storage
  *     output_storage = fnct.output_storage
  *     offset = n_seqs             # <<<<<<<<<<<<<<
@@ -2692,7 +2576,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
   __pyx_v_offset = __pyx_v_n_seqs;
 
-  /* "theano/scan_module/scan_perform.pyx":255
+  /* "theano/scan_module/scan_perform.pyx":257
  *     output_storage = fnct.output_storage
  *     offset = n_seqs
  *     for idx in range(n_outs):             # <<<<<<<<<<<<<<
@@ -2703,7 +2587,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
   for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
     __pyx_v_idx = __pyx_t_5;
 
-    /* "theano/scan_module/scan_perform.pyx":256
+    /* "theano/scan_module/scan_perform.pyx":258
  *     offset = n_seqs
  *     for idx in range(n_outs):
  *         offset += tap_array_len[idx]             # <<<<<<<<<<<<<<
@@ -2714,7 +2598,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     __pyx_v_offset = (__pyx_v_offset + (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_tap_array_len.rcbuffer->pybuffer.buf, __pyx_t_16, __pyx_pybuffernd_tap_array_len.diminfo[0].strides)));
   }
 
-  /* "theano/scan_module/scan_perform.pyx":257
+  /* "theano/scan_module/scan_perform.pyx":259
  *     for idx in range(n_outs):
  *         offset += tap_array_len[idx]
  *     offset += n_shared_outs             # <<<<<<<<<<<<<<
@@ -2723,38 +2607,38 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
   __pyx_v_offset = (__pyx_v_offset + __pyx_v_n_shared_outs);
 
-  /* "theano/scan_module/scan_perform.pyx":259
+  /* "theano/scan_module/scan_perform.pyx":261
  *     offset += n_shared_outs
  * 
  *     for idx in range(len(other_args)):             # <<<<<<<<<<<<<<
  *         input_storage[<unsigned int>(idx+offset)].storage[0] = other_args[idx]
  * 
  */
-  __pyx_t_17 = PyObject_Length(__pyx_v_other_args); if (unlikely(__pyx_t_17 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 259; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_17 = PyObject_Length(__pyx_v_other_args); if (unlikely(__pyx_t_17 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 261; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_17; __pyx_t_4+=1) {
     __pyx_v_idx = __pyx_t_4;
 
-    /* "theano/scan_module/scan_perform.pyx":260
+    /* "theano/scan_module/scan_perform.pyx":262
  * 
  *     for idx in range(len(other_args)):
  *         input_storage[<unsigned int>(idx+offset)].storage[0] = other_args[idx]             # <<<<<<<<<<<<<<
  * 
  * 
  */
-    __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_other_args, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 260; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-    __Pyx_GOTREF(__pyx_t_1);
-    __pyx_t_5 = ((unsigned int)(__pyx_v_idx + __pyx_v_offset));
-    __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_t_5, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 260; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+    __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_other_args, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 262; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_9);
-    __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_storage); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 260; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_2);
-    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-    if (unlikely(__Pyx_SetItemInt(__pyx_t_2, 0, __pyx_t_1, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 260; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __pyx_t_5 = ((unsigned int)(__pyx_v_idx + __pyx_v_offset));
+    __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_t_5, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 262; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_1 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 262; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    if (__Pyx_SetItemInt(__pyx_t_1, 0, __pyx_t_9, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 262; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
   }
 
-  /* "theano/scan_module/scan_perform.pyx":263
+  /* "theano/scan_module/scan_perform.pyx":265
  * 
  * 
  *     i = 0             # <<<<<<<<<<<<<<
@@ -2763,7 +2647,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
   __pyx_v_i = 0;
 
-  /* "theano/scan_module/scan_perform.pyx":264
+  /* "theano/scan_module/scan_perform.pyx":266
  * 
  *     i = 0
  *     cond = 1             # <<<<<<<<<<<<<<
@@ -2772,7 +2656,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
   __pyx_v_cond = 1;
 
-  /* "theano/scan_module/scan_perform.pyx":267
+  /* "theano/scan_module/scan_perform.pyx":269
  *     ############## THE MAIN LOOP #########################
  *     #for i in range(n_steps):
  *     while (i < n_steps) and cond == 1:             # <<<<<<<<<<<<<<
@@ -2780,16 +2664,16 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *         # 3. collect input slices
  */
   while (1) {
-    __pyx_t_13 = ((__pyx_v_i < __pyx_v_n_steps) != 0);
+    __pyx_t_13 = (__pyx_v_i < __pyx_v_n_steps);
     if (__pyx_t_13) {
-      __pyx_t_3 = ((__pyx_v_cond == 1) != 0);
+      __pyx_t_3 = (__pyx_v_cond == 1);
       __pyx_t_15 = __pyx_t_3;
     } else {
       __pyx_t_15 = __pyx_t_13;
     }
     if (!__pyx_t_15) break;
 
-    /* "theano/scan_module/scan_perform.pyx":270
+    /* "theano/scan_module/scan_perform.pyx":272
  *         # sequences over which scan iterates
  *         # 3. collect input slices
  *         for idx in range(n_seqs):             # <<<<<<<<<<<<<<
@@ -2800,7 +2684,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
       __pyx_v_idx = __pyx_t_5;
 
-      /* "theano/scan_module/scan_perform.pyx":271
+      /* "theano/scan_module/scan_perform.pyx":273
  *         # 3. collect input slices
  *         for idx in range(n_seqs):
  *             if vector_seqs[idx] == 1:             # <<<<<<<<<<<<<<
@@ -2808,10 +2692,10 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *                             <unsigned int>(1+idx)][i:<unsigned int>(i+1)].reshape(())
  */
       __pyx_t_18 = __pyx_v_idx;
-      __pyx_t_15 = (((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_vector_seqs.rcbuffer->pybuffer.buf, __pyx_t_18, __pyx_pybuffernd_vector_seqs.diminfo[0].strides)) == 1) != 0);
+      __pyx_t_15 = ((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_vector_seqs.rcbuffer->pybuffer.buf, __pyx_t_18, __pyx_pybuffernd_vector_seqs.diminfo[0].strides)) == 1);
       if (__pyx_t_15) {
 
-        /* "theano/scan_module/scan_perform.pyx":273
+        /* "theano/scan_module/scan_perform.pyx":275
  *             if vector_seqs[idx] == 1:
  *                 input_storage[idx].storage[0] = args[\
  *                             <unsigned int>(1+idx)][i:<unsigned int>(i+1)].reshape(())             # <<<<<<<<<<<<<<
@@ -2819,54 +2703,38 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *                 input_storage[idx].storage[0] = \
  */
         __pyx_t_19 = ((unsigned int)(1 + __pyx_v_idx));
-
-        /* "theano/scan_module/scan_perform.pyx":272
- *         for idx in range(n_seqs):
- *             if vector_seqs[idx] == 1:
- *                 input_storage[idx].storage[0] = args[\             # <<<<<<<<<<<<<<
- *                             <unsigned int>(1+idx)][i:<unsigned int>(i+1)].reshape(())
- *             else:
- */
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_19, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 272; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_19, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 274; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_1 = __Pyx_PySequence_GetSlice(__pyx_t_9, __pyx_v_i, ((unsigned int)(__pyx_v_i + 1))); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-
-        /* "theano/scan_module/scan_perform.pyx":273
- *             if vector_seqs[idx] == 1:
- *                 input_storage[idx].storage[0] = args[\
- *                             <unsigned int>(1+idx)][i:<unsigned int>(i+1)].reshape(())             # <<<<<<<<<<<<<<
- *             else:
- *                 input_storage[idx].storage[0] = \
- */
-        __pyx_t_2 = __Pyx_PyObject_GetSlice(__pyx_t_1, __pyx_v_i, ((unsigned int)(__pyx_v_i + 1)), NULL, NULL, NULL, 1, 1, 1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 273; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_9 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__reshape); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_reshape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 273; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_1 = PyObject_Call(__pyx_t_9, ((PyObject *)__pyx_k_tuple_3), NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 273; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":272
+        /* "theano/scan_module/scan_perform.pyx":274
  *         for idx in range(n_seqs):
  *             if vector_seqs[idx] == 1:
  *                 input_storage[idx].storage[0] = args[\             # <<<<<<<<<<<<<<
  *                             <unsigned int>(1+idx)][i:<unsigned int>(i+1)].reshape(())
  *             else:
  */
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 272; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 272; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 274; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        if (unlikely(__Pyx_SetItemInt(__pyx_t_9, 0, __pyx_t_2, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 272; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_7 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__storage); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 274; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        goto __pyx_L25;
+        if (__Pyx_SetItemInt(__pyx_t_7, 0, __pyx_t_1, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 274; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        goto __pyx_L28;
       }
       /*else*/ {
 
-        /* "theano/scan_module/scan_perform.pyx":276
+        /* "theano/scan_module/scan_perform.pyx":278
  *             else:
  *                 input_storage[idx].storage[0] = \
  *                         args[<unsigned int>(idx+1)][i]             # <<<<<<<<<<<<<<
@@ -2874,32 +2742,32 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *         offset = n_seqs
  */
         __pyx_t_19 = ((unsigned int)(__pyx_v_idx + 1));
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_19, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, __pyx_v_i, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_19, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 278; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, __pyx_v_i, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 278; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":275
+        /* "theano/scan_module/scan_perform.pyx":277
  *                             <unsigned int>(1+idx)][i:<unsigned int>(i+1)].reshape(())
  *             else:
  *                 input_storage[idx].storage[0] = \             # <<<<<<<<<<<<<<
  *                         args[<unsigned int>(idx+1)][i]
  * 
  */
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 277; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        if (unlikely(__Pyx_SetItemInt(__pyx_t_1, 0, __pyx_t_9, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_9 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 277; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        if (__Pyx_SetItemInt(__pyx_t_9, 0, __pyx_t_7, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 277; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
       }
-      __pyx_L25:;
+      __pyx_L28:;
     }
 
-    /* "theano/scan_module/scan_perform.pyx":278
+    /* "theano/scan_module/scan_perform.pyx":280
  *                         args[<unsigned int>(idx+1)][i]
  * 
  *         offset = n_seqs             # <<<<<<<<<<<<<<
@@ -2908,7 +2776,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_offset = __pyx_v_n_seqs;
 
-    /* "theano/scan_module/scan_perform.pyx":279
+    /* "theano/scan_module/scan_perform.pyx":281
  * 
  *         offset = n_seqs
  *         for idx in range(n_outs):             # <<<<<<<<<<<<<<
@@ -2919,7 +2787,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
       __pyx_v_idx = __pyx_t_5;
 
-      /* "theano/scan_module/scan_perform.pyx":280
+      /* "theano/scan_module/scan_perform.pyx":282
  *         offset = n_seqs
  *         for idx in range(n_outs):
  *             if vector_outs[idx] == 1:             # <<<<<<<<<<<<<<
@@ -2927,10 +2795,10 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *                     tap = tap_array[idx,tdx]
  */
       __pyx_t_19 = __pyx_v_idx;
-      __pyx_t_15 = (((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_vector_outs.rcbuffer->pybuffer.buf, __pyx_t_19, __pyx_pybuffernd_vector_outs.diminfo[0].strides)) == 1) != 0);
+      __pyx_t_15 = ((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_vector_outs.rcbuffer->pybuffer.buf, __pyx_t_19, __pyx_pybuffernd_vector_outs.diminfo[0].strides)) == 1);
       if (__pyx_t_15) {
 
-        /* "theano/scan_module/scan_perform.pyx":281
+        /* "theano/scan_module/scan_perform.pyx":283
  *         for idx in range(n_outs):
  *             if vector_outs[idx] == 1:
  *                 for tdx in range(tap_array_len[idx]):             # <<<<<<<<<<<<<<
@@ -2942,7 +2810,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
         for (__pyx_t_22 = 0; __pyx_t_22 < __pyx_t_21; __pyx_t_22+=1) {
           __pyx_v_tdx = __pyx_t_22;
 
-          /* "theano/scan_module/scan_perform.pyx":282
+          /* "theano/scan_module/scan_perform.pyx":284
  *             if vector_outs[idx] == 1:
  *                 for tdx in range(tap_array_len[idx]):
  *                     tap = tap_array[idx,tdx]             # <<<<<<<<<<<<<<
@@ -2953,7 +2821,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
           __pyx_t_24 = __pyx_v_tdx;
           __pyx_v_tap = (*__Pyx_BufPtrStrided2d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_tap_array.rcbuffer->pybuffer.buf, __pyx_t_23, __pyx_pybuffernd_tap_array.diminfo[0].strides, __pyx_t_24, __pyx_pybuffernd_tap_array.diminfo[1].strides));
 
-          /* "theano/scan_module/scan_perform.pyx":283
+          /* "theano/scan_module/scan_perform.pyx":285
  *                 for tdx in range(tap_array_len[idx]):
  *                     tap = tap_array[idx,tdx]
  *                     _idx = (pos[idx]+tap)%store_steps[idx]             # <<<<<<<<<<<<<<
@@ -2962,56 +2830,50 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
           __pyx_t_10 = ((__pyx_v_pos[__pyx_v_idx]) + __pyx_v_tap);
           if (unlikely((__pyx_v_store_steps[__pyx_v_idx]) == 0)) {
-            #ifdef WITH_THREAD
-            PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();
-            #endif
-            PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
-            #ifdef WITH_THREAD
-            PyGILState_Release(__pyx_gilstate_save);
-            #endif
-            {__pyx_filename = __pyx_f[0]; __pyx_lineno = 283; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+            PyErr_Format(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+            {__pyx_filename = __pyx_f[0]; __pyx_lineno = 285; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           }
           __pyx_v__idx = __Pyx_mod_int(__pyx_t_10, (__pyx_v_store_steps[__pyx_v_idx]));
 
-          /* "theano/scan_module/scan_perform.pyx":285
+          /* "theano/scan_module/scan_perform.pyx":287
  *                     _idx = (pos[idx]+tap)%store_steps[idx]
  *                     input_storage[offset].storage[0] =\
  *                             outs[idx][0][_idx:<unsigned int>(_idx+1)].reshape(())             # <<<<<<<<<<<<<<
  *                     offset += 1
  *             else:
  */
-          __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 285; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+          __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 287; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
+          __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 287; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_9);
-          __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 285; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_1);
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __pyx_t_7 = __Pyx_PySequence_GetSlice(__pyx_t_9, __pyx_v__idx, ((unsigned int)(__pyx_v__idx + 1))); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 287; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
           __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __pyx_t_9 = __Pyx_PyObject_GetSlice(__pyx_t_1, __pyx_v__idx, ((unsigned int)(__pyx_v__idx + 1)), NULL, NULL, NULL, 1, 1, 1); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 285; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __pyx_t_9 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__reshape); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 287; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_9);
-          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-          __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_reshape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 285; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_GOTREF(__pyx_t_1);
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __pyx_t_7 = PyObject_Call(__pyx_t_9, ((PyObject *)__pyx_k_tuple_4), NULL); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 287; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
           __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __pyx_t_9 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 285; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_GOTREF(__pyx_t_9);
-          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
-          /* "theano/scan_module/scan_perform.pyx":284
+          /* "theano/scan_module/scan_perform.pyx":286
  *                     tap = tap_array[idx,tdx]
  *                     _idx = (pos[idx]+tap)%store_steps[idx]
  *                     input_storage[offset].storage[0] =\             # <<<<<<<<<<<<<<
  *                             outs[idx][0][_idx:<unsigned int>(_idx+1)].reshape(())
  *                     offset += 1
  */
-          __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_v_offset, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 284; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+          __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_v_offset, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 286; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_9);
+          __pyx_t_1 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 286; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_1);
-          __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_storage); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 284; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_GOTREF(__pyx_t_2);
-          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-          if (unlikely(__Pyx_SetItemInt(__pyx_t_2, 0, __pyx_t_9, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 284; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
           __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+          if (__Pyx_SetItemInt(__pyx_t_1, 0, __pyx_t_7, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 286; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
 
-          /* "theano/scan_module/scan_perform.pyx":286
+          /* "theano/scan_module/scan_perform.pyx":288
  *                     input_storage[offset].storage[0] =\
  *                             outs[idx][0][_idx:<unsigned int>(_idx+1)].reshape(())
  *                     offset += 1             # <<<<<<<<<<<<<<
@@ -3020,11 +2882,11 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
           __pyx_v_offset = (__pyx_v_offset + 1);
         }
-        goto __pyx_L28;
+        goto __pyx_L31;
       }
       /*else*/ {
 
-        /* "theano/scan_module/scan_perform.pyx":288
+        /* "theano/scan_module/scan_perform.pyx":290
  *                     offset += 1
  *             else:
  *                 for tdx in range(tap_array_len[idx]):             # <<<<<<<<<<<<<<
@@ -3036,7 +2898,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
         for (__pyx_t_25 = 0; __pyx_t_25 < __pyx_t_21; __pyx_t_25+=1) {
           __pyx_v_tdx = __pyx_t_25;
 
-          /* "theano/scan_module/scan_perform.pyx":289
+          /* "theano/scan_module/scan_perform.pyx":291
  *             else:
  *                 for tdx in range(tap_array_len[idx]):
  *                     tap = tap_array[idx,tdx]             # <<<<<<<<<<<<<<
@@ -3047,7 +2909,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
           __pyx_t_27 = __pyx_v_tdx;
           __pyx_v_tap = (*__Pyx_BufPtrStrided2d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_tap_array.rcbuffer->pybuffer.buf, __pyx_t_26, __pyx_pybuffernd_tap_array.diminfo[0].strides, __pyx_t_27, __pyx_pybuffernd_tap_array.diminfo[1].strides));
 
-          /* "theano/scan_module/scan_perform.pyx":290
+          /* "theano/scan_module/scan_perform.pyx":292
  *                 for tdx in range(tap_array_len[idx]):
  *                     tap = tap_array[idx,tdx]
  *                     _idx = (pos[idx]+tap)%store_steps[idx]             # <<<<<<<<<<<<<<
@@ -3056,42 +2918,36 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
           __pyx_t_10 = ((__pyx_v_pos[__pyx_v_idx]) + __pyx_v_tap);
           if (unlikely((__pyx_v_store_steps[__pyx_v_idx]) == 0)) {
-            #ifdef WITH_THREAD
-            PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();
-            #endif
-            PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
-            #ifdef WITH_THREAD
-            PyGILState_Release(__pyx_gilstate_save);
-            #endif
-            {__pyx_filename = __pyx_f[0]; __pyx_lineno = 290; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+            PyErr_Format(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+            {__pyx_filename = __pyx_f[0]; __pyx_lineno = 292; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           }
           __pyx_v__idx = __Pyx_mod_int(__pyx_t_10, (__pyx_v_store_steps[__pyx_v_idx]));
 
-          /* "theano/scan_module/scan_perform.pyx":291
+          /* "theano/scan_module/scan_perform.pyx":293
  *                     tap = tap_array[idx,tdx]
  *                     _idx = (pos[idx]+tap)%store_steps[idx]
  *                     input_storage[offset].storage[0] = outs[idx][0][_idx]             # <<<<<<<<<<<<<<
  *                     offset += 1
  * 
  */
-          __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 291; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_9);
-          __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 291; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_2);
-          __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, __pyx_v__idx, int, 1, __Pyx_PyInt_From_int, 0, 1, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 291; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_9);
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-          __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_v_offset, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 291; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_2);
-          __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 291; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 293; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
+          __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 293; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_1);
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-          if (unlikely(__Pyx_SetItemInt(__pyx_t_1, 0, __pyx_t_9, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 291; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, __pyx_v__idx, sizeof(int), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 293; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
           __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+          __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_v_offset, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 293; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_1);
+          __pyx_t_9 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 293; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_9);
+          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+          if (__Pyx_SetItemInt(__pyx_t_9, 0, __pyx_t_7, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 293; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
 
-          /* "theano/scan_module/scan_perform.pyx":292
+          /* "theano/scan_module/scan_perform.pyx":294
  *                     _idx = (pos[idx]+tap)%store_steps[idx]
  *                     input_storage[offset].storage[0] = outs[idx][0][_idx]
  *                     offset += 1             # <<<<<<<<<<<<<<
@@ -3101,10 +2957,10 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
           __pyx_v_offset = (__pyx_v_offset + 1);
         }
       }
-      __pyx_L28:;
+      __pyx_L31:;
     }
 
-    /* "theano/scan_module/scan_perform.pyx":295
+    /* "theano/scan_module/scan_perform.pyx":297
  * 
  * 
  *         a_offset = shared_arg_offset             # <<<<<<<<<<<<<<
@@ -3113,7 +2969,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_a_offset = __pyx_v_shared_arg_offset;
 
-    /* "theano/scan_module/scan_perform.pyx":296
+    /* "theano/scan_module/scan_perform.pyx":298
  * 
  *         a_offset = shared_arg_offset
  *         o_offset = n_outs + n_nit_sot             # <<<<<<<<<<<<<<
@@ -3122,17 +2978,17 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_o_offset = (__pyx_v_n_outs + __pyx_v_n_nit_sot);
 
-    /* "theano/scan_module/scan_perform.pyx":297
+    /* "theano/scan_module/scan_perform.pyx":299
  *         a_offset = shared_arg_offset
  *         o_offset = n_outs + n_nit_sot
  *         if i == 0:             # <<<<<<<<<<<<<<
  *             for j in range(n_shared_outs):
  *                 input_storage[offset].storage[0] = args[<unsigned int>(a_offset+j)]
  */
-    __pyx_t_15 = ((__pyx_v_i == 0) != 0);
+    __pyx_t_15 = (__pyx_v_i == 0);
     if (__pyx_t_15) {
 
-      /* "theano/scan_module/scan_perform.pyx":298
+      /* "theano/scan_module/scan_perform.pyx":300
  *         o_offset = n_outs + n_nit_sot
  *         if i == 0:
  *             for j in range(n_shared_outs):             # <<<<<<<<<<<<<<
@@ -3143,7 +2999,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
       for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
         __pyx_v_j = __pyx_t_5;
 
-        /* "theano/scan_module/scan_perform.pyx":299
+        /* "theano/scan_module/scan_perform.pyx":301
  *         if i == 0:
  *             for j in range(n_shared_outs):
  *                 input_storage[offset].storage[0] = args[<unsigned int>(a_offset+j)]             # <<<<<<<<<<<<<<
@@ -3151,18 +3007,18 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *         else:
  */
         __pyx_t_25 = ((unsigned int)(__pyx_v_a_offset + __pyx_v_j));
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_25, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 299; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_args, __pyx_t_25, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 301; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_v_offset, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 301; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_v_offset, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 299; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_1 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 301; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_storage); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 299; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        if (unlikely(__Pyx_SetItemInt(__pyx_t_2, 0, __pyx_t_9, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 299; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        if (__Pyx_SetItemInt(__pyx_t_1, 0, __pyx_t_7, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 301; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":300
+        /* "theano/scan_module/scan_perform.pyx":302
  *             for j in range(n_shared_outs):
  *                 input_storage[offset].storage[0] = args[<unsigned int>(a_offset+j)]
  *                 offset += 1             # <<<<<<<<<<<<<<
@@ -3171,11 +3027,11 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
         __pyx_v_offset = (__pyx_v_offset + 1);
       }
-      goto __pyx_L33;
+      goto __pyx_L36;
     }
     /*else*/ {
 
-      /* "theano/scan_module/scan_perform.pyx":302
+      /* "theano/scan_module/scan_perform.pyx":304
  *                 offset += 1
  *         else:
  *             for j in range(n_shared_outs):             # <<<<<<<<<<<<<<
@@ -3186,7 +3042,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
       for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
         __pyx_v_j = __pyx_t_5;
 
-        /* "theano/scan_module/scan_perform.pyx":303
+        /* "theano/scan_module/scan_perform.pyx":305
  *         else:
  *             for j in range(n_shared_outs):
  *                 input_storage[offset].storage[0] = outs[<unsigned int>(o_offset+j)][0]             # <<<<<<<<<<<<<<
@@ -3194,21 +3050,21 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  * 
  */
         __pyx_t_25 = ((unsigned int)(__pyx_v_o_offset + __pyx_v_j));
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_t_25, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 303; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 303; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_v_offset, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 303; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 303; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_t_25, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 305; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 305; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_input_storage, __pyx_v_offset, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 305; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_9 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 305; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        if (__Pyx_SetItemInt(__pyx_t_9, 0, __pyx_t_1, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 305; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        if (unlikely(__Pyx_SetItemInt(__pyx_t_1, 0, __pyx_t_2, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 303; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":304
+        /* "theano/scan_module/scan_perform.pyx":306
  *             for j in range(n_shared_outs):
  *                 input_storage[offset].storage[0] = outs[<unsigned int>(o_offset+j)][0]
  *                 offset += 1             # <<<<<<<<<<<<<<
@@ -3218,9 +3074,9 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
         __pyx_v_offset = (__pyx_v_offset + 1);
       }
     }
-    __pyx_L33:;
+    __pyx_L36:;
 
-    /* "theano/scan_module/scan_perform.pyx":307
+    /* "theano/scan_module/scan_perform.pyx":309
  * 
  *         # 4. collecting slices where the output should be stored
  *         for idx in range(n_mit_mot_outs):             # <<<<<<<<<<<<<<
@@ -3231,23 +3087,23 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
       __pyx_v_idx = __pyx_t_5;
 
-      /* "theano/scan_module/scan_perform.pyx":308
+      /* "theano/scan_module/scan_perform.pyx":310
  *         # 4. collecting slices where the output should be stored
  *         for idx in range(n_mit_mot_outs):
  *             output_storage[idx].storage[0] = None             # <<<<<<<<<<<<<<
  * 
  *         offset = n_mit_mot_outs
  */
-      __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 308; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_2);
-      __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 308; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 310; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-      if (unlikely(__Pyx_SetItemInt(__pyx_t_1, 0, Py_None, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 308; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_9 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 310; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_9);
       __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      if (__Pyx_SetItemInt(__pyx_t_9, 0, Py_None, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 310; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
     }
 
-    /* "theano/scan_module/scan_perform.pyx":310
+    /* "theano/scan_module/scan_perform.pyx":312
  *             output_storage[idx].storage[0] = None
  * 
  *         offset = n_mit_mot_outs             # <<<<<<<<<<<<<<
@@ -3256,23 +3112,23 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_offset = __pyx_v_n_mit_mot_outs;
 
-    /* "theano/scan_module/scan_perform.pyx":311
+    /* "theano/scan_module/scan_perform.pyx":313
  * 
  *         offset = n_mit_mot_outs
  *         if i !=0 and n_nit_sot >0:             # <<<<<<<<<<<<<<
  *             for idx in range(n_outs + n_nit_sot - n_mit_mot):
  *                 if ( store_steps[<unsigned int>(idx+n_mit_mot)] == 1 or
  */
-    __pyx_t_15 = ((__pyx_v_i != 0) != 0);
+    __pyx_t_15 = (__pyx_v_i != 0);
     if (__pyx_t_15) {
-      __pyx_t_13 = ((__pyx_v_n_nit_sot > 0) != 0);
+      __pyx_t_13 = (__pyx_v_n_nit_sot > 0);
       __pyx_t_3 = __pyx_t_13;
     } else {
       __pyx_t_3 = __pyx_t_15;
     }
     if (__pyx_t_3) {
 
-      /* "theano/scan_module/scan_perform.pyx":312
+      /* "theano/scan_module/scan_perform.pyx":314
  *         offset = n_mit_mot_outs
  *         if i !=0 and n_nit_sot >0:
  *             for idx in range(n_outs + n_nit_sot - n_mit_mot):             # <<<<<<<<<<<<<<
@@ -3283,17 +3139,17 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
       for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
         __pyx_v_idx = __pyx_t_5;
 
-        /* "theano/scan_module/scan_perform.pyx":313
+        /* "theano/scan_module/scan_perform.pyx":315
  *         if i !=0 and n_nit_sot >0:
  *             for idx in range(n_outs + n_nit_sot - n_mit_mot):
  *                 if ( store_steps[<unsigned int>(idx+n_mit_mot)] == 1 or             # <<<<<<<<<<<<<<
  *                     vector_outs[<unsigned int>(idx+n_mit_mot)] == 1):
  *                     output_storage[<unsigned int>(idx+offset)].storage[0] = None
  */
-        __pyx_t_3 = (((__pyx_v_store_steps[((unsigned int)(__pyx_v_idx + __pyx_v_n_mit_mot))]) == 1) != 0);
+        __pyx_t_3 = ((__pyx_v_store_steps[((unsigned int)(__pyx_v_idx + __pyx_v_n_mit_mot))]) == 1);
         if (!__pyx_t_3) {
 
-          /* "theano/scan_module/scan_perform.pyx":314
+          /* "theano/scan_module/scan_perform.pyx":316
  *             for idx in range(n_outs + n_nit_sot - n_mit_mot):
  *                 if ( store_steps[<unsigned int>(idx+n_mit_mot)] == 1 or
  *                     vector_outs[<unsigned int>(idx+n_mit_mot)] == 1):             # <<<<<<<<<<<<<<
@@ -3301,14 +3157,14 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *                 else:
  */
           __pyx_t_25 = ((unsigned int)(__pyx_v_idx + __pyx_v_n_mit_mot));
-          __pyx_t_15 = (((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_vector_outs.rcbuffer->pybuffer.buf, __pyx_t_25, __pyx_pybuffernd_vector_outs.diminfo[0].strides)) == 1) != 0);
+          __pyx_t_15 = ((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_vector_outs.rcbuffer->pybuffer.buf, __pyx_t_25, __pyx_pybuffernd_vector_outs.diminfo[0].strides)) == 1);
           __pyx_t_13 = __pyx_t_15;
         } else {
           __pyx_t_13 = __pyx_t_3;
         }
         if (__pyx_t_13) {
 
-          /* "theano/scan_module/scan_perform.pyx":315
+          /* "theano/scan_module/scan_perform.pyx":317
  *                 if ( store_steps[<unsigned int>(idx+n_mit_mot)] == 1 or
  *                     vector_outs[<unsigned int>(idx+n_mit_mot)] == 1):
  *                     output_storage[<unsigned int>(idx+offset)].storage[0] = None             # <<<<<<<<<<<<<<
@@ -3316,18 +3172,18 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *                     output_storage[<unsigned int>(idx+offset)].storage[0] =\
  */
           __pyx_t_28 = ((unsigned int)(__pyx_v_idx + __pyx_v_offset));
-          __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_28, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 315; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+          __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_28, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 317; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_9);
+          __pyx_t_1 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 317; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_1);
-          __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_storage); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 315; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_GOTREF(__pyx_t_2);
+          __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+          if (__Pyx_SetItemInt(__pyx_t_1, 0, Py_None, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 317; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-          if (unlikely(__Pyx_SetItemInt(__pyx_t_2, 0, Py_None, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 315; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-          goto __pyx_L43;
+          goto __pyx_L46;
         }
         /*else*/ {
 
-          /* "theano/scan_module/scan_perform.pyx":318
+          /* "theano/scan_module/scan_perform.pyx":320
  *                 else:
  *                     output_storage[<unsigned int>(idx+offset)].storage[0] =\
  *                         outs[<unsigned int>(idx+n_mit_mot)][0][pos[\             # <<<<<<<<<<<<<<
@@ -3335,13 +3191,13 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *         else:
  */
           __pyx_t_28 = ((unsigned int)(__pyx_v_idx + __pyx_v_n_mit_mot));
-          __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_t_28, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 318; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_2);
-          __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 318; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+          __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_t_28, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 320; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_1);
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+          __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 320; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_9);
+          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
-          /* "theano/scan_module/scan_perform.pyx":319
+          /* "theano/scan_module/scan_perform.pyx":321
  *                     output_storage[<unsigned int>(idx+offset)].storage[0] =\
  *                         outs[<unsigned int>(idx+n_mit_mot)][0][pos[\
  *                                             <unsigned int>(idx+n_mit_mot)]]             # <<<<<<<<<<<<<<
@@ -3349,19 +3205,11 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *             for idx in range(n_outs + n_nit_sot - n_mit_mot):
  */
           __pyx_t_10 = (__pyx_v_pos[((unsigned int)(__pyx_v_idx + __pyx_v_n_mit_mot))]);
+          __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, __pyx_t_10, sizeof(int), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 320; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_1);
+          __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
 
-          /* "theano/scan_module/scan_perform.pyx":318
- *                 else:
- *                     output_storage[<unsigned int>(idx+offset)].storage[0] =\
- *                         outs[<unsigned int>(idx+n_mit_mot)][0][pos[\             # <<<<<<<<<<<<<<
- *                                             <unsigned int>(idx+n_mit_mot)]]
- *         else:
- */
-          __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, __pyx_t_10, int, 1, __Pyx_PyInt_From_int, 0, 1, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 318; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_2);
-          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-
-          /* "theano/scan_module/scan_perform.pyx":317
+          /* "theano/scan_module/scan_perform.pyx":319
  *                     output_storage[<unsigned int>(idx+offset)].storage[0] = None
  *                 else:
  *                     output_storage[<unsigned int>(idx+offset)].storage[0] =\             # <<<<<<<<<<<<<<
@@ -3369,22 +3217,22 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *                                             <unsigned int>(idx+n_mit_mot)]]
  */
           __pyx_t_28 = ((unsigned int)(__pyx_v_idx + __pyx_v_offset));
-          __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_28, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 317; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_1);
-          __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 317; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_28, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 319; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_9);
-          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-          if (unlikely(__Pyx_SetItemInt(__pyx_t_9, 0, __pyx_t_2, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 317; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __pyx_t_7 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__storage); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 319; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
           __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+          if (__Pyx_SetItemInt(__pyx_t_7, 0, __pyx_t_1, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 319; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
         }
-        __pyx_L43:;
+        __pyx_L46:;
       }
-      goto __pyx_L40;
+      goto __pyx_L43;
     }
     /*else*/ {
 
-      /* "theano/scan_module/scan_perform.pyx":321
+      /* "theano/scan_module/scan_perform.pyx":323
  *                                             <unsigned int>(idx+n_mit_mot)]]
  *         else:
  *             for idx in range(n_outs + n_nit_sot - n_mit_mot):             # <<<<<<<<<<<<<<
@@ -3395,7 +3243,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
       for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
         __pyx_v_idx = __pyx_t_5;
 
-        /* "theano/scan_module/scan_perform.pyx":322
+        /* "theano/scan_module/scan_perform.pyx":324
  *         else:
  *             for idx in range(n_outs + n_nit_sot - n_mit_mot):
  *                 output_storage[<unsigned int>(idx+offset)].storage[0] = None             # <<<<<<<<<<<<<<
@@ -3403,18 +3251,18 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *         offset += n_outs+n_nit_sot - n_mit_mot
  */
         __pyx_t_28 = ((unsigned int)(__pyx_v_idx + __pyx_v_offset));
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_28, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 322; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 322; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        if (unlikely(__Pyx_SetItemInt(__pyx_t_9, 0, Py_None, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 322; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_28, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 324; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_t_7 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__storage); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 324; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        if (__Pyx_SetItemInt(__pyx_t_7, 0, Py_None, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 324; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
       }
     }
-    __pyx_L40:;
+    __pyx_L43:;
 
-    /* "theano/scan_module/scan_perform.pyx":324
+    /* "theano/scan_module/scan_perform.pyx":326
  *                 output_storage[<unsigned int>(idx+offset)].storage[0] = None
  * 
  *         offset += n_outs+n_nit_sot - n_mit_mot             # <<<<<<<<<<<<<<
@@ -3423,7 +3271,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_offset = (__pyx_v_offset + ((__pyx_v_n_outs + __pyx_v_n_nit_sot) - __pyx_v_n_mit_mot));
 
-    /* "theano/scan_module/scan_perform.pyx":325
+    /* "theano/scan_module/scan_perform.pyx":327
  * 
  *         offset += n_outs+n_nit_sot - n_mit_mot
  *         for idx in range(n_shared_outs):             # <<<<<<<<<<<<<<
@@ -3434,7 +3282,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
       __pyx_v_idx = __pyx_t_5;
 
-      /* "theano/scan_module/scan_perform.pyx":326
+      /* "theano/scan_module/scan_perform.pyx":328
  *         offset += n_outs+n_nit_sot - n_mit_mot
  *         for idx in range(n_shared_outs):
  *             output_storage[<unsigned int>(idx+offset)].storage[0] = None             # <<<<<<<<<<<<<<
@@ -3442,26 +3290,25 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *             pdx = offset + n_shared_outs
  */
       __pyx_t_28 = ((unsigned int)(__pyx_v_idx + __pyx_v_offset));
-      __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_28, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 326; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_9);
-      __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_storage); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 326; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_2);
-      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-      if (unlikely(__Pyx_SetItemInt(__pyx_t_2, 0, Py_None, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 326; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+      __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_28, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 328; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
+      __pyx_t_1 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 328; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      if (__Pyx_SetItemInt(__pyx_t_1, 0, Py_None, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 328; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
     }
 
-    /* "theano/scan_module/scan_perform.pyx":327
+    /* "theano/scan_module/scan_perform.pyx":329
  *         for idx in range(n_shared_outs):
  *             output_storage[<unsigned int>(idx+offset)].storage[0] = None
  *         if as_while:             # <<<<<<<<<<<<<<
  *             pdx = offset + n_shared_outs
  *             output_storage[<unsigned int>pdx].storage[0] = None
  */
-    __pyx_t_13 = (__pyx_v_as_while != 0);
-    if (__pyx_t_13) {
+    if (__pyx_v_as_while) {
 
-      /* "theano/scan_module/scan_perform.pyx":328
+      /* "theano/scan_module/scan_perform.pyx":330
  *             output_storage[<unsigned int>(idx+offset)].storage[0] = None
  *         if as_while:
  *             pdx = offset + n_shared_outs             # <<<<<<<<<<<<<<
@@ -3470,43 +3317,44 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
       __pyx_v_pdx = (__pyx_v_offset + __pyx_v_n_shared_outs);
 
-      /* "theano/scan_module/scan_perform.pyx":329
+      /* "theano/scan_module/scan_perform.pyx":331
  *         if as_while:
  *             pdx = offset + n_shared_outs
  *             output_storage[<unsigned int>pdx].storage[0] = None             # <<<<<<<<<<<<<<
  *         # 5. compute outputs
  *         t0_fn = time.time()
  */
-      __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_output_storage, ((unsigned int)__pyx_v_pdx), unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 329; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_2);
-      __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 329; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_9);
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-      if (unlikely(__Pyx_SetItemInt(__pyx_t_9, 0, Py_None, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 329; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-      goto __pyx_L48;
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_output_storage, ((unsigned int)__pyx_v_pdx), sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 331; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_1);
+      __pyx_t_7 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__storage); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 331; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      if (__Pyx_SetItemInt(__pyx_t_7, 0, Py_None, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 331; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      goto __pyx_L51;
     }
-    __pyx_L48:;
+    __pyx_L51:;
 
-    /* "theano/scan_module/scan_perform.pyx":331
+    /* "theano/scan_module/scan_perform.pyx":333
  *             output_storage[<unsigned int>pdx].storage[0] = None
  *         # 5. compute outputs
  *         t0_fn = time.time()             # <<<<<<<<<<<<<<
  * 
  *         try:
  */
-    __pyx_t_9 = __Pyx_GetModuleGlobalName(__pyx_n_s_time); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 331; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_9);
-    __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_time); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 331; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_2);
-    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-    __pyx_t_9 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 331; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_9);
-    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-    __Pyx_XDECREF_SET(__pyx_v_t0_fn, __pyx_t_9);
-    __pyx_t_9 = 0;
+    __pyx_t_7 = __Pyx_GetName(__pyx_m, __pyx_n_s__time); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 333; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_1 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__time); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 333; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_7 = PyObject_Call(__pyx_t_1, ((PyObject *)__pyx_empty_tuple), NULL); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 333; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_v_t0_fn);
+    __pyx_v_t0_fn = __pyx_t_7;
+    __pyx_t_7 = 0;
 
-    /* "theano/scan_module/scan_perform.pyx":333
+    /* "theano/scan_module/scan_perform.pyx":335
  *         t0_fn = time.time()
  * 
  *         try:             # <<<<<<<<<<<<<<
@@ -3520,28 +3368,28 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
       __Pyx_XGOTREF(__pyx_t_31);
       /*try:*/ {
 
-        /* "theano/scan_module/scan_perform.pyx":334
+        /* "theano/scan_module/scan_perform.pyx":336
  * 
  *         try:
  *             fn()             # <<<<<<<<<<<<<<
  *         except Exception:
  *             if hasattr(fn, 'position_of_error'):
  */
-        __pyx_t_9 = __Pyx_PyObject_Call(__pyx_v_fn, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 334; __pyx_clineno = __LINE__; goto __pyx_L49_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_7 = PyObject_Call(__pyx_v_fn, ((PyObject *)__pyx_empty_tuple), NULL); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 336; __pyx_clineno = __LINE__; goto __pyx_L52_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
       }
       __Pyx_XDECREF(__pyx_t_29); __pyx_t_29 = 0;
       __Pyx_XDECREF(__pyx_t_30); __pyx_t_30 = 0;
       __Pyx_XDECREF(__pyx_t_31); __pyx_t_31 = 0;
-      goto __pyx_L56_try_end;
-      __pyx_L49_error:;
-      __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
-      __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+      goto __pyx_L59_try_end;
+      __pyx_L52_error:;
       __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
       __Pyx_XDECREF(__pyx_t_9); __pyx_t_9 = 0;
+      __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+      __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
 
-      /* "theano/scan_module/scan_perform.pyx":335
+      /* "theano/scan_module/scan_perform.pyx":337
  *         try:
  *             fn()
  *         except Exception:             # <<<<<<<<<<<<<<
@@ -3551,141 +3399,144 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
       __pyx_t_10 = PyErr_ExceptionMatches(__pyx_builtin_Exception);
       if (__pyx_t_10) {
         __Pyx_AddTraceback("theano.scan_module.scan_perform.perform", __pyx_clineno, __pyx_lineno, __pyx_filename);
-        if (__Pyx_GetException(&__pyx_t_9, &__pyx_t_2, &__pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 335; __pyx_clineno = __LINE__; goto __pyx_L51_except_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_GOTREF(__pyx_t_2);
+        if (__Pyx_GetException(&__pyx_t_7, &__pyx_t_1, &__pyx_t_9) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 337; __pyx_clineno = __LINE__; goto __pyx_L54_except_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_GOTREF(__pyx_t_9);
 
-        /* "theano/scan_module/scan_perform.pyx":336
+        /* "theano/scan_module/scan_perform.pyx":338
  *             fn()
  *         except Exception:
  *             if hasattr(fn, 'position_of_error'):             # <<<<<<<<<<<<<<
  *                 # this is a new vm-provided function
  *                 # the C VM needs this because the exception manipulation
  */
-        __pyx_t_13 = PyObject_HasAttr(__pyx_v_fn, __pyx_n_s_position_of_error); if (unlikely(__pyx_t_13 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 336; __pyx_clineno = __LINE__; goto __pyx_L51_except_error;}
-        __pyx_t_3 = (__pyx_t_13 != 0);
-        if (__pyx_t_3) {
+        __pyx_t_2 = ((PyObject *)__pyx_n_s__position_of_error);
+        __Pyx_INCREF(__pyx_t_2);
+        __pyx_t_13 = PyObject_HasAttr(__pyx_v_fn, __pyx_t_2); if (unlikely(__pyx_t_13 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 338; __pyx_clineno = __LINE__; goto __pyx_L54_except_error;}
+        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+        if (__pyx_t_13) {
 
-          /* "theano/scan_module/scan_perform.pyx":340
+          /* "theano/scan_module/scan_perform.pyx":342
  *                 # the C VM needs this because the exception manipulation
  *                 # done by raise_with_op is not implemented in C.
  *                 gof.vm.raise_with_op(fn.nodes[fn.position_of_error])             # <<<<<<<<<<<<<<
  *             else:
  *                 # old-style linkers raise their own exceptions
  */
-          __pyx_t_7 = __Pyx_GetModuleGlobalName(__pyx_n_s_gof); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 340; __pyx_clineno = __LINE__; goto __pyx_L51_except_error;}
-          __Pyx_GOTREF(__pyx_t_7);
-          __pyx_t_32 = __Pyx_PyObject_GetAttrStr(__pyx_t_7, __pyx_n_s_vm); if (unlikely(!__pyx_t_32)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 340; __pyx_clineno = __LINE__; goto __pyx_L51_except_error;}
+          __pyx_t_2 = __Pyx_GetName(__pyx_m, __pyx_n_s__gof); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 342; __pyx_clineno = __LINE__; goto __pyx_L54_except_error;}
+          __Pyx_GOTREF(__pyx_t_2);
+          __pyx_t_32 = PyObject_GetAttr(__pyx_t_2, __pyx_n_s__vm); if (unlikely(!__pyx_t_32)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 342; __pyx_clineno = __LINE__; goto __pyx_L54_except_error;}
           __Pyx_GOTREF(__pyx_t_32);
-          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
-          __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_32, __pyx_n_s_raise_with_op); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 340; __pyx_clineno = __LINE__; goto __pyx_L51_except_error;}
-          __Pyx_GOTREF(__pyx_t_7);
+          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+          __pyx_t_2 = PyObject_GetAttr(__pyx_t_32, __pyx_n_s__raise_with_op); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 342; __pyx_clineno = __LINE__; goto __pyx_L54_except_error;}
+          __Pyx_GOTREF(__pyx_t_2);
           __Pyx_DECREF(__pyx_t_32); __pyx_t_32 = 0;
-          __pyx_t_32 = __Pyx_PyObject_GetAttrStr(__pyx_v_fn, __pyx_n_s_nodes); if (unlikely(!__pyx_t_32)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 340; __pyx_clineno = __LINE__; goto __pyx_L51_except_error;}
+          __pyx_t_32 = PyObject_GetAttr(__pyx_v_fn, __pyx_n_s__nodes); if (unlikely(!__pyx_t_32)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 342; __pyx_clineno = __LINE__; goto __pyx_L54_except_error;}
           __Pyx_GOTREF(__pyx_t_32);
-          __pyx_t_33 = __Pyx_PyObject_GetAttrStr(__pyx_v_fn, __pyx_n_s_position_of_error); if (unlikely(!__pyx_t_33)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 340; __pyx_clineno = __LINE__; goto __pyx_L51_except_error;}
+          __pyx_t_33 = PyObject_GetAttr(__pyx_v_fn, __pyx_n_s__position_of_error); if (unlikely(!__pyx_t_33)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 342; __pyx_clineno = __LINE__; goto __pyx_L54_except_error;}
           __Pyx_GOTREF(__pyx_t_33);
-          __pyx_t_34 = PyObject_GetItem(__pyx_t_32, __pyx_t_33); if (unlikely(__pyx_t_34 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 340; __pyx_clineno = __LINE__; goto __pyx_L51_except_error;};
+          __pyx_t_34 = PyObject_GetItem(__pyx_t_32, __pyx_t_33); if (!__pyx_t_34) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 342; __pyx_clineno = __LINE__; goto __pyx_L54_except_error;}
           __Pyx_GOTREF(__pyx_t_34);
           __Pyx_DECREF(__pyx_t_32); __pyx_t_32 = 0;
           __Pyx_DECREF(__pyx_t_33); __pyx_t_33 = 0;
-          __pyx_t_33 = PyTuple_New(1); if (unlikely(!__pyx_t_33)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 340; __pyx_clineno = __LINE__; goto __pyx_L51_except_error;}
+          __pyx_t_33 = PyTuple_New(1); if (unlikely(!__pyx_t_33)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 342; __pyx_clineno = __LINE__; goto __pyx_L54_except_error;}
           __Pyx_GOTREF(__pyx_t_33);
           PyTuple_SET_ITEM(__pyx_t_33, 0, __pyx_t_34);
           __Pyx_GIVEREF(__pyx_t_34);
           __pyx_t_34 = 0;
-          __pyx_t_34 = __Pyx_PyObject_Call(__pyx_t_7, __pyx_t_33, NULL); if (unlikely(!__pyx_t_34)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 340; __pyx_clineno = __LINE__; goto __pyx_L51_except_error;}
+          __pyx_t_34 = PyObject_Call(__pyx_t_2, ((PyObject *)__pyx_t_33), NULL); if (unlikely(!__pyx_t_34)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 342; __pyx_clineno = __LINE__; goto __pyx_L54_except_error;}
           __Pyx_GOTREF(__pyx_t_34);
-          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
-          __Pyx_DECREF(__pyx_t_33); __pyx_t_33 = 0;
+          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+          __Pyx_DECREF(((PyObject *)__pyx_t_33)); __pyx_t_33 = 0;
           __Pyx_DECREF(__pyx_t_34); __pyx_t_34 = 0;
-          goto __pyx_L59;
+          goto __pyx_L62;
         }
         /*else*/ {
 
-          /* "theano/scan_module/scan_perform.pyx":343
+          /* "theano/scan_module/scan_perform.pyx":345
  *             else:
  *                 # old-style linkers raise their own exceptions
  *                 raise             # <<<<<<<<<<<<<<
  * 
  *         dt_fn = time.time() - t0_fn
  */
+          __Pyx_GIVEREF(__pyx_t_7);
+          __Pyx_GIVEREF(__pyx_t_1);
           __Pyx_GIVEREF(__pyx_t_9);
-          __Pyx_GIVEREF(__pyx_t_2);
-          __Pyx_XGIVEREF(__pyx_t_1);
-          __Pyx_ErrRestore(__pyx_t_9, __pyx_t_2, __pyx_t_1);
-          __pyx_t_9 = 0; __pyx_t_2 = 0; __pyx_t_1 = 0; 
-          {__pyx_filename = __pyx_f[0]; __pyx_lineno = 343; __pyx_clineno = __LINE__; goto __pyx_L51_except_error;}
+          __Pyx_ErrRestore(__pyx_t_7, __pyx_t_1, __pyx_t_9);
+          __pyx_t_7 = 0; __pyx_t_1 = 0; __pyx_t_9 = 0; 
+          {__pyx_filename = __pyx_f[0]; __pyx_lineno = 345; __pyx_clineno = __LINE__; goto __pyx_L54_except_error;}
         }
-        __pyx_L59:;
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __pyx_L62:;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        goto __pyx_L50_exception_handled;
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        goto __pyx_L53_exception_handled;
       }
-      goto __pyx_L51_except_error;
-      __pyx_L51_except_error:;
+      __pyx_L54_except_error:;
       __Pyx_XGIVEREF(__pyx_t_29);
       __Pyx_XGIVEREF(__pyx_t_30);
       __Pyx_XGIVEREF(__pyx_t_31);
       __Pyx_ExceptionReset(__pyx_t_29, __pyx_t_30, __pyx_t_31);
       goto __pyx_L1_error;
-      __pyx_L50_exception_handled:;
+      __pyx_L53_exception_handled:;
       __Pyx_XGIVEREF(__pyx_t_29);
       __Pyx_XGIVEREF(__pyx_t_30);
       __Pyx_XGIVEREF(__pyx_t_31);
       __Pyx_ExceptionReset(__pyx_t_29, __pyx_t_30, __pyx_t_31);
-      __pyx_L56_try_end:;
+      __pyx_L59_try_end:;
     }
 
-    /* "theano/scan_module/scan_perform.pyx":345
+    /* "theano/scan_module/scan_perform.pyx":347
  *                 raise
  * 
  *         dt_fn = time.time() - t0_fn             # <<<<<<<<<<<<<<
  *         t_fn += dt_fn
  *         if self.as_while:
  */
-    __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_time); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 345; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_9 = __Pyx_GetName(__pyx_m, __pyx_n_s__time); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 347; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_9);
+    __pyx_t_1 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__time); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 347; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_1);
-    __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_time); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 345; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+    __pyx_t_9 = PyObject_Call(__pyx_t_1, ((PyObject *)__pyx_empty_tuple), NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 347; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_9);
     __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-    __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 345; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = PyNumber_Subtract(__pyx_t_9, __pyx_v_t0_fn); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 347; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_1);
-    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-    __pyx_t_2 = PyNumber_Subtract(__pyx_t_1, __pyx_v_t0_fn); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 345; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_2);
-    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-    __Pyx_XDECREF_SET(__pyx_v_dt_fn, __pyx_t_2);
-    __pyx_t_2 = 0;
+    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+    __Pyx_XDECREF(__pyx_v_dt_fn);
+    __pyx_v_dt_fn = __pyx_t_1;
+    __pyx_t_1 = 0;
 
-    /* "theano/scan_module/scan_perform.pyx":346
+    /* "theano/scan_module/scan_perform.pyx":348
  * 
  *         dt_fn = time.time() - t0_fn
  *         t_fn += dt_fn             # <<<<<<<<<<<<<<
  *         if self.as_while:
  *             pdx = offset + n_shared_outs
  */
-    __pyx_t_2 = PyNumber_InPlaceAdd(__pyx_v_t_fn, __pyx_v_dt_fn); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 346; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_2);
-    __Pyx_DECREF_SET(__pyx_v_t_fn, __pyx_t_2);
-    __pyx_t_2 = 0;
+    __pyx_t_1 = PyNumber_InPlaceAdd(__pyx_v_t_fn, __pyx_v_dt_fn); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 348; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_v_t_fn);
+    __pyx_v_t_fn = __pyx_t_1;
+    __pyx_t_1 = 0;
 
-    /* "theano/scan_module/scan_perform.pyx":347
+    /* "theano/scan_module/scan_perform.pyx":349
  *         dt_fn = time.time() - t0_fn
  *         t_fn += dt_fn
  *         if self.as_while:             # <<<<<<<<<<<<<<
  *             pdx = offset + n_shared_outs
  *             cond = output_storage[pdx].storage[0] == 0
  */
-    __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_self, __pyx_n_s_as_while); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 347; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_2);
-    __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 347; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-    if (__pyx_t_3) {
+    __pyx_t_1 = PyObject_GetAttr(__pyx_v_self, __pyx_n_s__as_while); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 349; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_13 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_13 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 349; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    if (__pyx_t_13) {
 
-      /* "theano/scan_module/scan_perform.pyx":348
+      /* "theano/scan_module/scan_perform.pyx":350
  *         t_fn += dt_fn
  *         if self.as_while:
  *             pdx = offset + n_shared_outs             # <<<<<<<<<<<<<<
@@ -3694,31 +3545,32 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
       __pyx_v_pdx = (__pyx_v_offset + __pyx_v_n_shared_outs);
 
-      /* "theano/scan_module/scan_perform.pyx":349
+      /* "theano/scan_module/scan_perform.pyx":351
  *         if self.as_while:
  *             pdx = offset + n_shared_outs
  *             cond = output_storage[pdx].storage[0] == 0             # <<<<<<<<<<<<<<
  * 
  * 
  */
-      __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_pdx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 349; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_2);
-      __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 349; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_pdx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 351; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-      __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 349; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_2);
+      __pyx_t_9 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 351; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_9);
       __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-      __pyx_t_1 = PyObject_RichCompare(__pyx_t_2, __pyx_int_0, Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 349; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-      __pyx_t_10 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_10 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 349; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 351; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+      __pyx_t_9 = PyObject_RichCompare(__pyx_t_1, __pyx_int_0, Py_EQ); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 351; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_9);
       __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      __pyx_t_10 = __Pyx_PyInt_AsInt(__pyx_t_9); if (unlikely((__pyx_t_10 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 351; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
       __pyx_v_cond = __pyx_t_10;
-      goto __pyx_L60;
+      goto __pyx_L63;
     }
-    __pyx_L60:;
+    __pyx_L63:;
 
-    /* "theano/scan_module/scan_perform.pyx":352
+    /* "theano/scan_module/scan_perform.pyx":354
  * 
  * 
  *         offset_out = 0             # <<<<<<<<<<<<<<
@@ -3727,7 +3579,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_offset_out = 0;
 
-    /* "theano/scan_module/scan_perform.pyx":354
+    /* "theano/scan_module/scan_perform.pyx":356
  *         offset_out = 0
  *         # 5.1 Copy over the values for mit_mot outputs
  *         for j in range(n_mit_mot):             # <<<<<<<<<<<<<<
@@ -3738,7 +3590,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
       __pyx_v_j = __pyx_t_5;
 
-      /* "theano/scan_module/scan_perform.pyx":355
+      /* "theano/scan_module/scan_perform.pyx":357
  *         # 5.1 Copy over the values for mit_mot outputs
  *         for j in range(n_mit_mot):
  *             for kdx in range(mit_mot_out_nslices[j]):             # <<<<<<<<<<<<<<
@@ -3750,7 +3602,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
       for (__pyx_t_35 = 0; __pyx_t_35 < __pyx_t_21; __pyx_t_35+=1) {
         __pyx_v_kdx = __pyx_t_35;
 
-        /* "theano/scan_module/scan_perform.pyx":356
+        /* "theano/scan_module/scan_perform.pyx":358
  *         for j in range(n_mit_mot):
  *             for kdx in range(mit_mot_out_nslices[j]):
  *                 k = mit_mot_out_slices[j,kdx]             # <<<<<<<<<<<<<<
@@ -3761,32 +3613,32 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
         __pyx_t_37 = __pyx_v_kdx;
         __pyx_v_k = (*__Pyx_BufPtrStrided2d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_mit_mot_out_slices.rcbuffer->pybuffer.buf, __pyx_t_36, __pyx_pybuffernd_mit_mot_out_slices.diminfo[0].strides, __pyx_t_37, __pyx_pybuffernd_mit_mot_out_slices.diminfo[1].strides));
 
-        /* "theano/scan_module/scan_perform.pyx":357
+        /* "theano/scan_module/scan_perform.pyx":359
  *             for kdx in range(mit_mot_out_nslices[j]):
  *                 k = mit_mot_out_slices[j,kdx]
  *                 outs[j][0][<unsigned int>(k+pos[j])] = output_storage[offset_out].storage[0]             # <<<<<<<<<<<<<<
  *                 offset_out += 1
  * 
  */
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_offset_out, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 357; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_offset_out, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 359; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_1 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 359; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_storage); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 357; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 359; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 357; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 359; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 357; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 357; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 359; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
         __pyx_t_38 = ((unsigned int)(__pyx_v_k + (__pyx_v_pos[__pyx_v_j])));
-        if (unlikely(__Pyx_SetItemInt(__pyx_t_9, __pyx_t_38, __pyx_t_1, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 357; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        if (__Pyx_SetItemInt(__pyx_t_7, __pyx_t_38, __pyx_t_9, sizeof(unsigned int)+1, PyLong_FromUnsignedLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 359; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":358
+        /* "theano/scan_module/scan_perform.pyx":360
  *                 k = mit_mot_out_slices[j,kdx]
  *                 outs[j][0][<unsigned int>(k+pos[j])] = output_storage[offset_out].storage[0]
  *                 offset_out += 1             # <<<<<<<<<<<<<<
@@ -3797,7 +3649,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
       }
     }
 
-    /* "theano/scan_module/scan_perform.pyx":361
+    /* "theano/scan_module/scan_perform.pyx":363
  * 
  *         # 5.2 Copy over the values for mit_sot/sit_sot outputs
  *         begin = n_mit_mot             # <<<<<<<<<<<<<<
@@ -3806,7 +3658,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_begin = __pyx_v_n_mit_mot;
 
-    /* "theano/scan_module/scan_perform.pyx":362
+    /* "theano/scan_module/scan_perform.pyx":364
  *         # 5.2 Copy over the values for mit_sot/sit_sot outputs
  *         begin = n_mit_mot
  *         end   = n_outs             # <<<<<<<<<<<<<<
@@ -3815,7 +3667,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_end = __pyx_v_n_outs;
 
-    /* "theano/scan_module/scan_perform.pyx":363
+    /* "theano/scan_module/scan_perform.pyx":365
  *         begin = n_mit_mot
  *         end   = n_outs
  *         offset_out -= n_mit_mot             # <<<<<<<<<<<<<<
@@ -3824,7 +3676,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_offset_out = (__pyx_v_offset_out - __pyx_v_n_mit_mot);
 
-    /* "theano/scan_module/scan_perform.pyx":365
+    /* "theano/scan_module/scan_perform.pyx":367
  *         offset_out -= n_mit_mot
  * 
  *         for j in range(begin, end):             # <<<<<<<<<<<<<<
@@ -3835,57 +3687,57 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     for (__pyx_t_5 = __pyx_v_begin; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
       __pyx_v_j = __pyx_t_5;
 
-      /* "theano/scan_module/scan_perform.pyx":366
+      /* "theano/scan_module/scan_perform.pyx":368
  * 
  *         for j in range(begin, end):
  *             if ( store_steps[j] == 1 or vector_outs[j] ==1 or             # <<<<<<<<<<<<<<
  *                 outs[j][0][pos[j]] is not output_storage[<unsigned int>(offset_out+j)].storage[0]):
  * 
  */
-      __pyx_t_3 = (((__pyx_v_store_steps[__pyx_v_j]) == 1) != 0);
-      if (!__pyx_t_3) {
+      __pyx_t_13 = ((__pyx_v_store_steps[__pyx_v_j]) == 1);
+      if (!__pyx_t_13) {
         __pyx_t_35 = __pyx_v_j;
-        __pyx_t_13 = (((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_vector_outs.rcbuffer->pybuffer.buf, __pyx_t_35, __pyx_pybuffernd_vector_outs.diminfo[0].strides)) == 1) != 0);
-        if (!__pyx_t_13) {
+        __pyx_t_3 = ((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_vector_outs.rcbuffer->pybuffer.buf, __pyx_t_35, __pyx_pybuffernd_vector_outs.diminfo[0].strides)) == 1);
+        if (!__pyx_t_3) {
 
-          /* "theano/scan_module/scan_perform.pyx":367
+          /* "theano/scan_module/scan_perform.pyx":369
  *         for j in range(begin, end):
  *             if ( store_steps[j] == 1 or vector_outs[j] ==1 or
  *                 outs[j][0][pos[j]] is not output_storage[<unsigned int>(offset_out+j)].storage[0]):             # <<<<<<<<<<<<<<
  * 
  *                 outs[j][0][pos[j]] = output_storage[<unsigned int>(offset_out+j)].storage[0]
  */
-          __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 367; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_1);
-          __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 367; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_9);
-          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-          __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, (__pyx_v_pos[__pyx_v_j]), int, 1, __Pyx_PyInt_From_int, 0, 1, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 367; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_1);
-          __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __pyx_t_38 = ((unsigned int)(__pyx_v_offset_out + __pyx_v_j));
-          __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_38, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 367; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+          __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 369; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_9);
-          __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_storage); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 367; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_GOTREF(__pyx_t_2);
+          __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 369; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
           __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 367; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+          __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_7, (__pyx_v_pos[__pyx_v_j]), sizeof(int), PyInt_FromLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 369; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_9);
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-          __pyx_t_15 = (__pyx_t_1 != __pyx_t_9);
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __pyx_t_38 = ((unsigned int)(__pyx_v_offset_out + __pyx_v_j));
+          __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_38, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 369; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
+          __pyx_t_1 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 369; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_1);
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 369; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
           __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+          __pyx_t_15 = (__pyx_t_9 != __pyx_t_7);
           __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __pyx_t_14 = (__pyx_t_15 != 0);
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __pyx_t_14 = __pyx_t_15;
         } else {
-          __pyx_t_14 = __pyx_t_13;
+          __pyx_t_14 = __pyx_t_3;
         }
-        __pyx_t_13 = __pyx_t_14;
+        __pyx_t_3 = __pyx_t_14;
       } else {
-        __pyx_t_13 = __pyx_t_3;
+        __pyx_t_3 = __pyx_t_13;
       }
-      if (__pyx_t_13) {
+      if (__pyx_t_3) {
 
-        /* "theano/scan_module/scan_perform.pyx":369
+        /* "theano/scan_module/scan_perform.pyx":371
  *                 outs[j][0][pos[j]] is not output_storage[<unsigned int>(offset_out+j)].storage[0]):
  * 
  *                 outs[j][0][pos[j]] = output_storage[<unsigned int>(offset_out+j)].storage[0]             # <<<<<<<<<<<<<<
@@ -3893,28 +3745,28 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  *         # 5.3 Copy over the values for nit_sot outputs
  */
         __pyx_t_38 = ((unsigned int)(__pyx_v_offset_out + __pyx_v_j));
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_38, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 369; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_38, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 371; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_9 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 371; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 369; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 371; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 369; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 371; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 369; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 371; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 369; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        if (unlikely(__Pyx_SetItemInt(__pyx_t_2, (__pyx_v_pos[__pyx_v_j]), __pyx_t_9, int, 1, __Pyx_PyInt_From_int, 0, 1, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 369; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        goto __pyx_L67;
+        if (__Pyx_SetItemInt(__pyx_t_1, (__pyx_v_pos[__pyx_v_j]), __pyx_t_7, sizeof(int), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 371; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        goto __pyx_L70;
       }
-      __pyx_L67:;
+      __pyx_L70:;
     }
 
-    /* "theano/scan_module/scan_perform.pyx":372
+    /* "theano/scan_module/scan_perform.pyx":374
  * 
  *         # 5.3 Copy over the values for nit_sot outputs
  *         begin  = end             # <<<<<<<<<<<<<<
@@ -3923,7 +3775,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_begin = __pyx_v_end;
 
-    /* "theano/scan_module/scan_perform.pyx":373
+    /* "theano/scan_module/scan_perform.pyx":375
  *         # 5.3 Copy over the values for nit_sot outputs
  *         begin  = end
  *         end   += n_nit_sot             # <<<<<<<<<<<<<<
@@ -3932,7 +3784,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_end = (__pyx_v_end + __pyx_v_n_nit_sot);
 
-    /* "theano/scan_module/scan_perform.pyx":374
+    /* "theano/scan_module/scan_perform.pyx":376
  *         begin  = end
  *         end   += n_nit_sot
  *         for j in range(begin,end):             # <<<<<<<<<<<<<<
@@ -3943,17 +3795,17 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     for (__pyx_t_5 = __pyx_v_begin; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
       __pyx_v_j = __pyx_t_5;
 
-      /* "theano/scan_module/scan_perform.pyx":375
+      /* "theano/scan_module/scan_perform.pyx":377
  *         end   += n_nit_sot
  *         for j in range(begin,end):
  *             if i == 0:             # <<<<<<<<<<<<<<
  *                 jout = j+offset_out
  *                 shape = (store_steps[j],) + output_storage[jout].storage[0].shape
  */
-      __pyx_t_13 = ((__pyx_v_i == 0) != 0);
-      if (__pyx_t_13) {
+      __pyx_t_3 = (__pyx_v_i == 0);
+      if (__pyx_t_3) {
 
-        /* "theano/scan_module/scan_perform.pyx":376
+        /* "theano/scan_module/scan_perform.pyx":378
  *         for j in range(begin,end):
  *             if i == 0:
  *                 jout = j+offset_out             # <<<<<<<<<<<<<<
@@ -3962,62 +3814,63 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
         __pyx_v_jout = (__pyx_v_j + __pyx_v_offset_out);
 
-        /* "theano/scan_module/scan_perform.pyx":377
+        /* "theano/scan_module/scan_perform.pyx":379
  *             if i == 0:
  *                 jout = j+offset_out
  *                 shape = (store_steps[j],) + output_storage[jout].storage[0].shape             # <<<<<<<<<<<<<<
  *                 if len(output_storage[jout].storage[0].shape) == 0:
  *                     vector_outs[j] = 1
  */
-        __pyx_t_9 = __Pyx_PyInt_From_int((__pyx_v_store_steps[__pyx_v_j])); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 377; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 377; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
-        PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_9);
-        __Pyx_GIVEREF(__pyx_t_9);
-        __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_jout, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 377; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 377; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_7 = PyInt_FromLong((__pyx_v_store_steps[__pyx_v_j])); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 379; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 379; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 377; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_7);
+        __Pyx_GIVEREF(__pyx_t_7);
+        __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_jout, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 379; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_9 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 379; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 377; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 379; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = PyNumber_Add(__pyx_t_2, __pyx_t_1); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 377; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_9 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__shape); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 379; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __Pyx_XDECREF_SET(__pyx_v_shape, __pyx_t_9);
-        __pyx_t_9 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = PyNumber_Add(((PyObject *)__pyx_t_1), __pyx_t_9); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 379; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(((PyObject *)__pyx_t_1)); __pyx_t_1 = 0;
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __Pyx_XDECREF(__pyx_v_shape);
+        __pyx_v_shape = __pyx_t_7;
+        __pyx_t_7 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":378
+        /* "theano/scan_module/scan_perform.pyx":380
  *                 jout = j+offset_out
  *                 shape = (store_steps[j],) + output_storage[jout].storage[0].shape
  *                 if len(output_storage[jout].storage[0].shape) == 0:             # <<<<<<<<<<<<<<
  *                     vector_outs[j] = 1
  *                 dtype = output_storage[jout].storage[0].dtype
  */
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_jout, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 378; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_jout, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 380; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_9 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 380; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 378; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 380; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 378; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_9 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__shape); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 380; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 378; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_17 = PyObject_Length(__pyx_t_9); if (unlikely(__pyx_t_17 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 380; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_17 = PyObject_Length(__pyx_t_1); if (unlikely(__pyx_t_17 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 378; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_13 = ((__pyx_t_17 == 0) != 0);
-        if (__pyx_t_13) {
+        __pyx_t_3 = (__pyx_t_17 == 0);
+        if (__pyx_t_3) {
 
-          /* "theano/scan_module/scan_perform.pyx":379
+          /* "theano/scan_module/scan_perform.pyx":381
  *                 shape = (store_steps[j],) + output_storage[jout].storage[0].shape
  *                 if len(output_storage[jout].storage[0].shape) == 0:
  *                     vector_outs[j] = 1             # <<<<<<<<<<<<<<
@@ -4026,297 +3879,302 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
           __pyx_t_38 = __pyx_v_j;
           *__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_vector_outs.rcbuffer->pybuffer.buf, __pyx_t_38, __pyx_pybuffernd_vector_outs.diminfo[0].strides) = 1;
-          goto __pyx_L71;
+          goto __pyx_L74;
         }
-        __pyx_L71:;
+        __pyx_L74:;
 
-        /* "theano/scan_module/scan_perform.pyx":380
+        /* "theano/scan_module/scan_perform.pyx":382
  *                 if len(output_storage[jout].storage[0].shape) == 0:
  *                     vector_outs[j] = 1
  *                 dtype = output_storage[jout].storage[0].dtype             # <<<<<<<<<<<<<<
  *                 if (outs[j][0] is None or
  *                         outs[j][0].shape[0] < store_steps[j] or
  */
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_jout, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 380; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 380; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_jout, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 382; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 380; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_t_7 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__storage); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 382; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_dtype); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 380; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 382; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __Pyx_XDECREF_SET(__pyx_v_dtype, __pyx_t_9);
-        __pyx_t_9 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__dtype); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 382; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __Pyx_XDECREF(__pyx_v_dtype);
+        __pyx_v_dtype = __pyx_t_7;
+        __pyx_t_7 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":381
+        /* "theano/scan_module/scan_perform.pyx":383
  *                     vector_outs[j] = 1
  *                 dtype = output_storage[jout].storage[0].dtype
  *                 if (outs[j][0] is None or             # <<<<<<<<<<<<<<
  *                         outs[j][0].shape[0] < store_steps[j] or
  *                         outs[j][0].shape[1:] != shape[1:] or
  */
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 381; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 383; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 383; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 381; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_3 = (__pyx_t_9 == Py_None);
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_13 = (__pyx_t_1 == Py_None);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        if (!__pyx_t_13) {
+        if (!__pyx_t_3) {
 
-          /* "theano/scan_module/scan_perform.pyx":382
+          /* "theano/scan_module/scan_perform.pyx":384
  *                 dtype = output_storage[jout].storage[0].dtype
  *                 if (outs[j][0] is None or
  *                         outs[j][0].shape[0] < store_steps[j] or             # <<<<<<<<<<<<<<
  *                         outs[j][0].shape[1:] != shape[1:] or
  *                         outs[j][0].dtype != dtype ):
  */
-          __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 382; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_1);
-          __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 382; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+          __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 384; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_9);
-          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-          __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 382; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_GOTREF(__pyx_t_1);
+          __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 384; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
           __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 382; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+          __pyx_t_9 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__shape); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 384; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_9);
-          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-          __pyx_t_1 = __Pyx_PyInt_From_int((__pyx_v_store_steps[__pyx_v_j])); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 382; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 384; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
+          __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+          __pyx_t_9 = PyInt_FromLong((__pyx_v_store_steps[__pyx_v_j])); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 384; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_9);
+          __pyx_t_1 = PyObject_RichCompare(__pyx_t_7, __pyx_t_9, Py_LT); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 384; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_1);
-          __pyx_t_2 = PyObject_RichCompare(__pyx_t_9, __pyx_t_1, Py_LT); __Pyx_XGOTREF(__pyx_t_2); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 382; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
           __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+          __pyx_t_13 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_13 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 384; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-          __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 382; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-          if (!__pyx_t_3) {
+          if (!__pyx_t_13) {
 
-            /* "theano/scan_module/scan_perform.pyx":383
+            /* "theano/scan_module/scan_perform.pyx":385
  *                 if (outs[j][0] is None or
  *                         outs[j][0].shape[0] < store_steps[j] or
  *                         outs[j][0].shape[1:] != shape[1:] or             # <<<<<<<<<<<<<<
  *                         outs[j][0].dtype != dtype ):
  *                     outs[j][0] = node.outputs[j].type.value_zeros(shape)
  */
-            __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 383; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-            __Pyx_GOTREF(__pyx_t_2);
-            __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 383; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+            __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
             __Pyx_GOTREF(__pyx_t_1);
-            __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-            __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_shape); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 383; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-            __Pyx_GOTREF(__pyx_t_2);
+            __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+            __Pyx_GOTREF(__pyx_t_9);
             __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-            __pyx_t_1 = __Pyx_PyObject_GetSlice(__pyx_t_2, 1, 0, NULL, NULL, &__pyx_slice__7, 1, 0, 1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 383; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+            __pyx_t_1 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__shape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
             __Pyx_GOTREF(__pyx_t_1);
-            __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-            __pyx_t_2 = __Pyx_PyObject_GetSlice(__pyx_v_shape, 1, 0, NULL, NULL, &__pyx_slice__8, 1, 0, 1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 383; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-            __Pyx_GOTREF(__pyx_t_2);
-            __pyx_t_9 = PyObject_RichCompare(__pyx_t_1, __pyx_t_2, Py_NE); __Pyx_XGOTREF(__pyx_t_9); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 383; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+            __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+            __pyx_t_9 = __Pyx_PySequence_GetSlice(__pyx_t_1, 1, PY_SSIZE_T_MAX); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+            __Pyx_GOTREF(__pyx_t_9);
             __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-            __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-            __pyx_t_14 = __Pyx_PyObject_IsTrue(__pyx_t_9); if (unlikely(__pyx_t_14 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 383; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+            __pyx_t_1 = __Pyx_PySequence_GetSlice(__pyx_v_shape, 1, PY_SSIZE_T_MAX); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+            __Pyx_GOTREF(__pyx_t_1);
+            __pyx_t_7 = PyObject_RichCompare(__pyx_t_9, __pyx_t_1, Py_NE); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+            __Pyx_GOTREF(__pyx_t_7);
             __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+            __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+            __pyx_t_14 = __Pyx_PyObject_IsTrue(__pyx_t_7); if (unlikely(__pyx_t_14 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+            __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
             if (!__pyx_t_14) {
 
-              /* "theano/scan_module/scan_perform.pyx":384
+              /* "theano/scan_module/scan_perform.pyx":386
  *                         outs[j][0].shape[0] < store_steps[j] or
  *                         outs[j][0].shape[1:] != shape[1:] or
  *                         outs[j][0].dtype != dtype ):             # <<<<<<<<<<<<<<
  *                     outs[j][0] = node.outputs[j].type.value_zeros(shape)
  *                 elif outs[j][0].shape[0] != store_steps[j]:
  */
-              __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 384; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-              __Pyx_GOTREF(__pyx_t_9);
-              __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 384; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-              __Pyx_GOTREF(__pyx_t_2);
-              __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-              __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_dtype); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 384; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-              __Pyx_GOTREF(__pyx_t_9);
-              __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-              __pyx_t_2 = PyObject_RichCompare(__pyx_t_9, __pyx_v_dtype, Py_NE); __Pyx_XGOTREF(__pyx_t_2); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 384; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-              __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-              __pyx_t_15 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_15 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 384; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-              __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+              __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 386; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+              __Pyx_GOTREF(__pyx_t_7);
+              __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 386; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+              __Pyx_GOTREF(__pyx_t_1);
+              __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+              __pyx_t_7 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__dtype); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 386; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+              __Pyx_GOTREF(__pyx_t_7);
+              __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+              __pyx_t_1 = PyObject_RichCompare(__pyx_t_7, __pyx_v_dtype, Py_NE); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 386; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+              __Pyx_GOTREF(__pyx_t_1);
+              __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+              __pyx_t_15 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_15 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 386; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+              __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
               __pyx_t_39 = __pyx_t_15;
             } else {
               __pyx_t_39 = __pyx_t_14;
             }
             __pyx_t_14 = __pyx_t_39;
           } else {
-            __pyx_t_14 = __pyx_t_3;
+            __pyx_t_14 = __pyx_t_13;
           }
-          __pyx_t_3 = __pyx_t_14;
+          __pyx_t_13 = __pyx_t_14;
         } else {
-          __pyx_t_3 = __pyx_t_13;
+          __pyx_t_13 = __pyx_t_3;
         }
-        if (__pyx_t_3) {
+        if (__pyx_t_13) {
 
-          /* "theano/scan_module/scan_perform.pyx":385
+          /* "theano/scan_module/scan_perform.pyx":387
  *                         outs[j][0].shape[1:] != shape[1:] or
  *                         outs[j][0].dtype != dtype ):
  *                     outs[j][0] = node.outputs[j].type.value_zeros(shape)             # <<<<<<<<<<<<<<
  *                 elif outs[j][0].shape[0] != store_steps[j]:
  *                     outs[j][0] = outs[j][0][:store_steps[j]]
  */
-          __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_node, __pyx_n_s_outputs); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_GOTREF(__pyx_t_2);
-          __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_9);
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-          __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_type); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_GOTREF(__pyx_t_2);
-          __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_value_zeros); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_GOTREF(__pyx_t_9);
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-          __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_GOTREF(__pyx_t_2);
+          __pyx_t_1 = PyObject_GetAttr(__pyx_v_node, __pyx_n_s__outputs); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_1);
+          __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
+          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+          __pyx_t_1 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__type); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_1);
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __pyx_t_7 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__value_zeros); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
+          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+          __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_1);
           __Pyx_INCREF(__pyx_v_shape);
-          PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_v_shape);
+          PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_shape);
           __Pyx_GIVEREF(__pyx_v_shape);
-          __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_9, __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __pyx_t_9 = PyObject_Call(__pyx_t_7, ((PyObject *)__pyx_t_1), NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_9);
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __Pyx_DECREF(((PyObject *)__pyx_t_1)); __pyx_t_1 = 0;
+          __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_1);
-          __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-          __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_2);
-          if (unlikely(__Pyx_SetItemInt(__pyx_t_2, 0, __pyx_t_1, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 385; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+          if (__Pyx_SetItemInt(__pyx_t_1, 0, __pyx_t_9, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-          goto __pyx_L72;
+          __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+          goto __pyx_L75;
         }
 
-        /* "theano/scan_module/scan_perform.pyx":386
+        /* "theano/scan_module/scan_perform.pyx":388
  *                         outs[j][0].dtype != dtype ):
  *                     outs[j][0] = node.outputs[j].type.value_zeros(shape)
  *                 elif outs[j][0].shape[0] != store_steps[j]:             # <<<<<<<<<<<<<<
  *                     outs[j][0] = outs[j][0][:store_steps[j]]
  *                 outs[j][0][pos[j]] = output_storage[jout].storage[0]
  */
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 386; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 386; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 386; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 386; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_9 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__shape); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_PyInt_From_int((__pyx_v_store_steps[__pyx_v_j])); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 386; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_9 = PyObject_RichCompare(__pyx_t_2, __pyx_t_1, Py_NE); __Pyx_XGOTREF(__pyx_t_9); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 386; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_9 = PyInt_FromLong((__pyx_v_store_steps[__pyx_v_j])); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_7 = PyObject_RichCompare(__pyx_t_1, __pyx_t_9, Py_NE); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_9); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 386; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        if (__pyx_t_3) {
+        __pyx_t_13 = __Pyx_PyObject_IsTrue(__pyx_t_7); if (unlikely(__pyx_t_13 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        if (__pyx_t_13) {
 
-          /* "theano/scan_module/scan_perform.pyx":387
+          /* "theano/scan_module/scan_perform.pyx":389
  *                     outs[j][0] = node.outputs[j].type.value_zeros(shape)
  *                 elif outs[j][0].shape[0] != store_steps[j]:
  *                     outs[j][0] = outs[j][0][:store_steps[j]]             # <<<<<<<<<<<<<<
  *                 outs[j][0][pos[j]] = output_storage[jout].storage[0]
  *             elif (store_steps[j] == 1 or vector_outs[j] == 1 or
  */
-          __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+          __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 389; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
+          __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 389; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_9);
-          __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_1);
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __pyx_t_7 = __Pyx_PySequence_GetSlice(__pyx_t_9, 0, (__pyx_v_store_steps[__pyx_v_j])); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 389; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
           __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __pyx_t_9 = __Pyx_PyObject_GetSlice(__pyx_t_1, 0, (__pyx_v_store_steps[__pyx_v_j]), NULL, NULL, NULL, 0, 1, 1); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 389; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_9);
-          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-          __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_1);
-          if (unlikely(__Pyx_SetItemInt(__pyx_t_1, 0, __pyx_t_9, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 387; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+          if (__Pyx_SetItemInt(__pyx_t_9, 0, __pyx_t_7, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 389; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          goto __pyx_L72;
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          goto __pyx_L75;
         }
-        __pyx_L72:;
+        __pyx_L75:;
 
-        /* "theano/scan_module/scan_perform.pyx":388
+        /* "theano/scan_module/scan_perform.pyx":390
  *                 elif outs[j][0].shape[0] != store_steps[j]:
  *                     outs[j][0] = outs[j][0][:store_steps[j]]
  *                 outs[j][0][pos[j]] = output_storage[jout].storage[0]             # <<<<<<<<<<<<<<
  *             elif (store_steps[j] == 1 or vector_outs[j] == 1 or
  *                   outs[j][0][pos[j]] is not output_storage[j+offset_out].storage[0]):
  */
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_jout, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_jout, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 390; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_9 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 390; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 390; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 390; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 390; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        if (unlikely(__Pyx_SetItemInt(__pyx_t_2, (__pyx_v_pos[__pyx_v_j]), __pyx_t_9, int, 1, __Pyx_PyInt_From_int, 0, 1, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 388; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        goto __pyx_L70;
+        if (__Pyx_SetItemInt(__pyx_t_1, (__pyx_v_pos[__pyx_v_j]), __pyx_t_7, sizeof(int), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 390; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        goto __pyx_L73;
       }
 
-      /* "theano/scan_module/scan_perform.pyx":389
+      /* "theano/scan_module/scan_perform.pyx":391
  *                     outs[j][0] = outs[j][0][:store_steps[j]]
  *                 outs[j][0][pos[j]] = output_storage[jout].storage[0]
  *             elif (store_steps[j] == 1 or vector_outs[j] == 1 or             # <<<<<<<<<<<<<<
  *                   outs[j][0][pos[j]] is not output_storage[j+offset_out].storage[0]):
  *                 outs[j][0][pos[j]] = output_storage[j+offset_out].storage[0]
  */
-      __pyx_t_3 = (((__pyx_v_store_steps[__pyx_v_j]) == 1) != 0);
-      if (!__pyx_t_3) {
+      __pyx_t_13 = ((__pyx_v_store_steps[__pyx_v_j]) == 1);
+      if (!__pyx_t_13) {
         __pyx_t_40 = __pyx_v_j;
-        __pyx_t_13 = (((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_vector_outs.rcbuffer->pybuffer.buf, __pyx_t_40, __pyx_pybuffernd_vector_outs.diminfo[0].strides)) == 1) != 0);
-        if (!__pyx_t_13) {
+        __pyx_t_3 = ((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_vector_outs.rcbuffer->pybuffer.buf, __pyx_t_40, __pyx_pybuffernd_vector_outs.diminfo[0].strides)) == 1);
+        if (!__pyx_t_3) {
 
-          /* "theano/scan_module/scan_perform.pyx":390
+          /* "theano/scan_module/scan_perform.pyx":392
  *                 outs[j][0][pos[j]] = output_storage[jout].storage[0]
  *             elif (store_steps[j] == 1 or vector_outs[j] == 1 or
  *                   outs[j][0][pos[j]] is not output_storage[j+offset_out].storage[0]):             # <<<<<<<<<<<<<<
  *                 outs[j][0][pos[j]] = output_storage[j+offset_out].storage[0]
  * 
  */
-          __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 390; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_9);
-          __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 390; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_2);
-          __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, (__pyx_v_pos[__pyx_v_j]), int, 1, __Pyx_PyInt_From_int, 0, 1, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 390; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_9);
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+          __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 392; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
+          __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 392; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_1);
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, (__pyx_v_pos[__pyx_v_j]), sizeof(int), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 392; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_7);
+          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
           __pyx_t_41 = (__pyx_v_j + __pyx_v_offset_out);
-          __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_41, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 390; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_2);
-          __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 390; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_41, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 392; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
           __Pyx_GOTREF(__pyx_t_1);
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-          __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 390; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-          __Pyx_GOTREF(__pyx_t_2);
+          __pyx_t_9 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 392; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_9);
           __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-          __pyx_t_14 = (__pyx_t_9 != __pyx_t_2);
+          __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 392; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+          __Pyx_GOTREF(__pyx_t_1);
           __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-          __pyx_t_39 = (__pyx_t_14 != 0);
+          __pyx_t_14 = (__pyx_t_7 != __pyx_t_1);
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+          __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+          __pyx_t_39 = __pyx_t_14;
         } else {
-          __pyx_t_39 = __pyx_t_13;
+          __pyx_t_39 = __pyx_t_3;
         }
-        __pyx_t_13 = __pyx_t_39;
+        __pyx_t_3 = __pyx_t_39;
       } else {
-        __pyx_t_13 = __pyx_t_3;
+        __pyx_t_3 = __pyx_t_13;
       }
-      if (__pyx_t_13) {
+      if (__pyx_t_3) {
 
-        /* "theano/scan_module/scan_perform.pyx":391
+        /* "theano/scan_module/scan_perform.pyx":393
  *             elif (store_steps[j] == 1 or vector_outs[j] == 1 or
  *                   outs[j][0][pos[j]] is not output_storage[j+offset_out].storage[0]):
  *                 outs[j][0][pos[j]] = output_storage[j+offset_out].storage[0]             # <<<<<<<<<<<<<<
@@ -4324,28 +4182,28 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  * 
  */
         __pyx_t_41 = (__pyx_v_j + __pyx_v_offset_out);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_41, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 391; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 391; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 391; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 391; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 391; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_t_41, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 393; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
+        __pyx_t_7 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__storage); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 393; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 393; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 393; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 393; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        if (__Pyx_SetItemInt(__pyx_t_9, (__pyx_v_pos[__pyx_v_j]), __pyx_t_1, sizeof(int), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 393; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        if (unlikely(__Pyx_SetItemInt(__pyx_t_1, (__pyx_v_pos[__pyx_v_j]), __pyx_t_2, int, 1, __Pyx_PyInt_From_int, 0, 1, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 391; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        goto __pyx_L70;
+        goto __pyx_L73;
       }
-      __pyx_L70:;
+      __pyx_L73:;
     }
 
-    /* "theano/scan_module/scan_perform.pyx":396
+    /* "theano/scan_module/scan_perform.pyx":398
  *         # 5.4 Copy over the values for outputs corresponding to shared
  *         # variables
  *         begin  = end             # <<<<<<<<<<<<<<
@@ -4354,7 +4212,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_begin = __pyx_v_end;
 
-    /* "theano/scan_module/scan_perform.pyx":397
+    /* "theano/scan_module/scan_perform.pyx":399
  *         # variables
  *         begin  = end
  *         end   += n_shared_outs             # <<<<<<<<<<<<<<
@@ -4363,7 +4221,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
     __pyx_v_end = (__pyx_v_end + __pyx_v_n_shared_outs);
 
-    /* "theano/scan_module/scan_perform.pyx":398
+    /* "theano/scan_module/scan_perform.pyx":400
  *         begin  = end
  *         end   += n_shared_outs
  *         for j in range(begin,end):             # <<<<<<<<<<<<<<
@@ -4374,7 +4232,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     for (__pyx_t_5 = __pyx_v_begin; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
       __pyx_v_j = __pyx_t_5;
 
-      /* "theano/scan_module/scan_perform.pyx":399
+      /* "theano/scan_module/scan_perform.pyx":401
  *         end   += n_shared_outs
  *         for j in range(begin,end):
  *             jout = j +offset_out             # <<<<<<<<<<<<<<
@@ -4383,29 +4241,29 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
       __pyx_v_jout = (__pyx_v_j + __pyx_v_offset_out);
 
-      /* "theano/scan_module/scan_perform.pyx":400
+      /* "theano/scan_module/scan_perform.pyx":402
  *         for j in range(begin,end):
  *             jout = j +offset_out
  *             outs[j][0] = output_storage[jout].storage[0]             # <<<<<<<<<<<<<<
  * 
  *         for idx in range(lenpos):
  */
-      __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_jout, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 400; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_2);
-      __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 400; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_output_storage, __pyx_v_jout, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 402; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-      __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 400; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_2);
+      __pyx_t_9 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 402; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_9);
       __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-      __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 400; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+      __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 402; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_1);
-      if (unlikely(__Pyx_SetItemInt(__pyx_t_1, 0, __pyx_t_2, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 400; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+      __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_j, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 402; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_9);
+      if (__Pyx_SetItemInt(__pyx_t_9, 0, __pyx_t_1, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 402; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
       __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
     }
 
-    /* "theano/scan_module/scan_perform.pyx":402
+    /* "theano/scan_module/scan_perform.pyx":404
  *             outs[j][0] = output_storage[jout].storage[0]
  * 
  *         for idx in range(lenpos):             # <<<<<<<<<<<<<<
@@ -4416,7 +4274,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
       __pyx_v_idx = __pyx_t_5;
 
-      /* "theano/scan_module/scan_perform.pyx":403
+      /* "theano/scan_module/scan_perform.pyx":405
  * 
  *         for idx in range(lenpos):
  *             pos[idx] = (pos[idx]+1)%store_steps[idx]             # <<<<<<<<<<<<<<
@@ -4425,19 +4283,13 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
       __pyx_t_8 = ((__pyx_v_pos[__pyx_v_idx]) + 1);
       if (unlikely((__pyx_v_store_steps[__pyx_v_idx]) == 0)) {
-        #ifdef WITH_THREAD
-        PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();
-        #endif
-        PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
-        #ifdef WITH_THREAD
-        PyGILState_Release(__pyx_gilstate_save);
-        #endif
-        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 403; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        PyErr_Format(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 405; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       }
       (__pyx_v_pos[__pyx_v_idx]) = __Pyx_mod_long(__pyx_t_8, (__pyx_v_store_steps[__pyx_v_idx]));
     }
 
-    /* "theano/scan_module/scan_perform.pyx":404
+    /* "theano/scan_module/scan_perform.pyx":406
  *         for idx in range(lenpos):
  *             pos[idx] = (pos[idx]+1)%store_steps[idx]
  *         i = i + 1             # <<<<<<<<<<<<<<
@@ -4447,7 +4299,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
     __pyx_v_i = (__pyx_v_i + 1);
   }
 
-  /* "theano/scan_module/scan_perform.pyx":409
+  /* "theano/scan_module/scan_perform.pyx":411
  * 
  *     # 6. Check if you need to re-order output buffers
  *     begin = n_mit_mot             # <<<<<<<<<<<<<<
@@ -4456,7 +4308,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
   __pyx_v_begin = __pyx_v_n_mit_mot;
 
-  /* "theano/scan_module/scan_perform.pyx":410
+  /* "theano/scan_module/scan_perform.pyx":412
  *     # 6. Check if you need to re-order output buffers
  *     begin = n_mit_mot
  *     end   = n_outs + n_nit_sot             # <<<<<<<<<<<<<<
@@ -4465,7 +4317,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
   __pyx_v_end = (__pyx_v_n_outs + __pyx_v_n_nit_sot);
 
-  /* "theano/scan_module/scan_perform.pyx":411
+  /* "theano/scan_module/scan_perform.pyx":413
  *     begin = n_mit_mot
  *     end   = n_outs + n_nit_sot
  *     for idx in range(begin, end):             # <<<<<<<<<<<<<<
@@ -4476,7 +4328,7 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
   for (__pyx_t_5 = __pyx_v_begin; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
     __pyx_v_idx = __pyx_t_5;
 
-    /* "theano/scan_module/scan_perform.pyx":412
+    /* "theano/scan_module/scan_perform.pyx":414
  *     end   = n_outs + n_nit_sot
  *     for idx in range(begin, end):
  *         if ( store_steps[idx] < i-mintaps[idx] and             # <<<<<<<<<<<<<<
@@ -4484,24 +4336,24 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  * 
  */
     __pyx_t_41 = __pyx_v_idx;
-    __pyx_t_13 = (((__pyx_v_store_steps[__pyx_v_idx]) < (__pyx_v_i - (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_mintaps.rcbuffer->pybuffer.buf, __pyx_t_41, __pyx_pybuffernd_mintaps.diminfo[0].strides)))) != 0);
-    if (__pyx_t_13) {
+    __pyx_t_3 = ((__pyx_v_store_steps[__pyx_v_idx]) < (__pyx_v_i - (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_mintaps.rcbuffer->pybuffer.buf, __pyx_t_41, __pyx_pybuffernd_mintaps.diminfo[0].strides))));
+    if (__pyx_t_3) {
 
-      /* "theano/scan_module/scan_perform.pyx":413
+      /* "theano/scan_module/scan_perform.pyx":415
  *     for idx in range(begin, end):
  *         if ( store_steps[idx] < i-mintaps[idx] and
  *             pos[idx] < store_steps[idx] ):             # <<<<<<<<<<<<<<
  * 
  *             pdx = pos[idx]
  */
-      __pyx_t_3 = (((__pyx_v_pos[__pyx_v_idx]) < (__pyx_v_store_steps[__pyx_v_idx])) != 0);
-      __pyx_t_39 = __pyx_t_3;
-    } else {
+      __pyx_t_13 = ((__pyx_v_pos[__pyx_v_idx]) < (__pyx_v_store_steps[__pyx_v_idx]));
       __pyx_t_39 = __pyx_t_13;
+    } else {
+      __pyx_t_39 = __pyx_t_3;
     }
     if (__pyx_t_39) {
 
-      /* "theano/scan_module/scan_perform.pyx":415
+      /* "theano/scan_module/scan_perform.pyx":417
  *             pos[idx] < store_steps[idx] ):
  * 
  *             pdx = pos[idx]             # <<<<<<<<<<<<<<
@@ -4510,694 +4362,579 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
  */
       __pyx_v_pdx = (__pyx_v_pos[__pyx_v_idx]);
 
-      /* "theano/scan_module/scan_perform.pyx":416
+      /* "theano/scan_module/scan_perform.pyx":418
  * 
  *             pdx = pos[idx]
  *             if pdx >= store_steps[idx]//2 :             # <<<<<<<<<<<<<<
  *                 # It seems inefficient to copy the bigger part of the
  *                 # array over, and back, but it is the only way that
  */
-      __pyx_t_39 = ((__pyx_v_pdx >= __Pyx_div_long((__pyx_v_store_steps[__pyx_v_idx]), 2)) != 0);
+      __pyx_t_39 = (__pyx_v_pdx >= __Pyx_div_long((__pyx_v_store_steps[__pyx_v_idx]), 2));
       if (__pyx_t_39) {
 
-        /* "theano/scan_module/scan_perform.pyx":423
+        /* "theano/scan_module/scan_perform.pyx":425
  *                 # This way, there will be no information overwritten
  *                 # before it is read (as it used to happen).
  *                 shape = (pdx,)+ outs[idx][0].shape[1:]             # <<<<<<<<<<<<<<
  * 
  *                 tmp = node.outputs[idx].type.value_zeros(shape)
  */
-        __pyx_t_2 = __Pyx_PyInt_From_unsigned_int(__pyx_v_pdx); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 423; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
-        __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 423; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_1 = PyLong_FromUnsignedLong(__pyx_v_pdx); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_2);
-        __Pyx_GIVEREF(__pyx_t_2);
-        __pyx_t_2 = 0;
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 423; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 423; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_shape); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 423; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_PyObject_GetSlice(__pyx_t_2, 1, 0, NULL, NULL, &__pyx_slice__9, 1, 0, 1); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 423; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_9 = PyTuple_New(1); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = PyNumber_Add(__pyx_t_1, __pyx_t_9); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 423; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
+        PyTuple_SET_ITEM(__pyx_t_9, 0, __pyx_t_1);
+        __Pyx_GIVEREF(__pyx_t_1);
+        __pyx_t_1 = 0;
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __Pyx_XDECREF_SET(__pyx_v_shape, __pyx_t_2);
-        __pyx_t_2 = 0;
-
-        /* "theano/scan_module/scan_perform.pyx":425
- *                 shape = (pdx,)+ outs[idx][0].shape[1:]
+        __pyx_t_1 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__shape); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_PySequence_GetSlice(__pyx_t_1, 1, PY_SSIZE_T_MAX); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __pyx_t_1 = PyNumber_Add(((PyObject *)__pyx_t_9), __pyx_t_7); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(((PyObject *)__pyx_t_9)); __pyx_t_9 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_XDECREF(__pyx_v_shape);
+        __pyx_v_shape = __pyx_t_1;
+        __pyx_t_1 = 0;
+
+        /* "theano/scan_module/scan_perform.pyx":427
+ *                 shape = (pdx,)+ outs[idx][0].shape[1:]
  * 
  *                 tmp = node.outputs[idx].type.value_zeros(shape)             # <<<<<<<<<<<<<<
  *                 tmp[:] = outs[idx][0][:pdx]
  *                 outs[idx][0][:store_steps[idx]-pdx] = outs[idx][0][pdx:]
  */
-        __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_node, __pyx_n_s_outputs); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_type); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_value_zeros); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
+        __pyx_t_1 = PyObject_GetAttr(__pyx_v_node, __pyx_n_s__outputs); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 427; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 427; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __pyx_t_1 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__type); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 427; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__value_zeros); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 427; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 427; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
         __Pyx_INCREF(__pyx_v_shape);
-        PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_v_shape);
+        PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_shape);
         __Pyx_GIVEREF(__pyx_v_shape);
-        __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_9, __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 425; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_1);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __Pyx_XDECREF_SET(__pyx_v_tmp, __pyx_t_1);
-        __pyx_t_1 = 0;
+        __pyx_t_9 = PyObject_Call(__pyx_t_7, ((PyObject *)__pyx_t_1), NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 427; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_DECREF(((PyObject *)__pyx_t_1)); __pyx_t_1 = 0;
+        __Pyx_XDECREF(__pyx_v_tmp);
+        __pyx_v_tmp = __pyx_t_9;
+        __pyx_t_9 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":426
+        /* "theano/scan_module/scan_perform.pyx":428
  * 
  *                 tmp = node.outputs[idx].type.value_zeros(shape)
  *                 tmp[:] = outs[idx][0][:pdx]             # <<<<<<<<<<<<<<
  *                 outs[idx][0][:store_steps[idx]-pdx] = outs[idx][0][pdx:]
  *                 outs[idx][0][store_steps[idx]-pdx:] = tmp
  */
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 426; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 426; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_PyObject_GetSlice(__pyx_t_2, 0, __pyx_v_pdx, NULL, NULL, NULL, 0, 1, 1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 426; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 428; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 428; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        if (__Pyx_PyObject_SetSlice(__pyx_v_tmp, __pyx_t_1, 0, 0, NULL, NULL, &__pyx_slice__10, 0, 0, 1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 426; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_9 = __Pyx_PySequence_GetSlice(__pyx_t_1, 0, __pyx_v_pdx); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 428; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        if (__Pyx_PySequence_SetSlice(__pyx_v_tmp, 0, PY_SSIZE_T_MAX, __pyx_t_9) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 428; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":427
+        /* "theano/scan_module/scan_perform.pyx":429
  *                 tmp = node.outputs[idx].type.value_zeros(shape)
  *                 tmp[:] = outs[idx][0][:pdx]
  *                 outs[idx][0][:store_steps[idx]-pdx] = outs[idx][0][pdx:]             # <<<<<<<<<<<<<<
  *                 outs[idx][0][store_steps[idx]-pdx:] = tmp
  *             else:
  */
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 427; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 429; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 429; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 427; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_9 = __Pyx_PySequence_GetSlice(__pyx_t_1, __pyx_v_pdx, PY_SSIZE_T_MAX); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 429; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_PyObject_GetSlice(__pyx_t_2, __pyx_v_pdx, 0, NULL, NULL, NULL, 1, 0, 1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 427; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 429; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 427; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 427; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        if (__Pyx_PyObject_SetSlice(__pyx_t_9, __pyx_t_1, 0, ((__pyx_v_store_steps[__pyx_v_idx]) - __pyx_v_pdx), NULL, NULL, NULL, 0, 1, 1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 427; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 429; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        if (__Pyx_PySequence_SetSlice(__pyx_t_7, 0, ((__pyx_v_store_steps[__pyx_v_idx]) - __pyx_v_pdx), __pyx_t_9) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 429; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":428
+        /* "theano/scan_module/scan_perform.pyx":430
  *                 tmp[:] = outs[idx][0][:pdx]
  *                 outs[idx][0][:store_steps[idx]-pdx] = outs[idx][0][pdx:]
  *                 outs[idx][0][store_steps[idx]-pdx:] = tmp             # <<<<<<<<<<<<<<
  *             else:
  *                 shape = (store_steps[idx]-pdx,) + outs[idx][0].shape[1:]
  */
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 428; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 428; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 430; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        if (__Pyx_PyObject_SetSlice(__pyx_t_9, __pyx_v_tmp, ((__pyx_v_store_steps[__pyx_v_idx]) - __pyx_v_pdx), 0, NULL, NULL, NULL, 1, 0, 1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 428; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 430; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        goto __pyx_L80;
+        if (__Pyx_PySequence_SetSlice(__pyx_t_7, ((__pyx_v_store_steps[__pyx_v_idx]) - __pyx_v_pdx), PY_SSIZE_T_MAX, __pyx_v_tmp) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 430; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        goto __pyx_L83;
       }
       /*else*/ {
 
-        /* "theano/scan_module/scan_perform.pyx":430
+        /* "theano/scan_module/scan_perform.pyx":432
  *                 outs[idx][0][store_steps[idx]-pdx:] = tmp
  *             else:
  *                 shape = (store_steps[idx]-pdx,) + outs[idx][0].shape[1:]             # <<<<<<<<<<<<<<
  *                 tmp = node.outputs[idx].type.value_zeros(shape)
  *                 tmp[:] = outs[idx][0][pdx:]
  */
-        __pyx_t_9 = __Pyx_PyInt_From_unsigned_int(((__pyx_v_store_steps[__pyx_v_idx]) - __pyx_v_pdx)); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 430; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_7 = PyLong_FromUnsignedLong(((__pyx_v_store_steps[__pyx_v_idx]) - __pyx_v_pdx)); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 432; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_9 = PyTuple_New(1); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 432; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 430; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        PyTuple_SET_ITEM(__pyx_t_9, 0, __pyx_t_7);
+        __Pyx_GIVEREF(__pyx_t_7);
+        __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 432; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 432; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_9);
-        __Pyx_GIVEREF(__pyx_t_9);
-        __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 430; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 430; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_shape); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 430; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = __Pyx_PyObject_GetSlice(__pyx_t_9, 1, 0, NULL, NULL, &__pyx_slice__11, 1, 0, 1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 430; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = PyNumber_Add(__pyx_t_1, __pyx_t_2); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 430; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__shape); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 432; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __Pyx_XDECREF_SET(__pyx_v_shape, __pyx_t_9);
-        __pyx_t_9 = 0;
+        __pyx_t_1 = __Pyx_PySequence_GetSlice(__pyx_t_7, 1, PY_SSIZE_T_MAX); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 432; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = PyNumber_Add(((PyObject *)__pyx_t_9), __pyx_t_1); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 432; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(((PyObject *)__pyx_t_9)); __pyx_t_9 = 0;
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __Pyx_XDECREF(__pyx_v_shape);
+        __pyx_v_shape = __pyx_t_7;
+        __pyx_t_7 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":431
+        /* "theano/scan_module/scan_perform.pyx":433
  *             else:
  *                 shape = (store_steps[idx]-pdx,) + outs[idx][0].shape[1:]
  *                 tmp = node.outputs[idx].type.value_zeros(shape)             # <<<<<<<<<<<<<<
  *                 tmp[:] = outs[idx][0][pdx:]
  *                 outs[idx][0][store_steps[idx]-pdx:] = outs[idx][0][:pdx]
  */
-        __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_v_node, __pyx_n_s_outputs); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 431; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_9, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 431; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_type); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 431; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_value_zeros); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 431; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = PyTuple_New(1); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 431; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_7 = PyObject_GetAttr(__pyx_v_node, __pyx_n_s__outputs); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 433; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_7, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 433; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__type); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 433; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __pyx_t_1 = PyObject_GetAttr(__pyx_t_7, __pyx_n_s__value_zeros); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 433; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = PyTuple_New(1); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 433; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_INCREF(__pyx_v_shape);
-        PyTuple_SET_ITEM(__pyx_t_9, 0, __pyx_v_shape);
+        PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_v_shape);
         __Pyx_GIVEREF(__pyx_v_shape);
-        __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_9, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 431; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_1);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __Pyx_XDECREF_SET(__pyx_v_tmp, __pyx_t_1);
-        __pyx_t_1 = 0;
+        __pyx_t_9 = PyObject_Call(__pyx_t_1, ((PyObject *)__pyx_t_7), NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 433; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __Pyx_DECREF(((PyObject *)__pyx_t_7)); __pyx_t_7 = 0;
+        __Pyx_XDECREF(__pyx_v_tmp);
+        __pyx_v_tmp = __pyx_t_9;
+        __pyx_t_9 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":432
+        /* "theano/scan_module/scan_perform.pyx":434
  *                 shape = (store_steps[idx]-pdx,) + outs[idx][0].shape[1:]
  *                 tmp = node.outputs[idx].type.value_zeros(shape)
  *                 tmp[:] = outs[idx][0][pdx:]             # <<<<<<<<<<<<<<
  *                 outs[idx][0][store_steps[idx]-pdx:] = outs[idx][0][:pdx]
  *                 outs[idx][0][:store_steps[idx]-pdx] = tmp
  */
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 432; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 432; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 434; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_PyObject_GetSlice(__pyx_t_9, __pyx_v_pdx, 0, NULL, NULL, NULL, 1, 0, 1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 432; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 434; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_9 = __Pyx_PySequence_GetSlice(__pyx_t_7, __pyx_v_pdx, PY_SSIZE_T_MAX); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 434; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        if (__Pyx_PySequence_SetSlice(__pyx_v_tmp, 0, PY_SSIZE_T_MAX, __pyx_t_9) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 434; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        if (__Pyx_PyObject_SetSlice(__pyx_v_tmp, __pyx_t_1, 0, 0, NULL, NULL, &__pyx_slice__12, 0, 0, 1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 432; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":433
+        /* "theano/scan_module/scan_perform.pyx":435
  *                 tmp = node.outputs[idx].type.value_zeros(shape)
  *                 tmp[:] = outs[idx][0][pdx:]
  *                 outs[idx][0][store_steps[idx]-pdx:] = outs[idx][0][:pdx]             # <<<<<<<<<<<<<<
  *                 outs[idx][0][:store_steps[idx]-pdx] = tmp
  *         # This would normally happen only when doing truncated
  */
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 433; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 433; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 435; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_PyObject_GetSlice(__pyx_t_9, 0, __pyx_v_pdx, NULL, NULL, NULL, 0, 1, 1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 433; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 435; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 433; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_9 = __Pyx_PySequence_GetSlice(__pyx_t_7, 0, __pyx_v_pdx); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 435; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 433; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        if (__Pyx_PyObject_SetSlice(__pyx_t_2, __pyx_t_1, ((__pyx_v_store_steps[__pyx_v_idx]) - __pyx_v_pdx), 0, NULL, NULL, NULL, 1, 0, 1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 433; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 435; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 435; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        if (__Pyx_PySequence_SetSlice(__pyx_t_1, ((__pyx_v_store_steps[__pyx_v_idx]) - __pyx_v_pdx), PY_SSIZE_T_MAX, __pyx_t_9) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 435; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":434
+        /* "theano/scan_module/scan_perform.pyx":436
  *                 tmp[:] = outs[idx][0][pdx:]
  *                 outs[idx][0][store_steps[idx]-pdx:] = outs[idx][0][:pdx]
  *                 outs[idx][0][:store_steps[idx]-pdx] = tmp             # <<<<<<<<<<<<<<
  *         # This would normally happen only when doing truncated
  *         # backpropagation through time. In such a scenarion Scan is
  */
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 434; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 436; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_9, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 436; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 434; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        if (__Pyx_PySequence_SetSlice(__pyx_t_1, 0, ((__pyx_v_store_steps[__pyx_v_idx]) - __pyx_v_pdx), __pyx_v_tmp) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 436; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        if (__Pyx_PyObject_SetSlice(__pyx_t_2, __pyx_v_tmp, 0, ((__pyx_v_store_steps[__pyx_v_idx]) - __pyx_v_pdx), NULL, NULL, NULL, 0, 1, 1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 434; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
       }
-      __pyx_L80:;
-      goto __pyx_L79;
+      __pyx_L83:;
+      goto __pyx_L82;
     }
 
-    /* "theano/scan_module/scan_perform.pyx":439
+    /* "theano/scan_module/scan_perform.pyx":441
  *         # expected to return 0 for all entries for which the gradient is
  *         # not actually computed
  *         elif store_steps[idx] > i - self.mintaps[idx]:             # <<<<<<<<<<<<<<
  *             outs[idx][0][i-self.mintaps[idx]:] = 0
  * 
  */
-    __pyx_t_2 = __Pyx_PyInt_From_int((__pyx_v_store_steps[__pyx_v_idx])); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 439; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_2);
-    __pyx_t_1 = __Pyx_PyInt_From_unsigned_int(__pyx_v_i); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 439; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = PyInt_FromLong((__pyx_v_store_steps[__pyx_v_idx])); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 441; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_1);
-    __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_v_self, __pyx_n_s_mintaps); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 439; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_9 = PyLong_FromUnsignedLong(__pyx_v_i); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 441; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_9);
-    __pyx_t_34 = __Pyx_GetItemInt(__pyx_t_9, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_34 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 439; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+    __pyx_t_7 = PyObject_GetAttr(__pyx_v_self, __pyx_n_s__mintaps); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 441; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_34 = __Pyx_GetItemInt(__pyx_t_7, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_34) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 441; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_34);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_7 = PyNumber_Subtract(__pyx_t_9, __pyx_t_34); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 441; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_7);
     __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-    __pyx_t_9 = PyNumber_Subtract(__pyx_t_1, __pyx_t_34); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 439; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_9);
-    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
     __Pyx_DECREF(__pyx_t_34); __pyx_t_34 = 0;
-    __pyx_t_34 = PyObject_RichCompare(__pyx_t_2, __pyx_t_9, Py_GT); __Pyx_XGOTREF(__pyx_t_34); if (unlikely(!__pyx_t_34)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 439; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-    __pyx_t_39 = __Pyx_PyObject_IsTrue(__pyx_t_34); if (unlikely(__pyx_t_39 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 439; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_34 = PyObject_RichCompare(__pyx_t_1, __pyx_t_7, Py_GT); if (unlikely(!__pyx_t_34)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 441; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_34);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_39 = __Pyx_PyObject_IsTrue(__pyx_t_34); if (unlikely(__pyx_t_39 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 441; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_34); __pyx_t_34 = 0;
     if (__pyx_t_39) {
 
-      /* "theano/scan_module/scan_perform.pyx":440
+      /* "theano/scan_module/scan_perform.pyx":442
  *         # not actually computed
  *         elif store_steps[idx] > i - self.mintaps[idx]:
  *             outs[idx][0][i-self.mintaps[idx]:] = 0             # <<<<<<<<<<<<<<
  * 
  *             # This is a fix for a bug introduced by while. If you say
  */
-      __pyx_t_34 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_34 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 440; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+      __pyx_t_34 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_34) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 442; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_34);
-      __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_34, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 440; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-      __Pyx_GOTREF(__pyx_t_9);
+      __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_34, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 442; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
       __Pyx_DECREF(__pyx_t_34); __pyx_t_34 = 0;
-      __pyx_t_34 = __Pyx_PyInt_From_unsigned_int(__pyx_v_i); if (unlikely(!__pyx_t_34)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 440; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_34 = PyLong_FromUnsignedLong(__pyx_v_i); if (unlikely(!__pyx_t_34)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 442; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_34);
-      __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_self, __pyx_n_s_mintaps); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 440; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_2);
-      __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_2, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 440; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+      __pyx_t_1 = PyObject_GetAttr(__pyx_v_self, __pyx_n_s__mintaps); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 442; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-      __pyx_t_2 = PyNumber_Subtract(__pyx_t_34, __pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 440; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_2);
-      __Pyx_DECREF(__pyx_t_34); __pyx_t_34 = 0;
+      __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_1, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_9) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 442; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_9);
       __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-      if (__Pyx_PyObject_SetSlice(__pyx_t_9, __pyx_int_0, 0, 0, &__pyx_t_2, NULL, NULL, 0, 0, 1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 440; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_1 = PyNumber_Subtract(__pyx_t_34, __pyx_t_9); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 442; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_34); __pyx_t_34 = 0;
       __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+      __pyx_t_17 = __Pyx_PyIndex_AsSsize_t(__pyx_t_1); if (unlikely((__pyx_t_17 == (Py_ssize_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 442; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      if (__Pyx_PySequence_SetSlice(__pyx_t_7, __pyx_t_17, PY_SSIZE_T_MAX, __pyx_int_0) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 442; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
 
-      /* "theano/scan_module/scan_perform.pyx":449
+      /* "theano/scan_module/scan_perform.pyx":451
  *             # if optimization gets applied compared to when optimization
  *             # do not get applied
  *             if i < n_steps:             # <<<<<<<<<<<<<<
  * 
  * 	    # Cython can not handle negative indices ( because of a
  */
-      __pyx_t_39 = ((__pyx_v_i < __pyx_v_n_steps) != 0);
+      __pyx_t_39 = (__pyx_v_i < __pyx_v_n_steps);
       if (__pyx_t_39) {
 
-        /* "theano/scan_module/scan_perform.pyx":456
+        /* "theano/scan_module/scan_perform.pyx":458
  * 	    # code faster, so this workaround is better then removing
  * 	    # the directive.
  *                 sh0 = outs[idx][0].shape[0]             # <<<<<<<<<<<<<<
  *                 outs[idx][0] = outs[idx][0][:sh0-(n_steps - i)]
  * 
  */
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 456; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 456; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_shape); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 456; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 456; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-        __Pyx_XDECREF_SET(__pyx_v_sh0, __pyx_t_9);
-        __pyx_t_9 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 458; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 458; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__shape); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 458; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_7, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 458; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_XDECREF(__pyx_v_sh0);
+        __pyx_v_sh0 = __pyx_t_1;
+        __pyx_t_1 = 0;
 
-        /* "theano/scan_module/scan_perform.pyx":457
+        /* "theano/scan_module/scan_perform.pyx":459
  * 	    # the directive.
  *                 sh0 = outs[idx][0].shape[0]
  *                 outs[idx][0] = outs[idx][0][:sh0-(n_steps - i)]             # <<<<<<<<<<<<<<
  * 
- *     # We never reuse the input or output storage of the
+ *     t_call = time.time() - t0_call
  */
-        __pyx_t_9 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 457; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_9, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 457; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-        __Pyx_GOTREF(__pyx_t_2);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_PyInt_From_unsigned_int((__pyx_v_n_steps - __pyx_v_i)); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 457; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __pyx_t_1 = PyNumber_Subtract(__pyx_v_sh0, __pyx_t_9); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 457; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 459; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        __pyx_t_9 = __Pyx_PyObject_GetSlice(__pyx_t_2, 0, 0, NULL, &__pyx_t_1, NULL, 0, 0, 1); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 457; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_1, 0, sizeof(long), PyInt_FromLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 459; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, unsigned int, 0, __Pyx_PyInt_From_unsigned_int, 0, 0, 0); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 457; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+        __pyx_t_1 = PyLong_FromUnsignedLong((__pyx_v_n_steps - __pyx_v_i)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 459; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        if (unlikely(__Pyx_SetItemInt(__pyx_t_1, 0, __pyx_t_9, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 457; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_9 = PyNumber_Subtract(__pyx_v_sh0, __pyx_t_1); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 459; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
         __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __pyx_t_17 = __Pyx_PyIndex_AsSsize_t(__pyx_t_9); if (unlikely((__pyx_t_17 == (Py_ssize_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 459; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_9 = __Pyx_PySequence_GetSlice(__pyx_t_7, 0, __pyx_t_17); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 459; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_outs, __pyx_v_idx, sizeof(unsigned int)+1, PyLong_FromUnsignedLong); if (!__pyx_t_7) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 459; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
+        if (__Pyx_SetItemInt(__pyx_t_7, 0, __pyx_t_9, sizeof(long), PyInt_FromLong) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 459; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        goto __pyx_L81;
+        goto __pyx_L84;
       }
-      __pyx_L81:;
-      goto __pyx_L79;
+      __pyx_L84:;
+      goto __pyx_L82;
     }
-    __pyx_L79:;
+    __pyx_L82:;
   }
 
   /* "theano/scan_module/scan_perform.pyx":461
- *     # We never reuse the input or output storage of the
- *     # inner function so we clear it.
- *     for i_s in input_storage:             # <<<<<<<<<<<<<<
- *         i_s.storage[0] = None
- *     for o_s in output_storage:
- */
-  if (PyList_CheckExact(__pyx_v_input_storage) || PyTuple_CheckExact(__pyx_v_input_storage)) {
-    __pyx_t_9 = __pyx_v_input_storage; __Pyx_INCREF(__pyx_t_9); __pyx_t_17 = 0;
-    __pyx_t_42 = NULL;
-  } else {
-    __pyx_t_17 = -1; __pyx_t_9 = PyObject_GetIter(__pyx_v_input_storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 461; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_9);
-    __pyx_t_42 = Py_TYPE(__pyx_t_9)->tp_iternext;
-  }
-  for (;;) {
-    if (!__pyx_t_42 && PyList_CheckExact(__pyx_t_9)) {
-      if (__pyx_t_17 >= PyList_GET_SIZE(__pyx_t_9)) break;
-      #if CYTHON_COMPILING_IN_CPYTHON
-      __pyx_t_1 = PyList_GET_ITEM(__pyx_t_9, __pyx_t_17); __Pyx_INCREF(__pyx_t_1); __pyx_t_17++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 461; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      #else
-      __pyx_t_1 = PySequence_ITEM(__pyx_t_9, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 461; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      #endif
-    } else if (!__pyx_t_42 && PyTuple_CheckExact(__pyx_t_9)) {
-      if (__pyx_t_17 >= PyTuple_GET_SIZE(__pyx_t_9)) break;
-      #if CYTHON_COMPILING_IN_CPYTHON
-      __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_9, __pyx_t_17); __Pyx_INCREF(__pyx_t_1); __pyx_t_17++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 461; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      #else
-      __pyx_t_1 = PySequence_ITEM(__pyx_t_9, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 461; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      #endif
-    } else {
-      __pyx_t_1 = __pyx_t_42(__pyx_t_9);
-      if (unlikely(!__pyx_t_1)) {
-        PyObject* exc_type = PyErr_Occurred();
-        if (exc_type) {
-          if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
-          else {__pyx_filename = __pyx_f[0]; __pyx_lineno = 461; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        }
-        break;
-      }
-      __Pyx_GOTREF(__pyx_t_1);
-    }
-    __Pyx_XDECREF_SET(__pyx_v_i_s, __pyx_t_1);
-    __pyx_t_1 = 0;
-
-    /* "theano/scan_module/scan_perform.pyx":462
- *     # inner function so we clear it.
- *     for i_s in input_storage:
- *         i_s.storage[0] = None             # <<<<<<<<<<<<<<
- *     for o_s in output_storage:
- *         o_s.storage[0] = None
- */
-    __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_i_s, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 462; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_1);
-    if (unlikely(__Pyx_SetItemInt(__pyx_t_1, 0, Py_None, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 462; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  }
-  __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-
-  /* "theano/scan_module/scan_perform.pyx":463
- *     for i_s in input_storage:
- *         i_s.storage[0] = None
- *     for o_s in output_storage:             # <<<<<<<<<<<<<<
- *         o_s.storage[0] = None
- * 
- */
-  if (PyList_CheckExact(__pyx_v_output_storage) || PyTuple_CheckExact(__pyx_v_output_storage)) {
-    __pyx_t_9 = __pyx_v_output_storage; __Pyx_INCREF(__pyx_t_9); __pyx_t_17 = 0;
-    __pyx_t_42 = NULL;
-  } else {
-    __pyx_t_17 = -1; __pyx_t_9 = PyObject_GetIter(__pyx_v_output_storage); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 463; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_9);
-    __pyx_t_42 = Py_TYPE(__pyx_t_9)->tp_iternext;
-  }
-  for (;;) {
-    if (!__pyx_t_42 && PyList_CheckExact(__pyx_t_9)) {
-      if (__pyx_t_17 >= PyList_GET_SIZE(__pyx_t_9)) break;
-      #if CYTHON_COMPILING_IN_CPYTHON
-      __pyx_t_1 = PyList_GET_ITEM(__pyx_t_9, __pyx_t_17); __Pyx_INCREF(__pyx_t_1); __pyx_t_17++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 463; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      #else
-      __pyx_t_1 = PySequence_ITEM(__pyx_t_9, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 463; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      #endif
-    } else if (!__pyx_t_42 && PyTuple_CheckExact(__pyx_t_9)) {
-      if (__pyx_t_17 >= PyTuple_GET_SIZE(__pyx_t_9)) break;
-      #if CYTHON_COMPILING_IN_CPYTHON
-      __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_9, __pyx_t_17); __Pyx_INCREF(__pyx_t_1); __pyx_t_17++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 463; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      #else
-      __pyx_t_1 = PySequence_ITEM(__pyx_t_9, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 463; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      #endif
-    } else {
-      __pyx_t_1 = __pyx_t_42(__pyx_t_9);
-      if (unlikely(!__pyx_t_1)) {
-        PyObject* exc_type = PyErr_Occurred();
-        if (exc_type) {
-          if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
-          else {__pyx_filename = __pyx_f[0]; __pyx_lineno = 463; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        }
-        break;
-      }
-      __Pyx_GOTREF(__pyx_t_1);
-    }
-    __Pyx_XDECREF_SET(__pyx_v_o_s, __pyx_t_1);
-    __pyx_t_1 = 0;
-
-    /* "theano/scan_module/scan_perform.pyx":464
- *         i_s.storage[0] = None
- *     for o_s in output_storage:
- *         o_s.storage[0] = None             # <<<<<<<<<<<<<<
- * 
- *     t_call = time.time() - t0_call
- */
-    __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_o_s, __pyx_n_s_storage); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 464; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_1);
-    if (unlikely(__Pyx_SetItemInt(__pyx_t_1, 0, Py_None, long, 1, __Pyx_PyInt_From_long, 0, 0, 0) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 464; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  }
-  __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-
-  /* "theano/scan_module/scan_perform.pyx":466
- *         o_s.storage[0] = None
+ *                 outs[idx][0] = outs[idx][0][:sh0-(n_steps - i)]
  * 
  *     t_call = time.time() - t0_call             # <<<<<<<<<<<<<<
  * 
  *     if hasattr(fnct.maker, 'profile'):
  */
-  __pyx_t_9 = __Pyx_GetModuleGlobalName(__pyx_n_s_time); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 466; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_9 = __Pyx_GetName(__pyx_m, __pyx_n_s__time); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 461; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_9);
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_9, __pyx_n_s_time); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 466; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_7 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__time); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 461; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_7);
   __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-  __pyx_t_9 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 466; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_9 = PyObject_Call(__pyx_t_7, ((PyObject *)__pyx_empty_tuple), NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 461; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_9);
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = PyNumber_Subtract(__pyx_t_9, __pyx_v_t0_call); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 466; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+  __pyx_t_7 = PyNumber_Subtract(__pyx_t_9, __pyx_v_t0_call); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 461; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_7);
   __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-  __pyx_v_t_call = __pyx_t_1;
-  __pyx_t_1 = 0;
+  __pyx_v_t_call = __pyx_t_7;
+  __pyx_t_7 = 0;
 
-  /* "theano/scan_module/scan_perform.pyx":468
+  /* "theano/scan_module/scan_perform.pyx":463
  *     t_call = time.time() - t0_call
  * 
  *     if hasattr(fnct.maker, 'profile'):             # <<<<<<<<<<<<<<
  *         profile = fnct.maker.profile
  *         if type(profile) is not bool and profile:
  */
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_fnct, __pyx_n_s_maker); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 468; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_39 = PyObject_HasAttr(__pyx_t_1, __pyx_n_s_profile); if (unlikely(__pyx_t_39 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 468; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_13 = (__pyx_t_39 != 0);
-  if (__pyx_t_13) {
+  __pyx_t_7 = PyObject_GetAttr(__pyx_v_fnct, __pyx_n_s__maker); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 463; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_7);
+  __pyx_t_9 = ((PyObject *)__pyx_n_s__profile);
+  __Pyx_INCREF(__pyx_t_9);
+  __pyx_t_39 = PyObject_HasAttr(__pyx_t_7, __pyx_t_9); if (unlikely(__pyx_t_39 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 463; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+  __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+  if (__pyx_t_39) {
 
-    /* "theano/scan_module/scan_perform.pyx":469
+    /* "theano/scan_module/scan_perform.pyx":464
  * 
  *     if hasattr(fnct.maker, 'profile'):
  *         profile = fnct.maker.profile             # <<<<<<<<<<<<<<
  *         if type(profile) is not bool and profile:
  *             profile.vm_call_time +=  t_fn
  */
-    __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_fnct, __pyx_n_s_maker); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 469; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_1);
-    __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_profile); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 469; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_9 = PyObject_GetAttr(__pyx_v_fnct, __pyx_n_s__maker); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 464; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_9);
-    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-    __pyx_v_profile = __pyx_t_9;
-    __pyx_t_9 = 0;
+    __pyx_t_7 = PyObject_GetAttr(__pyx_t_9, __pyx_n_s__profile); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 464; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+    __pyx_v_profile = __pyx_t_7;
+    __pyx_t_7 = 0;
 
-    /* "theano/scan_module/scan_perform.pyx":470
+    /* "theano/scan_module/scan_perform.pyx":465
  *     if hasattr(fnct.maker, 'profile'):
  *         profile = fnct.maker.profile
  *         if type(profile) is not bool and profile:             # <<<<<<<<<<<<<<
  *             profile.vm_call_time +=  t_fn
  *             profile.callcount += 1
  */
-    __pyx_t_13 = (((PyObject *)Py_TYPE(__pyx_v_profile)) != ((PyObject*)&PyBool_Type));
-    if (__pyx_t_13) {
-      __pyx_t_39 = __Pyx_PyObject_IsTrue(__pyx_v_profile); if (unlikely(__pyx_t_39 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 470; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __pyx_t_3 = __pyx_t_39;
+    __pyx_t_39 = (((PyObject *)Py_TYPE(__pyx_v_profile)) != ((PyObject*)&PyBool_Type));
+    if (__pyx_t_39) {
+      __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_v_profile); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 465; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_13 = __pyx_t_3;
     } else {
-      __pyx_t_3 = __pyx_t_13;
+      __pyx_t_13 = __pyx_t_39;
     }
-    if (__pyx_t_3) {
+    if (__pyx_t_13) {
 
-      /* "theano/scan_module/scan_perform.pyx":471
+      /* "theano/scan_module/scan_perform.pyx":466
  *         profile = fnct.maker.profile
  *         if type(profile) is not bool and profile:
  *             profile.vm_call_time +=  t_fn             # <<<<<<<<<<<<<<
  *             profile.callcount += 1
  *             profile.nbsteps += n_steps
  */
-      __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_v_profile, __pyx_n_s_vm_call_time); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 471; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_7 = PyObject_GetAttr(__pyx_v_profile, __pyx_n_s__vm_call_time); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 466; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
+      __pyx_t_9 = PyNumber_InPlaceAdd(__pyx_t_7, __pyx_v_t_fn); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 466; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_9);
-      __pyx_t_1 = PyNumber_InPlaceAdd(__pyx_t_9, __pyx_v_t_fn); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 471; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      if (PyObject_SetAttr(__pyx_v_profile, __pyx_n_s__vm_call_time, __pyx_t_9) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 466; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-      if (__Pyx_PyObject_SetAttrStr(__pyx_v_profile, __pyx_n_s_vm_call_time, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 471; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
-      /* "theano/scan_module/scan_perform.pyx":472
+      /* "theano/scan_module/scan_perform.pyx":467
  *         if type(profile) is not bool and profile:
  *             profile.vm_call_time +=  t_fn
  *             profile.callcount += 1             # <<<<<<<<<<<<<<
  *             profile.nbsteps += n_steps
  *             profile.call_time += t_call
  */
-      __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_profile, __pyx_n_s_callcount); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 472; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_1);
-      __pyx_t_9 = PyNumber_InPlaceAdd(__pyx_t_1, __pyx_int_1); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 472; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_9 = PyObject_GetAttr(__pyx_v_profile, __pyx_n_s__callcount); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 467; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_9);
-      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-      if (__Pyx_PyObject_SetAttrStr(__pyx_v_profile, __pyx_n_s_callcount, __pyx_t_9) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 472; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_7 = PyNumber_InPlaceAdd(__pyx_t_9, __pyx_int_1); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 467; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
       __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+      if (PyObject_SetAttr(__pyx_v_profile, __pyx_n_s__callcount, __pyx_t_7) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 467; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
 
-      /* "theano/scan_module/scan_perform.pyx":473
+      /* "theano/scan_module/scan_perform.pyx":468
  *             profile.vm_call_time +=  t_fn
  *             profile.callcount += 1
  *             profile.nbsteps += n_steps             # <<<<<<<<<<<<<<
  *             profile.call_time += t_call
  *             if hasattr(fn, 'update_profile'):
  */
-      __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_v_profile, __pyx_n_s_nbsteps); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 473; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_7 = PyObject_GetAttr(__pyx_v_profile, __pyx_n_s__nbsteps); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 468; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_7);
+      __pyx_t_9 = PyInt_FromLong(__pyx_v_n_steps); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 468; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_9);
-      __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_n_steps); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 473; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_1 = PyNumber_InPlaceAdd(__pyx_t_7, __pyx_t_9); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 468; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_1);
-      __pyx_t_2 = PyNumber_InPlaceAdd(__pyx_t_9, __pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 473; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_2);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
       __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+      if (PyObject_SetAttr(__pyx_v_profile, __pyx_n_s__nbsteps, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 468; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-      if (__Pyx_PyObject_SetAttrStr(__pyx_v_profile, __pyx_n_s_nbsteps, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 473; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
-      /* "theano/scan_module/scan_perform.pyx":474
+      /* "theano/scan_module/scan_perform.pyx":469
  *             profile.callcount += 1
  *             profile.nbsteps += n_steps
  *             profile.call_time += t_call             # <<<<<<<<<<<<<<
  *             if hasattr(fn, 'update_profile'):
  *                 fn.update_profile(profile)
  */
-      __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_profile, __pyx_n_s_call_time); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 474; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_2);
-      __pyx_t_1 = PyNumber_InPlaceAdd(__pyx_t_2, __pyx_v_t_call); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 474; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_1 = PyObject_GetAttr(__pyx_v_profile, __pyx_n_s__call_time); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 469; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-      if (__Pyx_PyObject_SetAttrStr(__pyx_v_profile, __pyx_n_s_call_time, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 474; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_9 = PyNumber_InPlaceAdd(__pyx_t_1, __pyx_v_t_call); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 469; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_9);
       __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      if (PyObject_SetAttr(__pyx_v_profile, __pyx_n_s__call_time, __pyx_t_9) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 469; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
 
-      /* "theano/scan_module/scan_perform.pyx":475
+      /* "theano/scan_module/scan_perform.pyx":470
  *             profile.nbsteps += n_steps
  *             profile.call_time += t_call
  *             if hasattr(fn, 'update_profile'):             # <<<<<<<<<<<<<<
  *                 fn.update_profile(profile)
  * 
  */
-      __pyx_t_3 = PyObject_HasAttr(__pyx_v_fn, __pyx_n_s_update_profile); if (unlikely(__pyx_t_3 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 475; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __pyx_t_13 = (__pyx_t_3 != 0);
+      __pyx_t_9 = ((PyObject *)__pyx_n_s__update_profile);
+      __Pyx_INCREF(__pyx_t_9);
+      __pyx_t_13 = PyObject_HasAttr(__pyx_v_fn, __pyx_t_9); if (unlikely(__pyx_t_13 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 470; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
       if (__pyx_t_13) {
 
-        /* "theano/scan_module/scan_perform.pyx":476
+        /* "theano/scan_module/scan_perform.pyx":471
  *             profile.call_time += t_call
  *             if hasattr(fn, 'update_profile'):
  *                 fn.update_profile(profile)             # <<<<<<<<<<<<<<
  * 
  *     ### Old Profile Mode
  */
-        __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_fn, __pyx_n_s_update_profile); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 476; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_9 = PyObject_GetAttr(__pyx_v_fn, __pyx_n_s__update_profile); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 471; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 471; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_1);
-        __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 476; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_2);
         __Pyx_INCREF(__pyx_v_profile);
-        PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_v_profile);
+        PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_profile);
         __Pyx_GIVEREF(__pyx_v_profile);
-        __pyx_t_9 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_2, NULL); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 476; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_9);
-        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __pyx_t_7 = PyObject_Call(__pyx_t_9, ((PyObject *)__pyx_t_1), NULL); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 471; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_7);
         __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
-        goto __pyx_L88;
+        __Pyx_DECREF(((PyObject *)__pyx_t_1)); __pyx_t_1 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        goto __pyx_L87;
       }
-      __pyx_L88:;
-      goto __pyx_L87;
+      __pyx_L87:;
+      goto __pyx_L86;
     }
-    __pyx_L87:;
-    goto __pyx_L86;
+    __pyx_L86:;
+    goto __pyx_L85;
   }
-  __pyx_L86:;
+  __pyx_L85:;
 
-  /* "theano/scan_module/scan_perform.pyx":487
+  /* "theano/scan_module/scan_perform.pyx":482
  * 
  *     # DEBUG PRINT :
  *     self.t_call = t_call             # <<<<<<<<<<<<<<
  *     self.t_fn   = t_fn
  *     # print 'Cython > timing', t_call, t_fn, 'in percentage', 100.*t_fn/t_call
  */
-  if (__Pyx_PyObject_SetAttrStr(__pyx_v_self, __pyx_n_s_t_call, __pyx_v_t_call) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 487; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttr(__pyx_v_self, __pyx_n_s__t_call, __pyx_v_t_call) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 482; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "theano/scan_module/scan_perform.pyx":488
+  /* "theano/scan_module/scan_perform.pyx":483
  *     # DEBUG PRINT :
  *     self.t_call = t_call
  *     self.t_fn   = t_fn             # <<<<<<<<<<<<<<
  *     # print 'Cython > timing', t_call, t_fn, 'in percentage', 100.*t_fn/t_call
  * 
  */
-  if (__Pyx_PyObject_SetAttrStr(__pyx_v_self, __pyx_n_s_t_fn, __pyx_v_t_fn) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 488; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-
-  /* "theano/scan_module/scan_perform.pyx":68
- * 
- * @cython.boundscheck(False)
- * def perform(             # <<<<<<<<<<<<<<
- *             unsigned int n_shared_outs,
- *             unsigned int n_mit_mot_outs,
- */
+  if (PyObject_SetAttr(__pyx_v_self, __pyx_n_s__t_fn, __pyx_v_t_fn) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 483; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* function exit code */
   __pyx_r = Py_None; __Pyx_INCREF(Py_None);
   goto __pyx_L0;
   __pyx_L1_error:;
@@ -5243,8 +4980,6 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
   __Pyx_XDECREF(__pyx_v_dtype);
   __Pyx_XDECREF(__pyx_v_tmp);
   __Pyx_XDECREF(__pyx_v_sh0);
-  __Pyx_XDECREF(__pyx_v_i_s);
-  __Pyx_XDECREF(__pyx_v_o_s);
   __Pyx_XDECREF(__pyx_v_t_call);
   __Pyx_XDECREF(__pyx_v_profile);
   __Pyx_XGIVEREF(__pyx_r);
@@ -5252,27 +4987,25 @@ static PyObject *__pyx_pf_6theano_11scan_module_12scan_perform_2perform(CYTHON_U
   return __pyx_r;
 }
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":194
- *         # experimental exception made for __getbuffer__ and __releasebuffer__
- *         # -- the details of this may change.
- *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
- *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
- */
-
 /* Python wrapper */
-static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
-static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+static int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+static int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
   int __pyx_r;
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("__getbuffer__ (wrapper)", 0);
   __pyx_r = __pyx_pf_5numpy_7ndarray___getbuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info), ((int)__pyx_v_flags));
-
-  /* function exit code */
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
+/* "numpy.pxd":193
+ *         # experimental exception made for __getbuffer__ and __releasebuffer__
+ *         # -- the details of this may change.
+ *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
+ *             # This implementation of getbuffer is geared towards Cython
+ *             # requirements, and does not yet fullfill the PEP.
+ */
+
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
   int __pyx_v_copy_shape;
   int __pyx_v_i;
@@ -5304,20 +5037,22 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
     __Pyx_GIVEREF(__pyx_v_info->obj);
   }
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":200
+  /* "numpy.pxd":199
  *             # of flags
  * 
  *             if info == NULL: return             # <<<<<<<<<<<<<<
  * 
  *             cdef int copy_shape, i, ndim
  */
-  __pyx_t_1 = ((__pyx_v_info == NULL) != 0);
+  __pyx_t_1 = (__pyx_v_info == NULL);
   if (__pyx_t_1) {
     __pyx_r = 0;
     goto __pyx_L0;
+    goto __pyx_L3;
   }
+  __pyx_L3:;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":203
+  /* "numpy.pxd":202
  * 
  *             cdef int copy_shape, i, ndim
  *             cdef int endian_detector = 1             # <<<<<<<<<<<<<<
@@ -5326,7 +5061,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
   __pyx_v_endian_detector = 1;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":204
+  /* "numpy.pxd":203
  *             cdef int copy_shape, i, ndim
  *             cdef int endian_detector = 1
  *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)             # <<<<<<<<<<<<<<
@@ -5335,7 +5070,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
   __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":206
+  /* "numpy.pxd":205
  *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
  * 
  *             ndim = PyArray_NDIM(self)             # <<<<<<<<<<<<<<
@@ -5344,17 +5079,17 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
   __pyx_v_ndim = PyArray_NDIM(__pyx_v_self);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":208
+  /* "numpy.pxd":207
  *             ndim = PyArray_NDIM(self)
  * 
  *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
  *                 copy_shape = 1
  *             else:
  */
-  __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+  __pyx_t_1 = ((sizeof(npy_intp)) != (sizeof(Py_ssize_t)));
   if (__pyx_t_1) {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":209
+    /* "numpy.pxd":208
  * 
  *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
  *                 copy_shape = 1             # <<<<<<<<<<<<<<
@@ -5366,7 +5101,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
   }
   /*else*/ {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":211
+    /* "numpy.pxd":210
  *                 copy_shape = 1
  *             else:
  *                 copy_shape = 0             # <<<<<<<<<<<<<<
@@ -5377,83 +5112,87 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
   }
   __pyx_L4:;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":213
+  /* "numpy.pxd":212
  *                 copy_shape = 0
  * 
  *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)             # <<<<<<<<<<<<<<
  *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
  *                 raise ValueError(u"ndarray is not C contiguous")
  */
-  __pyx_t_1 = (((__pyx_v_flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS) != 0);
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS);
   if (__pyx_t_1) {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":214
+    /* "numpy.pxd":213
  * 
  *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
  *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):             # <<<<<<<<<<<<<<
  *                 raise ValueError(u"ndarray is not C contiguous")
  * 
  */
-    __pyx_t_2 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_C_CONTIGUOUS) != 0)) != 0);
+    __pyx_t_2 = (!PyArray_CHKFLAGS(__pyx_v_self, NPY_C_CONTIGUOUS));
     __pyx_t_3 = __pyx_t_2;
   } else {
     __pyx_t_3 = __pyx_t_1;
   }
   if (__pyx_t_3) {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":215
+    /* "numpy.pxd":214
  *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
  *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
  *                 raise ValueError(u"ndarray is not C contiguous")             # <<<<<<<<<<<<<<
  * 
  *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
  */
-    __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__13, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 215; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_4 = PyObject_Call(__pyx_builtin_ValueError, ((PyObject *)__pyx_k_tuple_6), NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 214; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_4);
     __Pyx_Raise(__pyx_t_4, 0, 0, 0);
     __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 215; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 214; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    goto __pyx_L5;
   }
+  __pyx_L5:;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":217
+  /* "numpy.pxd":216
  *                 raise ValueError(u"ndarray is not C contiguous")
  * 
  *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)             # <<<<<<<<<<<<<<
  *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
  *                 raise ValueError(u"ndarray is not Fortran contiguous")
  */
-  __pyx_t_3 = (((__pyx_v_flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS) != 0);
+  __pyx_t_3 = ((__pyx_v_flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS);
   if (__pyx_t_3) {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":218
+    /* "numpy.pxd":217
  * 
  *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
  *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):             # <<<<<<<<<<<<<<
  *                 raise ValueError(u"ndarray is not Fortran contiguous")
  * 
  */
-    __pyx_t_1 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_F_CONTIGUOUS) != 0)) != 0);
+    __pyx_t_1 = (!PyArray_CHKFLAGS(__pyx_v_self, NPY_F_CONTIGUOUS));
     __pyx_t_2 = __pyx_t_1;
   } else {
     __pyx_t_2 = __pyx_t_3;
   }
   if (__pyx_t_2) {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":219
+    /* "numpy.pxd":218
  *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
  *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
  *                 raise ValueError(u"ndarray is not Fortran contiguous")             # <<<<<<<<<<<<<<
  * 
  *             info.buf = PyArray_DATA(self)
  */
-    __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__14, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 219; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_4 = PyObject_Call(__pyx_builtin_ValueError, ((PyObject *)__pyx_k_tuple_8), NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 218; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_4);
     __Pyx_Raise(__pyx_t_4, 0, 0, 0);
     __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 219; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 218; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    goto __pyx_L6;
   }
+  __pyx_L6:;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":221
+  /* "numpy.pxd":220
  *                 raise ValueError(u"ndarray is not Fortran contiguous")
  * 
  *             info.buf = PyArray_DATA(self)             # <<<<<<<<<<<<<<
@@ -5462,7 +5201,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
   __pyx_v_info->buf = PyArray_DATA(__pyx_v_self);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":222
+  /* "numpy.pxd":221
  * 
  *             info.buf = PyArray_DATA(self)
  *             info.ndim = ndim             # <<<<<<<<<<<<<<
@@ -5471,17 +5210,16 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
   __pyx_v_info->ndim = __pyx_v_ndim;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":223
+  /* "numpy.pxd":222
  *             info.buf = PyArray_DATA(self)
  *             info.ndim = ndim
  *             if copy_shape:             # <<<<<<<<<<<<<<
  *                 # Allocate new buffer for strides and shape info.
  *                 # This is allocated as one block, strides first.
  */
-  __pyx_t_2 = (__pyx_v_copy_shape != 0);
-  if (__pyx_t_2) {
+  if (__pyx_v_copy_shape) {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":226
+    /* "numpy.pxd":225
  *                 # Allocate new buffer for strides and shape info.
  *                 # This is allocated as one block, strides first.
  *                 info.strides = <Py_ssize_t*>stdlib.malloc(sizeof(Py_ssize_t) * <size_t>ndim * 2)             # <<<<<<<<<<<<<<
@@ -5490,7 +5228,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
     __pyx_v_info->strides = ((Py_ssize_t *)malloc((((sizeof(Py_ssize_t)) * ((size_t)__pyx_v_ndim)) * 2)));
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":227
+    /* "numpy.pxd":226
  *                 # This is allocated as one block, strides first.
  *                 info.strides = <Py_ssize_t*>stdlib.malloc(sizeof(Py_ssize_t) * <size_t>ndim * 2)
  *                 info.shape = info.strides + ndim             # <<<<<<<<<<<<<<
@@ -5499,7 +5237,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
     __pyx_v_info->shape = (__pyx_v_info->strides + __pyx_v_ndim);
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":228
+    /* "numpy.pxd":227
  *                 info.strides = <Py_ssize_t*>stdlib.malloc(sizeof(Py_ssize_t) * <size_t>ndim * 2)
  *                 info.shape = info.strides + ndim
  *                 for i in range(ndim):             # <<<<<<<<<<<<<<
@@ -5510,7 +5248,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
     for (__pyx_t_6 = 0; __pyx_t_6 < __pyx_t_5; __pyx_t_6+=1) {
       __pyx_v_i = __pyx_t_6;
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":229
+      /* "numpy.pxd":228
  *                 info.shape = info.strides + ndim
  *                 for i in range(ndim):
  *                     info.strides[i] = PyArray_STRIDES(self)[i]             # <<<<<<<<<<<<<<
@@ -5519,7 +5257,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
       (__pyx_v_info->strides[__pyx_v_i]) = (PyArray_STRIDES(__pyx_v_self)[__pyx_v_i]);
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":230
+      /* "numpy.pxd":229
  *                 for i in range(ndim):
  *                     info.strides[i] = PyArray_STRIDES(self)[i]
  *                     info.shape[i] = PyArray_DIMS(self)[i]             # <<<<<<<<<<<<<<
@@ -5532,7 +5270,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
   }
   /*else*/ {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":232
+    /* "numpy.pxd":231
  *                     info.shape[i] = PyArray_DIMS(self)[i]
  *             else:
  *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)             # <<<<<<<<<<<<<<
@@ -5541,7 +5279,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
     __pyx_v_info->strides = ((Py_ssize_t *)PyArray_STRIDES(__pyx_v_self));
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233
+    /* "numpy.pxd":232
  *             else:
  *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
  *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)             # <<<<<<<<<<<<<<
@@ -5552,7 +5290,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
   }
   __pyx_L7:;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":234
+  /* "numpy.pxd":233
  *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
  *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
  *             info.suboffsets = NULL             # <<<<<<<<<<<<<<
@@ -5561,7 +5299,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
   __pyx_v_info->suboffsets = NULL;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":235
+  /* "numpy.pxd":234
  *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
  *             info.suboffsets = NULL
  *             info.itemsize = PyArray_ITEMSIZE(self)             # <<<<<<<<<<<<<<
@@ -5570,16 +5308,16 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
   __pyx_v_info->itemsize = PyArray_ITEMSIZE(__pyx_v_self);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":236
+  /* "numpy.pxd":235
  *             info.suboffsets = NULL
  *             info.itemsize = PyArray_ITEMSIZE(self)
  *             info.readonly = not PyArray_ISWRITEABLE(self)             # <<<<<<<<<<<<<<
  * 
  *             cdef int t
  */
-  __pyx_v_info->readonly = (!(PyArray_ISWRITEABLE(__pyx_v_self) != 0));
+  __pyx_v_info->readonly = (!PyArray_ISWRITEABLE(__pyx_v_self));
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":239
+  /* "numpy.pxd":238
  * 
  *             cdef int t
  *             cdef char* f = NULL             # <<<<<<<<<<<<<<
@@ -5588,19 +5326,17 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
   __pyx_v_f = NULL;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":240
+  /* "numpy.pxd":239
  *             cdef int t
  *             cdef char* f = NULL
  *             cdef dtype descr = self.descr             # <<<<<<<<<<<<<<
  *             cdef list stack
  *             cdef int offset
  */
-  __pyx_t_4 = ((PyObject *)PyArray_DESCR(__pyx_v_self));
-  __Pyx_INCREF(__pyx_t_4);
-  __pyx_v_descr = ((PyArray_Descr *)__pyx_t_4);
-  __pyx_t_4 = 0;
+  __Pyx_INCREF(((PyObject *)__pyx_v_self->descr));
+  __pyx_v_descr = __pyx_v_self->descr;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":244
+  /* "numpy.pxd":243
  *             cdef int offset
  * 
  *             cdef bint hasfields = PyDataType_HASFIELDS(descr)             # <<<<<<<<<<<<<<
@@ -5609,23 +5345,23 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
   __pyx_v_hasfields = PyDataType_HASFIELDS(__pyx_v_descr);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":246
+  /* "numpy.pxd":245
  *             cdef bint hasfields = PyDataType_HASFIELDS(descr)
  * 
  *             if not hasfields and not copy_shape:             # <<<<<<<<<<<<<<
  *                 # do not call releasebuffer
  *                 info.obj = None
  */
-  __pyx_t_2 = ((!(__pyx_v_hasfields != 0)) != 0);
+  __pyx_t_2 = (!__pyx_v_hasfields);
   if (__pyx_t_2) {
-    __pyx_t_3 = ((!(__pyx_v_copy_shape != 0)) != 0);
+    __pyx_t_3 = (!__pyx_v_copy_shape);
     __pyx_t_1 = __pyx_t_3;
   } else {
     __pyx_t_1 = __pyx_t_2;
   }
   if (__pyx_t_1) {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":248
+    /* "numpy.pxd":247
  *             if not hasfields and not copy_shape:
  *                 # do not call releasebuffer
  *                 info.obj = None             # <<<<<<<<<<<<<<
@@ -5641,7 +5377,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
   }
   /*else*/ {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":251
+    /* "numpy.pxd":250
  *             else:
  *                 # need to call releasebuffer
  *                 info.obj = self             # <<<<<<<<<<<<<<
@@ -5656,51 +5392,50 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
   }
   __pyx_L10:;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":253
+  /* "numpy.pxd":252
  *                 info.obj = self
  * 
  *             if not hasfields:             # <<<<<<<<<<<<<<
  *                 t = descr.type_num
- *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                 if ((descr.byteorder == '>' and little_endian) or
  */
-  __pyx_t_1 = ((!(__pyx_v_hasfields != 0)) != 0);
+  __pyx_t_1 = (!__pyx_v_hasfields);
   if (__pyx_t_1) {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":254
+    /* "numpy.pxd":253
  * 
  *             if not hasfields:
  *                 t = descr.type_num             # <<<<<<<<<<<<<<
- *                 if ((descr.byteorder == c'>' and little_endian) or
- *                     (descr.byteorder == c'<' and not little_endian)):
+ *                 if ((descr.byteorder == '>' and little_endian) or
+ *                     (descr.byteorder == '<' and not little_endian)):
  */
-    __pyx_t_5 = __pyx_v_descr->type_num;
-    __pyx_v_t = __pyx_t_5;
+    __pyx_v_t = __pyx_v_descr->type_num;
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":255
+    /* "numpy.pxd":254
  *             if not hasfields:
  *                 t = descr.type_num
- *                 if ((descr.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
- *                     (descr.byteorder == c'<' and not little_endian)):
+ *                 if ((descr.byteorder == '>' and little_endian) or             # <<<<<<<<<<<<<<
+ *                     (descr.byteorder == '<' and not little_endian)):
  *                     raise ValueError(u"Non-native byte order not supported")
  */
-    __pyx_t_1 = ((__pyx_v_descr->byteorder == '>') != 0);
+    __pyx_t_1 = (__pyx_v_descr->byteorder == '>');
     if (__pyx_t_1) {
-      __pyx_t_2 = (__pyx_v_little_endian != 0);
+      __pyx_t_2 = __pyx_v_little_endian;
     } else {
       __pyx_t_2 = __pyx_t_1;
     }
     if (!__pyx_t_2) {
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":256
+      /* "numpy.pxd":255
  *                 t = descr.type_num
- *                 if ((descr.byteorder == c'>' and little_endian) or
- *                     (descr.byteorder == c'<' and not little_endian)):             # <<<<<<<<<<<<<<
+ *                 if ((descr.byteorder == '>' and little_endian) or
+ *                     (descr.byteorder == '<' and not little_endian)):             # <<<<<<<<<<<<<<
  *                     raise ValueError(u"Non-native byte order not supported")
  *                 if   t == NPY_BYTE:        f = "b"
  */
-      __pyx_t_1 = ((__pyx_v_descr->byteorder == '<') != 0);
+      __pyx_t_1 = (__pyx_v_descr->byteorder == '<');
       if (__pyx_t_1) {
-        __pyx_t_3 = ((!(__pyx_v_little_endian != 0)) != 0);
+        __pyx_t_3 = (!__pyx_v_little_endian);
         __pyx_t_7 = __pyx_t_3;
       } else {
         __pyx_t_7 = __pyx_t_1;
@@ -5711,244 +5446,271 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
     }
     if (__pyx_t_1) {
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":257
- *                 if ((descr.byteorder == c'>' and little_endian) or
- *                     (descr.byteorder == c'<' and not little_endian)):
+      /* "numpy.pxd":256
+ *                 if ((descr.byteorder == '>' and little_endian) or
+ *                     (descr.byteorder == '<' and not little_endian)):
  *                     raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
  *                 if   t == NPY_BYTE:        f = "b"
  *                 elif t == NPY_UBYTE:       f = "B"
  */
-      __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__15, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 257; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_4 = PyObject_Call(__pyx_builtin_ValueError, ((PyObject *)__pyx_k_tuple_10), NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 256; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_4);
       __Pyx_Raise(__pyx_t_4, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 257; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 256; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      goto __pyx_L12;
     }
+    __pyx_L12:;
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274
- *                 elif t == NPY_CDOUBLE:     f = "Zd"
- *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
- *                 elif t == NPY_OBJECT:      f = "O"             # <<<<<<<<<<<<<<
- *                 else:
- *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
- */
-    switch (__pyx_v_t) {
-
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":258
- *                     (descr.byteorder == c'<' and not little_endian)):
+    /* "numpy.pxd":257
+ *                     (descr.byteorder == '<' and not little_endian)):
  *                     raise ValueError(u"Non-native byte order not supported")
  *                 if   t == NPY_BYTE:        f = "b"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_UBYTE:       f = "B"
  *                 elif t == NPY_SHORT:       f = "h"
  */
-      case NPY_BYTE:
-      __pyx_v_f = __pyx_k_b;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_BYTE);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__b;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":259
+    /* "numpy.pxd":258
  *                     raise ValueError(u"Non-native byte order not supported")
  *                 if   t == NPY_BYTE:        f = "b"
  *                 elif t == NPY_UBYTE:       f = "B"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_SHORT:       f = "h"
  *                 elif t == NPY_USHORT:      f = "H"
  */
-      case NPY_UBYTE:
-      __pyx_v_f = __pyx_k_B;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_UBYTE);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__B;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":260
+    /* "numpy.pxd":259
  *                 if   t == NPY_BYTE:        f = "b"
  *                 elif t == NPY_UBYTE:       f = "B"
  *                 elif t == NPY_SHORT:       f = "h"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_USHORT:      f = "H"
  *                 elif t == NPY_INT:         f = "i"
  */
-      case NPY_SHORT:
-      __pyx_v_f = __pyx_k_h;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_SHORT);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__h;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":261
+    /* "numpy.pxd":260
  *                 elif t == NPY_UBYTE:       f = "B"
  *                 elif t == NPY_SHORT:       f = "h"
  *                 elif t == NPY_USHORT:      f = "H"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_INT:         f = "i"
  *                 elif t == NPY_UINT:        f = "I"
  */
-      case NPY_USHORT:
-      __pyx_v_f = __pyx_k_H;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_USHORT);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__H;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":262
+    /* "numpy.pxd":261
  *                 elif t == NPY_SHORT:       f = "h"
  *                 elif t == NPY_USHORT:      f = "H"
  *                 elif t == NPY_INT:         f = "i"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_UINT:        f = "I"
  *                 elif t == NPY_LONG:        f = "l"
  */
-      case NPY_INT:
-      __pyx_v_f = __pyx_k_i;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_INT);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__i;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":263
+    /* "numpy.pxd":262
  *                 elif t == NPY_USHORT:      f = "H"
  *                 elif t == NPY_INT:         f = "i"
  *                 elif t == NPY_UINT:        f = "I"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_LONG:        f = "l"
  *                 elif t == NPY_ULONG:       f = "L"
  */
-      case NPY_UINT:
-      __pyx_v_f = __pyx_k_I;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_UINT);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__I;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":264
+    /* "numpy.pxd":263
  *                 elif t == NPY_INT:         f = "i"
  *                 elif t == NPY_UINT:        f = "I"
  *                 elif t == NPY_LONG:        f = "l"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_ULONG:       f = "L"
  *                 elif t == NPY_LONGLONG:    f = "q"
  */
-      case NPY_LONG:
-      __pyx_v_f = __pyx_k_l;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_LONG);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__l;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":265
+    /* "numpy.pxd":264
  *                 elif t == NPY_UINT:        f = "I"
  *                 elif t == NPY_LONG:        f = "l"
  *                 elif t == NPY_ULONG:       f = "L"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_LONGLONG:    f = "q"
  *                 elif t == NPY_ULONGLONG:   f = "Q"
  */
-      case NPY_ULONG:
-      __pyx_v_f = __pyx_k_L;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_ULONG);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__L;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":266
+    /* "numpy.pxd":265
  *                 elif t == NPY_LONG:        f = "l"
  *                 elif t == NPY_ULONG:       f = "L"
  *                 elif t == NPY_LONGLONG:    f = "q"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_ULONGLONG:   f = "Q"
  *                 elif t == NPY_FLOAT:       f = "f"
  */
-      case NPY_LONGLONG:
-      __pyx_v_f = __pyx_k_q;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_LONGLONG);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__q;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":267
+    /* "numpy.pxd":266
  *                 elif t == NPY_ULONG:       f = "L"
  *                 elif t == NPY_LONGLONG:    f = "q"
  *                 elif t == NPY_ULONGLONG:   f = "Q"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_FLOAT:       f = "f"
  *                 elif t == NPY_DOUBLE:      f = "d"
  */
-      case NPY_ULONGLONG:
-      __pyx_v_f = __pyx_k_Q;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_ULONGLONG);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__Q;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":268
+    /* "numpy.pxd":267
  *                 elif t == NPY_LONGLONG:    f = "q"
  *                 elif t == NPY_ULONGLONG:   f = "Q"
  *                 elif t == NPY_FLOAT:       f = "f"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_DOUBLE:      f = "d"
  *                 elif t == NPY_LONGDOUBLE:  f = "g"
  */
-      case NPY_FLOAT:
-      __pyx_v_f = __pyx_k_f;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_FLOAT);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__f;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":269
+    /* "numpy.pxd":268
  *                 elif t == NPY_ULONGLONG:   f = "Q"
  *                 elif t == NPY_FLOAT:       f = "f"
  *                 elif t == NPY_DOUBLE:      f = "d"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_LONGDOUBLE:  f = "g"
  *                 elif t == NPY_CFLOAT:      f = "Zf"
  */
-      case NPY_DOUBLE:
-      __pyx_v_f = __pyx_k_d;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_DOUBLE);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__d;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":270
+    /* "numpy.pxd":269
  *                 elif t == NPY_FLOAT:       f = "f"
  *                 elif t == NPY_DOUBLE:      f = "d"
  *                 elif t == NPY_LONGDOUBLE:  f = "g"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_CFLOAT:      f = "Zf"
  *                 elif t == NPY_CDOUBLE:     f = "Zd"
  */
-      case NPY_LONGDOUBLE:
-      __pyx_v_f = __pyx_k_g;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_LONGDOUBLE);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__g;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":271
+    /* "numpy.pxd":270
  *                 elif t == NPY_DOUBLE:      f = "d"
  *                 elif t == NPY_LONGDOUBLE:  f = "g"
  *                 elif t == NPY_CFLOAT:      f = "Zf"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_CDOUBLE:     f = "Zd"
  *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
  */
-      case NPY_CFLOAT:
-      __pyx_v_f = __pyx_k_Zf;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_CFLOAT);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__Zf;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":272
+    /* "numpy.pxd":271
  *                 elif t == NPY_LONGDOUBLE:  f = "g"
  *                 elif t == NPY_CFLOAT:      f = "Zf"
  *                 elif t == NPY_CDOUBLE:     f = "Zd"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
  *                 elif t == NPY_OBJECT:      f = "O"
  */
-      case NPY_CDOUBLE:
-      __pyx_v_f = __pyx_k_Zd;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_CDOUBLE);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__Zd;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":273
+    /* "numpy.pxd":272
  *                 elif t == NPY_CFLOAT:      f = "Zf"
  *                 elif t == NPY_CDOUBLE:     f = "Zd"
  *                 elif t == NPY_CLONGDOUBLE: f = "Zg"             # <<<<<<<<<<<<<<
  *                 elif t == NPY_OBJECT:      f = "O"
  *                 else:
  */
-      case NPY_CLONGDOUBLE:
-      __pyx_v_f = __pyx_k_Zg;
-      break;
+    __pyx_t_1 = (__pyx_v_t == NPY_CLONGDOUBLE);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__Zg;
+      goto __pyx_L13;
+    }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274
+    /* "numpy.pxd":273
  *                 elif t == NPY_CDOUBLE:     f = "Zd"
  *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
  *                 elif t == NPY_OBJECT:      f = "O"             # <<<<<<<<<<<<<<
  *                 else:
  *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
  */
-      case NPY_OBJECT:
-      __pyx_v_f = __pyx_k_O;
-      break;
-      default:
+    __pyx_t_1 = (__pyx_v_t == NPY_OBJECT);
+    if (__pyx_t_1) {
+      __pyx_v_f = __pyx_k__O;
+      goto __pyx_L13;
+    }
+    /*else*/ {
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":276
+      /* "numpy.pxd":275
  *                 elif t == NPY_OBJECT:      f = "O"
  *                 else:
  *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)             # <<<<<<<<<<<<<<
  *                 info.format = f
  *                 return
  */
-      __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_t); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_4 = PyInt_FromLong(__pyx_v_t); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_4);
-      __pyx_t_8 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_t_4); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_8 = PyNumber_Remainder(((PyObject *)__pyx_kp_u_11), __pyx_t_4); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(((PyObject *)__pyx_t_8));
       __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_4);
-      PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_8);
-      __Pyx_GIVEREF(__pyx_t_8);
+      PyTuple_SET_ITEM(__pyx_t_4, 0, ((PyObject *)__pyx_t_8));
+      __Pyx_GIVEREF(((PyObject *)__pyx_t_8));
       __pyx_t_8 = 0;
-      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_4, NULL); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_8 = PyObject_Call(__pyx_builtin_ValueError, ((PyObject *)__pyx_t_4), NULL); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_8);
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_DECREF(((PyObject *)__pyx_t_4)); __pyx_t_4 = 0;
       __Pyx_Raise(__pyx_t_8, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
-      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      break;
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     }
+    __pyx_L13:;
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":277
+    /* "numpy.pxd":276
  *                 else:
  *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
  *                 info.format = f             # <<<<<<<<<<<<<<
@@ -5957,7 +5719,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
     __pyx_v_info->format = __pyx_v_f;
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":278
+    /* "numpy.pxd":277
  *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
  *                 info.format = f
  *                 return             # <<<<<<<<<<<<<<
@@ -5966,65 +5728,58 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
  */
     __pyx_r = 0;
     goto __pyx_L0;
+    goto __pyx_L11;
   }
   /*else*/ {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":280
+    /* "numpy.pxd":279
  *                 return
  *             else:
  *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)             # <<<<<<<<<<<<<<
- *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 info.format[0] = '^' # Native data types, manual alignment
  *                 offset = 0
  */
     __pyx_v_info->format = ((char *)malloc(255));
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":281
+    /* "numpy.pxd":280
  *             else:
  *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)
- *                 info.format[0] = c'^' # Native data types, manual alignment             # <<<<<<<<<<<<<<
+ *                 info.format[0] = '^' # Native data types, manual alignment             # <<<<<<<<<<<<<<
  *                 offset = 0
  *                 f = _util_dtypestring(descr, info.format + 1,
  */
     (__pyx_v_info->format[0]) = '^';
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":282
+    /* "numpy.pxd":281
  *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)
- *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 info.format[0] = '^' # Native data types, manual alignment
  *                 offset = 0             # <<<<<<<<<<<<<<
  *                 f = _util_dtypestring(descr, info.format + 1,
  *                                       info.format + _buffer_format_string_len,
  */
     __pyx_v_offset = 0;
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":283
- *                 info.format[0] = c'^' # Native data types, manual alignment
- *                 offset = 0
- *                 f = _util_dtypestring(descr, info.format + 1,             # <<<<<<<<<<<<<<
+    /* "numpy.pxd":284
+ *                 f = _util_dtypestring(descr, info.format + 1,
  *                                       info.format + _buffer_format_string_len,
- *                                       &offset)
+ *                                       &offset)             # <<<<<<<<<<<<<<
+ *                 f[0] = 0 # Terminate format string
+ * 
  */
-    __pyx_t_9 = __pyx_f_5numpy__util_dtypestring(__pyx_v_descr, (__pyx_v_info->format + 1), (__pyx_v_info->format + 255), (&__pyx_v_offset)); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 283; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_9 = __pyx_f_5numpy__util_dtypestring(__pyx_v_descr, (__pyx_v_info->format + 1), (__pyx_v_info->format + 255), (&__pyx_v_offset)); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 282; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __pyx_v_f = __pyx_t_9;
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":286
+    /* "numpy.pxd":285
  *                                       info.format + _buffer_format_string_len,
  *                                       &offset)
- *                 f[0] = c'\0' # Terminate format string             # <<<<<<<<<<<<<<
+ *                 f[0] = 0 # Terminate format string             # <<<<<<<<<<<<<<
  * 
  *         def __releasebuffer__(ndarray self, Py_buffer* info):
  */
-    (__pyx_v_f[0]) = '\x00';
+    (__pyx_v_f[0]) = 0;
   }
+  __pyx_L11:;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":194
- *         # experimental exception made for __getbuffer__ and __releasebuffer__
- *         # -- the details of this may change.
- *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
- *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
- */
-
-  /* function exit code */
   __pyx_r = 0;
   goto __pyx_L0;
   __pyx_L1_error:;
@@ -6048,41 +5803,39 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
   return __pyx_r;
 }
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":288
- *                 f[0] = c'\0' # Terminate format string
- * 
- *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
- *             if PyArray_HASFIELDS(self):
- *                 stdlib.free(info.format)
- */
-
 /* Python wrapper */
-static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info); /*proto*/
-static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+static void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info); /*proto*/
+static void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("__releasebuffer__ (wrapper)", 0);
   __pyx_pf_5numpy_7ndarray_2__releasebuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info));
-
-  /* function exit code */
   __Pyx_RefNannyFinishContext();
 }
 
+/* "numpy.pxd":287
+ *                 f[0] = 0 # Terminate format string
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)
+ */
+
 static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
   __Pyx_RefNannyDeclarations
   int __pyx_t_1;
   __Pyx_RefNannySetupContext("__releasebuffer__", 0);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":289
+  /* "numpy.pxd":288
  * 
  *         def __releasebuffer__(ndarray self, Py_buffer* info):
  *             if PyArray_HASFIELDS(self):             # <<<<<<<<<<<<<<
  *                 stdlib.free(info.format)
  *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
  */
-  __pyx_t_1 = (PyArray_HASFIELDS(__pyx_v_self) != 0);
+  __pyx_t_1 = PyArray_HASFIELDS(__pyx_v_self);
   if (__pyx_t_1) {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":290
+    /* "numpy.pxd":289
  *         def __releasebuffer__(ndarray self, Py_buffer* info):
  *             if PyArray_HASFIELDS(self):
  *                 stdlib.free(info.format)             # <<<<<<<<<<<<<<
@@ -6094,17 +5847,17 @@ static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_s
   }
   __pyx_L3:;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":291
+  /* "numpy.pxd":290
  *             if PyArray_HASFIELDS(self):
  *                 stdlib.free(info.format)
  *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
  *                 stdlib.free(info.strides)
  *                 # info.shape was stored after info.strides in the same block
  */
-  __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+  __pyx_t_1 = ((sizeof(npy_intp)) != (sizeof(Py_ssize_t)));
   if (__pyx_t_1) {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":292
+    /* "numpy.pxd":291
  *                 stdlib.free(info.format)
  *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
  *                 stdlib.free(info.strides)             # <<<<<<<<<<<<<<
@@ -6116,19 +5869,10 @@ static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_s
   }
   __pyx_L4:;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":288
- *                 f[0] = c'\0' # Terminate format string
- * 
- *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
- *             if PyArray_HASFIELDS(self):
- *                 stdlib.free(info.format)
- */
-
-  /* function exit code */
   __Pyx_RefNannyFinishContext();
 }
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":768
+/* "numpy.pxd":767
  * ctypedef npy_cdouble     complex_t
  * 
  * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
@@ -6145,7 +5889,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 0);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":769
+  /* "numpy.pxd":768
  * 
  * cdef inline object PyArray_MultiIterNew1(a):
  *     return PyArray_MultiIterNew(1, <void*>a)             # <<<<<<<<<<<<<<
@@ -6153,21 +5897,14 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__
  * cdef inline object PyArray_MultiIterNew2(a, b):
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 769; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 768; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_r = __pyx_t_1;
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":768
- * ctypedef npy_cdouble     complex_t
- * 
- * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
- *     return PyArray_MultiIterNew(1, <void*>a)
- * 
- */
-
-  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
   __pyx_L1_error:;
   __Pyx_XDECREF(__pyx_t_1);
   __Pyx_AddTraceback("numpy.PyArray_MultiIterNew1", __pyx_clineno, __pyx_lineno, __pyx_filename);
@@ -6178,7 +5915,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__
   return __pyx_r;
 }
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":771
+/* "numpy.pxd":770
  *     return PyArray_MultiIterNew(1, <void*>a)
  * 
  * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
@@ -6195,7 +5932,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 0);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":772
+  /* "numpy.pxd":771
  * 
  * cdef inline object PyArray_MultiIterNew2(a, b):
  *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)             # <<<<<<<<<<<<<<
@@ -6203,21 +5940,14 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__
  * cdef inline object PyArray_MultiIterNew3(a, b, c):
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 772; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 771; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_r = __pyx_t_1;
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":771
- *     return PyArray_MultiIterNew(1, <void*>a)
- * 
- * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
- *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
- * 
- */
-
-  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
   __pyx_L1_error:;
   __Pyx_XDECREF(__pyx_t_1);
   __Pyx_AddTraceback("numpy.PyArray_MultiIterNew2", __pyx_clineno, __pyx_lineno, __pyx_filename);
@@ -6228,7 +5958,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__
   return __pyx_r;
 }
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":774
+/* "numpy.pxd":773
  *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
  * 
  * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
@@ -6245,7 +5975,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 0);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":775
+  /* "numpy.pxd":774
  * 
  * cdef inline object PyArray_MultiIterNew3(a, b, c):
  *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)             # <<<<<<<<<<<<<<
@@ -6253,21 +5983,14 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__
  * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 775; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 774; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_r = __pyx_t_1;
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":774
- *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
- * 
- * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
- *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
- * 
- */
-
-  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
   __pyx_L1_error:;
   __Pyx_XDECREF(__pyx_t_1);
   __Pyx_AddTraceback("numpy.PyArray_MultiIterNew3", __pyx_clineno, __pyx_lineno, __pyx_filename);
@@ -6278,7 +6001,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__
   return __pyx_r;
 }
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":777
+/* "numpy.pxd":776
  *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
  * 
  * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
@@ -6295,7 +6018,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 0);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":778
+  /* "numpy.pxd":777
  * 
  * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
  *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)             # <<<<<<<<<<<<<<
@@ -6303,21 +6026,14 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__
  * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 778; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 777; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_r = __pyx_t_1;
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":777
- *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
- * 
- * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
- *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
- * 
- */
-
-  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
   __pyx_L1_error:;
   __Pyx_XDECREF(__pyx_t_1);
   __Pyx_AddTraceback("numpy.PyArray_MultiIterNew4", __pyx_clineno, __pyx_lineno, __pyx_filename);
@@ -6328,7 +6044,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__
   return __pyx_r;
 }
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":780
+/* "numpy.pxd":779
  *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
  * 
  * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
@@ -6345,7 +6061,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 0);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":781
+  /* "numpy.pxd":780
  * 
  * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
  *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)             # <<<<<<<<<<<<<<
@@ -6353,21 +6069,14 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__
  * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 781; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 780; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_r = __pyx_t_1;
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":780
- *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
- * 
- * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
- *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
- * 
- */
-
-  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
   __pyx_L1_error:;
   __Pyx_XDECREF(__pyx_t_1);
   __Pyx_AddTraceback("numpy.PyArray_MultiIterNew5", __pyx_clineno, __pyx_lineno, __pyx_filename);
@@ -6378,7 +6087,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__
   return __pyx_r;
 }
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":783
+/* "numpy.pxd":782
  *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
  * 
  * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:             # <<<<<<<<<<<<<<
@@ -6400,7 +6109,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
   Py_ssize_t __pyx_t_2;
   PyObject *__pyx_t_3 = NULL;
   PyObject *__pyx_t_4 = NULL;
-  int __pyx_t_5;
+  PyObject *__pyx_t_5 = NULL;
   int __pyx_t_6;
   int __pyx_t_7;
   int __pyx_t_8;
@@ -6412,7 +6121,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("_util_dtypestring", 0);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":790
+  /* "numpy.pxd":789
  *     cdef int delta_offset
  *     cdef tuple i
  *     cdef int endian_detector = 1             # <<<<<<<<<<<<<<
@@ -6421,7 +6130,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
  */
   __pyx_v_endian_detector = 1;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":791
+  /* "numpy.pxd":790
  *     cdef tuple i
  *     cdef int endian_detector = 1
  *     cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)             # <<<<<<<<<<<<<<
@@ -6430,136 +6139,134 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
  */
   __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":794
+  /* "numpy.pxd":793
  *     cdef tuple fields
  * 
  *     for childname in descr.names:             # <<<<<<<<<<<<<<
  *         fields = descr.fields[childname]
  *         child, new_offset = fields
  */
-  if (unlikely(__pyx_v_descr->names == Py_None)) {
-    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 794; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(((PyObject *)__pyx_v_descr->names) == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable"); {__pyx_filename = __pyx_f[1]; __pyx_lineno = 793; __pyx_clineno = __LINE__; goto __pyx_L1_error;} 
   }
-  __pyx_t_1 = __pyx_v_descr->names; __Pyx_INCREF(__pyx_t_1); __pyx_t_2 = 0;
+  __pyx_t_1 = ((PyObject *)__pyx_v_descr->names); __Pyx_INCREF(__pyx_t_1); __pyx_t_2 = 0;
   for (;;) {
     if (__pyx_t_2 >= PyTuple_GET_SIZE(__pyx_t_1)) break;
-    #if CYTHON_COMPILING_IN_CPYTHON
-    __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_3); __pyx_t_2++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 794; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    #else
-    __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 794; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    #endif
-    __Pyx_XDECREF_SET(__pyx_v_childname, __pyx_t_3);
+    __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_3); __pyx_t_2++;
+    __Pyx_XDECREF(__pyx_v_childname);
+    __pyx_v_childname = __pyx_t_3;
     __pyx_t_3 = 0;
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":795
+    /* "numpy.pxd":794
  * 
  *     for childname in descr.names:
  *         fields = descr.fields[childname]             # <<<<<<<<<<<<<<
  *         child, new_offset = fields
  * 
  */
-    __pyx_t_3 = PyObject_GetItem(__pyx_v_descr->fields, __pyx_v_childname); if (unlikely(__pyx_t_3 == NULL)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 795; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+    __pyx_t_3 = PyObject_GetItem(__pyx_v_descr->fields, __pyx_v_childname); if (!__pyx_t_3) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 794; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
-    if (!(likely(PyTuple_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "tuple", Py_TYPE(__pyx_t_3)->tp_name), 0))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 795; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_XDECREF_SET(__pyx_v_fields, ((PyObject*)__pyx_t_3));
+    if (!(likely(PyTuple_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected tuple, got %.200s", Py_TYPE(__pyx_t_3)->tp_name), 0))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 794; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_XDECREF(((PyObject *)__pyx_v_fields));
+    __pyx_v_fields = ((PyObject*)__pyx_t_3);
     __pyx_t_3 = 0;
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":796
+    /* "numpy.pxd":795
  *     for childname in descr.names:
  *         fields = descr.fields[childname]
  *         child, new_offset = fields             # <<<<<<<<<<<<<<
  * 
- *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *         if (end - f) - (new_offset - offset[0]) < 15:
  */
-    if (likely(__pyx_v_fields != Py_None)) {
-      PyObject* sequence = __pyx_v_fields;
-      #if CYTHON_COMPILING_IN_CPYTHON
-      Py_ssize_t size = Py_SIZE(sequence);
-      #else
-      Py_ssize_t size = PySequence_Size(sequence);
-      #endif
-      if (unlikely(size != 2)) {
-        if (size > 2) __Pyx_RaiseTooManyValuesError(2);
-        else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
-        {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (likely(PyTuple_CheckExact(((PyObject *)__pyx_v_fields)))) {
+      PyObject* sequence = ((PyObject *)__pyx_v_fields);
+      if (unlikely(PyTuple_GET_SIZE(sequence) != 2)) {
+        if (PyTuple_GET_SIZE(sequence) > 2) __Pyx_RaiseTooManyValuesError(2);
+        else __Pyx_RaiseNeedMoreValuesError(PyTuple_GET_SIZE(sequence));
+        {__pyx_filename = __pyx_f[1]; __pyx_lineno = 795; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       }
-      #if CYTHON_COMPILING_IN_CPYTHON
       __pyx_t_3 = PyTuple_GET_ITEM(sequence, 0); 
       __pyx_t_4 = PyTuple_GET_ITEM(sequence, 1); 
       __Pyx_INCREF(__pyx_t_3);
       __Pyx_INCREF(__pyx_t_4);
-      #else
-      __pyx_t_3 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_3);
-      __pyx_t_4 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_4);
-      #endif
     } else {
-      __Pyx_RaiseNoneNotIterableError(); {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_UnpackTupleError(((PyObject *)__pyx_v_fields), 2);
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 795; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     }
-    if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_dtype))))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_XDECREF_SET(__pyx_v_child, ((PyArray_Descr *)__pyx_t_3));
+    if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_dtype))))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 795; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_XDECREF(((PyObject *)__pyx_v_child));
+    __pyx_v_child = ((PyArray_Descr *)__pyx_t_3);
     __pyx_t_3 = 0;
-    __Pyx_XDECREF_SET(__pyx_v_new_offset, __pyx_t_4);
+    __Pyx_XDECREF(__pyx_v_new_offset);
+    __pyx_v_new_offset = __pyx_t_4;
     __pyx_t_4 = 0;
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":798
+    /* "numpy.pxd":797
  *         child, new_offset = fields
  * 
- *         if (end - f) - <int>(new_offset - offset[0]) < 15:             # <<<<<<<<<<<<<<
+ *         if (end - f) - (new_offset - offset[0]) < 15:             # <<<<<<<<<<<<<<
  *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
  * 
  */
-    __pyx_t_4 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_4 = PyInt_FromLong((__pyx_v_end - __pyx_v_f)); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 797; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_4);
-    __pyx_t_3 = PyNumber_Subtract(__pyx_v_new_offset, __pyx_t_4); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = PyInt_FromLong((__pyx_v_offset[0])); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 797; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_5 = PyNumber_Subtract(__pyx_v_new_offset, __pyx_t_3); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 797; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_3 = PyNumber_Subtract(__pyx_t_4, __pyx_t_5); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 797; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
     __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-    __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_t_5 = PyObject_RichCompare(__pyx_t_3, __pyx_int_15, Py_LT); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 797; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_5);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-    __pyx_t_6 = ((((__pyx_v_end - __pyx_v_f) - ((int)__pyx_t_5)) < 15) != 0);
+    __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 797; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
     if (__pyx_t_6) {
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":799
+      /* "numpy.pxd":798
  * 
- *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *         if (end - f) - (new_offset - offset[0]) < 15:
  *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")             # <<<<<<<<<<<<<<
  * 
- *         if ((child.byteorder == c'>' and little_endian) or
+ *         if ((child.byteorder == '>' and little_endian) or
  */
-      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__16, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 799; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_3);
-      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
-      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 799; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_5 = PyObject_Call(__pyx_builtin_RuntimeError, ((PyObject *)__pyx_k_tuple_13), NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_Raise(__pyx_t_5, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      goto __pyx_L5;
     }
+    __pyx_L5:;
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":801
+    /* "numpy.pxd":800
  *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
  * 
- *         if ((child.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
- *             (child.byteorder == c'<' and not little_endian)):
+ *         if ((child.byteorder == '>' and little_endian) or             # <<<<<<<<<<<<<<
+ *             (child.byteorder == '<' and not little_endian)):
  *             raise ValueError(u"Non-native byte order not supported")
  */
-    __pyx_t_6 = ((__pyx_v_child->byteorder == '>') != 0);
+    __pyx_t_6 = (__pyx_v_child->byteorder == '>');
     if (__pyx_t_6) {
-      __pyx_t_7 = (__pyx_v_little_endian != 0);
+      __pyx_t_7 = __pyx_v_little_endian;
     } else {
       __pyx_t_7 = __pyx_t_6;
     }
     if (!__pyx_t_7) {
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":802
+      /* "numpy.pxd":801
  * 
- *         if ((child.byteorder == c'>' and little_endian) or
- *             (child.byteorder == c'<' and not little_endian)):             # <<<<<<<<<<<<<<
+ *         if ((child.byteorder == '>' and little_endian) or
+ *             (child.byteorder == '<' and not little_endian)):             # <<<<<<<<<<<<<<
  *             raise ValueError(u"Non-native byte order not supported")
  *             # One could encode it in the format string and have Cython
  */
-      __pyx_t_6 = ((__pyx_v_child->byteorder == '<') != 0);
+      __pyx_t_6 = (__pyx_v_child->byteorder == '<');
       if (__pyx_t_6) {
-        __pyx_t_8 = ((!(__pyx_v_little_endian != 0)) != 0);
+        __pyx_t_8 = (!__pyx_v_little_endian);
         __pyx_t_9 = __pyx_t_8;
       } else {
         __pyx_t_9 = __pyx_t_6;
@@ -6570,21 +6277,23 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
     }
     if (__pyx_t_6) {
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":803
- *         if ((child.byteorder == c'>' and little_endian) or
- *             (child.byteorder == c'<' and not little_endian)):
+      /* "numpy.pxd":802
+ *         if ((child.byteorder == '>' and little_endian) or
+ *             (child.byteorder == '<' and not little_endian)):
  *             raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
  *             # One could encode it in the format string and have Cython
  *             # complain instead, BUT: < and > in format strings also imply
  */
-      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__17, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 803; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_3);
-      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
-      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 803; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_5 = PyObject_Call(__pyx_builtin_ValueError, ((PyObject *)__pyx_k_tuple_14), NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 802; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_Raise(__pyx_t_5, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 802; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      goto __pyx_L6;
     }
+    __pyx_L6:;
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":813
+    /* "numpy.pxd":812
  * 
  *         # Output padding bytes
  *         while offset[0] < new_offset:             # <<<<<<<<<<<<<<
@@ -6592,15 +6301,16 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
  *             f += 1
  */
     while (1) {
-      __pyx_t_3 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 813; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_5 = PyInt_FromLong((__pyx_v_offset[0])); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 812; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_t_5, __pyx_v_new_offset, Py_LT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 812; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_3);
-      __pyx_t_4 = PyObject_RichCompare(__pyx_t_3, __pyx_v_new_offset, Py_LT); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 813; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 812; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 813; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
       if (!__pyx_t_6) break;
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":814
+      /* "numpy.pxd":813
  *         # Output padding bytes
  *         while offset[0] < new_offset:
  *             f[0] = 120 # "x"; pad byte             # <<<<<<<<<<<<<<
@@ -6609,7 +6319,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
  */
       (__pyx_v_f[0]) = 120;
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":815
+      /* "numpy.pxd":814
  *         while offset[0] < new_offset:
  *             f[0] = 120 # "x"; pad byte
  *             f += 1             # <<<<<<<<<<<<<<
@@ -6618,7 +6328,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
  */
       __pyx_v_f = (__pyx_v_f + 1);
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":816
+      /* "numpy.pxd":815
  *             f[0] = 120 # "x"; pad byte
  *             f += 1
  *             offset[0] += 1             # <<<<<<<<<<<<<<
@@ -6629,7 +6339,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
       (__pyx_v_offset[__pyx_t_10]) = ((__pyx_v_offset[__pyx_t_10]) + 1);
     }
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":818
+    /* "numpy.pxd":817
  *             offset[0] += 1
  * 
  *         offset[0] += child.itemsize             # <<<<<<<<<<<<<<
@@ -6639,299 +6349,316 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
     __pyx_t_10 = 0;
     (__pyx_v_offset[__pyx_t_10]) = ((__pyx_v_offset[__pyx_t_10]) + __pyx_v_child->elsize);
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":820
+    /* "numpy.pxd":819
  *         offset[0] += child.itemsize
  * 
  *         if not PyDataType_HASFIELDS(child):             # <<<<<<<<<<<<<<
  *             t = child.type_num
  *             if end - f < 5:
  */
-    __pyx_t_6 = ((!(PyDataType_HASFIELDS(__pyx_v_child) != 0)) != 0);
+    __pyx_t_6 = (!PyDataType_HASFIELDS(__pyx_v_child));
     if (__pyx_t_6) {
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":821
+      /* "numpy.pxd":820
  * 
  *         if not PyDataType_HASFIELDS(child):
  *             t = child.type_num             # <<<<<<<<<<<<<<
  *             if end - f < 5:
  *                 raise RuntimeError(u"Format string allocated too short.")
  */
-      __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_child->type_num); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 821; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_4);
-      __Pyx_XDECREF_SET(__pyx_v_t, __pyx_t_4);
-      __pyx_t_4 = 0;
+      __pyx_t_3 = PyInt_FromLong(__pyx_v_child->type_num); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 820; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_XDECREF(__pyx_v_t);
+      __pyx_v_t = __pyx_t_3;
+      __pyx_t_3 = 0;
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":822
+      /* "numpy.pxd":821
  *         if not PyDataType_HASFIELDS(child):
  *             t = child.type_num
  *             if end - f < 5:             # <<<<<<<<<<<<<<
  *                 raise RuntimeError(u"Format string allocated too short.")
  * 
  */
-      __pyx_t_6 = (((__pyx_v_end - __pyx_v_f) < 5) != 0);
+      __pyx_t_6 = ((__pyx_v_end - __pyx_v_f) < 5);
       if (__pyx_t_6) {
 
-        /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":823
+        /* "numpy.pxd":822
  *             t = child.type_num
  *             if end - f < 5:
  *                 raise RuntimeError(u"Format string allocated too short.")             # <<<<<<<<<<<<<<
  * 
  *             # Until ticket #99 is fixed, use integers to avoid warnings
  */
-        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__18, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 823; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_4);
-        __Pyx_Raise(__pyx_t_4, 0, 0, 0);
-        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-        {__pyx_filename = __pyx_f[1]; __pyx_lineno = 823; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_3 = PyObject_Call(__pyx_builtin_RuntimeError, ((PyObject *)__pyx_k_tuple_16), NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 822; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+        {__pyx_filename = __pyx_f[1]; __pyx_lineno = 822; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        goto __pyx_L10;
       }
+      __pyx_L10:;
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":826
+      /* "numpy.pxd":825
  * 
  *             # Until ticket #99 is fixed, use integers to avoid warnings
  *             if   t == NPY_BYTE:        f[0] =  98 #"b"             # <<<<<<<<<<<<<<
  *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
  *             elif t == NPY_SHORT:       f[0] = 104 #"h"
  */
-      __pyx_t_4 = PyInt_FromLong(NPY_BYTE); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 826; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_4);
-      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 826; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 826; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_3 = PyInt_FromLong(NPY_BYTE); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 825; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_5 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 825; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 825; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 98;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":827
+      /* "numpy.pxd":826
  *             # Until ticket #99 is fixed, use integers to avoid warnings
  *             if   t == NPY_BYTE:        f[0] =  98 #"b"
  *             elif t == NPY_UBYTE:       f[0] =  66 #"B"             # <<<<<<<<<<<<<<
  *             elif t == NPY_SHORT:       f[0] = 104 #"h"
  *             elif t == NPY_USHORT:      f[0] =  72 #"H"
  */
-      __pyx_t_3 = PyInt_FromLong(NPY_UBYTE); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 827; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_5 = PyInt_FromLong(NPY_UBYTE); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 826; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_5, Py_EQ); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 826; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_3);
-      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 827; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 826; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 827; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 66;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":828
+      /* "numpy.pxd":827
  *             if   t == NPY_BYTE:        f[0] =  98 #"b"
  *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
  *             elif t == NPY_SHORT:       f[0] = 104 #"h"             # <<<<<<<<<<<<<<
  *             elif t == NPY_USHORT:      f[0] =  72 #"H"
  *             elif t == NPY_INT:         f[0] = 105 #"i"
  */
-      __pyx_t_4 = PyInt_FromLong(NPY_SHORT); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 828; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_4);
-      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 828; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 828; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_3 = PyInt_FromLong(NPY_SHORT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 827; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_5 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 827; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 827; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 104;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":829
+      /* "numpy.pxd":828
  *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
  *             elif t == NPY_SHORT:       f[0] = 104 #"h"
  *             elif t == NPY_USHORT:      f[0] =  72 #"H"             # <<<<<<<<<<<<<<
  *             elif t == NPY_INT:         f[0] = 105 #"i"
  *             elif t == NPY_UINT:        f[0] =  73 #"I"
  */
-      __pyx_t_3 = PyInt_FromLong(NPY_USHORT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 829; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_5 = PyInt_FromLong(NPY_USHORT); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 828; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_5, Py_EQ); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 828; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_3);
-      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 829; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 828; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 829; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 72;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":830
+      /* "numpy.pxd":829
  *             elif t == NPY_SHORT:       f[0] = 104 #"h"
  *             elif t == NPY_USHORT:      f[0] =  72 #"H"
  *             elif t == NPY_INT:         f[0] = 105 #"i"             # <<<<<<<<<<<<<<
  *             elif t == NPY_UINT:        f[0] =  73 #"I"
  *             elif t == NPY_LONG:        f[0] = 108 #"l"
  */
-      __pyx_t_4 = PyInt_FromLong(NPY_INT); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 830; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_4);
-      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 830; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 830; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_3 = PyInt_FromLong(NPY_INT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 829; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_5 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 829; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 829; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 105;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":831
+      /* "numpy.pxd":830
  *             elif t == NPY_USHORT:      f[0] =  72 #"H"
  *             elif t == NPY_INT:         f[0] = 105 #"i"
  *             elif t == NPY_UINT:        f[0] =  73 #"I"             # <<<<<<<<<<<<<<
  *             elif t == NPY_LONG:        f[0] = 108 #"l"
  *             elif t == NPY_ULONG:       f[0] = 76  #"L"
  */
-      __pyx_t_3 = PyInt_FromLong(NPY_UINT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 831; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_5 = PyInt_FromLong(NPY_UINT); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 830; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_5, Py_EQ); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 830; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_3);
-      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 831; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 830; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 831; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 73;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":832
+      /* "numpy.pxd":831
  *             elif t == NPY_INT:         f[0] = 105 #"i"
  *             elif t == NPY_UINT:        f[0] =  73 #"I"
  *             elif t == NPY_LONG:        f[0] = 108 #"l"             # <<<<<<<<<<<<<<
  *             elif t == NPY_ULONG:       f[0] = 76  #"L"
  *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
  */
-      __pyx_t_4 = PyInt_FromLong(NPY_LONG); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 832; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_4);
-      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 832; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 832; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_3 = PyInt_FromLong(NPY_LONG); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 831; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_5 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 831; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 831; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 108;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":833
+      /* "numpy.pxd":832
  *             elif t == NPY_UINT:        f[0] =  73 #"I"
  *             elif t == NPY_LONG:        f[0] = 108 #"l"
  *             elif t == NPY_ULONG:       f[0] = 76  #"L"             # <<<<<<<<<<<<<<
  *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
  *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
  */
-      __pyx_t_3 = PyInt_FromLong(NPY_ULONG); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 833; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_5 = PyInt_FromLong(NPY_ULONG); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 832; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_5, Py_EQ); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 832; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_3);
-      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 833; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 832; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 833; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 76;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":834
+      /* "numpy.pxd":833
  *             elif t == NPY_LONG:        f[0] = 108 #"l"
  *             elif t == NPY_ULONG:       f[0] = 76  #"L"
  *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"             # <<<<<<<<<<<<<<
  *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
  *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
  */
-      __pyx_t_4 = PyInt_FromLong(NPY_LONGLONG); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 834; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_4);
-      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 834; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 834; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_3 = PyInt_FromLong(NPY_LONGLONG); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 833; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_5 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 833; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 833; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 113;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":835
+      /* "numpy.pxd":834
  *             elif t == NPY_ULONG:       f[0] = 76  #"L"
  *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
  *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"             # <<<<<<<<<<<<<<
  *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
  *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
  */
-      __pyx_t_3 = PyInt_FromLong(NPY_ULONGLONG); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 835; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_5 = PyInt_FromLong(NPY_ULONGLONG); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 834; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_5, Py_EQ); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 834; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_3);
-      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 835; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 834; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 835; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 81;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":836
+      /* "numpy.pxd":835
  *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
  *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
  *             elif t == NPY_FLOAT:       f[0] = 102 #"f"             # <<<<<<<<<<<<<<
  *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
  *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
  */
-      __pyx_t_4 = PyInt_FromLong(NPY_FLOAT); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 836; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_4);
-      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 836; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 836; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_3 = PyInt_FromLong(NPY_FLOAT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 835; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_5 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 835; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 835; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 102;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":837
+      /* "numpy.pxd":836
  *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
  *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
  *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"             # <<<<<<<<<<<<<<
  *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
  *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
  */
-      __pyx_t_3 = PyInt_FromLong(NPY_DOUBLE); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 837; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_5 = PyInt_FromLong(NPY_DOUBLE); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 836; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_5, Py_EQ); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 836; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_3);
-      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 837; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 836; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 837; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 100;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":838
+      /* "numpy.pxd":837
  *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
  *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
  *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"             # <<<<<<<<<<<<<<
  *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
  *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
  */
-      __pyx_t_4 = PyInt_FromLong(NPY_LONGDOUBLE); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 838; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_4);
-      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 838; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 838; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_3 = PyInt_FromLong(NPY_LONGDOUBLE); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 837; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_5 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 837; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 837; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 103;
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":839
+      /* "numpy.pxd":838
  *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
  *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
  *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf             # <<<<<<<<<<<<<<
  *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
  *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
  */
-      __pyx_t_3 = PyInt_FromLong(NPY_CFLOAT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 839; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_5 = PyInt_FromLong(NPY_CFLOAT); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 838; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_5, Py_EQ); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 838; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_3);
-      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 839; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 838; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 839; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 90;
         (__pyx_v_f[1]) = 102;
@@ -6939,19 +6666,20 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":840
+      /* "numpy.pxd":839
  *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
  *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
  *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd             # <<<<<<<<<<<<<<
  *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
  *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
  */
-      __pyx_t_4 = PyInt_FromLong(NPY_CDOUBLE); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 840; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_4);
-      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 840; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 840; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_3 = PyInt_FromLong(NPY_CDOUBLE); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 839; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_5 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 839; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 839; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 90;
         (__pyx_v_f[1]) = 100;
@@ -6959,19 +6687,20 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":841
+      /* "numpy.pxd":840
  *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
  *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
  *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg             # <<<<<<<<<<<<<<
  *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
  *             else:
  */
-      __pyx_t_3 = PyInt_FromLong(NPY_CLONGDOUBLE); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 841; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_5 = PyInt_FromLong(NPY_CLONGDOUBLE); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 840; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_5, Py_EQ); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 840; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_3);
-      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 841; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 840; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 841; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 90;
         (__pyx_v_f[1]) = 103;
@@ -6979,49 +6708,50 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
         goto __pyx_L11;
       }
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":842
+      /* "numpy.pxd":841
  *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
  *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
  *             elif t == NPY_OBJECT:      f[0] = 79 #"O"             # <<<<<<<<<<<<<<
  *             else:
  *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
  */
-      __pyx_t_4 = PyInt_FromLong(NPY_OBJECT); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 842; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_GOTREF(__pyx_t_4);
-      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 842; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 842; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_3 = PyInt_FromLong(NPY_OBJECT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 841; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_5 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 841; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_5);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 841; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
       if (__pyx_t_6) {
         (__pyx_v_f[0]) = 79;
         goto __pyx_L11;
       }
       /*else*/ {
 
-        /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":844
+        /* "numpy.pxd":843
  *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
  *             else:
  *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)             # <<<<<<<<<<<<<<
  *             f += 1
  *         else:
  */
-        __pyx_t_3 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_v_t); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 844; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_3);
-        __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 844; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-        __Pyx_GOTREF(__pyx_t_4);
-        PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3);
-        __Pyx_GIVEREF(__pyx_t_3);
-        __pyx_t_3 = 0;
-        __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_4, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 844; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_5 = PyNumber_Remainder(((PyObject *)__pyx_kp_u_11), __pyx_v_t); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 843; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(((PyObject *)__pyx_t_5));
+        __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 843; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
         __Pyx_GOTREF(__pyx_t_3);
-        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-        __Pyx_Raise(__pyx_t_3, 0, 0, 0);
-        __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-        {__pyx_filename = __pyx_f[1]; __pyx_lineno = 844; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        PyTuple_SET_ITEM(__pyx_t_3, 0, ((PyObject *)__pyx_t_5));
+        __Pyx_GIVEREF(((PyObject *)__pyx_t_5));
+        __pyx_t_5 = 0;
+        __pyx_t_5 = PyObject_Call(__pyx_builtin_ValueError, ((PyObject *)__pyx_t_3), NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 843; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_DECREF(((PyObject *)__pyx_t_3)); __pyx_t_3 = 0;
+        __Pyx_Raise(__pyx_t_5, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+        {__pyx_filename = __pyx_f[1]; __pyx_lineno = 843; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       }
       __pyx_L11:;
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":845
+      /* "numpy.pxd":844
  *             else:
  *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
  *             f += 1             # <<<<<<<<<<<<<<
@@ -7033,21 +6763,21 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
     }
     /*else*/ {
 
-      /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":849
+      /* "numpy.pxd":848
  *             # Cython ignores struct boundary information ("T{...}"),
  *             # so don't output it
  *             f = _util_dtypestring(child, f, end, offset)             # <<<<<<<<<<<<<<
  *     return f
  * 
  */
-      __pyx_t_11 = __pyx_f_5numpy__util_dtypestring(__pyx_v_child, __pyx_v_f, __pyx_v_end, __pyx_v_offset); if (unlikely(__pyx_t_11 == NULL)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 849; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_11 = __pyx_f_5numpy__util_dtypestring(__pyx_v_child, __pyx_v_f, __pyx_v_end, __pyx_v_offset); if (unlikely(__pyx_t_11 == NULL)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 848; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __pyx_v_f = __pyx_t_11;
     }
     __pyx_L9:;
   }
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":850
+  /* "numpy.pxd":849
  *             # so don't output it
  *             f = _util_dtypestring(child, f, end, offset)
  *     return f             # <<<<<<<<<<<<<<
@@ -7057,19 +6787,13 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
   __pyx_r = __pyx_v_f;
   goto __pyx_L0;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":783
- *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
- * 
- * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:             # <<<<<<<<<<<<<<
- *     # Recursive utility function used in __getbuffer__ to get format
- *     # string. The new location in the format string is returned.
- */
-
-  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
   __pyx_L1_error:;
   __Pyx_XDECREF(__pyx_t_1);
   __Pyx_XDECREF(__pyx_t_3);
   __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
   __Pyx_AddTraceback("numpy._util_dtypestring", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __pyx_r = NULL;
   __pyx_L0:;
@@ -7082,7 +6806,7 @@ static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx
   return __pyx_r;
 }
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":966
+/* "numpy.pxd":964
  * 
  * 
  * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
@@ -7094,10 +6818,9 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a
   PyObject *__pyx_v_baseptr;
   __Pyx_RefNannyDeclarations
   int __pyx_t_1;
-  int __pyx_t_2;
   __Pyx_RefNannySetupContext("set_array_base", 0);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":968
+  /* "numpy.pxd":966
  * cdef inline void set_array_base(ndarray arr, object base):
  *      cdef PyObject* baseptr
  *      if base is None:             # <<<<<<<<<<<<<<
@@ -7105,10 +6828,9 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a
  *      else:
  */
   __pyx_t_1 = (__pyx_v_base == Py_None);
-  __pyx_t_2 = (__pyx_t_1 != 0);
-  if (__pyx_t_2) {
+  if (__pyx_t_1) {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":969
+    /* "numpy.pxd":967
  *      cdef PyObject* baseptr
  *      if base is None:
  *          baseptr = NULL             # <<<<<<<<<<<<<<
@@ -7120,7 +6842,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a
   }
   /*else*/ {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":971
+    /* "numpy.pxd":969
  *          baseptr = NULL
  *      else:
  *          Py_INCREF(base) # important to do this before decref below!             # <<<<<<<<<<<<<<
@@ -7129,7 +6851,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a
  */
     Py_INCREF(__pyx_v_base);
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":972
+    /* "numpy.pxd":970
  *      else:
  *          Py_INCREF(base) # important to do this before decref below!
  *          baseptr = <PyObject*>base             # <<<<<<<<<<<<<<
@@ -7140,41 +6862,28 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a
   }
   __pyx_L3:;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":973
+  /* "numpy.pxd":971
  *          Py_INCREF(base) # important to do this before decref below!
  *          baseptr = <PyObject*>base
  *      Py_XDECREF(arr.base)             # <<<<<<<<<<<<<<
  *      arr.base = baseptr
  * 
  */
-  Py_XDECREF(PyArray_BASE(__pyx_v_arr));
+  Py_XDECREF(__pyx_v_arr->base);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":974
+  /* "numpy.pxd":972
  *          baseptr = <PyObject*>base
  *      Py_XDECREF(arr.base)
  *      arr.base = baseptr             # <<<<<<<<<<<<<<
  * 
  * cdef inline object get_array_base(ndarray arr):
  */
-#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(__pyx_v_arr) = __pyx_v_baseptr;
-#else
-  PyArray_SetBaseObject(__pyx_v_arr, __pyx_v_baseptr);
-#endif
-
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":966
- * 
- * 
- * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
- *      cdef PyObject* baseptr
- *      if base is None:
- */
+  __pyx_v_arr->base = __pyx_v_baseptr;
 
-  /* function exit code */
   __Pyx_RefNannyFinishContext();
 }
 
-/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":976
+/* "numpy.pxd":974
  *      arr.base = baseptr
  * 
  * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
@@ -7188,17 +6897,17 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__py
   int __pyx_t_1;
   __Pyx_RefNannySetupContext("get_array_base", 0);
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":977
+  /* "numpy.pxd":975
  * 
  * cdef inline object get_array_base(ndarray arr):
  *     if arr.base is NULL:             # <<<<<<<<<<<<<<
  *         return None
  *     else:
  */
-  __pyx_t_1 = ((PyArray_BASE(__pyx_v_arr) == NULL) != 0);
+  __pyx_t_1 = (__pyx_v_arr->base == NULL);
   if (__pyx_t_1) {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":978
+    /* "numpy.pxd":976
  * cdef inline object get_array_base(ndarray arr):
  *     if arr.base is NULL:
  *         return None             # <<<<<<<<<<<<<<
@@ -7209,29 +6918,23 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__py
     __Pyx_INCREF(Py_None);
     __pyx_r = Py_None;
     goto __pyx_L0;
+    goto __pyx_L3;
   }
   /*else*/ {
 
-    /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":980
+    /* "numpy.pxd":978
  *         return None
  *     else:
  *         return <object>arr.base             # <<<<<<<<<<<<<<
  */
     __Pyx_XDECREF(__pyx_r);
-    __Pyx_INCREF(((PyObject *)PyArray_BASE(__pyx_v_arr)));
-    __pyx_r = ((PyObject *)PyArray_BASE(__pyx_v_arr));
+    __Pyx_INCREF(((PyObject *)__pyx_v_arr->base));
+    __pyx_r = ((PyObject *)__pyx_v_arr->base);
     goto __pyx_L0;
   }
+  __pyx_L3:;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":976
- *      arr.base = baseptr
- * 
- * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
- *     if arr.base is NULL:
- *         return None
- */
-
-  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
   __pyx_L0:;
   __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
@@ -7244,13 +6947,9 @@ static PyMethodDef __pyx_methods[] = {
 
 #if PY_MAJOR_VERSION >= 3
 static struct PyModuleDef __pyx_moduledef = {
-  #if PY_VERSION_HEX < 0x03020000
-    { PyObject_HEAD_INIT(NULL) NULL, 0, NULL },
-  #else
     PyModuleDef_HEAD_INIT,
-  #endif
     __Pyx_NAMESTR("scan_perform"),
-    __Pyx_DOCSTR(__pyx_k_This_code_implements_the_operat), /* m_doc */
+    __Pyx_DOCSTR(__pyx_k_17), /* m_doc */
     -1, /* m_size */
     __pyx_methods /* m_methods */,
     NULL, /* m_reload */
@@ -7261,129 +6960,120 @@ static struct PyModuleDef __pyx_moduledef = {
 #endif
 
 static __Pyx_StringTabEntry __pyx_string_tab[] = {
-  {&__pyx_n_s_Exception, __pyx_k_Exception, sizeof(__pyx_k_Exception), 0, 0, 1, 1},
-  {&__pyx_kp_u_Format_string_allocated_too_shor, __pyx_k_Format_string_allocated_too_shor, sizeof(__pyx_k_Format_string_allocated_too_shor), 0, 1, 0, 0},
-  {&__pyx_kp_u_Format_string_allocated_too_shor_2, __pyx_k_Format_string_allocated_too_shor_2, sizeof(__pyx_k_Format_string_allocated_too_shor_2), 0, 1, 0, 0},
-  {&__pyx_n_s_IndexError, __pyx_k_IndexError, sizeof(__pyx_k_IndexError), 0, 0, 1, 1},
-  {&__pyx_kp_u_Non_native_byte_order_not_suppor, __pyx_k_Non_native_byte_order_not_suppor, sizeof(__pyx_k_Non_native_byte_order_not_suppor), 0, 1, 0, 0},
-  {&__pyx_n_s_NotImplementedError, __pyx_k_NotImplementedError, sizeof(__pyx_k_NotImplementedError), 0, 0, 1, 1},
-  {&__pyx_kp_s_Razvan_Pascanu, __pyx_k_Razvan_Pascanu, sizeof(__pyx_k_Razvan_Pascanu), 0, 0, 1, 0},
-  {&__pyx_kp_s_Razvan_Pascanu_r_pascanu_gmail, __pyx_k_Razvan_Pascanu_r_pascanu_gmail, sizeof(__pyx_k_Razvan_Pascanu_r_pascanu_gmail), 0, 0, 1, 0},
-  {&__pyx_n_s_RuntimeError, __pyx_k_RuntimeError, sizeof(__pyx_k_RuntimeError), 0, 0, 1, 1},
-  {&__pyx_kp_s_Scan_was_asked_to_run_for_negati, __pyx_k_Scan_was_asked_to_run_for_negati, sizeof(__pyx_k_Scan_was_asked_to_run_for_negati), 0, 0, 1, 0},
-  {&__pyx_kp_s_Sequence_is_shorter_then_the_req, __pyx_k_Sequence_is_shorter_then_the_req, sizeof(__pyx_k_Sequence_is_shorter_then_the_req), 0, 0, 1, 0},
-  {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 0, 1, 1},
-  {&__pyx_kp_s_We_didn_t_implemented_yet_the_ca, __pyx_k_We_didn_t_implemented_yet_the_ca, sizeof(__pyx_k_We_didn_t_implemented_yet_the_ca), 0, 0, 1, 0},
-  {&__pyx_n_s_a_offset, __pyx_k_a_offset, sizeof(__pyx_k_a_offset), 0, 0, 1, 1},
-  {&__pyx_n_s_args, __pyx_k_args, sizeof(__pyx_k_args), 0, 0, 1, 1},
-  {&__pyx_n_s_as_while, __pyx_k_as_while, sizeof(__pyx_k_as_while), 0, 0, 1, 1},
-  {&__pyx_n_s_authors, __pyx_k_authors, sizeof(__pyx_k_authors), 0, 0, 1, 1},
-  {&__pyx_n_s_begin, __pyx_k_begin, sizeof(__pyx_k_begin), 0, 0, 1, 1},
-  {&__pyx_kp_s_c_2011_Universite_de_Montreal, __pyx_k_c_2011_Universite_de_Montreal, sizeof(__pyx_k_c_2011_Universite_de_Montreal), 0, 0, 1, 0},
-  {&__pyx_n_s_call_time, __pyx_k_call_time, sizeof(__pyx_k_call_time), 0, 0, 1, 1},
-  {&__pyx_n_s_callcount, __pyx_k_callcount, sizeof(__pyx_k_callcount), 0, 0, 1, 1},
-  {&__pyx_n_s_cond, __pyx_k_cond, sizeof(__pyx_k_cond), 0, 0, 1, 1},
-  {&__pyx_n_s_contact, __pyx_k_contact, sizeof(__pyx_k_contact), 0, 0, 1, 1},
-  {&__pyx_n_s_copy, __pyx_k_copy, sizeof(__pyx_k_copy), 0, 0, 1, 1},
-  {&__pyx_n_s_copyright, __pyx_k_copyright, sizeof(__pyx_k_copyright), 0, 0, 1, 1},
-  {&__pyx_n_s_destroy_map, __pyx_k_destroy_map, sizeof(__pyx_k_destroy_map), 0, 0, 1, 1},
-  {&__pyx_n_s_dt_fn, __pyx_k_dt_fn, sizeof(__pyx_k_dt_fn), 0, 0, 1, 1},
-  {&__pyx_n_s_dtype, __pyx_k_dtype, sizeof(__pyx_k_dtype), 0, 0, 1, 1},
-  {&__pyx_n_s_end, __pyx_k_end, sizeof(__pyx_k_end), 0, 0, 1, 1},
-  {&__pyx_n_s_fn, __pyx_k_fn, sizeof(__pyx_k_fn), 0, 0, 1, 1},
-  {&__pyx_n_s_fnct, __pyx_k_fnct, sizeof(__pyx_k_fnct), 0, 0, 1, 1},
-  {&__pyx_n_s_get_version, __pyx_k_get_version, sizeof(__pyx_k_get_version), 0, 0, 1, 1},
-  {&__pyx_n_s_gof, __pyx_k_gof, sizeof(__pyx_k_gof), 0, 0, 1, 1},
-  {&__pyx_kp_s_home_anakha_ext_theano_theano_s, __pyx_k_home_anakha_ext_theano_theano_s, sizeof(__pyx_k_home_anakha_ext_theano_theano_s), 0, 0, 1, 0},
-  {&__pyx_n_s_i, __pyx_k_i, sizeof(__pyx_k_i), 0, 0, 1, 1},
-  {&__pyx_n_s_i_s, __pyx_k_i_s, sizeof(__pyx_k_i_s), 0, 0, 1, 1},
-  {&__pyx_n_s_idx, __pyx_k_idx, sizeof(__pyx_k_idx), 0, 0, 1, 1},
-  {&__pyx_n_s_idx_2, __pyx_k_idx_2, sizeof(__pyx_k_idx_2), 0, 0, 1, 1},
-  {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
-  {&__pyx_n_s_input_storage, __pyx_k_input_storage, sizeof(__pyx_k_input_storage), 0, 0, 1, 1},
-  {&__pyx_n_s_j, __pyx_k_j, sizeof(__pyx_k_j), 0, 0, 1, 1},
-  {&__pyx_n_s_jout, __pyx_k_jout, sizeof(__pyx_k_jout), 0, 0, 1, 1},
-  {&__pyx_n_s_k, __pyx_k_k, sizeof(__pyx_k_k), 0, 0, 1, 1},
-  {&__pyx_n_s_kdx, __pyx_k_kdx, sizeof(__pyx_k_kdx), 0, 0, 1, 1},
-  {&__pyx_n_s_l, __pyx_k_l, sizeof(__pyx_k_l), 0, 0, 1, 1},
-  {&__pyx_n_s_len_store_steps, __pyx_k_len_store_steps, sizeof(__pyx_k_len_store_steps), 0, 0, 1, 1},
-  {&__pyx_n_s_lenpos, __pyx_k_lenpos, sizeof(__pyx_k_lenpos), 0, 0, 1, 1},
-  {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
-  {&__pyx_n_s_maker, __pyx_k_maker, sizeof(__pyx_k_maker), 0, 0, 1, 1},
-  {&__pyx_n_s_mintaps, __pyx_k_mintaps, sizeof(__pyx_k_mintaps), 0, 0, 1, 1},
-  {&__pyx_n_s_mit_mot_out_nslices, __pyx_k_mit_mot_out_nslices, sizeof(__pyx_k_mit_mot_out_nslices), 0, 0, 1, 1},
-  {&__pyx_n_s_mit_mot_out_slices, __pyx_k_mit_mot_out_slices, sizeof(__pyx_k_mit_mot_out_slices), 0, 0, 1, 1},
-  {&__pyx_n_s_n_mit_mot, __pyx_k_n_mit_mot, sizeof(__pyx_k_n_mit_mot), 0, 0, 1, 1},
-  {&__pyx_n_s_n_mit_mot_outs, __pyx_k_n_mit_mot_outs, sizeof(__pyx_k_n_mit_mot_outs), 0, 0, 1, 1},
-  {&__pyx_n_s_n_mit_sot, __pyx_k_n_mit_sot, sizeof(__pyx_k_n_mit_sot), 0, 0, 1, 1},
-  {&__pyx_n_s_n_nit_sot, __pyx_k_n_nit_sot, sizeof(__pyx_k_n_nit_sot), 0, 0, 1, 1},
-  {&__pyx_n_s_n_outs, __pyx_k_n_outs, sizeof(__pyx_k_n_outs), 0, 0, 1, 1},
-  {&__pyx_n_s_n_seqs, __pyx_k_n_seqs, sizeof(__pyx_k_n_seqs), 0, 0, 1, 1},
-  {&__pyx_n_s_n_shared_outs, __pyx_k_n_shared_outs, sizeof(__pyx_k_n_shared_outs), 0, 0, 1, 1},
-  {&__pyx_n_s_n_sit_sot, __pyx_k_n_sit_sot, sizeof(__pyx_k_n_sit_sot), 0, 0, 1, 1},
-  {&__pyx_n_s_n_steps, __pyx_k_n_steps, sizeof(__pyx_k_n_steps), 0, 0, 1, 1},
-  {&__pyx_n_s_nbsteps, __pyx_k_nbsteps, sizeof(__pyx_k_nbsteps), 0, 0, 1, 1},
-  {&__pyx_kp_u_ndarray_is_not_C_contiguous, __pyx_k_ndarray_is_not_C_contiguous, sizeof(__pyx_k_ndarray_is_not_C_contiguous), 0, 1, 0, 0},
-  {&__pyx_kp_u_ndarray_is_not_Fortran_contiguou, __pyx_k_ndarray_is_not_Fortran_contiguou, sizeof(__pyx_k_ndarray_is_not_Fortran_contiguou), 0, 1, 0, 0},
-  {&__pyx_n_s_nit_sot_arg_offset, __pyx_k_nit_sot_arg_offset, sizeof(__pyx_k_nit_sot_arg_offset), 0, 0, 1, 1},
-  {&__pyx_n_s_node, __pyx_k_node, sizeof(__pyx_k_node), 0, 0, 1, 1},
-  {&__pyx_n_s_nodes, __pyx_k_nodes, sizeof(__pyx_k_nodes), 0, 0, 1, 1},
-  {&__pyx_n_s_numpy, __pyx_k_numpy, sizeof(__pyx_k_numpy), 0, 0, 1, 1},
-  {&__pyx_n_s_o_offset, __pyx_k_o_offset, sizeof(__pyx_k_o_offset), 0, 0, 1, 1},
-  {&__pyx_n_s_o_s, __pyx_k_o_s, sizeof(__pyx_k_o_s), 0, 0, 1, 1},
-  {&__pyx_n_s_offset, __pyx_k_offset, sizeof(__pyx_k_offset), 0, 0, 1, 1},
-  {&__pyx_n_s_offset_out, __pyx_k_offset_out, sizeof(__pyx_k_offset_out), 0, 0, 1, 1},
-  {&__pyx_n_s_other_args, __pyx_k_other_args, sizeof(__pyx_k_other_args), 0, 0, 1, 1},
-  {&__pyx_n_s_output_storage, __pyx_k_output_storage, sizeof(__pyx_k_output_storage), 0, 0, 1, 1},
-  {&__pyx_n_s_outputs, __pyx_k_outputs, sizeof(__pyx_k_outputs), 0, 0, 1, 1},
-  {&__pyx_n_s_outs, __pyx_k_outs, sizeof(__pyx_k_outs), 0, 0, 1, 1},
-  {&__pyx_n_s_pdx, __pyx_k_pdx, sizeof(__pyx_k_pdx), 0, 0, 1, 1},
-  {&__pyx_n_s_perform, __pyx_k_perform, sizeof(__pyx_k_perform), 0, 0, 1, 1},
-  {&__pyx_n_s_pos, __pyx_k_pos, sizeof(__pyx_k_pos), 0, 0, 1, 1},
-  {&__pyx_n_s_position_of_error, __pyx_k_position_of_error, sizeof(__pyx_k_position_of_error), 0, 0, 1, 1},
-  {&__pyx_n_s_profile, __pyx_k_profile, sizeof(__pyx_k_profile), 0, 0, 1, 1},
-  {&__pyx_n_s_pyx_getbuffer, __pyx_k_pyx_getbuffer, sizeof(__pyx_k_pyx_getbuffer), 0, 0, 1, 1},
-  {&__pyx_n_s_pyx_releasebuffer, __pyx_k_pyx_releasebuffer, sizeof(__pyx_k_pyx_releasebuffer), 0, 0, 1, 1},
-  {&__pyx_n_s_raise_with_op, __pyx_k_raise_with_op, sizeof(__pyx_k_raise_with_op), 0, 0, 1, 1},
-  {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1},
-  {&__pyx_n_s_reshape, __pyx_k_reshape, sizeof(__pyx_k_reshape), 0, 0, 1, 1},
-  {&__pyx_n_s_self, __pyx_k_self, sizeof(__pyx_k_self), 0, 0, 1, 1},
-  {&__pyx_n_s_seqs_arg_offset, __pyx_k_seqs_arg_offset, sizeof(__pyx_k_seqs_arg_offset), 0, 0, 1, 1},
-  {&__pyx_n_s_sh0, __pyx_k_sh0, sizeof(__pyx_k_sh0), 0, 0, 1, 1},
-  {&__pyx_n_s_shape, __pyx_k_shape, sizeof(__pyx_k_shape), 0, 0, 1, 1},
-  {&__pyx_n_s_shared_arg_offset, __pyx_k_shared_arg_offset, sizeof(__pyx_k_shared_arg_offset), 0, 0, 1, 1},
-  {&__pyx_n_s_storage, __pyx_k_storage, sizeof(__pyx_k_storage), 0, 0, 1, 1},
-  {&__pyx_n_s_store_steps, __pyx_k_store_steps, sizeof(__pyx_k_store_steps), 0, 0, 1, 1},
-  {&__pyx_n_s_t0_call, __pyx_k_t0_call, sizeof(__pyx_k_t0_call), 0, 0, 1, 1},
-  {&__pyx_n_s_t0_fn, __pyx_k_t0_fn, sizeof(__pyx_k_t0_fn), 0, 0, 1, 1},
-  {&__pyx_n_s_t_call, __pyx_k_t_call, sizeof(__pyx_k_t_call), 0, 0, 1, 1},
-  {&__pyx_n_s_t_fn, __pyx_k_t_fn, sizeof(__pyx_k_t_fn), 0, 0, 1, 1},
-  {&__pyx_n_s_tap, __pyx_k_tap, sizeof(__pyx_k_tap), 0, 0, 1, 1},
-  {&__pyx_n_s_tap_array, __pyx_k_tap_array, sizeof(__pyx_k_tap_array), 0, 0, 1, 1},
-  {&__pyx_n_s_tap_array_len, __pyx_k_tap_array_len, sizeof(__pyx_k_tap_array_len), 0, 0, 1, 1},
-  {&__pyx_n_s_tdx, __pyx_k_tdx, sizeof(__pyx_k_tdx), 0, 0, 1, 1},
-  {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
-  {&__pyx_n_s_theano, __pyx_k_theano, sizeof(__pyx_k_theano), 0, 0, 1, 1},
-  {&__pyx_n_s_theano_scan_module_scan_perform, __pyx_k_theano_scan_module_scan_perform, sizeof(__pyx_k_theano_scan_module_scan_perform), 0, 0, 1, 1},
-  {&__pyx_n_s_time, __pyx_k_time, sizeof(__pyx_k_time), 0, 0, 1, 1},
-  {&__pyx_n_s_tmp, __pyx_k_tmp, sizeof(__pyx_k_tmp), 0, 0, 1, 1},
-  {&__pyx_n_s_type, __pyx_k_type, sizeof(__pyx_k_type), 0, 0, 1, 1},
-  {&__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_k_unknown_dtype_code_in_numpy_pxd, sizeof(__pyx_k_unknown_dtype_code_in_numpy_pxd), 0, 1, 0, 0},
-  {&__pyx_n_s_update_profile, __pyx_k_update_profile, sizeof(__pyx_k_update_profile), 0, 0, 1, 1},
-  {&__pyx_n_s_value_zeros, __pyx_k_value_zeros, sizeof(__pyx_k_value_zeros), 0, 0, 1, 1},
-  {&__pyx_n_s_vector_outs, __pyx_k_vector_outs, sizeof(__pyx_k_vector_outs), 0, 0, 1, 1},
-  {&__pyx_n_s_vector_seqs, __pyx_k_vector_seqs, sizeof(__pyx_k_vector_seqs), 0, 0, 1, 1},
-  {&__pyx_n_s_vm, __pyx_k_vm, sizeof(__pyx_k_vm), 0, 0, 1, 1},
-  {&__pyx_n_s_vm_call_time, __pyx_k_vm_call_time, sizeof(__pyx_k_vm_call_time), 0, 0, 1, 1},
+  {&__pyx_kp_s_1, __pyx_k_1, sizeof(__pyx_k_1), 0, 0, 1, 0},
+  {&__pyx_kp_u_11, __pyx_k_11, sizeof(__pyx_k_11), 0, 1, 0, 0},
+  {&__pyx_kp_u_12, __pyx_k_12, sizeof(__pyx_k_12), 0, 1, 0, 0},
+  {&__pyx_kp_u_15, __pyx_k_15, sizeof(__pyx_k_15), 0, 1, 0, 0},
+  {&__pyx_kp_s_18, __pyx_k_18, sizeof(__pyx_k_18), 0, 0, 1, 0},
+  {&__pyx_kp_s_19, __pyx_k_19, sizeof(__pyx_k_19), 0, 0, 1, 0},
+  {&__pyx_kp_s_20, __pyx_k_20, sizeof(__pyx_k_20), 0, 0, 1, 0},
+  {&__pyx_n_s_21, __pyx_k_21, sizeof(__pyx_k_21), 0, 0, 1, 1},
+  {&__pyx_kp_s_23, __pyx_k_23, sizeof(__pyx_k_23), 0, 0, 1, 0},
+  {&__pyx_n_s_24, __pyx_k_24, sizeof(__pyx_k_24), 0, 0, 1, 1},
+  {&__pyx_kp_u_5, __pyx_k_5, sizeof(__pyx_k_5), 0, 1, 0, 0},
+  {&__pyx_kp_u_7, __pyx_k_7, sizeof(__pyx_k_7), 0, 1, 0, 0},
+  {&__pyx_kp_u_9, __pyx_k_9, sizeof(__pyx_k_9), 0, 1, 0, 0},
+  {&__pyx_n_s__Exception, __pyx_k__Exception, sizeof(__pyx_k__Exception), 0, 0, 1, 1},
+  {&__pyx_n_s__RuntimeError, __pyx_k__RuntimeError, sizeof(__pyx_k__RuntimeError), 0, 0, 1, 1},
+  {&__pyx_n_s__ValueError, __pyx_k__ValueError, sizeof(__pyx_k__ValueError), 0, 0, 1, 1},
+  {&__pyx_n_s____authors__, __pyx_k____authors__, sizeof(__pyx_k____authors__), 0, 0, 1, 1},
+  {&__pyx_n_s____contact__, __pyx_k____contact__, sizeof(__pyx_k____contact__), 0, 0, 1, 1},
+  {&__pyx_n_s____copyright__, __pyx_k____copyright__, sizeof(__pyx_k____copyright__), 0, 0, 1, 1},
+  {&__pyx_n_s____main__, __pyx_k____main__, sizeof(__pyx_k____main__), 0, 0, 1, 1},
+  {&__pyx_n_s____test__, __pyx_k____test__, sizeof(__pyx_k____test__), 0, 0, 1, 1},
+  {&__pyx_n_s___idx, __pyx_k___idx, sizeof(__pyx_k___idx), 0, 0, 1, 1},
+  {&__pyx_n_s__a_offset, __pyx_k__a_offset, sizeof(__pyx_k__a_offset), 0, 0, 1, 1},
+  {&__pyx_n_s__args, __pyx_k__args, sizeof(__pyx_k__args), 0, 0, 1, 1},
+  {&__pyx_n_s__as_while, __pyx_k__as_while, sizeof(__pyx_k__as_while), 0, 0, 1, 1},
+  {&__pyx_n_s__begin, __pyx_k__begin, sizeof(__pyx_k__begin), 0, 0, 1, 1},
+  {&__pyx_n_s__call_time, __pyx_k__call_time, sizeof(__pyx_k__call_time), 0, 0, 1, 1},
+  {&__pyx_n_s__callcount, __pyx_k__callcount, sizeof(__pyx_k__callcount), 0, 0, 1, 1},
+  {&__pyx_n_s__cond, __pyx_k__cond, sizeof(__pyx_k__cond), 0, 0, 1, 1},
+  {&__pyx_n_s__copy, __pyx_k__copy, sizeof(__pyx_k__copy), 0, 0, 1, 1},
+  {&__pyx_n_s__cuda, __pyx_k__cuda, sizeof(__pyx_k__cuda), 0, 0, 1, 1},
+  {&__pyx_n_s__destroy_map, __pyx_k__destroy_map, sizeof(__pyx_k__destroy_map), 0, 0, 1, 1},
+  {&__pyx_n_s__dt_fn, __pyx_k__dt_fn, sizeof(__pyx_k__dt_fn), 0, 0, 1, 1},
+  {&__pyx_n_s__dtype, __pyx_k__dtype, sizeof(__pyx_k__dtype), 0, 0, 1, 1},
+  {&__pyx_n_s__end, __pyx_k__end, sizeof(__pyx_k__end), 0, 0, 1, 1},
+  {&__pyx_n_s__fn, __pyx_k__fn, sizeof(__pyx_k__fn), 0, 0, 1, 1},
+  {&__pyx_n_s__fnct, __pyx_k__fnct, sizeof(__pyx_k__fnct), 0, 0, 1, 1},
+  {&__pyx_n_s__get_version, __pyx_k__get_version, sizeof(__pyx_k__get_version), 0, 0, 1, 1},
+  {&__pyx_n_s__gof, __pyx_k__gof, sizeof(__pyx_k__gof), 0, 0, 1, 1},
+  {&__pyx_n_s__i, __pyx_k__i, sizeof(__pyx_k__i), 0, 0, 1, 1},
+  {&__pyx_n_s__idx, __pyx_k__idx, sizeof(__pyx_k__idx), 0, 0, 1, 1},
+  {&__pyx_n_s__input_storage, __pyx_k__input_storage, sizeof(__pyx_k__input_storage), 0, 0, 1, 1},
+  {&__pyx_n_s__j, __pyx_k__j, sizeof(__pyx_k__j), 0, 0, 1, 1},
+  {&__pyx_n_s__jout, __pyx_k__jout, sizeof(__pyx_k__jout), 0, 0, 1, 1},
+  {&__pyx_n_s__k, __pyx_k__k, sizeof(__pyx_k__k), 0, 0, 1, 1},
+  {&__pyx_n_s__kdx, __pyx_k__kdx, sizeof(__pyx_k__kdx), 0, 0, 1, 1},
+  {&__pyx_n_s__l, __pyx_k__l, sizeof(__pyx_k__l), 0, 0, 1, 1},
+  {&__pyx_n_s__len_store_steps, __pyx_k__len_store_steps, sizeof(__pyx_k__len_store_steps), 0, 0, 1, 1},
+  {&__pyx_n_s__lenpos, __pyx_k__lenpos, sizeof(__pyx_k__lenpos), 0, 0, 1, 1},
+  {&__pyx_n_s__maker, __pyx_k__maker, sizeof(__pyx_k__maker), 0, 0, 1, 1},
+  {&__pyx_n_s__mintaps, __pyx_k__mintaps, sizeof(__pyx_k__mintaps), 0, 0, 1, 1},
+  {&__pyx_n_s__mit_mot_out_nslices, __pyx_k__mit_mot_out_nslices, sizeof(__pyx_k__mit_mot_out_nslices), 0, 0, 1, 1},
+  {&__pyx_n_s__mit_mot_out_slices, __pyx_k__mit_mot_out_slices, sizeof(__pyx_k__mit_mot_out_slices), 0, 0, 1, 1},
+  {&__pyx_n_s__n_mit_mot, __pyx_k__n_mit_mot, sizeof(__pyx_k__n_mit_mot), 0, 0, 1, 1},
+  {&__pyx_n_s__n_mit_mot_outs, __pyx_k__n_mit_mot_outs, sizeof(__pyx_k__n_mit_mot_outs), 0, 0, 1, 1},
+  {&__pyx_n_s__n_mit_sot, __pyx_k__n_mit_sot, sizeof(__pyx_k__n_mit_sot), 0, 0, 1, 1},
+  {&__pyx_n_s__n_nit_sot, __pyx_k__n_nit_sot, sizeof(__pyx_k__n_nit_sot), 0, 0, 1, 1},
+  {&__pyx_n_s__n_outs, __pyx_k__n_outs, sizeof(__pyx_k__n_outs), 0, 0, 1, 1},
+  {&__pyx_n_s__n_seqs, __pyx_k__n_seqs, sizeof(__pyx_k__n_seqs), 0, 0, 1, 1},
+  {&__pyx_n_s__n_shared_outs, __pyx_k__n_shared_outs, sizeof(__pyx_k__n_shared_outs), 0, 0, 1, 1},
+  {&__pyx_n_s__n_sit_sot, __pyx_k__n_sit_sot, sizeof(__pyx_k__n_sit_sot), 0, 0, 1, 1},
+  {&__pyx_n_s__n_steps, __pyx_k__n_steps, sizeof(__pyx_k__n_steps), 0, 0, 1, 1},
+  {&__pyx_n_s__nbsteps, __pyx_k__nbsteps, sizeof(__pyx_k__nbsteps), 0, 0, 1, 1},
+  {&__pyx_n_s__nit_sot_arg_offset, __pyx_k__nit_sot_arg_offset, sizeof(__pyx_k__nit_sot_arg_offset), 0, 0, 1, 1},
+  {&__pyx_n_s__node, __pyx_k__node, sizeof(__pyx_k__node), 0, 0, 1, 1},
+  {&__pyx_n_s__nodes, __pyx_k__nodes, sizeof(__pyx_k__nodes), 0, 0, 1, 1},
+  {&__pyx_n_s__numpy, __pyx_k__numpy, sizeof(__pyx_k__numpy), 0, 0, 1, 1},
+  {&__pyx_n_s__o_offset, __pyx_k__o_offset, sizeof(__pyx_k__o_offset), 0, 0, 1, 1},
+  {&__pyx_n_s__offset, __pyx_k__offset, sizeof(__pyx_k__offset), 0, 0, 1, 1},
+  {&__pyx_n_s__offset_out, __pyx_k__offset_out, sizeof(__pyx_k__offset_out), 0, 0, 1, 1},
+  {&__pyx_n_s__other_args, __pyx_k__other_args, sizeof(__pyx_k__other_args), 0, 0, 1, 1},
+  {&__pyx_n_s__output_storage, __pyx_k__output_storage, sizeof(__pyx_k__output_storage), 0, 0, 1, 1},
+  {&__pyx_n_s__outputs, __pyx_k__outputs, sizeof(__pyx_k__outputs), 0, 0, 1, 1},
+  {&__pyx_n_s__outs, __pyx_k__outs, sizeof(__pyx_k__outs), 0, 0, 1, 1},
+  {&__pyx_n_s__pdx, __pyx_k__pdx, sizeof(__pyx_k__pdx), 0, 0, 1, 1},
+  {&__pyx_n_s__perform, __pyx_k__perform, sizeof(__pyx_k__perform), 0, 0, 1, 1},
+  {&__pyx_n_s__pos, __pyx_k__pos, sizeof(__pyx_k__pos), 0, 0, 1, 1},
+  {&__pyx_n_s__position_of_error, __pyx_k__position_of_error, sizeof(__pyx_k__position_of_error), 0, 0, 1, 1},
+  {&__pyx_n_s__profile, __pyx_k__profile, sizeof(__pyx_k__profile), 0, 0, 1, 1},
+  {&__pyx_n_s__raise_with_op, __pyx_k__raise_with_op, sizeof(__pyx_k__raise_with_op), 0, 0, 1, 1},
+  {&__pyx_n_s__range, __pyx_k__range, sizeof(__pyx_k__range), 0, 0, 1, 1},
+  {&__pyx_n_s__reshape, __pyx_k__reshape, sizeof(__pyx_k__reshape), 0, 0, 1, 1},
+  {&__pyx_n_s__self, __pyx_k__self, sizeof(__pyx_k__self), 0, 0, 1, 1},
+  {&__pyx_n_s__seqs_arg_offset, __pyx_k__seqs_arg_offset, sizeof(__pyx_k__seqs_arg_offset), 0, 0, 1, 1},
+  {&__pyx_n_s__sh0, __pyx_k__sh0, sizeof(__pyx_k__sh0), 0, 0, 1, 1},
+  {&__pyx_n_s__shape, __pyx_k__shape, sizeof(__pyx_k__shape), 0, 0, 1, 1},
+  {&__pyx_n_s__shared_arg_offset, __pyx_k__shared_arg_offset, sizeof(__pyx_k__shared_arg_offset), 0, 0, 1, 1},
+  {&__pyx_n_s__storage, __pyx_k__storage, sizeof(__pyx_k__storage), 0, 0, 1, 1},
+  {&__pyx_n_s__store_steps, __pyx_k__store_steps, sizeof(__pyx_k__store_steps), 0, 0, 1, 1},
+  {&__pyx_n_s__t0_call, __pyx_k__t0_call, sizeof(__pyx_k__t0_call), 0, 0, 1, 1},
+  {&__pyx_n_s__t0_fn, __pyx_k__t0_fn, sizeof(__pyx_k__t0_fn), 0, 0, 1, 1},
+  {&__pyx_n_s__t_call, __pyx_k__t_call, sizeof(__pyx_k__t_call), 0, 0, 1, 1},
+  {&__pyx_n_s__t_fn, __pyx_k__t_fn, sizeof(__pyx_k__t_fn), 0, 0, 1, 1},
+  {&__pyx_n_s__tap, __pyx_k__tap, sizeof(__pyx_k__tap), 0, 0, 1, 1},
+  {&__pyx_n_s__tap_array, __pyx_k__tap_array, sizeof(__pyx_k__tap_array), 0, 0, 1, 1},
+  {&__pyx_n_s__tap_array_len, __pyx_k__tap_array_len, sizeof(__pyx_k__tap_array_len), 0, 0, 1, 1},
+  {&__pyx_n_s__tdx, __pyx_k__tdx, sizeof(__pyx_k__tdx), 0, 0, 1, 1},
+  {&__pyx_n_s__theano, __pyx_k__theano, sizeof(__pyx_k__theano), 0, 0, 1, 1},
+  {&__pyx_n_s__time, __pyx_k__time, sizeof(__pyx_k__time), 0, 0, 1, 1},
+  {&__pyx_n_s__tmp, __pyx_k__tmp, sizeof(__pyx_k__tmp), 0, 0, 1, 1},
+  {&__pyx_n_s__type, __pyx_k__type, sizeof(__pyx_k__type), 0, 0, 1, 1},
+  {&__pyx_n_s__update_profile, __pyx_k__update_profile, sizeof(__pyx_k__update_profile), 0, 0, 1, 1},
+  {&__pyx_n_s__value_zeros, __pyx_k__value_zeros, sizeof(__pyx_k__value_zeros), 0, 0, 1, 1},
+  {&__pyx_n_s__vector_outs, __pyx_k__vector_outs, sizeof(__pyx_k__vector_outs), 0, 0, 1, 1},
+  {&__pyx_n_s__vector_seqs, __pyx_k__vector_seqs, sizeof(__pyx_k__vector_seqs), 0, 0, 1, 1},
+  {&__pyx_n_s__vm, __pyx_k__vm, sizeof(__pyx_k__vm), 0, 0, 1, 1},
+  {&__pyx_n_s__vm_call_time, __pyx_k__vm_call_time, sizeof(__pyx_k__vm_call_time), 0, 0, 1, 1},
   {0, 0, 0, 0, 0, 0, 0}
 };
 static int __Pyx_InitCachedBuiltins(void) {
-  __pyx_builtin_IndexError = __Pyx_GetBuiltinName(__pyx_n_s_IndexError); if (!__pyx_builtin_IndexError) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 199; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_builtin_NotImplementedError = __Pyx_GetBuiltinName(__pyx_n_s_NotImplementedError); if (!__pyx_builtin_NotImplementedError) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 203; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 206; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 208; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_builtin_Exception = __Pyx_GetBuiltinName(__pyx_n_s_Exception); if (!__pyx_builtin_Exception) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 335; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_builtin_RuntimeError = __Pyx_GetBuiltinName(__pyx_n_s_RuntimeError); if (!__pyx_builtin_RuntimeError) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 799; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_builtin_range = __Pyx_GetName(__pyx_b, __pyx_n_s__range); if (!__pyx_builtin_range) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 199; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_builtin_ValueError = __Pyx_GetName(__pyx_b, __pyx_n_s__ValueError); if (!__pyx_builtin_ValueError) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 201; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_builtin_Exception = __Pyx_GetName(__pyx_b, __pyx_n_s__Exception); if (!__pyx_builtin_Exception) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 337; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_builtin_RuntimeError = __Pyx_GetName(__pyx_b, __pyx_n_s__RuntimeError); if (!__pyx_builtin_RuntimeError) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   return 0;
   __pyx_L1_error:;
   return -1;
@@ -7393,208 +7083,335 @@ static int __Pyx_InitCachedConstants(void) {
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
 
-  /* "theano/scan_module/scan_perform.pyx":203
- *             n_steps)
- *     elif n_steps == 0:
- *         raise NotImplementedError(             # <<<<<<<<<<<<<<
- *             "We didn't implemented yet the case where scan do 0 iteration")
- *     else:
- */
-  __pyx_tuple_ = PyTuple_Pack(1, __pyx_kp_s_We_didn_t_implemented_yet_the_ca); if (unlikely(!__pyx_tuple_)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 203; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple_);
-  __Pyx_GIVEREF(__pyx_tuple_);
-
-  /* "theano/scan_module/scan_perform.pyx":236
- *             outs[idx][0] = args[ <unsigned int>(1+ n_seqs + idx)]
- *         elif ( outs[idx][0] is not None and
- *               outs[idx][0].shape[1:] == args[<unsigned int>(1+ n_seqs + idx)].shape[1:]             # <<<<<<<<<<<<<<
- *               and outs[idx][0].shape[0] >= store_steps[idx] ):
- *             # Put in the values of the initial state
- */
-  __pyx_slice__2 = PySlice_New(__pyx_int_1, Py_None, Py_None); if (unlikely(!__pyx_slice__2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_slice__2);
-  __Pyx_GIVEREF(__pyx_slice__2);
-  __pyx_slice__3 = PySlice_New(__pyx_int_1, Py_None, Py_None); if (unlikely(!__pyx_slice__3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 236; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_slice__3);
-  __Pyx_GIVEREF(__pyx_slice__3);
-
-  /* "theano/scan_module/scan_perform.pyx":245
- *                                                        idx)][:l]
- *             else:
- *                 outs[idx][0][:] = args[<unsigned int>(seqs_arg_offset + idx)]             # <<<<<<<<<<<<<<
- *         else:
- *             outs[idx][0] = args[<unsigned int>(seqs_arg_offset + idx)].copy()
+  /* "theano/scan_module/scan_perform.pyx":206
+ *                                   args[1+idx],
+ *                                   args[1+idx].shape)
+ *             args[<unsigned int>(1+idx)] = args[<unsigned int>(1+idx)][::-1]             # <<<<<<<<<<<<<<
+ *     else:
+ *         for idx in range(n_seqs):
  */
-  __pyx_slice__4 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 245; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_slice__4);
-  __Pyx_GIVEREF(__pyx_slice__4);
+  __pyx_k_slice_2 = PySlice_New(Py_None, Py_None, __pyx_int_neg_1); if (unlikely(!__pyx_k_slice_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 206; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_k_slice_2);
+  __Pyx_GIVEREF(__pyx_k_slice_2);
 
-  /* "theano/scan_module/scan_perform.pyx":273
+  /* "theano/scan_module/scan_perform.pyx":275
  *             if vector_seqs[idx] == 1:
  *                 input_storage[idx].storage[0] = args[\
  *                             <unsigned int>(1+idx)][i:<unsigned int>(i+1)].reshape(())             # <<<<<<<<<<<<<<
  *             else:
  *                 input_storage[idx].storage[0] = \
  */
-  __pyx_tuple__5 = PyTuple_Pack(1, __pyx_empty_tuple); if (unlikely(!__pyx_tuple__5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 273; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__5);
-  __Pyx_GIVEREF(__pyx_tuple__5);
+  __pyx_k_tuple_3 = PyTuple_New(1); if (unlikely(!__pyx_k_tuple_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_k_tuple_3);
+  __Pyx_INCREF(((PyObject *)__pyx_empty_tuple));
+  PyTuple_SET_ITEM(__pyx_k_tuple_3, 0, ((PyObject *)__pyx_empty_tuple));
+  __Pyx_GIVEREF(((PyObject *)__pyx_empty_tuple));
+  __Pyx_GIVEREF(((PyObject *)__pyx_k_tuple_3));
 
-  /* "theano/scan_module/scan_perform.pyx":285
+  /* "theano/scan_module/scan_perform.pyx":287
  *                     _idx = (pos[idx]+tap)%store_steps[idx]
  *                     input_storage[offset].storage[0] =\
  *                             outs[idx][0][_idx:<unsigned int>(_idx+1)].reshape(())             # <<<<<<<<<<<<<<
  *                     offset += 1
  *             else:
  */
-  __pyx_tuple__6 = PyTuple_Pack(1, __pyx_empty_tuple); if (unlikely(!__pyx_tuple__6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 285; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__6);
-  __Pyx_GIVEREF(__pyx_tuple__6);
-
-  /* "theano/scan_module/scan_perform.pyx":383
- *                 if (outs[j][0] is None or
- *                         outs[j][0].shape[0] < store_steps[j] or
- *                         outs[j][0].shape[1:] != shape[1:] or             # <<<<<<<<<<<<<<
- *                         outs[j][0].dtype != dtype ):
- *                     outs[j][0] = node.outputs[j].type.value_zeros(shape)
- */
-  __pyx_slice__7 = PySlice_New(__pyx_int_1, Py_None, Py_None); if (unlikely(!__pyx_slice__7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 383; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_slice__7);
-  __Pyx_GIVEREF(__pyx_slice__7);
-  __pyx_slice__8 = PySlice_New(__pyx_int_1, Py_None, Py_None); if (unlikely(!__pyx_slice__8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 383; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_slice__8);
-  __Pyx_GIVEREF(__pyx_slice__8);
-
-  /* "theano/scan_module/scan_perform.pyx":423
- *                 # This way, there will be no information overwritten
- *                 # before it is read (as it used to happen).
- *                 shape = (pdx,)+ outs[idx][0].shape[1:]             # <<<<<<<<<<<<<<
- * 
- *                 tmp = node.outputs[idx].type.value_zeros(shape)
- */
-  __pyx_slice__9 = PySlice_New(__pyx_int_1, Py_None, Py_None); if (unlikely(!__pyx_slice__9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 423; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_slice__9);
-  __Pyx_GIVEREF(__pyx_slice__9);
-
-  /* "theano/scan_module/scan_perform.pyx":426
- * 
- *                 tmp = node.outputs[idx].type.value_zeros(shape)
- *                 tmp[:] = outs[idx][0][:pdx]             # <<<<<<<<<<<<<<
- *                 outs[idx][0][:store_steps[idx]-pdx] = outs[idx][0][pdx:]
- *                 outs[idx][0][store_steps[idx]-pdx:] = tmp
- */
-  __pyx_slice__10 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__10)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 426; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_slice__10);
-  __Pyx_GIVEREF(__pyx_slice__10);
-
-  /* "theano/scan_module/scan_perform.pyx":430
- *                 outs[idx][0][store_steps[idx]-pdx:] = tmp
- *             else:
- *                 shape = (store_steps[idx]-pdx,) + outs[idx][0].shape[1:]             # <<<<<<<<<<<<<<
- *                 tmp = node.outputs[idx].type.value_zeros(shape)
- *                 tmp[:] = outs[idx][0][pdx:]
- */
-  __pyx_slice__11 = PySlice_New(__pyx_int_1, Py_None, Py_None); if (unlikely(!__pyx_slice__11)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 430; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_slice__11);
-  __Pyx_GIVEREF(__pyx_slice__11);
-
-  /* "theano/scan_module/scan_perform.pyx":432
- *                 shape = (store_steps[idx]-pdx,) + outs[idx][0].shape[1:]
- *                 tmp = node.outputs[idx].type.value_zeros(shape)
- *                 tmp[:] = outs[idx][0][pdx:]             # <<<<<<<<<<<<<<
- *                 outs[idx][0][store_steps[idx]-pdx:] = outs[idx][0][:pdx]
- *                 outs[idx][0][:store_steps[idx]-pdx] = tmp
- */
-  __pyx_slice__12 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 432; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_slice__12);
-  __Pyx_GIVEREF(__pyx_slice__12);
+  __pyx_k_tuple_4 = PyTuple_New(1); if (unlikely(!__pyx_k_tuple_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 287; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_k_tuple_4);
+  __Pyx_INCREF(((PyObject *)__pyx_empty_tuple));
+  PyTuple_SET_ITEM(__pyx_k_tuple_4, 0, ((PyObject *)__pyx_empty_tuple));
+  __Pyx_GIVEREF(((PyObject *)__pyx_empty_tuple));
+  __Pyx_GIVEREF(((PyObject *)__pyx_k_tuple_4));
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":215
+  /* "numpy.pxd":214
  *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
  *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
  *                 raise ValueError(u"ndarray is not C contiguous")             # <<<<<<<<<<<<<<
  * 
  *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
  */
-  __pyx_tuple__13 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_C_contiguous); if (unlikely(!__pyx_tuple__13)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 215; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__13);
-  __Pyx_GIVEREF(__pyx_tuple__13);
+  __pyx_k_tuple_6 = PyTuple_New(1); if (unlikely(!__pyx_k_tuple_6)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 214; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_k_tuple_6);
+  __Pyx_INCREF(((PyObject *)__pyx_kp_u_5));
+  PyTuple_SET_ITEM(__pyx_k_tuple_6, 0, ((PyObject *)__pyx_kp_u_5));
+  __Pyx_GIVEREF(((PyObject *)__pyx_kp_u_5));
+  __Pyx_GIVEREF(((PyObject *)__pyx_k_tuple_6));
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":219
+  /* "numpy.pxd":218
  *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
  *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
  *                 raise ValueError(u"ndarray is not Fortran contiguous")             # <<<<<<<<<<<<<<
  * 
  *             info.buf = PyArray_DATA(self)
  */
-  __pyx_tuple__14 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_Fortran_contiguou); if (unlikely(!__pyx_tuple__14)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 219; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__14);
-  __Pyx_GIVEREF(__pyx_tuple__14);
+  __pyx_k_tuple_8 = PyTuple_New(1); if (unlikely(!__pyx_k_tuple_8)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 218; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_k_tuple_8);
+  __Pyx_INCREF(((PyObject *)__pyx_kp_u_7));
+  PyTuple_SET_ITEM(__pyx_k_tuple_8, 0, ((PyObject *)__pyx_kp_u_7));
+  __Pyx_GIVEREF(((PyObject *)__pyx_kp_u_7));
+  __Pyx_GIVEREF(((PyObject *)__pyx_k_tuple_8));
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":257
- *                 if ((descr.byteorder == c'>' and little_endian) or
- *                     (descr.byteorder == c'<' and not little_endian)):
+  /* "numpy.pxd":256
+ *                 if ((descr.byteorder == '>' and little_endian) or
+ *                     (descr.byteorder == '<' and not little_endian)):
  *                     raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
  *                 if   t == NPY_BYTE:        f = "b"
  *                 elif t == NPY_UBYTE:       f = "B"
  */
-  __pyx_tuple__15 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__15)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 257; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__15);
-  __Pyx_GIVEREF(__pyx_tuple__15);
+  __pyx_k_tuple_10 = PyTuple_New(1); if (unlikely(!__pyx_k_tuple_10)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 256; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_k_tuple_10);
+  __Pyx_INCREF(((PyObject *)__pyx_kp_u_9));
+  PyTuple_SET_ITEM(__pyx_k_tuple_10, 0, ((PyObject *)__pyx_kp_u_9));
+  __Pyx_GIVEREF(((PyObject *)__pyx_kp_u_9));
+  __Pyx_GIVEREF(((PyObject *)__pyx_k_tuple_10));
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":799
+  /* "numpy.pxd":798
  * 
- *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *         if (end - f) - (new_offset - offset[0]) < 15:
  *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")             # <<<<<<<<<<<<<<
  * 
- *         if ((child.byteorder == c'>' and little_endian) or
+ *         if ((child.byteorder == '>' and little_endian) or
  */
-  __pyx_tuple__16 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor); if (unlikely(!__pyx_tuple__16)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 799; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__16);
-  __Pyx_GIVEREF(__pyx_tuple__16);
+  __pyx_k_tuple_13 = PyTuple_New(1); if (unlikely(!__pyx_k_tuple_13)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_k_tuple_13);
+  __Pyx_INCREF(((PyObject *)__pyx_kp_u_12));
+  PyTuple_SET_ITEM(__pyx_k_tuple_13, 0, ((PyObject *)__pyx_kp_u_12));
+  __Pyx_GIVEREF(((PyObject *)__pyx_kp_u_12));
+  __Pyx_GIVEREF(((PyObject *)__pyx_k_tuple_13));
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":803
- *         if ((child.byteorder == c'>' and little_endian) or
- *             (child.byteorder == c'<' and not little_endian)):
+  /* "numpy.pxd":802
+ *         if ((child.byteorder == '>' and little_endian) or
+ *             (child.byteorder == '<' and not little_endian)):
  *             raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
  *             # One could encode it in the format string and have Cython
  *             # complain instead, BUT: < and > in format strings also imply
  */
-  __pyx_tuple__17 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__17)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 803; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__17);
-  __Pyx_GIVEREF(__pyx_tuple__17);
+  __pyx_k_tuple_14 = PyTuple_New(1); if (unlikely(!__pyx_k_tuple_14)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 802; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_k_tuple_14);
+  __Pyx_INCREF(((PyObject *)__pyx_kp_u_9));
+  PyTuple_SET_ITEM(__pyx_k_tuple_14, 0, ((PyObject *)__pyx_kp_u_9));
+  __Pyx_GIVEREF(((PyObject *)__pyx_kp_u_9));
+  __Pyx_GIVEREF(((PyObject *)__pyx_k_tuple_14));
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":823
+  /* "numpy.pxd":822
  *             t = child.type_num
  *             if end - f < 5:
  *                 raise RuntimeError(u"Format string allocated too short.")             # <<<<<<<<<<<<<<
  * 
  *             # Until ticket #99 is fixed, use integers to avoid warnings
  */
-  __pyx_tuple__18 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor_2); if (unlikely(!__pyx_tuple__18)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 823; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__18);
-  __Pyx_GIVEREF(__pyx_tuple__18);
+  __pyx_k_tuple_16 = PyTuple_New(1); if (unlikely(!__pyx_k_tuple_16)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 822; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_k_tuple_16);
+  __Pyx_INCREF(((PyObject *)__pyx_kp_u_15));
+  PyTuple_SET_ITEM(__pyx_k_tuple_16, 0, ((PyObject *)__pyx_kp_u_15));
+  __Pyx_GIVEREF(((PyObject *)__pyx_kp_u_15));
+  __Pyx_GIVEREF(((PyObject *)__pyx_k_tuple_16));
 
-  /* "theano/scan_module/scan_perform.pyx":64
+  /* "theano/scan_module/scan_perform.pyx":65
  * 
  * 
  * def get_version():             # <<<<<<<<<<<<<<
- *     return 0.283
+ *     return 0.278
  * 
  */
-  __pyx_codeobj__19 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_anakha_ext_theano_theano_s, __pyx_n_s_get_version, 64, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__19)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_k_codeobj_22 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_23, __pyx_n_s__get_version, 65, __pyx_empty_bytes); if (unlikely(!__pyx_k_codeobj_22)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 65; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "theano/scan_module/scan_perform.pyx":68
+  /* "theano/scan_module/scan_perform.pyx":69
  * 
  * @cython.boundscheck(False)
  * def perform(             # <<<<<<<<<<<<<<
  *             unsigned int n_shared_outs,
  *             unsigned int n_mit_mot_outs,
  */
-  __pyx_tuple__20 = PyTuple_Pack(64, __pyx_n_s_n_shared_outs, __pyx_n_s_n_mit_mot_outs, __pyx_n_s_n_seqs, __pyx_n_s_n_mit_mot, __pyx_n_s_n_mit_sot, __pyx_n_s_n_sit_sot, __pyx_n_s_n_nit_sot, __pyx_n_s_n_steps, __pyx_n_s_as_while, __pyx_n_s_mintaps, __pyx_n_s_tap_array, __pyx_n_s_tap_array_len, __pyx_n_s_vector_seqs, __pyx_n_s_vector_outs, __pyx_n_s_mit_mot_out_slices, __pyx_n_s_mit_mot_out_nslices, __pyx_n_s_fn, __pyx_n_s_fnct, __pyx_n_s_destroy_map, __pyx_n_s_args, __pyx_n_s_outs, __pyx_n_s_self, __pyx_n_s_node, __pyx_n_s_t0_call, __pyx_n_s_t_fn, __pyx_n_s_n_outs, __pyx_n_s_seqs_arg_offset, __pyx_n_s_shared_arg_offset, __pyx_n_s_nit_sot_arg_offset, __pyx_n_s_offset_out, __pyx_n_s_lenpos, __pyx_n_s_pos, __pyx_n_s_len_store_steps, __pyx_n_s_store_steps, __pyx_n_s_l, __pyx_n_s_offset, __pyx_n_s_tap, __pyx_n_s_idx, __pyx_n_s_a_offset, __pyx_n_s_o_offset, __pyx_n_s_idx_2, __pyx_n_s_i, __pyx_n_s_j, __pyx_n_s_k, __pyx_n_s_kdx, __pyx_n_s_tdx, __pyx_n_s_pdx, __pyx_n_s_jout, __pyx_n_s_begin, __pyx_n_s_end, __pyx_n_s_cond, __pyx_n_s_other_args, __pyx_n_s_input_storage, __pyx_n_s_output_storage, __pyx_n_s_t0_fn, __pyx_n_s_dt_fn, __pyx_n_s_shape, __pyx_n_s_dtype, __pyx_n_s_tmp, __pyx_n_s_sh0, __pyx_n_s_i_s, __pyx_n_s_o_s, __pyx_n_s_t_call, __pyx_n_s_profile); if (unlikely(!__pyx_tuple__20)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__20);
-  __Pyx_GIVEREF(__pyx_tuple__20);
-  __pyx_codeobj__21 = (PyObject*)__Pyx_PyCode_New(23, 0, 64, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__20, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_anakha_ext_theano_theano_s, __pyx_n_s_perform, 68, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__21)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_k_tuple_25 = PyTuple_New(62); if (unlikely(!__pyx_k_tuple_25)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_k_tuple_25);
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__n_shared_outs));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 0, ((PyObject *)__pyx_n_s__n_shared_outs));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__n_shared_outs));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__n_mit_mot_outs));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 1, ((PyObject *)__pyx_n_s__n_mit_mot_outs));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__n_mit_mot_outs));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__n_seqs));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 2, ((PyObject *)__pyx_n_s__n_seqs));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__n_seqs));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__n_mit_mot));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 3, ((PyObject *)__pyx_n_s__n_mit_mot));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__n_mit_mot));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__n_mit_sot));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 4, ((PyObject *)__pyx_n_s__n_mit_sot));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__n_mit_sot));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__n_sit_sot));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 5, ((PyObject *)__pyx_n_s__n_sit_sot));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__n_sit_sot));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__n_nit_sot));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 6, ((PyObject *)__pyx_n_s__n_nit_sot));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__n_nit_sot));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__n_steps));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 7, ((PyObject *)__pyx_n_s__n_steps));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__n_steps));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__as_while));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 8, ((PyObject *)__pyx_n_s__as_while));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__as_while));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__mintaps));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 9, ((PyObject *)__pyx_n_s__mintaps));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__mintaps));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__tap_array));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 10, ((PyObject *)__pyx_n_s__tap_array));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__tap_array));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__tap_array_len));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 11, ((PyObject *)__pyx_n_s__tap_array_len));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__tap_array_len));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__vector_seqs));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 12, ((PyObject *)__pyx_n_s__vector_seqs));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__vector_seqs));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__vector_outs));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 13, ((PyObject *)__pyx_n_s__vector_outs));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__vector_outs));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__mit_mot_out_slices));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 14, ((PyObject *)__pyx_n_s__mit_mot_out_slices));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__mit_mot_out_slices));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__mit_mot_out_nslices));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 15, ((PyObject *)__pyx_n_s__mit_mot_out_nslices));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__mit_mot_out_nslices));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__fn));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 16, ((PyObject *)__pyx_n_s__fn));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__fn));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__fnct));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 17, ((PyObject *)__pyx_n_s__fnct));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__fnct));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__destroy_map));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 18, ((PyObject *)__pyx_n_s__destroy_map));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__destroy_map));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__args));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 19, ((PyObject *)__pyx_n_s__args));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__args));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__outs));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 20, ((PyObject *)__pyx_n_s__outs));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__outs));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__self));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 21, ((PyObject *)__pyx_n_s__self));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__self));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__node));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 22, ((PyObject *)__pyx_n_s__node));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__node));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__t0_call));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 23, ((PyObject *)__pyx_n_s__t0_call));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__t0_call));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__t_fn));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 24, ((PyObject *)__pyx_n_s__t_fn));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__t_fn));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__n_outs));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 25, ((PyObject *)__pyx_n_s__n_outs));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__n_outs));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__seqs_arg_offset));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 26, ((PyObject *)__pyx_n_s__seqs_arg_offset));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__seqs_arg_offset));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__shared_arg_offset));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 27, ((PyObject *)__pyx_n_s__shared_arg_offset));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__shared_arg_offset));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__nit_sot_arg_offset));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 28, ((PyObject *)__pyx_n_s__nit_sot_arg_offset));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__nit_sot_arg_offset));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__offset_out));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 29, ((PyObject *)__pyx_n_s__offset_out));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__offset_out));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__lenpos));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 30, ((PyObject *)__pyx_n_s__lenpos));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__lenpos));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__pos));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 31, ((PyObject *)__pyx_n_s__pos));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__pos));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__len_store_steps));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 32, ((PyObject *)__pyx_n_s__len_store_steps));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__len_store_steps));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__store_steps));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 33, ((PyObject *)__pyx_n_s__store_steps));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__store_steps));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__l));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 34, ((PyObject *)__pyx_n_s__l));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__l));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__offset));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 35, ((PyObject *)__pyx_n_s__offset));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__offset));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__tap));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 36, ((PyObject *)__pyx_n_s__tap));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__tap));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s___idx));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 37, ((PyObject *)__pyx_n_s___idx));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s___idx));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__a_offset));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 38, ((PyObject *)__pyx_n_s__a_offset));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__a_offset));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__o_offset));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 39, ((PyObject *)__pyx_n_s__o_offset));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__o_offset));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__idx));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 40, ((PyObject *)__pyx_n_s__idx));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__idx));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__i));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 41, ((PyObject *)__pyx_n_s__i));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__i));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__j));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 42, ((PyObject *)__pyx_n_s__j));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__j));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__k));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 43, ((PyObject *)__pyx_n_s__k));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__k));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__kdx));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 44, ((PyObject *)__pyx_n_s__kdx));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__kdx));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__tdx));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 45, ((PyObject *)__pyx_n_s__tdx));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__tdx));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__pdx));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 46, ((PyObject *)__pyx_n_s__pdx));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__pdx));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__jout));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 47, ((PyObject *)__pyx_n_s__jout));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__jout));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__begin));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 48, ((PyObject *)__pyx_n_s__begin));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__begin));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__end));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 49, ((PyObject *)__pyx_n_s__end));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__end));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__cond));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 50, ((PyObject *)__pyx_n_s__cond));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__cond));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__other_args));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 51, ((PyObject *)__pyx_n_s__other_args));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__other_args));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__input_storage));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 52, ((PyObject *)__pyx_n_s__input_storage));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__input_storage));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__output_storage));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 53, ((PyObject *)__pyx_n_s__output_storage));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__output_storage));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__t0_fn));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 54, ((PyObject *)__pyx_n_s__t0_fn));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__t0_fn));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__dt_fn));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 55, ((PyObject *)__pyx_n_s__dt_fn));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__dt_fn));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__shape));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 56, ((PyObject *)__pyx_n_s__shape));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__shape));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__dtype));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 57, ((PyObject *)__pyx_n_s__dtype));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__dtype));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__tmp));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 58, ((PyObject *)__pyx_n_s__tmp));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__tmp));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__sh0));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 59, ((PyObject *)__pyx_n_s__sh0));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__sh0));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__t_call));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 60, ((PyObject *)__pyx_n_s__t_call));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__t_call));
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__profile));
+  PyTuple_SET_ITEM(__pyx_k_tuple_25, 61, ((PyObject *)__pyx_n_s__profile));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__profile));
+  __Pyx_GIVEREF(((PyObject *)__pyx_k_tuple_25));
+  __pyx_k_codeobj_26 = (PyObject*)__Pyx_PyCode_New(23, 0, 62, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_k_tuple_25, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_23, __pyx_n_s__perform, 69, __pyx_empty_bytes); if (unlikely(!__pyx_k_codeobj_26)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_RefNannyFinishContext();
   return 0;
   __pyx_L1_error:;
@@ -7604,9 +7421,10 @@ static int __Pyx_InitCachedConstants(void) {
 
 static int __Pyx_InitGlobals(void) {
   if (__Pyx_InitStrings(__pyx_string_tab) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-  __pyx_float_0_283 = PyFloat_FromDouble(0.283); if (unlikely(!__pyx_float_0_283)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_int_0 = PyInt_FromLong(0); if (unlikely(!__pyx_int_0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_int_1 = PyInt_FromLong(1); if (unlikely(!__pyx_int_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_int_0 = PyInt_FromLong(0); if (unlikely(!__pyx_int_0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __pyx_int_1 = PyInt_FromLong(1); if (unlikely(!__pyx_int_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __pyx_int_neg_1 = PyInt_FromLong(-1); if (unlikely(!__pyx_int_neg_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __pyx_int_15 = PyInt_FromLong(15); if (unlikely(!__pyx_int_15)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
   return 0;
   __pyx_L1_error:;
   return -1;
@@ -7622,9 +7440,6 @@ PyMODINIT_FUNC PyInit_scan_perform(void)
 {
   PyObject *__pyx_t_1 = NULL;
   PyObject *__pyx_t_2 = NULL;
-  int __pyx_lineno = 0;
-  const char *__pyx_filename = NULL;
-  int __pyx_clineno = 0;
   __Pyx_RefNannyDeclarations
   #if CYTHON_REFNANNY
   __Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
@@ -7657,34 +7472,22 @@ PyMODINIT_FUNC PyInit_scan_perform(void)
   #endif
   /*--- Module creation code ---*/
   #if PY_MAJOR_VERSION < 3
-  __pyx_m = Py_InitModule4(__Pyx_NAMESTR("scan_perform"), __pyx_methods, __Pyx_DOCSTR(__pyx_k_This_code_implements_the_operat), 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
+  __pyx_m = Py_InitModule4(__Pyx_NAMESTR("scan_perform"), __pyx_methods, __Pyx_DOCSTR(__pyx_k_17), 0, PYTHON_API_VERSION);
   #else
   __pyx_m = PyModule_Create(&__pyx_moduledef);
   #endif
-  if (unlikely(!__pyx_m)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  Py_INCREF(__pyx_d);
-  __pyx_b = PyImport_AddModule(__Pyx_NAMESTR(__Pyx_BUILTIN_MODULE_NAME)); if (unlikely(!__pyx_b)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  #if CYTHON_COMPILING_IN_PYPY
-  Py_INCREF(__pyx_b);
+  if (!__pyx_m) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  #if PY_MAJOR_VERSION < 3
+  Py_INCREF(__pyx_m);
   #endif
+  __pyx_b = PyImport_AddModule(__Pyx_NAMESTR(__Pyx_BUILTIN_MODULE_NAME));
+  if (!__pyx_b) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
   if (__Pyx_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
   /*--- Initialize various global constants etc. ---*/
   if (unlikely(__Pyx_InitGlobals() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
-  if (__Pyx_init_sys_getdefaultencoding_params() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  #endif
   if (__pyx_module_is_main_theano__scan_module__scan_perform) {
-    if (__Pyx_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-  }
-  #if PY_MAJOR_VERSION >= 3
-  {
-    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    if (!PyDict_GetItemString(modules, "theano.scan_module.scan_perform")) {
-      if (unlikely(PyDict_SetItemString(modules, "theano.scan_module.scan_perform", __pyx_m) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    }
+    if (__Pyx_SetAttrString(__pyx_m, "__name__", __pyx_n_s____main__) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
   }
-  #endif
   /*--- Builtin init code ---*/
   if (unlikely(__Pyx_InitCachedBuiltins() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   /*--- Constants init code ---*/
@@ -7694,18 +7497,11 @@ PyMODINIT_FUNC PyInit_scan_perform(void)
   /*--- Function export code ---*/
   /*--- Type init code ---*/
   /*--- Type import code ---*/
-  __pyx_ptype_7cpython_4type_type = __Pyx_ImportType(__Pyx_BUILTIN_MODULE_NAME, "type", 
-  #if CYTHON_COMPILING_IN_PYPY
-  sizeof(PyTypeObject),
-  #else
-  sizeof(PyHeapTypeObject),
-  #endif
-  0); if (unlikely(!__pyx_ptype_7cpython_4type_type)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_ptype_5numpy_dtype = __Pyx_ImportType("numpy", "dtype", sizeof(PyArray_Descr), 0); if (unlikely(!__pyx_ptype_5numpy_dtype)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 155; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_ptype_5numpy_flatiter = __Pyx_ImportType("numpy", "flatiter", sizeof(PyArrayIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_flatiter)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 165; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_ptype_5numpy_broadcast = __Pyx_ImportType("numpy", "broadcast", sizeof(PyArrayMultiIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_broadcast)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 169; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_ptype_5numpy_ndarray = __Pyx_ImportType("numpy", "ndarray", sizeof(PyArrayObject), 0); if (unlikely(!__pyx_ptype_5numpy_ndarray)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 178; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_ptype_5numpy_ufunc = __Pyx_ImportType("numpy", "ufunc", sizeof(PyUFuncObject), 0); if (unlikely(!__pyx_ptype_5numpy_ufunc)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 861; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_dtype = __Pyx_ImportType("numpy", "dtype", sizeof(PyArray_Descr), 0); if (unlikely(!__pyx_ptype_5numpy_dtype)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 154; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_flatiter = __Pyx_ImportType("numpy", "flatiter", sizeof(PyArrayIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_flatiter)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 164; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_broadcast = __Pyx_ImportType("numpy", "broadcast", sizeof(PyArrayMultiIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_broadcast)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 168; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_ndarray = __Pyx_ImportType("numpy", "ndarray", sizeof(PyArrayObject), 0); if (unlikely(!__pyx_ptype_5numpy_ndarray)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 177; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_ufunc = __Pyx_ImportType("numpy", "ufunc", sizeof(PyUFuncObject), 0); if (unlikely(!__pyx_ptype_5numpy_ufunc)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 860; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   /*--- Variable import code ---*/
   /*--- Function import code ---*/
   /*--- Execution code ---*/
@@ -7717,7 +7513,7 @@ PyMODINIT_FUNC PyInit_scan_perform(void)
  * __copyright__ = "(c) 2011, Universite de Montreal"
  * __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
  */
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_authors, __pyx_kp_s_Razvan_Pascanu) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttr(__pyx_m, __pyx_n_s____authors__, ((PyObject *)__pyx_kp_s_18)) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
   /* "theano/scan_module/scan_perform.pyx":51
  * 
@@ -7726,7 +7522,7 @@ PyMODINIT_FUNC PyInit_scan_perform(void)
  * __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
  * 
  */
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_copyright, __pyx_kp_s_c_2011_Universite_de_Montreal) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 51; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttr(__pyx_m, __pyx_n_s____copyright__, ((PyObject *)__pyx_kp_s_19)) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 51; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
   /* "theano/scan_module/scan_perform.pyx":52
  * __authors__ = "Razvan Pascanu"
@@ -7735,7 +7531,7 @@ PyMODINIT_FUNC PyInit_scan_perform(void)
  * 
  * 
  */
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_contact, __pyx_kp_s_Razvan_Pascanu_r_pascanu_gmail) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttr(__pyx_m, __pyx_n_s____contact__, ((PyObject *)__pyx_kp_s_20)) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
   /* "theano/scan_module/scan_perform.pyx":57
  * import cython
@@ -7744,9 +7540,9 @@ PyMODINIT_FUNC PyInit_scan_perform(void)
  * cimport numpy
  * from theano import gof
  */
-  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 57; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_Import(((PyObject *)__pyx_n_s__numpy), 0, -1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 57; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_numpy, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 57; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttr(__pyx_m, __pyx_n_s__numpy, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 57; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
   /* "theano/scan_module/scan_perform.pyx":59
@@ -7758,15 +7554,19 @@ PyMODINIT_FUNC PyInit_scan_perform(void)
  */
   __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
-  __Pyx_INCREF(__pyx_n_s_gof);
-  PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_gof);
-  __Pyx_GIVEREF(__pyx_n_s_gof);
-  __pyx_t_2 = __Pyx_Import(__pyx_n_s_theano, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__gof));
+  PyList_SET_ITEM(__pyx_t_1, 0, ((PyObject *)__pyx_n_s__gof));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__gof));
+  __pyx_t_2 = __Pyx_Import(((PyObject *)__pyx_n_s__theano), ((PyObject *)__pyx_t_1), -1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_gof); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(((PyObject *)__pyx_t_1)); __pyx_t_1 = 0;
+  __pyx_t_1 = PyObject_GetAttr(__pyx_t_2, __pyx_n_s__gof);
+  if (__pyx_t_1 == NULL) {
+    if (PyErr_ExceptionMatches(PyExc_AttributeError)) __Pyx_RaiseImportError(__pyx_n_s__gof);
+    if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
   __Pyx_GOTREF(__pyx_t_1);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_gof, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttr(__pyx_m, __pyx_n_s__gof, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
@@ -7775,60 +7575,85 @@ PyMODINIT_FUNC PyInit_scan_perform(void)
  * from theano import gof
  * import time             # <<<<<<<<<<<<<<
  * import copy
- * 
+ * from theano.sandbox import cuda
  */
-  __pyx_t_2 = __Pyx_Import(__pyx_n_s_time, 0, -1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_Import(((PyObject *)__pyx_n_s__time), 0, -1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_time, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttr(__pyx_m, __pyx_n_s__time, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
   /* "theano/scan_module/scan_perform.pyx":61
  * from theano import gof
  * import time
  * import copy             # <<<<<<<<<<<<<<
+ * from theano.sandbox import cuda
+ * 
+ */
+  __pyx_t_2 = __Pyx_Import(((PyObject *)__pyx_n_s__copy), 0, -1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 61; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_2);
+  if (PyObject_SetAttr(__pyx_m, __pyx_n_s__copy, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 61; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "theano/scan_module/scan_perform.pyx":62
+ * import time
+ * import copy
+ * from theano.sandbox import cuda             # <<<<<<<<<<<<<<
  * 
  * 
  */
-  __pyx_t_2 = __Pyx_Import(__pyx_n_s_copy, 0, -1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 61; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = PyList_New(1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 62; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_INCREF(((PyObject *)__pyx_n_s__cuda));
+  PyList_SET_ITEM(__pyx_t_2, 0, ((PyObject *)__pyx_n_s__cuda));
+  __Pyx_GIVEREF(((PyObject *)__pyx_n_s__cuda));
+  __pyx_t_1 = __Pyx_Import(((PyObject *)__pyx_n_s_21), ((PyObject *)__pyx_t_2), -1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 62; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(((PyObject *)__pyx_t_2)); __pyx_t_2 = 0;
+  __pyx_t_2 = PyObject_GetAttr(__pyx_t_1, __pyx_n_s__cuda);
+  if (__pyx_t_2 == NULL) {
+    if (PyErr_ExceptionMatches(PyExc_AttributeError)) __Pyx_RaiseImportError(__pyx_n_s__cuda);
+    if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 62; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
   __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_copy, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 61; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttr(__pyx_m, __pyx_n_s__cuda, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 62; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
-  /* "theano/scan_module/scan_perform.pyx":64
+  /* "theano/scan_module/scan_perform.pyx":65
  * 
  * 
  * def get_version():             # <<<<<<<<<<<<<<
- *     return 0.283
+ *     return 0.278
  * 
  */
-  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_6theano_11scan_module_12scan_perform_1get_version, NULL, __pyx_n_s_theano_scan_module_scan_perform); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_get_version, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_6theano_11scan_module_12scan_perform_1get_version, NULL, __pyx_n_s_24); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 65; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyObject_SetAttr(__pyx_m, __pyx_n_s__get_version, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 65; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
-  /* "theano/scan_module/scan_perform.pyx":68
+  /* "theano/scan_module/scan_perform.pyx":69
  * 
  * @cython.boundscheck(False)
  * def perform(             # <<<<<<<<<<<<<<
  *             unsigned int n_shared_outs,
  *             unsigned int n_mit_mot_outs,
  */
-  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_6theano_11scan_module_12scan_perform_3perform, NULL, __pyx_n_s_theano_scan_module_scan_perform); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_perform, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_6theano_11scan_module_12scan_perform_3perform, NULL, __pyx_n_s_24); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyObject_SetAttr(__pyx_m, __pyx_n_s__perform, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 69; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
   /* "theano/scan_module/scan_perform.pyx":1
  * """             # <<<<<<<<<<<<<<
  *  This code implements the operations that scan has to carry on when called
  *  as a stand alone function.
  */
-  __pyx_t_2 = PyDict_New(); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(((PyObject *)__pyx_t_1));
+  if (PyObject_SetAttr(__pyx_m, __pyx_n_s____test__, ((PyObject *)__pyx_t_1)) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(((PyObject *)__pyx_t_1)); __pyx_t_1 = 0;
 
-  /* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":976
+  /* "numpy.pxd":974
  *      arr.base = baseptr
  * 
  * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
@@ -7871,15 +7696,17 @@ static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
 }
 #endif /* CYTHON_REFNANNY */
 
-static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
-    PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name);
-    if (unlikely(!result)) {
-        PyErr_Format(PyExc_NameError,
-#if PY_MAJOR_VERSION >= 3
-            "name '%U' is not defined", name);
-#else
-            "name '%.200s' is not defined", PyString_AS_STRING(name));
-#endif
+static PyObject *__Pyx_GetName(PyObject *dict, PyObject *name) {
+    PyObject *result;
+    result = PyObject_GetAttr(dict, name);
+    if (!result) {
+        if (dict != __pyx_b) {
+            PyErr_Clear();
+            result = PyObject_GetAttr(__pyx_b, name);
+        }
+        if (!result) {
+            PyErr_SetObject(PyExc_NameError, name);
+        }
     }
     return result;
 }
@@ -7904,7 +7731,7 @@ static void __Pyx_RaiseArgtupleInvalid(
         more_or_less = "exactly";
     }
     PyErr_Format(PyExc_TypeError,
-                 "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                 "%s() takes %s %"PY_FORMAT_SIZE_T"d positional argument%s (%"PY_FORMAT_SIZE_T"d given)",
                  func_name, more_or_less, num_expected,
                  (num_expected == 1) ? "" : "s", num_found);
 }
@@ -7918,7 +7745,7 @@ static void __Pyx_RaiseDoubleKeywordsError(
         "%s() got multiple values for keyword argument '%U'", func_name, kw_name);
         #else
         "%s() got multiple values for keyword argument '%s'", func_name,
-        PyString_AsString(kw_name));
+        PyString_AS_STRING(kw_name));
         #endif
 }
 
@@ -7939,81 +7766,57 @@ static int __Pyx_ParseOptionalKeywords(
         while (*name && (**name != key)) name++;
         if (*name) {
             values[name-argnames] = value;
-            continue;
-        }
-        name = first_kw_arg;
-        #if PY_MAJOR_VERSION < 3
-        if (likely(PyString_CheckExact(key)) || likely(PyString_Check(key))) {
-            while (*name) {
-                if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key))
-                        && _PyString_Eq(**name, key)) {
-                    values[name-argnames] = value;
-                    break;
-                }
-                name++;
-            }
-            if (*name) continue;
-            else {
-                PyObject*** argname = argnames;
-                while (argname != first_kw_arg) {
-                    if ((**argname == key) || (
-                            (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key))
-                             && _PyString_Eq(**argname, key))) {
-                        goto arg_passed_twice;
-                    }
-                    argname++;
+        } else {
+            #if PY_MAJOR_VERSION < 3
+            if (unlikely(!PyString_CheckExact(key)) && unlikely(!PyString_Check(key))) {
+            #else
+            if (unlikely(!PyUnicode_Check(key))) {
+            #endif
+                goto invalid_keyword_type;
+            } else {
+                for (name = first_kw_arg; *name; name++) {
+                    #if PY_MAJOR_VERSION >= 3
+                    if (PyUnicode_GET_SIZE(**name) == PyUnicode_GET_SIZE(key) &&
+                        PyUnicode_Compare(**name, key) == 0) break;
+                    #else
+                    if (PyString_GET_SIZE(**name) == PyString_GET_SIZE(key) &&
+                        _PyString_Eq(**name, key)) break;
+                    #endif
                 }
-            }
-        } else
-        #endif
-        if (likely(PyUnicode_Check(key))) {
-            while (*name) {
-                int cmp = (**name == key) ? 0 :
-                #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
-                    (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :
-                #endif
-                    PyUnicode_Compare(**name, key);
-                if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
-                if (cmp == 0) {
+                if (*name) {
                     values[name-argnames] = value;
-                    break;
-                }
-                name++;
-            }
-            if (*name) continue;
-            else {
-                PyObject*** argname = argnames;
-                while (argname != first_kw_arg) {
-                    int cmp = (**argname == key) ? 0 :
-                    #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
-                        (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :
-                    #endif
-                        PyUnicode_Compare(**argname, key);
-                    if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
-                    if (cmp == 0) goto arg_passed_twice;
-                    argname++;
+                } else {
+                    for (name=argnames; name != first_kw_arg; name++) {
+                        if (**name == key) goto arg_passed_twice;
+                        #if PY_MAJOR_VERSION >= 3
+                        if (PyUnicode_GET_SIZE(**name) == PyUnicode_GET_SIZE(key) &&
+                            PyUnicode_Compare(**name, key) == 0) goto arg_passed_twice;
+                        #else
+                        if (PyString_GET_SIZE(**name) == PyString_GET_SIZE(key) &&
+                            _PyString_Eq(**name, key)) goto arg_passed_twice;
+                        #endif
+                    }
+                    if (kwds2) {
+                        if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
+                    } else {
+                        goto invalid_keyword;
+                    }
                 }
             }
-        } else
-            goto invalid_keyword_type;
-        if (kwds2) {
-            if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
-        } else {
-            goto invalid_keyword;
         }
     }
     return 0;
 arg_passed_twice:
-    __Pyx_RaiseDoubleKeywordsError(function_name, key);
+    __Pyx_RaiseDoubleKeywordsError(function_name, **name);
     goto bad;
 invalid_keyword_type:
     PyErr_Format(PyExc_TypeError,
-        "%.200s() keywords must be strings", function_name);
+        "%s() keywords must be strings", function_name);
     goto bad;
 invalid_keyword:
     PyErr_Format(PyExc_TypeError,
     #if PY_MAJOR_VERSION < 3
-        "%.200s() got an unexpected keyword argument '%.200s'",
+        "%s() got an unexpected keyword argument '%s'",
         function_name, PyString_AsString(key));
     #else
         "%s() got an unexpected keyword argument '%U'",
@@ -8023,29 +7826,23 @@ static int __Pyx_ParseOptionalKeywords(
     return -1;
 }
 
-static void __Pyx_RaiseArgumentTypeInvalid(const char* name, PyObject *obj, PyTypeObject *type) {
-    PyErr_Format(PyExc_TypeError,
-        "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)",
-        name, type->tp_name, Py_TYPE(obj)->tp_name);
-}
-static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed,
+static int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed,
     const char *name, int exact)
 {
-    if (unlikely(!type)) {
-        PyErr_SetString(PyExc_SystemError, "Missing type object");
+    if (!type) {
+        PyErr_Format(PyExc_SystemError, "Missing type object");
         return 0;
     }
     if (none_allowed && obj == Py_None) return 1;
     else if (exact) {
-        if (likely(Py_TYPE(obj) == type)) return 1;
-        #if PY_MAJOR_VERSION == 2
-        else if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1;
-        #endif
+        if (Py_TYPE(obj) == type) return 1;
     }
     else {
-        if (likely(PyObject_TypeCheck(obj, type))) return 1;
+        if (PyObject_TypeCheck(obj, type)) return 1;
     }
-    __Pyx_RaiseArgumentTypeInvalid(name, obj, type);
+    PyErr_Format(PyExc_TypeError,
+        "Argument '%s' has incorrect type (expected %s, got %s)",
+        name, type->tp_name, Py_TYPE(obj)->tp_name);
     return 0;
 }
 
@@ -8108,8 +7905,7 @@ static void __Pyx_BufFmt_RaiseUnexpectedChar(char ch) {
 }
 static const char* __Pyx_BufFmt_DescribeTypeChar(char ch, int is_complex) {
   switch (ch) {
-    case 'c': return "'char'";
-    case 'b': return "'signed char'";
+    case 'b': return "'char'";
     case 'B': return "'unsigned char'";
     case 'h': return "'short'";
     case 'H': return "'unsigned short'";
@@ -8177,7 +7973,7 @@ typedef struct { char c; void *x; } __Pyx_st_void_p;
 #ifdef HAVE_LONG_LONG
 typedef struct { char c; PY_LONG_LONG x; } __Pyx_st_longlong;
 #endif
-static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, CYTHON_UNUSED int is_complex) {
+static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, int is_complex) {
   switch (ch) {
     case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
     case 'h': case 'H': return sizeof(__Pyx_st_short) - sizeof(short);
@@ -8209,7 +8005,7 @@ typedef struct { void *x; char c; } __Pyx_pad_void_p;
 #ifdef HAVE_LONG_LONG
 typedef struct { PY_LONG_LONG x; char c; } __Pyx_pad_longlong;
 #endif
-static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, CYTHON_UNUSED int is_complex) {
+static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, int is_complex) {
   switch (ch) {
     case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
     case 'h': case 'H': return sizeof(__Pyx_pad_short) - sizeof(short);
@@ -8229,9 +8025,7 @@ static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, CYTHON_UNUSED int is_compl
 }
 static char __Pyx_BufFmt_TypeCharToGroup(char ch, int is_complex) {
   switch (ch) {
-    case 'c':
-        return 'H';
-    case 'b': case 'h': case 'i':
+    case 'c': case 'b': case 'h': case 'i':
     case 'l': case 'q': case 's': case 'p':
         return 'I';
     case 'B': case 'H': case 'I': case 'L': case 'Q':
@@ -8326,16 +8120,13 @@ static int __Pyx_BufFmt_ProcessTypeChunk(__Pyx_BufFmt_Context* ctx) {
         ctx->head->parent_offset = parent_offset;
         continue;
       }
-      if ((type->typegroup == 'H' || group == 'H') && type->size == size) {
-      } else {
-          __Pyx_BufFmt_RaiseExpected(ctx);
-          return -1;
-      }
+      __Pyx_BufFmt_RaiseExpected(ctx);
+      return -1;
     }
     offset = ctx->head->parent_offset + field->offset;
     if (ctx->fmt_offset != offset) {
       PyErr_Format(PyExc_ValueError,
-                   "Buffer dtype mismatch; next field is at offset %" CYTHON_FORMAT_SSIZE_T "d but %" CYTHON_FORMAT_SSIZE_T "d expected",
+                   "Buffer dtype mismatch; next field is at offset %"PY_FORMAT_SIZE_T"d but %"PY_FORMAT_SIZE_T"d expected",
                    (Py_ssize_t)ctx->fmt_offset, (Py_ssize_t)offset);
       return -1;
     }
@@ -8389,10 +8180,8 @@ __pyx_buffmt_parse_array(__Pyx_BufFmt_Context* ctx, const char** tsp)
     }
     if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
     while (*ts && *ts != ')') {
-        switch (*ts) {
-            case ' ': case '\f': case '\r': case '\n': case '\t': case '\v':  continue;
-            default:  break;  /* not a 'break' in the loop */
-        }
+        if (isspace(*ts))
+            continue;
         number = __Pyx_BufFmt_ExpectNumber(&ts);
         if (number == -1) return NULL;
         if (i < ndim && (size_t) number != ctx->head->field->type->arraysize[i])
@@ -8577,7 +8366,7 @@ static CYTHON_INLINE int __Pyx_GetBufferAndValidate(
   }
   if ((unsigned)buf->itemsize != dtype->size) {
     PyErr_Format(PyExc_ValueError,
-      "Item size of buffer (%" CYTHON_FORMAT_SSIZE_T "d byte%s) does not match size of '%s' (%" CYTHON_FORMAT_SSIZE_T "d byte%s)",
+      "Item size of buffer (%"PY_FORMAT_SIZE_T"d byte%s) does not match size of '%s' (%"PY_FORMAT_SIZE_T"d byte%s)",
       buf->itemsize, (buf->itemsize > 1) ? "s" : "",
       dtype->name, (Py_ssize_t)dtype->size, (dtype->size > 1) ? "s" : "");
     goto fail;
@@ -8594,45 +8383,7 @@ static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info) {
   __Pyx_ReleaseBuffer(info);
 }
 
-static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) {
-    PyObject *result;
-#if CYTHON_COMPILING_IN_CPYTHON
-    result = PyDict_GetItem(__pyx_d, name);
-    if (result) {
-        Py_INCREF(result);
-    } else {
-#else
-    result = PyObject_GetItem(__pyx_d, name);
-    if (!result) {
-        PyErr_Clear();
-#endif
-        result = __Pyx_GetBuiltinName(name);
-    }
-    return result;
-}
 
-#if CYTHON_COMPILING_IN_CPYTHON
-static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) {
-    PyObject *result;
-    ternaryfunc call = func->ob_type->tp_call;
-    if (unlikely(!call))
-        return PyObject_Call(func, arg, kw);
-#if PY_VERSION_HEX >= 0x02060000
-    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
-        return NULL;
-#endif
-    result = (*call)(func, arg, kw);
-#if PY_VERSION_HEX >= 0x02060000
-    Py_LeaveRecursiveCall();
-#endif
-    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
-        PyErr_SetString(
-            PyExc_SystemError,
-            "NULL result without error in PyObject_Call");
-    }
-    return result;
-}
-#endif
 
 static CYTHON_INLINE void __Pyx_ErrRestore(PyObject *type, PyObject *value, PyObject *tb) {
 #if CYTHON_COMPILING_IN_CPYTHON
@@ -8669,57 +8420,53 @@ static CYTHON_INLINE void __Pyx_ErrFetch(PyObject **type, PyObject **value, PyOb
 static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb,
                         CYTHON_UNUSED PyObject *cause) {
     Py_XINCREF(type);
-    if (!value || value == Py_None)
-        value = NULL;
-    else
+    Py_XINCREF(value);
+    Py_XINCREF(tb);
+    if (tb == Py_None) {
+        Py_DECREF(tb);
+        tb = 0;
+    }
+    else if (tb != NULL && !PyTraceBack_Check(tb)) {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: arg 3 must be a traceback or None");
+        goto raise_error;
+    }
+    if (value == NULL) {
+        value = Py_None;
         Py_INCREF(value);
-    if (!tb || tb == Py_None)
-        tb = NULL;
-    else {
-        Py_INCREF(tb);
-        if (!PyTraceBack_Check(tb)) {
-            PyErr_SetString(PyExc_TypeError,
-                "raise: arg 3 must be a traceback or None");
-            goto raise_error;
-        }
     }
     #if PY_VERSION_HEX < 0x02050000
-    if (PyClass_Check(type)) {
+    if (!PyClass_Check(type))
     #else
-    if (PyType_Check(type)) {
+    if (!PyType_Check(type))
     #endif
-#if CYTHON_COMPILING_IN_PYPY
-        if (!value) {
-            Py_INCREF(Py_None);
-            value = Py_None;
-        }
-#endif
-        PyErr_NormalizeException(&type, &value, &tb);
-    } else {
-        if (value) {
+    {
+        if (value != Py_None) {
             PyErr_SetString(PyExc_TypeError,
                 "instance exception may not have a separate value");
             goto raise_error;
         }
+        Py_DECREF(value);
         value = type;
         #if PY_VERSION_HEX < 0x02050000
-        if (PyInstance_Check(type)) {
-            type = (PyObject*) ((PyInstanceObject*)type)->in_class;
-            Py_INCREF(type);
-        } else {
-            type = 0;
-            PyErr_SetString(PyExc_TypeError,
-                "raise: exception must be an old-style class or instance");
-            goto raise_error;
-        }
+            if (PyInstance_Check(type)) {
+                type = (PyObject*) ((PyInstanceObject*)type)->in_class;
+                Py_INCREF(type);
+            }
+            else {
+                type = 0;
+                PyErr_SetString(PyExc_TypeError,
+                    "raise: exception must be an old-style class or instance");
+                goto raise_error;
+            }
         #else
-        type = (PyObject*) Py_TYPE(type);
-        Py_INCREF(type);
-        if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) {
-            PyErr_SetString(PyExc_TypeError,
-                "raise: exception class must be a subclass of BaseException");
-            goto raise_error;
-        }
+            type = (PyObject*) Py_TYPE(type);
+            Py_INCREF(type);
+            if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) {
+                PyErr_SetString(PyExc_TypeError,
+                    "raise: exception class must be a subclass of BaseException");
+                goto raise_error;
+            }
         #endif
     }
     __Pyx_ErrRestore(type, value, tb);
@@ -8732,7 +8479,6 @@ static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb,
 }
 #else /* Python 3+ */
 static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) {
-    PyObject* owned_instance = NULL;
     if (tb == Py_None) {
         tb = 0;
     } else if (tb && !PyTraceBack_Check(tb)) {
@@ -8750,68 +8496,31 @@ static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject
         }
         value = type;
         type = (PyObject*) Py_TYPE(value);
-    } else if (PyExceptionClass_Check(type)) {
-        PyObject *instance_class = NULL;
-        if (value && PyExceptionInstance_Check(value)) {
-            instance_class = (PyObject*) Py_TYPE(value);
-            if (instance_class != type) {
-                if (PyObject_IsSubclass(instance_class, type)) {
-                    type = instance_class;
-                } else {
-                    instance_class = NULL;
-                }
-            }
-        }
-        if (!instance_class) {
-            PyObject *args;
-            if (!value)
-                args = PyTuple_New(0);
-            else if (PyTuple_Check(value)) {
-                Py_INCREF(value);
-                args = value;
-            } else
-                args = PyTuple_Pack(1, value);
-            if (!args)
-                goto bad;
-            owned_instance = PyObject_Call(type, args, NULL);
-            Py_DECREF(args);
-            if (!owned_instance)
-                goto bad;
-            value = owned_instance;
-            if (!PyExceptionInstance_Check(value)) {
-                PyErr_Format(PyExc_TypeError,
-                             "calling %R should have returned an instance of "
-                             "BaseException, not %R",
-                             type, Py_TYPE(value));
-                goto bad;
-            }
-        }
-    } else {
+    } else if (!PyExceptionClass_Check(type)) {
         PyErr_SetString(PyExc_TypeError,
             "raise: exception class must be a subclass of BaseException");
         goto bad;
     }
-#if PY_VERSION_HEX >= 0x03030000
     if (cause) {
-#else
-    if (cause && cause != Py_None) {
-#endif
         PyObject *fixed_cause;
-        if (cause == Py_None) {
-            fixed_cause = NULL;
-        } else if (PyExceptionClass_Check(cause)) {
+        if (PyExceptionClass_Check(cause)) {
             fixed_cause = PyObject_CallObject(cause, NULL);
             if (fixed_cause == NULL)
                 goto bad;
-        } else if (PyExceptionInstance_Check(cause)) {
+        }
+        else if (PyExceptionInstance_Check(cause)) {
             fixed_cause = cause;
             Py_INCREF(fixed_cause);
-        } else {
+        }
+        else {
             PyErr_SetString(PyExc_TypeError,
                             "exception causes must derive from "
                             "BaseException");
             goto bad;
         }
+        if (!value) {
+            value = PyObject_CallObject(type, NULL);
+        }
         PyException_SetCause(value, fixed_cause);
     }
     PyErr_SetObject(type, value);
@@ -8825,88 +8534,11 @@ static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject
         }
     }
 bad:
-    Py_XDECREF(owned_instance);
     return;
 }
 #endif
 
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) {
-    PyObject *r;
-    if (!j) return NULL;
-    r = PyObject_GetItem(o, j);
-    Py_DECREF(j);
-    return r;
-}
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
-                                                              int wraparound, int boundscheck) {
-#if CYTHON_COMPILING_IN_CPYTHON
-    if (wraparound & unlikely(i < 0)) i += PyList_GET_SIZE(o);
-    if ((!boundscheck) || likely((0 <= i) & (i < PyList_GET_SIZE(o)))) {
-        PyObject *r = PyList_GET_ITEM(o, i);
-        Py_INCREF(r);
-        return r;
-    }
-    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
-#else
-    return PySequence_GetItem(o, i);
-#endif
-}
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
-                                                              int wraparound, int boundscheck) {
-#if CYTHON_COMPILING_IN_CPYTHON
-    if (wraparound & unlikely(i < 0)) i += PyTuple_GET_SIZE(o);
-    if ((!boundscheck) || likely((0 <= i) & (i < PyTuple_GET_SIZE(o)))) {
-        PyObject *r = PyTuple_GET_ITEM(o, i);
-        Py_INCREF(r);
-        return r;
-    }
-    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
-#else
-    return PySequence_GetItem(o, i);
-#endif
-}
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i,
-                                                     int is_list, int wraparound, int boundscheck) {
-#if CYTHON_COMPILING_IN_CPYTHON
-    if (is_list || PyList_CheckExact(o)) {
-        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyList_GET_SIZE(o);
-        if ((!boundscheck) || (likely((n >= 0) & (n < PyList_GET_SIZE(o))))) {
-            PyObject *r = PyList_GET_ITEM(o, n);
-            Py_INCREF(r);
-            return r;
-        }
-    }
-    else if (PyTuple_CheckExact(o)) {
-        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyTuple_GET_SIZE(o);
-        if ((!boundscheck) || likely((n >= 0) & (n < PyTuple_GET_SIZE(o)))) {
-            PyObject *r = PyTuple_GET_ITEM(o, n);
-            Py_INCREF(r);
-            return r;
-        }
-    } else {
-        PySequenceMethods *m = Py_TYPE(o)->tp_as_sequence;
-        if (likely(m && m->sq_item)) {
-            if (wraparound && unlikely(i < 0) && likely(m->sq_length)) {
-                Py_ssize_t l = m->sq_length(o);
-                if (likely(l >= 0)) {
-                    i += l;
-                } else {
-                    if (PyErr_ExceptionMatches(PyExc_OverflowError))
-                        PyErr_Clear();
-                    else
-                        return NULL;
-                }
-            }
-            return m->sq_item(o, i);
-        }
-    }
-#else
-    if (is_list || PySequence_Check(o)) {
-        return PySequence_GetItem(o, i);
-    }
-#endif
-    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
-}
+
 
 static CYTHON_INLINE int __Pyx_mod_int(int a, int b) {
     int r = a % b;
@@ -8914,283 +8546,8 @@ static CYTHON_INLINE int __Pyx_mod_int(int a, int b) {
     return r;
 }
 
-static CYTHON_INLINE int __Pyx_SetItemInt_Generic(PyObject *o, PyObject *j, PyObject *v) {
-    int r;
-    if (!j) return -1;
-    r = PyObject_SetItem(o, j, v);
-    Py_DECREF(j);
-    return r;
-}
-static CYTHON_INLINE int __Pyx_SetItemInt_Fast(PyObject *o, Py_ssize_t i, PyObject *v,
-                                               int is_list, int wraparound, int boundscheck) {
-#if CYTHON_COMPILING_IN_CPYTHON
-    if (is_list || PyList_CheckExact(o)) {
-        Py_ssize_t n = (!wraparound) ? i : ((likely(i >= 0)) ? i : i + PyList_GET_SIZE(o));
-        if ((!boundscheck) || likely((n >= 0) & (n < PyList_GET_SIZE(o)))) {
-            PyObject* old = PyList_GET_ITEM(o, n);
-            Py_INCREF(v);
-            PyList_SET_ITEM(o, n, v);
-            Py_DECREF(old);
-            return 1;
-        }
-    } else {
-        PySequenceMethods *m = Py_TYPE(o)->tp_as_sequence;
-        if (likely(m && m->sq_ass_item)) {
-            if (wraparound && unlikely(i < 0) && likely(m->sq_length)) {
-                Py_ssize_t l = m->sq_length(o);
-                if (likely(l >= 0)) {
-                    i += l;
-                } else {
-                    if (PyErr_ExceptionMatches(PyExc_OverflowError))
-                        PyErr_Clear();
-                    else
-                        return -1;
-                }
-            }
-            return m->sq_ass_item(o, i, v);
-        }
-    }
-#else
-#if CYTHON_COMPILING_IN_PYPY
-    if (is_list || (PySequence_Check(o) && !PyDict_Check(o))) {
-#else
-    if (is_list || PySequence_Check(o)) {
-#endif
-        return PySequence_SetItem(o, i, v);
-    }
-#endif
-    return __Pyx_SetItemInt_Generic(o, PyInt_FromSsize_t(i), v);
-}
-
-static CYTHON_INLINE PyObject* __Pyx_PyObject_GetSlice(
-        PyObject* obj, Py_ssize_t cstart, Py_ssize_t cstop,
-        PyObject** _py_start, PyObject** _py_stop, PyObject** _py_slice,
-        int has_cstart, int has_cstop, CYTHON_UNUSED int wraparound) {
-#if CYTHON_COMPILING_IN_CPYTHON
-    PyMappingMethods* mp;
-#if PY_MAJOR_VERSION < 3
-    PySequenceMethods* ms = Py_TYPE(obj)->tp_as_sequence;
-    if (likely(ms && ms->sq_slice)) {
-        if (!has_cstart) {
-            if (_py_start && (*_py_start != Py_None)) {
-                cstart = __Pyx_PyIndex_AsSsize_t(*_py_start);
-                if ((cstart == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad;
-            } else
-                cstart = 0;
-        }
-        if (!has_cstop) {
-            if (_py_stop && (*_py_stop != Py_None)) {
-                cstop = __Pyx_PyIndex_AsSsize_t(*_py_stop);
-                if ((cstop == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad;
-            } else
-                cstop = PY_SSIZE_T_MAX;
-        }
-        if (wraparound && unlikely((cstart < 0) | (cstop < 0)) && likely(ms->sq_length)) {
-            Py_ssize_t l = ms->sq_length(obj);
-            if (likely(l >= 0)) {
-                if (cstop < 0) {
-                    cstop += l;
-                    if (cstop < 0) cstop = 0;
-                }
-                if (cstart < 0) {
-                    cstart += l;
-                    if (cstart < 0) cstart = 0;
-                }
-            } else {
-                if (PyErr_ExceptionMatches(PyExc_OverflowError))
-                    PyErr_Clear();
-                else
-                    goto bad;
-            }
-        }
-        return ms->sq_slice(obj, cstart, cstop);
-    }
-#endif
-    mp = Py_TYPE(obj)->tp_as_mapping;
-    if (likely(mp && mp->mp_subscript))
-#endif
-    {
-        PyObject* result;
-        PyObject *py_slice, *py_start, *py_stop;
-        if (_py_slice) {
-            py_slice = *_py_slice;
-        } else {
-            PyObject* owned_start = NULL;
-            PyObject* owned_stop = NULL;
-            if (_py_start) {
-                py_start = *_py_start;
-            } else {
-                if (has_cstart) {
-                    owned_start = py_start = PyInt_FromSsize_t(cstart);
-                    if (unlikely(!py_start)) goto bad;
-                } else
-                    py_start = Py_None;
-            }
-            if (_py_stop) {
-                py_stop = *_py_stop;
-            } else {
-                if (has_cstop) {
-                    owned_stop = py_stop = PyInt_FromSsize_t(cstop);
-                    if (unlikely(!py_stop)) {
-                        Py_XDECREF(owned_start);
-                        goto bad;
-                    }
-                } else
-                    py_stop = Py_None;
-            }
-            py_slice = PySlice_New(py_start, py_stop, Py_None);
-            Py_XDECREF(owned_start);
-            Py_XDECREF(owned_stop);
-            if (unlikely(!py_slice)) goto bad;
-        }
-#if CYTHON_COMPILING_IN_CPYTHON
-        result = mp->mp_subscript(obj, py_slice);
-#else
-        result = PyObject_GetItem(obj, py_slice);
-#endif
-        if (!_py_slice) {
-            Py_DECREF(py_slice);
-        }
-        return result;
-    }
-    PyErr_Format(PyExc_TypeError,
-        "'%.200s' object is unsliceable", Py_TYPE(obj)->tp_name);
-bad:
-    return NULL;
-}
-
-static CYTHON_INLINE int __Pyx_PyObject_SetSlice(
-        PyObject* obj, PyObject* value, Py_ssize_t cstart, Py_ssize_t cstop,
-        PyObject** _py_start, PyObject** _py_stop, PyObject** _py_slice,
-        int has_cstart, int has_cstop, CYTHON_UNUSED int wraparound) {
-#if CYTHON_COMPILING_IN_CPYTHON
-    PyMappingMethods* mp;
-#if PY_MAJOR_VERSION < 3
-    PySequenceMethods* ms = Py_TYPE(obj)->tp_as_sequence;
-    if (likely(ms && ms->sq_ass_slice)) {
-        if (!has_cstart) {
-            if (_py_start && (*_py_start != Py_None)) {
-                cstart = __Pyx_PyIndex_AsSsize_t(*_py_start);
-                if ((cstart == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad;
-            } else
-                cstart = 0;
-        }
-        if (!has_cstop) {
-            if (_py_stop && (*_py_stop != Py_None)) {
-                cstop = __Pyx_PyIndex_AsSsize_t(*_py_stop);
-                if ((cstop == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad;
-            } else
-                cstop = PY_SSIZE_T_MAX;
-        }
-        if (wraparound && unlikely((cstart < 0) | (cstop < 0)) && likely(ms->sq_length)) {
-            Py_ssize_t l = ms->sq_length(obj);
-            if (likely(l >= 0)) {
-                if (cstop < 0) {
-                    cstop += l;
-                    if (cstop < 0) cstop = 0;
-                }
-                if (cstart < 0) {
-                    cstart += l;
-                    if (cstart < 0) cstart = 0;
-                }
-            } else {
-                if (PyErr_ExceptionMatches(PyExc_OverflowError))
-                    PyErr_Clear();
-                else
-                    goto bad;
-            }
-        }
-        return ms->sq_ass_slice(obj, cstart, cstop, value);
-    }
-#endif
-    mp = Py_TYPE(obj)->tp_as_mapping;
-    if (likely(mp && mp->mp_ass_subscript))
-#endif
-    {
-        int result;
-        PyObject *py_slice, *py_start, *py_stop;
-        if (_py_slice) {
-            py_slice = *_py_slice;
-        } else {
-            PyObject* owned_start = NULL;
-            PyObject* owned_stop = NULL;
-            if (_py_start) {
-                py_start = *_py_start;
-            } else {
-                if (has_cstart) {
-                    owned_start = py_start = PyInt_FromSsize_t(cstart);
-                    if (unlikely(!py_start)) goto bad;
-                } else
-                    py_start = Py_None;
-            }
-            if (_py_stop) {
-                py_stop = *_py_stop;
-            } else {
-                if (has_cstop) {
-                    owned_stop = py_stop = PyInt_FromSsize_t(cstop);
-                    if (unlikely(!py_stop)) {
-                        Py_XDECREF(owned_start);
-                        goto bad;
-                    }
-                } else
-                    py_stop = Py_None;
-            }
-            py_slice = PySlice_New(py_start, py_stop, Py_None);
-            Py_XDECREF(owned_start);
-            Py_XDECREF(owned_stop);
-            if (unlikely(!py_slice)) goto bad;
-        }
-#if CYTHON_COMPILING_IN_CPYTHON
-        result = mp->mp_ass_subscript(obj, py_slice, value);
-#else
-        result = value ? PyObject_SetItem(obj, py_slice, value) : PyObject_DelItem(obj, py_slice);
-#endif
-        if (!_py_slice) {
-            Py_DECREF(py_slice);
-        }
-        return result;
-    }
-    PyErr_Format(PyExc_TypeError,
-        "'%.200s' object does not support slice %.10s",
-        Py_TYPE(obj)->tp_name, value ? "assignment" : "deletion");
-bad:
-    return -1;
-}
-
-static CYTHON_INLINE void __Pyx_ExceptionSave(PyObject **type, PyObject **value, PyObject **tb) {
-#if CYTHON_COMPILING_IN_CPYTHON
-    PyThreadState *tstate = PyThreadState_GET();
-    *type = tstate->exc_type;
-    *value = tstate->exc_value;
-    *tb = tstate->exc_traceback;
-    Py_XINCREF(*type);
-    Py_XINCREF(*value);
-    Py_XINCREF(*tb);
-#else
-    PyErr_GetExcInfo(type, value, tb);
-#endif
-}
-static void __Pyx_ExceptionReset(PyObject *type, PyObject *value, PyObject *tb) {
-#if CYTHON_COMPILING_IN_CPYTHON
-    PyObject *tmp_type, *tmp_value, *tmp_tb;
-    PyThreadState *tstate = PyThreadState_GET();
-    tmp_type = tstate->exc_type;
-    tmp_value = tstate->exc_value;
-    tmp_tb = tstate->exc_traceback;
-    tstate->exc_type = type;
-    tstate->exc_value = value;
-    tstate->exc_traceback = tb;
-    Py_XDECREF(tmp_type);
-    Py_XDECREF(tmp_value);
-    Py_XDECREF(tmp_tb);
-#else
-    PyErr_SetExcInfo(type, value, tb);
-#endif
-}
-
 static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb) {
     PyObject *local_type, *local_value, *local_tb;
-#if CYTHON_COMPILING_IN_CPYTHON
     PyObject *tmp_type, *tmp_value, *tmp_tb;
     PyThreadState *tstate = PyThreadState_GET();
     local_type = tstate->curexc_type;
@@ -9199,41 +8556,30 @@ static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb)
     tstate->curexc_type = 0;
     tstate->curexc_value = 0;
     tstate->curexc_traceback = 0;
-#else
-    PyErr_Fetch(&local_type, &local_value, &local_tb);
-#endif
     PyErr_NormalizeException(&local_type, &local_value, &local_tb);
-#if CYTHON_COMPILING_IN_CPYTHON
     if (unlikely(tstate->curexc_type))
-#else
-    if (unlikely(PyErr_Occurred()))
-#endif
         goto bad;
     #if PY_MAJOR_VERSION >= 3
-    if (local_tb) {
-        if (unlikely(PyException_SetTraceback(local_value, local_tb) < 0))
-            goto bad;
-    }
+    if (unlikely(PyException_SetTraceback(local_value, local_tb) < 0))
+        goto bad;
     #endif
-    Py_XINCREF(local_tb);
-    Py_XINCREF(local_type);
-    Py_XINCREF(local_value);
     *type = local_type;
     *value = local_value;
     *tb = local_tb;
-#if CYTHON_COMPILING_IN_CPYTHON
+    Py_INCREF(local_type);
+    Py_INCREF(local_value);
+    Py_INCREF(local_tb);
     tmp_type = tstate->exc_type;
     tmp_value = tstate->exc_value;
     tmp_tb = tstate->exc_traceback;
     tstate->exc_type = local_type;
     tstate->exc_value = local_value;
     tstate->exc_traceback = local_tb;
+    /* Make sure tstate is in a consistent state when we XDECREF
+       these objects (XDECREF may run arbitrary code). */
     Py_XDECREF(tmp_type);
     Py_XDECREF(tmp_value);
     Py_XDECREF(tmp_tb);
-#else
-    PyErr_SetExcInfo(local_type, local_value, local_tb);
-#endif
     return 0;
 bad:
     *type = 0;
@@ -9258,24 +8604,34 @@ static CYTHON_INLINE long __Pyx_div_long(long a, long b) {
     return q;
 }
 
-static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) {
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index) {
     PyErr_Format(PyExc_ValueError,
-                 "too many values to unpack (expected %" CYTHON_FORMAT_SSIZE_T "d)", expected);
+                 "need more than %"PY_FORMAT_SIZE_T"d value%s to unpack",
+                 index, (index == 1) ? "" : "s");
 }
 
-static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index) {
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) {
     PyErr_Format(PyExc_ValueError,
-                 "need more than %" CYTHON_FORMAT_SSIZE_T "d value%.1s to unpack",
-                 index, (index == 1) ? "" : "s");
+                 "too many values to unpack (expected %"PY_FORMAT_SIZE_T"d)", expected);
 }
 
 static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void) {
     PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
 }
 
+static void __Pyx_UnpackTupleError(PyObject *t, Py_ssize_t index) {
+    if (t == Py_None) {
+      __Pyx_RaiseNoneNotIterableError();
+    } else if (PyTuple_GET_SIZE(t) < index) {
+      __Pyx_RaiseNeedMoreValuesError(PyTuple_GET_SIZE(t));
+    } else {
+      __Pyx_RaiseTooManyValuesError(index);
+    }
+}
+
 static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type) {
     if (unlikely(!type)) {
-        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        PyErr_Format(PyExc_SystemError, "Missing type object");
         return 0;
     }
     if (likely(PyObject_TypeCheck(obj, type)))
@@ -9285,96 +8641,132 @@ static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type) {
     return 0;
 }
 
-static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name) {
-    PyObject* value = __Pyx_PyObject_GetAttrStr(module, name);
-    if (unlikely(!value) && PyErr_ExceptionMatches(PyExc_AttributeError)) {
-        PyErr_Format(PyExc_ImportError,
-        #if PY_MAJOR_VERSION < 3
-            "cannot import name %.230s", PyString_AS_STRING(name));
-        #else
-            "cannot import name %S", name);
-        #endif
-    }
-    return value;
+static CYTHON_INLINE void __Pyx_ExceptionSave(PyObject **type, PyObject **value, PyObject **tb) {
+    PyThreadState *tstate = PyThreadState_GET();
+    *type = tstate->exc_type;
+    *value = tstate->exc_value;
+    *tb = tstate->exc_traceback;
+    Py_XINCREF(*type);
+    Py_XINCREF(*value);
+    Py_XINCREF(*tb);
+}
+static void __Pyx_ExceptionReset(PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    PyThreadState *tstate = PyThreadState_GET();
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = type;
+    tstate->exc_value = value;
+    tstate->exc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
 }
 
 #if PY_MAJOR_VERSION < 3
 static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags) {
+    PyObject *getbuffer_cobj;
+
   #if PY_VERSION_HEX >= 0x02060000
     if (PyObject_CheckBuffer(obj)) return PyObject_GetBuffer(obj, view, flags);
   #endif
+
         if (PyObject_TypeCheck(obj, __pyx_ptype_5numpy_ndarray)) return __pyx_pw_5numpy_7ndarray_1__getbuffer__(obj, view, flags);
+
   #if PY_VERSION_HEX < 0x02060000
-    if (obj->ob_type->tp_dict) {
-        PyObject *getbuffer_cobj = PyObject_GetItem(
-            obj->ob_type->tp_dict, __pyx_n_s_pyx_getbuffer);
-        if (getbuffer_cobj) {
-            getbufferproc func = (getbufferproc) PyCObject_AsVoidPtr(getbuffer_cobj);
-            Py_DECREF(getbuffer_cobj);
-            if (!func)
-                goto fail;
-            return func(obj, view, flags);
-        } else {
-            PyErr_Clear();
-        }
+    if (obj->ob_type->tp_dict &&
+        (getbuffer_cobj = PyMapping_GetItemString(obj->ob_type->tp_dict,
+                                             "__pyx_getbuffer"))) {
+        getbufferproc func;
+
+      #if PY_VERSION_HEX >= 0x02070000 && !(PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 0)
+        func = (getbufferproc) PyCapsule_GetPointer(getbuffer_cobj, "getbuffer(obj, view, flags)");
+      #else
+        func = (getbufferproc) PyCObject_AsVoidPtr(getbuffer_cobj);
+      #endif
+        Py_DECREF(getbuffer_cobj);
+        if (!func)
+            goto fail;
+
+        return func(obj, view, flags);
+    } else {
+        PyErr_Clear();
     }
   #endif
-    PyErr_Format(PyExc_TypeError, "'%.200s' does not have the buffer interface", Py_TYPE(obj)->tp_name);
+
+    PyErr_Format(PyExc_TypeError, "'%100s' does not have the buffer interface", Py_TYPE(obj)->tp_name);
+
 #if PY_VERSION_HEX < 0x02060000
 fail:
 #endif
+
     return -1;
 }
+
 static void __Pyx_ReleaseBuffer(Py_buffer *view) {
     PyObject *obj = view->obj;
+    PyObject *releasebuffer_cobj;
+
     if (!obj) return;
+
   #if PY_VERSION_HEX >= 0x02060000
     if (PyObject_CheckBuffer(obj)) {
         PyBuffer_Release(view);
         return;
     }
   #endif
+
         if (PyObject_TypeCheck(obj, __pyx_ptype_5numpy_ndarray)) { __pyx_pw_5numpy_7ndarray_3__releasebuffer__(obj, view); return; }
+
   #if PY_VERSION_HEX < 0x02060000
-    if (obj->ob_type->tp_dict) {
-        PyObject *releasebuffer_cobj = PyObject_GetItem(
-            obj->ob_type->tp_dict, __pyx_n_s_pyx_releasebuffer);
-        if (releasebuffer_cobj) {
-            releasebufferproc func = (releasebufferproc) PyCObject_AsVoidPtr(releasebuffer_cobj);
-            Py_DECREF(releasebuffer_cobj);
-            if (!func)
-                goto fail;
-            func(obj, view);
-            return;
-        } else {
-            PyErr_Clear();
-        }
+    if (obj->ob_type->tp_dict &&
+        (releasebuffer_cobj = PyMapping_GetItemString(obj->ob_type->tp_dict,
+                                                      "__pyx_releasebuffer"))) {
+        releasebufferproc func;
+
+      #if PY_VERSION_HEX >= 0x02070000 && !(PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 0)
+        func = (releasebufferproc) PyCapsule_GetPointer(releasebuffer_cobj, "releasebuffer(obj, view)");
+      #else
+        func = (releasebufferproc) PyCObject_AsVoidPtr(releasebuffer_cobj);
+      #endif
+
+        Py_DECREF(releasebuffer_cobj);
+
+        if (!func)
+            goto fail;
+
+        func(obj, view);
+        return;
+    } else {
+        PyErr_Clear();
     }
   #endif
+
     goto nofail;
+
 #if PY_VERSION_HEX < 0x02060000
 fail:
 #endif
     PyErr_WriteUnraisable(obj);
+
 nofail:
     Py_DECREF(obj);
     view->obj = NULL;
 }
-#endif /*  PY_MAJOR_VERSION < 3 */
 
+#endif /*  PY_MAJOR_VERSION < 3 */
 
-          static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
+    static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, long level) {
+    PyObject *py_import = 0;
     PyObject *empty_list = 0;
     PyObject *module = 0;
     PyObject *global_dict = 0;
     PyObject *empty_dict = 0;
     PyObject *list;
-    #if PY_VERSION_HEX < 0x03030000
-    PyObject *py_import;
-    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
+    py_import = __Pyx_GetAttrString(__pyx_b, "__import__");
     if (!py_import)
         goto bad;
-    #endif
     if (from_list)
         list = from_list;
     else {
@@ -9394,17 +8786,13 @@ static void __Pyx_ReleaseBuffer(Py_buffer *view) {
         #if PY_MAJOR_VERSION >= 3
         if (level == -1) {
             if (strchr(__Pyx_MODULE_NAME, '.')) {
-                #if PY_VERSION_HEX < 0x03030000
+                /* try package relative import first */
                 PyObject *py_level = PyInt_FromLong(1);
                 if (!py_level)
                     goto bad;
                 module = PyObject_CallFunctionObjArgs(py_import,
                     name, global_dict, empty_dict, list, py_level, NULL);
                 Py_DECREF(py_level);
-                #else
-                module = PyImport_ImportModuleLevelObject(
-                    name, global_dict, empty_dict, list, 1);
-                #endif
                 if (!module) {
                     if (!PyErr_ExceptionMatches(PyExc_ImportError))
                         goto bad;
@@ -9415,17 +8803,12 @@ static void __Pyx_ReleaseBuffer(Py_buffer *view) {
         }
         #endif
         if (!module) {
-            #if PY_VERSION_HEX < 0x03030000
             PyObject *py_level = PyInt_FromLong(level);
             if (!py_level)
                 goto bad;
             module = PyObject_CallFunctionObjArgs(py_import,
                 name, global_dict, empty_dict, list, py_level, NULL);
             Py_DECREF(py_level);
-            #else
-            module = PyImport_ImportModuleLevelObject(
-                name, global_dict, empty_dict, list, level);
-            #endif
         }
     }
     #else
@@ -9437,329 +8820,41 @@ static void __Pyx_ReleaseBuffer(Py_buffer *view) {
         name, global_dict, empty_dict, list, NULL);
     #endif
 bad:
-    #if PY_VERSION_HEX < 0x03030000
-    Py_XDECREF(py_import);
-    #endif
     Py_XDECREF(empty_list);
+    Py_XDECREF(py_import);
     Py_XDECREF(empty_dict);
     return module;
 }
 
-#define __PYX_VERIFY_RETURN_INT(target_type, func_type, func)             \
-    {                                                                     \
-        func_type value = func(x);                                        \
-        if (sizeof(target_type) < sizeof(func_type)) {                    \
-            if (unlikely(value != (func_type) (target_type) value)) {     \
-                func_type zero = 0;                                       \
-                PyErr_SetString(PyExc_OverflowError,                      \
-                    (is_unsigned && unlikely(value < zero)) ?             \
-                    "can't convert negative value to " #target_type :     \
-                    "value too large to convert to " #target_type);       \
-                return (target_type) -1;                                  \
-            }                                                             \
-        }                                                                 \
-        return (target_type) value;                                       \
-    }
-
-#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
- #if CYTHON_USE_PYLONG_INTERNALS
-  #include "longintrepr.h"
- #endif
-#endif
-static CYTHON_INLINE unsigned int __Pyx_PyInt_As_unsigned_int(PyObject *x) {
-    const unsigned int neg_one = (unsigned int) -1, const_zero = 0;
-    const int is_unsigned = neg_one > const_zero;
-#if PY_MAJOR_VERSION < 3
-    if (likely(PyInt_Check(x))) {
-        if (sizeof(unsigned int) < sizeof(long)) {
-            __PYX_VERIFY_RETURN_INT(unsigned int, long, PyInt_AS_LONG)
-        } else {
-            long val = PyInt_AS_LONG(x);
-            if (is_unsigned && unlikely(val < 0)) {
-                PyErr_SetString(PyExc_OverflowError,
-                                "can't convert negative value to unsigned int");
-                return (unsigned int) -1;
-            }
-            return (unsigned int) val;
-        }
-    } else
-#endif
-    if (likely(PyLong_Check(x))) {
-        if (is_unsigned) {
-#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
- #if CYTHON_USE_PYLONG_INTERNALS
-            if (sizeof(digit) <= sizeof(unsigned int)) {
-                switch (Py_SIZE(x)) {
-                    case  0: return 0;
-                    case  1: return (unsigned int) ((PyLongObject*)x)->ob_digit[0];
-                }
-            }
- #endif
-#endif
-            if (unlikely(Py_SIZE(x) < 0)) {
-                PyErr_SetString(PyExc_OverflowError,
-                                "can't convert negative value to unsigned int");
-                return (unsigned int) -1;
-            }
-            if (sizeof(unsigned int) <= sizeof(unsigned long)) {
-                __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, PyLong_AsUnsignedLong)
-            } else if (sizeof(unsigned int) <= sizeof(unsigned long long)) {
-                __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long long, PyLong_AsUnsignedLongLong)
-            }
-        } else {
-#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
- #if CYTHON_USE_PYLONG_INTERNALS
-            if (sizeof(digit) <= sizeof(unsigned int)) {
-                switch (Py_SIZE(x)) {
-                    case  0: return 0;
-                    case  1: return +(unsigned int) ((PyLongObject*)x)->ob_digit[0];
-                    case -1: return -(unsigned int) ((PyLongObject*)x)->ob_digit[0];
-                }
-            }
- #endif
-#endif
-            if (sizeof(unsigned int) <= sizeof(long)) {
-                __PYX_VERIFY_RETURN_INT(unsigned int, long, PyLong_AsLong)
-            } else if (sizeof(unsigned int) <= sizeof(long long)) {
-                __PYX_VERIFY_RETURN_INT(unsigned int, long long, PyLong_AsLongLong)
-            }
-        }
-        {
-#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
-            PyErr_SetString(PyExc_RuntimeError,
-                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
-#else
-            unsigned int val;
-            PyObject *v = __Pyx_PyNumber_Int(x);
- #if PY_MAJOR_VERSION < 3
-            if (likely(v) && !PyLong_Check(v)) {
-                PyObject *tmp = v;
-                v = PyNumber_Long(tmp);
-                Py_DECREF(tmp);
-            }
- #endif
-            if (likely(v)) {
-                int one = 1; int is_little = (int)*(unsigned char *)&one;
-                unsigned char *bytes = (unsigned char *)&val;
-                int ret = _PyLong_AsByteArray((PyLongObject *)v,
-                                              bytes, sizeof(val),
-                                              is_little, !is_unsigned);
-                Py_DECREF(v);
-                if (likely(!ret))
-                    return val;
-            }
-#endif
-            return (unsigned int) -1;
-        }
-    } else {
-        unsigned int val;
-        PyObject *tmp = __Pyx_PyNumber_Int(x);
-        if (!tmp) return (unsigned int) -1;
-        val = __Pyx_PyInt_As_unsigned_int(tmp);
-        Py_DECREF(tmp);
-        return val;
-    }
-}
-
-#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
- #if CYTHON_USE_PYLONG_INTERNALS
-  #include "longintrepr.h"
- #endif
-#endif
-static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
-    const int neg_one = (int) -1, const_zero = 0;
-    const int is_unsigned = neg_one > const_zero;
+static CYTHON_INLINE void __Pyx_RaiseImportError(PyObject *name) {
 #if PY_MAJOR_VERSION < 3
-    if (likely(PyInt_Check(x))) {
-        if (sizeof(int) < sizeof(long)) {
-            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG)
-        } else {
-            long val = PyInt_AS_LONG(x);
-            if (is_unsigned && unlikely(val < 0)) {
-                PyErr_SetString(PyExc_OverflowError,
-                                "can't convert negative value to int");
-                return (int) -1;
-            }
-            return (int) val;
-        }
-    } else
-#endif
-    if (likely(PyLong_Check(x))) {
-        if (is_unsigned) {
-#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
- #if CYTHON_USE_PYLONG_INTERNALS
-            if (sizeof(digit) <= sizeof(int)) {
-                switch (Py_SIZE(x)) {
-                    case  0: return 0;
-                    case  1: return (int) ((PyLongObject*)x)->ob_digit[0];
-                }
-            }
- #endif
-#endif
-            if (unlikely(Py_SIZE(x) < 0)) {
-                PyErr_SetString(PyExc_OverflowError,
-                                "can't convert negative value to int");
-                return (int) -1;
-            }
-            if (sizeof(int) <= sizeof(unsigned long)) {
-                __PYX_VERIFY_RETURN_INT(int, unsigned long, PyLong_AsUnsignedLong)
-            } else if (sizeof(int) <= sizeof(unsigned long long)) {
-                __PYX_VERIFY_RETURN_INT(int, unsigned long long, PyLong_AsUnsignedLongLong)
-            }
-        } else {
-#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
- #if CYTHON_USE_PYLONG_INTERNALS
-            if (sizeof(digit) <= sizeof(int)) {
-                switch (Py_SIZE(x)) {
-                    case  0: return 0;
-                    case  1: return +(int) ((PyLongObject*)x)->ob_digit[0];
-                    case -1: return -(int) ((PyLongObject*)x)->ob_digit[0];
-                }
-            }
- #endif
-#endif
-            if (sizeof(int) <= sizeof(long)) {
-                __PYX_VERIFY_RETURN_INT(int, long, PyLong_AsLong)
-            } else if (sizeof(int) <= sizeof(long long)) {
-                __PYX_VERIFY_RETURN_INT(int, long long, PyLong_AsLongLong)
-            }
-        }
-        {
-#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
-            PyErr_SetString(PyExc_RuntimeError,
-                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+    PyErr_Format(PyExc_ImportError, "cannot import name %.230s",
+                 PyString_AsString(name));
 #else
-            int val;
-            PyObject *v = __Pyx_PyNumber_Int(x);
- #if PY_MAJOR_VERSION < 3
-            if (likely(v) && !PyLong_Check(v)) {
-                PyObject *tmp = v;
-                v = PyNumber_Long(tmp);
-                Py_DECREF(tmp);
-            }
- #endif
-            if (likely(v)) {
-                int one = 1; int is_little = (int)*(unsigned char *)&one;
-                unsigned char *bytes = (unsigned char *)&val;
-                int ret = _PyLong_AsByteArray((PyLongObject *)v,
-                                              bytes, sizeof(val),
-                                              is_little, !is_unsigned);
-                Py_DECREF(v);
-                if (likely(!ret))
-                    return val;
-            }
+    PyErr_Format(PyExc_ImportError, "cannot import name %S", name);
 #endif
-            return (int) -1;
-        }
-    } else {
-        int val;
-        PyObject *tmp = __Pyx_PyNumber_Int(x);
-        if (!tmp) return (int) -1;
-        val = __Pyx_PyInt_As_int(tmp);
-        Py_DECREF(tmp);
-        return val;
-    }
-}
-
-static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) {
-    const int neg_one = (int) -1, const_zero = 0;
-    const int is_unsigned = neg_one > const_zero;
-    if (is_unsigned) {
-        if (sizeof(int) < sizeof(long)) {
-            return PyInt_FromLong((long) value);
-        } else if (sizeof(int) <= sizeof(unsigned long)) {
-            return PyLong_FromUnsignedLong((unsigned long) value);
-        } else if (sizeof(int) <= sizeof(unsigned long long)) {
-            return PyLong_FromUnsignedLongLong((unsigned long long) value);
-        }
-    } else {
-        if (sizeof(int) <= sizeof(long)) {
-            return PyInt_FromLong((long) value);
-        } else if (sizeof(int) <= sizeof(long long)) {
-            return PyLong_FromLongLong((long long) value);
-        }
-    }
-    {
-        int one = 1; int little = (int)*(unsigned char *)&one;
-        unsigned char *bytes = (unsigned char *)&value;
-        return _PyLong_FromByteArray(bytes, sizeof(int),
-                                     little, !is_unsigned);
-    }
-}
-
-static CYTHON_INLINE PyObject* __Pyx_PyInt_From_unsigned_int(unsigned int value) {
-    const unsigned int neg_one = (unsigned int) -1, const_zero = 0;
-    const int is_unsigned = neg_one > const_zero;
-    if (is_unsigned) {
-        if (sizeof(unsigned int) < sizeof(long)) {
-            return PyInt_FromLong((long) value);
-        } else if (sizeof(unsigned int) <= sizeof(unsigned long)) {
-            return PyLong_FromUnsignedLong((unsigned long) value);
-        } else if (sizeof(unsigned int) <= sizeof(unsigned long long)) {
-            return PyLong_FromUnsignedLongLong((unsigned long long) value);
-        }
-    } else {
-        if (sizeof(unsigned int) <= sizeof(long)) {
-            return PyInt_FromLong((long) value);
-        } else if (sizeof(unsigned int) <= sizeof(long long)) {
-            return PyLong_FromLongLong((long long) value);
-        }
-    }
-    {
-        int one = 1; int little = (int)*(unsigned char *)&one;
-        unsigned char *bytes = (unsigned char *)&value;
-        return _PyLong_FromByteArray(bytes, sizeof(unsigned int),
-                                     little, !is_unsigned);
-    }
-}
-
-static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
-    const long neg_one = (long) -1, const_zero = 0;
-    const int is_unsigned = neg_one > const_zero;
-    if (is_unsigned) {
-        if (sizeof(long) < sizeof(long)) {
-            return PyInt_FromLong((long) value);
-        } else if (sizeof(long) <= sizeof(unsigned long)) {
-            return PyLong_FromUnsignedLong((unsigned long) value);
-        } else if (sizeof(long) <= sizeof(unsigned long long)) {
-            return PyLong_FromUnsignedLongLong((unsigned long long) value);
-        }
-    } else {
-        if (sizeof(long) <= sizeof(long)) {
-            return PyInt_FromLong((long) value);
-        } else if (sizeof(long) <= sizeof(long long)) {
-            return PyLong_FromLongLong((long long) value);
-        }
-    }
-    {
-        int one = 1; int little = (int)*(unsigned char *)&one;
-        unsigned char *bytes = (unsigned char *)&value;
-        return _PyLong_FromByteArray(bytes, sizeof(long),
-                                     little, !is_unsigned);
-    }
 }
-
-static CYTHON_INLINE PyObject* __Pyx_PyInt_From_npy_int32(npy_int32 value) {
-    const npy_int32 neg_one = (npy_int32) -1, const_zero = 0;
-    const int is_unsigned = neg_one > const_zero;
-    if (is_unsigned) {
-        if (sizeof(npy_int32) < sizeof(long)) {
-            return PyInt_FromLong((long) value);
-        } else if (sizeof(npy_int32) <= sizeof(unsigned long)) {
-            return PyLong_FromUnsignedLong((unsigned long) value);
-        } else if (sizeof(npy_int32) <= sizeof(unsigned long long)) {
-            return PyLong_FromUnsignedLongLong((unsigned long long) value);
-        }
+
+static CYTHON_INLINE PyObject *__Pyx_PyInt_to_py_npy_int32(npy_int32 val) {
+    const npy_int32 neg_one = (npy_int32)-1, const_zero = (npy_int32)0;
+    const int is_unsigned = const_zero < neg_one;
+    if ((sizeof(npy_int32) == sizeof(char))  ||
+        (sizeof(npy_int32) == sizeof(short))) {
+        return PyInt_FromLong((long)val);
+    } else if ((sizeof(npy_int32) == sizeof(int)) ||
+               (sizeof(npy_int32) == sizeof(long))) {
+        if (is_unsigned)
+            return PyLong_FromUnsignedLong((unsigned long)val);
+        else
+            return PyInt_FromLong((long)val);
+    } else if (sizeof(npy_int32) == sizeof(PY_LONG_LONG)) {
+        if (is_unsigned)
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG)val);
+        else
+            return PyLong_FromLongLong((PY_LONG_LONG)val);
     } else {
-        if (sizeof(npy_int32) <= sizeof(long)) {
-            return PyInt_FromLong((long) value);
-        } else if (sizeof(npy_int32) <= sizeof(long long)) {
-            return PyLong_FromLongLong((long long) value);
-        }
-    }
-    {
         int one = 1; int little = (int)*(unsigned char *)&one;
-        unsigned char *bytes = (unsigned char *)&value;
+        unsigned char *bytes = (unsigned char *)&val;
         return _PyLong_FromByteArray(bytes, sizeof(npy_int32),
                                      little, !is_unsigned);
     }
@@ -10005,101 +9100,401 @@ static CYTHON_INLINE PyObject* __Pyx_PyInt_From_npy_int32(npy_int32 value) {
     #endif
 #endif
 
-#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
- #if CYTHON_USE_PYLONG_INTERNALS
-  #include "longintrepr.h"
- #endif
-#endif
-static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
-    const long neg_one = (long) -1, const_zero = 0;
+static CYTHON_INLINE unsigned char __Pyx_PyInt_AsUnsignedChar(PyObject* x) {
+    const unsigned char neg_one = (unsigned char)-1, const_zero = 0;
     const int is_unsigned = neg_one > const_zero;
-#if PY_MAJOR_VERSION < 3
+    if (sizeof(unsigned char) < sizeof(long)) {
+        long val = __Pyx_PyInt_AsLong(x);
+        if (unlikely(val != (long)(unsigned char)val)) {
+            if (!unlikely(val == -1 && PyErr_Occurred())) {
+                PyErr_SetString(PyExc_OverflowError,
+                    (is_unsigned && unlikely(val < 0)) ?
+                    "can't convert negative value to unsigned char" :
+                    "value too large to convert to unsigned char");
+            }
+            return (unsigned char)-1;
+        }
+        return (unsigned char)val;
+    }
+    return (unsigned char)__Pyx_PyInt_AsUnsignedLong(x);
+}
+
+static CYTHON_INLINE unsigned short __Pyx_PyInt_AsUnsignedShort(PyObject* x) {
+    const unsigned short neg_one = (unsigned short)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (sizeof(unsigned short) < sizeof(long)) {
+        long val = __Pyx_PyInt_AsLong(x);
+        if (unlikely(val != (long)(unsigned short)val)) {
+            if (!unlikely(val == -1 && PyErr_Occurred())) {
+                PyErr_SetString(PyExc_OverflowError,
+                    (is_unsigned && unlikely(val < 0)) ?
+                    "can't convert negative value to unsigned short" :
+                    "value too large to convert to unsigned short");
+            }
+            return (unsigned short)-1;
+        }
+        return (unsigned short)val;
+    }
+    return (unsigned short)__Pyx_PyInt_AsUnsignedLong(x);
+}
+
+static CYTHON_INLINE unsigned int __Pyx_PyInt_AsUnsignedInt(PyObject* x) {
+    const unsigned int neg_one = (unsigned int)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (sizeof(unsigned int) < sizeof(long)) {
+        long val = __Pyx_PyInt_AsLong(x);
+        if (unlikely(val != (long)(unsigned int)val)) {
+            if (!unlikely(val == -1 && PyErr_Occurred())) {
+                PyErr_SetString(PyExc_OverflowError,
+                    (is_unsigned && unlikely(val < 0)) ?
+                    "can't convert negative value to unsigned int" :
+                    "value too large to convert to unsigned int");
+            }
+            return (unsigned int)-1;
+        }
+        return (unsigned int)val;
+    }
+    return (unsigned int)__Pyx_PyInt_AsUnsignedLong(x);
+}
+
+static CYTHON_INLINE char __Pyx_PyInt_AsChar(PyObject* x) {
+    const char neg_one = (char)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (sizeof(char) < sizeof(long)) {
+        long val = __Pyx_PyInt_AsLong(x);
+        if (unlikely(val != (long)(char)val)) {
+            if (!unlikely(val == -1 && PyErr_Occurred())) {
+                PyErr_SetString(PyExc_OverflowError,
+                    (is_unsigned && unlikely(val < 0)) ?
+                    "can't convert negative value to char" :
+                    "value too large to convert to char");
+            }
+            return (char)-1;
+        }
+        return (char)val;
+    }
+    return (char)__Pyx_PyInt_AsLong(x);
+}
+
+static CYTHON_INLINE short __Pyx_PyInt_AsShort(PyObject* x) {
+    const short neg_one = (short)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (sizeof(short) < sizeof(long)) {
+        long val = __Pyx_PyInt_AsLong(x);
+        if (unlikely(val != (long)(short)val)) {
+            if (!unlikely(val == -1 && PyErr_Occurred())) {
+                PyErr_SetString(PyExc_OverflowError,
+                    (is_unsigned && unlikely(val < 0)) ?
+                    "can't convert negative value to short" :
+                    "value too large to convert to short");
+            }
+            return (short)-1;
+        }
+        return (short)val;
+    }
+    return (short)__Pyx_PyInt_AsLong(x);
+}
+
+static CYTHON_INLINE int __Pyx_PyInt_AsInt(PyObject* x) {
+    const int neg_one = (int)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (sizeof(int) < sizeof(long)) {
+        long val = __Pyx_PyInt_AsLong(x);
+        if (unlikely(val != (long)(int)val)) {
+            if (!unlikely(val == -1 && PyErr_Occurred())) {
+                PyErr_SetString(PyExc_OverflowError,
+                    (is_unsigned && unlikely(val < 0)) ?
+                    "can't convert negative value to int" :
+                    "value too large to convert to int");
+            }
+            return (int)-1;
+        }
+        return (int)val;
+    }
+    return (int)__Pyx_PyInt_AsLong(x);
+}
+
+static CYTHON_INLINE signed char __Pyx_PyInt_AsSignedChar(PyObject* x) {
+    const signed char neg_one = (signed char)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (sizeof(signed char) < sizeof(long)) {
+        long val = __Pyx_PyInt_AsLong(x);
+        if (unlikely(val != (long)(signed char)val)) {
+            if (!unlikely(val == -1 && PyErr_Occurred())) {
+                PyErr_SetString(PyExc_OverflowError,
+                    (is_unsigned && unlikely(val < 0)) ?
+                    "can't convert negative value to signed char" :
+                    "value too large to convert to signed char");
+            }
+            return (signed char)-1;
+        }
+        return (signed char)val;
+    }
+    return (signed char)__Pyx_PyInt_AsSignedLong(x);
+}
+
+static CYTHON_INLINE signed short __Pyx_PyInt_AsSignedShort(PyObject* x) {
+    const signed short neg_one = (signed short)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (sizeof(signed short) < sizeof(long)) {
+        long val = __Pyx_PyInt_AsLong(x);
+        if (unlikely(val != (long)(signed short)val)) {
+            if (!unlikely(val == -1 && PyErr_Occurred())) {
+                PyErr_SetString(PyExc_OverflowError,
+                    (is_unsigned && unlikely(val < 0)) ?
+                    "can't convert negative value to signed short" :
+                    "value too large to convert to signed short");
+            }
+            return (signed short)-1;
+        }
+        return (signed short)val;
+    }
+    return (signed short)__Pyx_PyInt_AsSignedLong(x);
+}
+
+static CYTHON_INLINE signed int __Pyx_PyInt_AsSignedInt(PyObject* x) {
+    const signed int neg_one = (signed int)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (sizeof(signed int) < sizeof(long)) {
+        long val = __Pyx_PyInt_AsLong(x);
+        if (unlikely(val != (long)(signed int)val)) {
+            if (!unlikely(val == -1 && PyErr_Occurred())) {
+                PyErr_SetString(PyExc_OverflowError,
+                    (is_unsigned && unlikely(val < 0)) ?
+                    "can't convert negative value to signed int" :
+                    "value too large to convert to signed int");
+            }
+            return (signed int)-1;
+        }
+        return (signed int)val;
+    }
+    return (signed int)__Pyx_PyInt_AsSignedLong(x);
+}
+
+static CYTHON_INLINE int __Pyx_PyInt_AsLongDouble(PyObject* x) {
+    const int neg_one = (int)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (sizeof(int) < sizeof(long)) {
+        long val = __Pyx_PyInt_AsLong(x);
+        if (unlikely(val != (long)(int)val)) {
+            if (!unlikely(val == -1 && PyErr_Occurred())) {
+                PyErr_SetString(PyExc_OverflowError,
+                    (is_unsigned && unlikely(val < 0)) ?
+                    "can't convert negative value to int" :
+                    "value too large to convert to int");
+            }
+            return (int)-1;
+        }
+        return (int)val;
+    }
+    return (int)__Pyx_PyInt_AsLong(x);
+}
+
+static CYTHON_INLINE unsigned long __Pyx_PyInt_AsUnsignedLong(PyObject* x) {
+    const unsigned long neg_one = (unsigned long)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_VERSION_HEX < 0x03000000
     if (likely(PyInt_Check(x))) {
-        if (sizeof(long) < sizeof(long)) {
-            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG)
-        } else {
-            long val = PyInt_AS_LONG(x);
-            if (is_unsigned && unlikely(val < 0)) {
+        long val = PyInt_AS_LONG(x);
+        if (is_unsigned && unlikely(val < 0)) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "can't convert negative value to unsigned long");
+            return (unsigned long)-1;
+        }
+        return (unsigned long)val;
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+            if (unlikely(Py_SIZE(x) < 0)) {
                 PyErr_SetString(PyExc_OverflowError,
-                                "can't convert negative value to long");
-                return (long) -1;
+                                "can't convert negative value to unsigned long");
+                return (unsigned long)-1;
             }
-            return (long) val;
+            return (unsigned long)PyLong_AsUnsignedLong(x);
+        } else {
+            return (unsigned long)PyLong_AsLong(x);
+        }
+    } else {
+        unsigned long val;
+        PyObject *tmp = __Pyx_PyNumber_Int(x);
+        if (!tmp) return (unsigned long)-1;
+        val = __Pyx_PyInt_AsUnsignedLong(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+}
+
+static CYTHON_INLINE unsigned PY_LONG_LONG __Pyx_PyInt_AsUnsignedLongLong(PyObject* x) {
+    const unsigned PY_LONG_LONG neg_one = (unsigned PY_LONG_LONG)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_VERSION_HEX < 0x03000000
+    if (likely(PyInt_Check(x))) {
+        long val = PyInt_AS_LONG(x);
+        if (is_unsigned && unlikely(val < 0)) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "can't convert negative value to unsigned PY_LONG_LONG");
+            return (unsigned PY_LONG_LONG)-1;
         }
+        return (unsigned PY_LONG_LONG)val;
     } else
 #endif
     if (likely(PyLong_Check(x))) {
         if (is_unsigned) {
-#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
- #if CYTHON_USE_PYLONG_INTERNALS
-            if (sizeof(digit) <= sizeof(long)) {
-                switch (Py_SIZE(x)) {
-                    case  0: return 0;
-                    case  1: return (long) ((PyLongObject*)x)->ob_digit[0];
-                }
+            if (unlikely(Py_SIZE(x) < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to unsigned PY_LONG_LONG");
+                return (unsigned PY_LONG_LONG)-1;
             }
- #endif
+            return (unsigned PY_LONG_LONG)PyLong_AsUnsignedLongLong(x);
+        } else {
+            return (unsigned PY_LONG_LONG)PyLong_AsLongLong(x);
+        }
+    } else {
+        unsigned PY_LONG_LONG val;
+        PyObject *tmp = __Pyx_PyNumber_Int(x);
+        if (!tmp) return (unsigned PY_LONG_LONG)-1;
+        val = __Pyx_PyInt_AsUnsignedLongLong(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+}
+
+static CYTHON_INLINE long __Pyx_PyInt_AsLong(PyObject* x) {
+    const long neg_one = (long)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_VERSION_HEX < 0x03000000
+    if (likely(PyInt_Check(x))) {
+        long val = PyInt_AS_LONG(x);
+        if (is_unsigned && unlikely(val < 0)) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "can't convert negative value to long");
+            return (long)-1;
+        }
+        return (long)val;
+    } else
 #endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
             if (unlikely(Py_SIZE(x) < 0)) {
                 PyErr_SetString(PyExc_OverflowError,
                                 "can't convert negative value to long");
-                return (long) -1;
-            }
-            if (sizeof(long) <= sizeof(unsigned long)) {
-                __PYX_VERIFY_RETURN_INT(long, unsigned long, PyLong_AsUnsignedLong)
-            } else if (sizeof(long) <= sizeof(unsigned long long)) {
-                __PYX_VERIFY_RETURN_INT(long, unsigned long long, PyLong_AsUnsignedLongLong)
+                return (long)-1;
             }
+            return (long)PyLong_AsUnsignedLong(x);
         } else {
-#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
- #if CYTHON_USE_PYLONG_INTERNALS
-            if (sizeof(digit) <= sizeof(long)) {
-                switch (Py_SIZE(x)) {
-                    case  0: return 0;
-                    case  1: return +(long) ((PyLongObject*)x)->ob_digit[0];
-                    case -1: return -(long) ((PyLongObject*)x)->ob_digit[0];
-                }
-            }
- #endif
+            return (long)PyLong_AsLong(x);
+        }
+    } else {
+        long val;
+        PyObject *tmp = __Pyx_PyNumber_Int(x);
+        if (!tmp) return (long)-1;
+        val = __Pyx_PyInt_AsLong(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+}
+
+static CYTHON_INLINE PY_LONG_LONG __Pyx_PyInt_AsLongLong(PyObject* x) {
+    const PY_LONG_LONG neg_one = (PY_LONG_LONG)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_VERSION_HEX < 0x03000000
+    if (likely(PyInt_Check(x))) {
+        long val = PyInt_AS_LONG(x);
+        if (is_unsigned && unlikely(val < 0)) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "can't convert negative value to PY_LONG_LONG");
+            return (PY_LONG_LONG)-1;
+        }
+        return (PY_LONG_LONG)val;
+    } else
 #endif
-            if (sizeof(long) <= sizeof(long)) {
-                __PYX_VERIFY_RETURN_INT(long, long, PyLong_AsLong)
-            } else if (sizeof(long) <= sizeof(long long)) {
-                __PYX_VERIFY_RETURN_INT(long, long long, PyLong_AsLongLong)
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+            if (unlikely(Py_SIZE(x) < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to PY_LONG_LONG");
+                return (PY_LONG_LONG)-1;
             }
+            return (PY_LONG_LONG)PyLong_AsUnsignedLongLong(x);
+        } else {
+            return (PY_LONG_LONG)PyLong_AsLongLong(x);
         }
-        {
-#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
-            PyErr_SetString(PyExc_RuntimeError,
-                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
-#else
-            long val;
-            PyObject *v = __Pyx_PyNumber_Int(x);
- #if PY_MAJOR_VERSION < 3
-            if (likely(v) && !PyLong_Check(v)) {
-                PyObject *tmp = v;
-                v = PyNumber_Long(tmp);
-                Py_DECREF(tmp);
-            }
- #endif
-            if (likely(v)) {
-                int one = 1; int is_little = (int)*(unsigned char *)&one;
-                unsigned char *bytes = (unsigned char *)&val;
-                int ret = _PyLong_AsByteArray((PyLongObject *)v,
-                                              bytes, sizeof(val),
-                                              is_little, !is_unsigned);
-                Py_DECREF(v);
-                if (likely(!ret))
-                    return val;
+    } else {
+        PY_LONG_LONG val;
+        PyObject *tmp = __Pyx_PyNumber_Int(x);
+        if (!tmp) return (PY_LONG_LONG)-1;
+        val = __Pyx_PyInt_AsLongLong(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+}
+
+static CYTHON_INLINE signed long __Pyx_PyInt_AsSignedLong(PyObject* x) {
+    const signed long neg_one = (signed long)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_VERSION_HEX < 0x03000000
+    if (likely(PyInt_Check(x))) {
+        long val = PyInt_AS_LONG(x);
+        if (is_unsigned && unlikely(val < 0)) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "can't convert negative value to signed long");
+            return (signed long)-1;
+        }
+        return (signed long)val;
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+            if (unlikely(Py_SIZE(x) < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to signed long");
+                return (signed long)-1;
             }
+            return (signed long)PyLong_AsUnsignedLong(x);
+        } else {
+            return (signed long)PyLong_AsLong(x);
+        }
+    } else {
+        signed long val;
+        PyObject *tmp = __Pyx_PyNumber_Int(x);
+        if (!tmp) return (signed long)-1;
+        val = __Pyx_PyInt_AsSignedLong(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+}
+
+static CYTHON_INLINE signed PY_LONG_LONG __Pyx_PyInt_AsSignedLongLong(PyObject* x) {
+    const signed PY_LONG_LONG neg_one = (signed PY_LONG_LONG)-1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_VERSION_HEX < 0x03000000
+    if (likely(PyInt_Check(x))) {
+        long val = PyInt_AS_LONG(x);
+        if (is_unsigned && unlikely(val < 0)) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "can't convert negative value to signed PY_LONG_LONG");
+            return (signed PY_LONG_LONG)-1;
+        }
+        return (signed PY_LONG_LONG)val;
+    } else
 #endif
-            return (long) -1;
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+            if (unlikely(Py_SIZE(x) < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to signed PY_LONG_LONG");
+                return (signed PY_LONG_LONG)-1;
+            }
+            return (signed PY_LONG_LONG)PyLong_AsUnsignedLongLong(x);
+        } else {
+            return (signed PY_LONG_LONG)PyLong_AsLongLong(x);
         }
     } else {
-        long val;
+        signed PY_LONG_LONG val;
         PyObject *tmp = __Pyx_PyNumber_Int(x);
-        if (!tmp) return (long) -1;
-        val = __Pyx_PyInt_As_long(tmp);
+        if (!tmp) return (signed PY_LONG_LONG)-1;
+        val = __Pyx_PyInt_AsSignedLongLong(tmp);
         Py_DECREF(tmp);
         return val;
     }
@@ -10124,23 +9519,6 @@ static int __Pyx_check_binary_version(void) {
     return 0;
 }
 
-#ifndef __PYX_HAVE_RT_ImportModule
-#define __PYX_HAVE_RT_ImportModule
-static PyObject *__Pyx_ImportModule(const char *name) {
-    PyObject *py_name = 0;
-    PyObject *py_module = 0;
-    py_name = __Pyx_PyIdentifier_FromString(name);
-    if (!py_name)
-        goto bad;
-    py_module = PyImport_Import(py_name);
-    Py_DECREF(py_name);
-    return py_module;
-bad:
-    Py_XDECREF(py_name);
-    return 0;
-}
-#endif
-
 #ifndef __PYX_HAVE_RT_ImportType
 #define __PYX_HAVE_RT_ImportType
 static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name,
@@ -10150,10 +9528,6 @@ static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class
     PyObject *result = 0;
     PyObject *py_name = 0;
     char warning[200];
-    Py_ssize_t basicsize;
-#ifdef Py_LIMITED_API
-    PyObject *py_basicsize;
-#endif
     py_module = __Pyx_ImportModule(module_name);
     if (!py_module)
         goto bad;
@@ -10169,23 +9543,11 @@ static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class
         goto bad;
     if (!PyType_Check(result)) {
         PyErr_Format(PyExc_TypeError,
-            "%.200s.%.200s is not a type object",
+            "%s.%s is not a type object",
             module_name, class_name);
         goto bad;
     }
-#ifndef Py_LIMITED_API
-    basicsize = ((PyTypeObject *)result)->tp_basicsize;
-#else
-    py_basicsize = PyObject_GetAttrString(result, "__basicsize__");
-    if (!py_basicsize)
-        goto bad;
-    basicsize = PyLong_AsSsize_t(py_basicsize);
-    Py_DECREF(py_basicsize);
-    py_basicsize = 0;
-    if (basicsize == (Py_ssize_t)-1 && PyErr_Occurred())
-        goto bad;
-#endif
-    if (!strict && (size_t)basicsize > size) {
+    if (!strict && (size_t)((PyTypeObject *)result)->tp_basicsize > size) {
         PyOS_snprintf(warning, sizeof(warning),
             "%s.%s size changed, may indicate binary incompatibility",
             module_name, class_name);
@@ -10195,9 +9557,9 @@ static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class
         if (PyErr_WarnEx(NULL, warning, 0) < 0) goto bad;
         #endif
     }
-    else if ((size_t)basicsize != size) {
+    else if ((size_t)((PyTypeObject *)result)->tp_basicsize != size) {
         PyErr_Format(PyExc_ValueError,
-            "%.200s.%.200s has the wrong size, try recompiling",
+            "%s.%s has the wrong size, try recompiling",
             module_name, class_name);
         goto bad;
     }
@@ -10209,6 +9571,23 @@ static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class
 }
 #endif
 
+#ifndef __PYX_HAVE_RT_ImportModule
+#define __PYX_HAVE_RT_ImportModule
+static PyObject *__Pyx_ImportModule(const char *name) {
+    PyObject *py_name = 0;
+    PyObject *py_module = 0;
+    py_name = __Pyx_PyIdentifier_FromString(name);
+    if (!py_name)
+        goto bad;
+    py_module = PyImport_Import(py_name);
+    Py_DECREF(py_name);
+    return py_module;
+bad:
+    Py_XDECREF(py_name);
+    return 0;
+}
+#endif
+
 static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
     int start = 0, mid = 0, end = count - 1;
     if (end >= 0 && code_line > entries[end].code_line) {
@@ -10401,90 +9780,27 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
     return 0;
 }
 
-static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char* c_str) {
-    return __Pyx_PyUnicode_FromStringAndSize(c_str, strlen(c_str));
-}
-static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject* o) {
-    Py_ssize_t ignore;
-    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
-}
-static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
-#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
-    if (
-#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
-            __Pyx_sys_getdefaultencoding_not_ascii &&
-#endif
-            PyUnicode_Check(o)) {
-#if PY_VERSION_HEX < 0x03030000
-        char* defenc_c;
-        PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
-        if (!defenc) return NULL;
-        defenc_c = PyBytes_AS_STRING(defenc);
-#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
-        {
-            char* end = defenc_c + PyBytes_GET_SIZE(defenc);
-            char* c;
-            for (c = defenc_c; c < end; c++) {
-                if ((unsigned char) (*c) >= 128) {
-                    PyUnicode_AsASCIIString(o);
-                    return NULL;
-                }
-            }
-        }
-#endif /*__PYX_DEFAULT_STRING_ENCODING_IS_ASCII*/
-        *length = PyBytes_GET_SIZE(defenc);
-        return defenc_c;
-#else /* PY_VERSION_HEX < 0x03030000 */
-        if (PyUnicode_READY(o) == -1) return NULL;
-#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
-        if (PyUnicode_IS_ASCII(o)) {
-            *length = PyUnicode_GET_DATA_SIZE(o);
-            return PyUnicode_AsUTF8(o);
-        } else {
-            PyUnicode_AsASCIIString(o);
-            return NULL;
-        }
-#else /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII */
-        return PyUnicode_AsUTF8AndSize(o, length);
-#endif /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII */
-#endif /* PY_VERSION_HEX < 0x03030000 */
-    } else
-#endif /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII  || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT */
-#if !CYTHON_COMPILING_IN_PYPY
-#if PY_VERSION_HEX >= 0x02060000
-    if (PyByteArray_Check(o)) {
-        *length = PyByteArray_GET_SIZE(o);
-        return PyByteArray_AS_STRING(o);
-    } else
-#endif
-#endif
-    {
-        char* result;
-        int r = PyBytes_AsStringAndSize(o, &result, length);
-        if (unlikely(r < 0)) {
-            return NULL;
-        } else {
-            return result;
-        }
-    }
-}
+
+/* Type Conversion Functions */
+
 static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
    int is_true = x == Py_True;
    if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
    else return PyObject_IsTrue(x);
 }
+
 static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x) {
   PyNumberMethods *m;
   const char *name = NULL;
   PyObject *res = NULL;
-#if PY_MAJOR_VERSION < 3
+#if PY_VERSION_HEX < 0x03000000
   if (PyInt_Check(x) || PyLong_Check(x))
 #else
   if (PyLong_Check(x))
 #endif
     return Py_INCREF(x), x;
   m = Py_TYPE(x)->tp_as_number;
-#if PY_MAJOR_VERSION < 3
+#if PY_VERSION_HEX < 0x03000000
   if (m && m->nb_int) {
     name = "int";
     res = PyNumber_Int(x);
@@ -10500,13 +9816,13 @@ static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x) {
   }
 #endif
   if (res) {
-#if PY_MAJOR_VERSION < 3
+#if PY_VERSION_HEX < 0x03000000
     if (!PyInt_Check(res) && !PyLong_Check(res)) {
 #else
     if (!PyLong_Check(res)) {
 #endif
       PyErr_Format(PyExc_TypeError,
-                   "__%.4s__ returned non-%.4s (type %.200s)",
+                   "__%s__ returned non-%s (type %.200s)",
                    name, name, Py_TYPE(res)->tp_name);
       Py_DECREF(res);
       return NULL;
@@ -10518,40 +9834,16 @@ static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x) {
   }
   return res;
 }
-#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
- #if CYTHON_USE_PYLONG_INTERNALS
-  #include "longintrepr.h"
- #endif
-#endif
+
 static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
   Py_ssize_t ival;
-  PyObject *x;
-#if PY_MAJOR_VERSION < 3
-  if (likely(PyInt_CheckExact(b)))
-      return PyInt_AS_LONG(b);
-#endif
-  if (likely(PyLong_CheckExact(b))) {
-    #if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
-     #if CYTHON_USE_PYLONG_INTERNALS
-       switch (Py_SIZE(b)) {
-       case -1: return -(sdigit)((PyLongObject*)b)->ob_digit[0];
-       case  0: return 0;
-       case  1: return ((PyLongObject*)b)->ob_digit[0];
-       }
-     #endif
-    #endif
-  #if PY_VERSION_HEX < 0x02060000
-    return PyInt_AsSsize_t(b);
-  #else
-    return PyLong_AsSsize_t(b);
-  #endif
-  }
-  x = PyNumber_Index(b);
+  PyObject* x = PyNumber_Index(b);
   if (!x) return -1;
   ival = PyInt_AsSsize_t(x);
   Py_DECREF(x);
   return ival;
 }
+
 static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
 #if PY_VERSION_HEX < 0x02050000
    if (ival <= LONG_MAX)
@@ -10566,5 +9858,17 @@ static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
 #endif
 }
 
+static CYTHON_INLINE size_t __Pyx_PyInt_AsSize_t(PyObject* x) {
+   unsigned PY_LONG_LONG val = __Pyx_PyInt_AsUnsignedLongLong(x);
+   if (unlikely(val == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred())) {
+       return (size_t)-1;
+   } else if (unlikely(val != (unsigned PY_LONG_LONG)(size_t)val)) {
+       PyErr_SetString(PyExc_OverflowError,
+                       "value too large to convert to size_t");
+       return (size_t)-1;
+   }
+   return (size_t)val;
+}
+
 
 #endif /* Py_PYTHON_H */
diff --git a/theano/scan_module/scan_perform.pyx b/theano/scan_module/scan_perform.pyx
index a6fd88ae8a7..46ae856a2a8 100644
--- a/theano/scan_module/scan_perform.pyx
+++ b/theano/scan_module/scan_perform.pyx
@@ -62,7 +62,7 @@ import copy
 
 
 def get_version():
-    return 0.283
+    return 0.278
 
 @cython.boundscheck(False)
 def perform(
@@ -194,14 +194,15 @@ def perform(
 
 
     if n_steps < 0:
-        # History, in the past, this was used for backward
-        # scan. Now we reverse the inputs outside of scan.
-        raise IndexError(
-            "Scan was asked to run for negative number of step %d" %
-            n_steps)
-    elif n_steps == 0:
-        raise NotImplementedError(
-            "We didn't implemented yet the case where scan do 0 iteration")
+        n_steps = -n_steps
+        for idx in range(n_seqs):
+            if args[<unsigned int>(1+idx)].shape[0] < n_steps:
+                raise ValueError(('Sequence is shorter then the required '
+                                 'number of steps : (n_steps, seq, '
+                                  'seq.shape):'), n_steps,
+                                  args[1+idx],
+                                  args[1+idx].shape)
+            args[<unsigned int>(1+idx)] = args[<unsigned int>(1+idx)][::-1]
     else:
         for idx in range(n_seqs):
             if args[<unsigned int>(1+idx)].shape[0] < n_steps:
@@ -456,13 +457,6 @@ def perform(
                 sh0 = outs[idx][0].shape[0]
                 outs[idx][0] = outs[idx][0][:sh0-(n_steps - i)]
 
-    # We never reuse the input or output storage of the
-    # inner function so we clear it.
-    for i_s in input_storage:
-        i_s.storage[0] = None
-    for o_s in output_storage:
-        o_s.storage[0] = None
-
     t_call = time.time() - t0_call
 
     if hasattr(fnct.maker, 'profile'):
diff --git a/theano/scan_module/scan_perform_ext.py b/theano/scan_module/scan_perform_ext.py
index 1297b6e2b63..08dcd9ecb5a 100644
--- a/theano/scan_module/scan_perform_ext.py
+++ b/theano/scan_module/scan_perform_ext.py
@@ -1,10 +1,4 @@
-import errno
-import logging
-import os
-import sys
-import warnings
-
-import numpy
+import os, logging, sys
 
 import theano
 from theano import config
@@ -14,9 +8,10 @@
 
 
 _logger = logging.getLogger('theano.scan_module.scan_perform')
+_logger.setLevel(logging.WARN)
 
 
-version = 0.283  # must match constant returned in function get_version()
+version = 0.278  # must match constant returned in function get_version()
 
 need_reload = False
 
@@ -54,60 +49,20 @@ def try_reload():
             if version != getattr(scan_perform, '_version', None):
                 raise ImportError()
         except ImportError:
-            if not theano.config.cxx:
-                raise ImportError("no c compiler, can't compile cython code")
+
             _logger.info("Compiling C code for scan")
             dirname = 'scan_perform'
+            # We use a .txt extensions as otherwise it don't get
+            # included when we create a package to send to pypi
+            # This happen even if we tell to include *.c files
             cfile = os.path.join(theano.__path__[0], 'scan_module',
-                                 'scan_perform.c')
-            if not os.path.exists(cfile):
-                # This can happen in not normal case. We just
-                # disable the cython code. If we are here the user
-                # didn't disable the compiler, so print a warning.
-                warnings.warn(
-                    "The file scan_perform.c is not available. This do"
-                    "not happen normally. You are probably in a strange"
-                    "setup. This mean Theano can not use the cython code for "
-                    "scan. If you"
-                    "want to remove this warning, use the Theano flag"
-                    "'cxx=' (set to an empty string) to disable all c"
-                    "code generation."
-                )
-                raise ImportError("The file lazylinker_c.c is not available.")
-
+                                 'scan_perform.c.txt')
             code = open(cfile).read()
             loc = os.path.join(config.compiledir, dirname)
             if not os.path.exists(loc):
-                try:
-                    os.mkdir(loc)
-                except OSError, e:
-                    assert e.errno == errno.EEXIST
-                    assert os.path.exists(loc)
-
+                os.mkdir(loc)
             preargs = ['-fwrapv', '-O2', '-fno-strict-aliasing']
             preargs += cmodule.GCC_compiler.compile_args()
-            # Cython 19.1 always use the old NumPy interface.  So we
-            # need to manually modify the .c file to get it compiled
-            # by Theano. As by default, we tell NumPy to don't import
-            # the old interface.
-            if False:
-                #During scan cython development, it is helpful to keep the old interface, to don't manually edit the c file each time.
-                preargs.remove('-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION')
-            else:
-                numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
-                # Add add some macro to lower the number of edit
-                # needed to the c file.
-                if bool(numpy_ver >= [1, 7]):
-                    # Needed when we disable the old API, as cython
-                    # use the old interface
-                    preargs.append("-D NPY_ENSUREARRAY=NPY_ARRAY_ENSUREARRAY")
-                    preargs.append("-D NPY_ENSURECOPY=NPY_ARRAY_ENSURECOPY")
-                    preargs.append("-D NPY_ALIGNED=NPY_ARRAY_ALIGNED")
-                    preargs.append("-D NPY_WRITEABLE=NPY_ARRAY_WRITEABLE")
-                    preargs.append("-D NPY_UPDATE_ALL=NPY_ARRAY_UPDATE_ALL")
-                    preargs.append("-D NPY_C_CONTIGUOUS=NPY_ARRAY_C_CONTIGUOUS")
-                    preargs.append("-D NPY_F_CONTIGUOUS=NPY_ARRAY_F_CONTIGUOUS")
-
             cmodule.GCC_compiler.compile_str(dirname, code, location=loc,
                                              preargs=preargs)
             # Save version into the __init__.py file.
diff --git a/theano/scan_module/scan_utils.py b/theano/scan_module/scan_utils.py
index d792980d9f8..90d2878bece 100644
--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
@@ -15,10 +15,10 @@
 
 import copy
 import logging
-import warnings
 from itertools import izip
 
 import numpy
+import warnings
 
 import theano
 from theano.compile.pfunc import rebuild_collect_shared
@@ -59,7 +59,7 @@ def safe_new(x, tag='', dtype=None):
     # making the pushout optimization fail
     elif isinstance(x, scalar.ScalarVariable):
         if dtype:
-            nw_x = scalar.get_scalar_type(dtype=dtype)()
+            nw_x = scalar.Scalar(dtype=dtype)()
         else:
             nw_x = x.type()
         nw_x.name = nw_name
@@ -157,12 +157,10 @@ def hash_listsDictsTuples(x):
     return hash_value
 
 
-DEPRECATED_ARG = object()
 def clone(output,
           replace=None,
           strict=True,
-          share_inputs=True,
-          copy_inputs=DEPRECATED_ARG):
+          copy_inputs=True):
     """
     Function that allows replacing subgraphs of a computational
     graph. It returns a copy of the initial subgraph with the corresponding
@@ -176,17 +174,12 @@ def clone(output,
     :param replace: dictionary describing which subgraphs should be
                     replaced by what
 
-    :type share_inputs: bool
-    :param share_inputs: If True, use the same inputs (and shared variables)
+    :type copy_inputs: bool
+    :param copy_inputs: If True, use the same inputs (and shared variables)
         as the original graph. If False, clone them. Note that cloned
         shared variables still use the same underlying storage, so they
         will always have the same value.
     """
-    if copy_inputs is not DEPRECATED_ARG:
-        warnings.warn('In `clone()` function, the argument `copy_inputs` has been deprecated and renamed into `share_inputs`')
-        assert share_inputs  # since we used `copy_inputs` we should have default value for `share_inputs`
-        share_inputs = copy_inputs
-
     if isinstance(replace, dict):
         items = replace.items()
     elif isinstance(replace, (list, tuple)):
@@ -205,15 +198,14 @@ def clone(output,
                                          tmp_replace,
                                          [],
                                          strict,
-                                         share_inputs)
+                                         copy_inputs)
 
-    # TODO Explain why we call it twice ?!
     _, outs, _ = rebuild_collect_shared(_outs,
                                         [],
                                         new_replace,
                                         [],
                                         strict,
-                                        share_inputs)
+                                        copy_inputs)
 
     return outs
 
@@ -391,7 +383,6 @@ def equal_computations(xs, ys, in_xs=None, in_ys=None):
     or `ys`.
 
     '''
-    assert len(xs) == len(ys)
     if in_xs is None:
         in_xs = []
     if in_ys is None:
@@ -402,64 +393,68 @@ def equal_computations(xs, ys, in_xs=None, in_ys=None):
             return False
         if y.owner and not x.owner:
             return False
-        if x.owner:  # Check above tell that y.owner eval to True too.
+        if x.owner and y.owner:
             if x.owner.outputs.index(x) != y.owner.outputs.index(y):
                 return False
-        if x not in in_xs and x.type != y.type:
-            return False
     if len(in_xs) != len(in_ys):
         return False
     for _x, _y in izip(in_xs, in_ys):
         if _x.type != _y.type:
             return False
 
+    nds_x = gof.graph.io_toposort(in_xs, xs)
+    nds_y = gof.graph.io_toposort(in_ys, ys)
+    if len(nds_x) != len(nds_y):
+        return False
     common = set(zip(in_xs, in_ys))
+    n_nodes = len(nds_x)
+    cont = True
+    idx = 0
     for dx, dy in izip(xs, ys):
-        # We checked above that both dx and dy have an owner or not
-        if not dx.owner:
-            if (isinstance(dx, tensor.Constant) and
+        if not dx.owner or not dy.owner:
+            if dy.owner or dx.owner:
+                return False
+            elif (isinstance(dx, tensor.Constant) and
                   isinstance(dy, tensor.Constant)):
-                if not dx.equals(dy):
+                if not (numpy.all(dx.data == dy.data) and
+                        dx.type.dtype == dy.type.dtype and
+                        dx.data.shape == dy.data.shape):
                     return False
                 else:
                     pass
             elif (dx, dy) not in common and dx != dy:
                 return False
 
-    nds_x = gof.graph.io_toposort(in_xs, xs)
-    nds_y = gof.graph.io_toposort(in_ys, ys)
-    if len(nds_x) != len(nds_y):
-        return False
-
-    n_nodes = len(nds_x)
-    idx = 0
-    while idx < n_nodes:
+    while cont and idx < n_nodes:
         nd_x = nds_x[idx]
         nd_y = nds_y[idx]
         if nd_x.op != nd_y.op:
-            return False
+            cont = False
         elif len(nd_x.inputs) != len(nd_y.inputs):
-            return False
+            cont = False
         elif len(nd_x.outputs) != len(nd_y.outputs):
-            return False
+            cont = False
         else:
             for dx, dy in izip(nd_x.inputs, nd_y.inputs):
                 if (dx, dy) not in common:
                     if dx != dy:
                         if (isinstance(dx, tensor.Constant) and
                             isinstance(dy, tensor.Constant)):
-                            if not dx.equals(dy):
+                            if not (numpy.all(dx.data == dy.data) and
+                                dx.type.dtype == dy.type.dtype and
+                                dx.data.shape == dy.data.shape):
                                 return False
                             else:
                                 pass
                         else:
-                            return False
+                            cont = False
 
+        if cont:
             for dx, dy in izip(nd_x.outputs, nd_y.outputs):
                 common.add((dx, dy))
         idx += 1
 
-    return True
+    return cont
 
 
 def infer_shape(outs, inputs, input_shapes):
@@ -505,7 +500,7 @@ def local_traverse(out):
             # shape_feature.on_import does not actually use an fgraph
             # It will call infer_shape and set_shape appropriately
             dummy_fgraph = None
-            shape_feature.on_import(dummy_fgraph, out.owner, reason="dummy")
+            shape_feature.on_import(dummy_fgraph, out.owner)
 
     ret = []
     for o in outs:
@@ -818,7 +813,6 @@ def __init__(self, outer_inputs, outer_outputs,
             self.cond = [rval[1][-1]]
             inner_outputs = rval[1][:-1]
         else:
-            self.cond = []
             inner_outputs = rval[1]
         inner_inputs = rval[0]
 
@@ -943,8 +937,7 @@ def __init__(self, outer_inputs, outer_outputs,
                                            self.inner_out_mit_sot +
                                            self.inner_out_sit_sot +
                                            self.inner_out_nit_sot +
-                                           self.inner_out_shared +
-                                           self.cond))
+                                           self.inner_out_shared))
 
     outer_outputs = property(lambda self: (self.outer_out_mit_mot +
                                            self.outer_out_mit_sot +
diff --git a/theano/scan_module/tests/test_scan.py b/theano/scan_module/tests/test_scan.py
index efc1c60229e..a10c6a128d4 100644
--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -1,6 +1,5 @@
 import os
 import shutil
-import sys
 from tempfile import mkdtemp
 import time
 import unittest
@@ -8,7 +7,6 @@
 import cPickle
 import numpy
 from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
 from numpy.testing import dec
 
 import theano
@@ -16,9 +14,8 @@
 from theano import tensor
 from theano.compile.pfunc import rebuild_collect_shared
 from theano.gof.python25 import any
-from theano.tests import unittest_tools as utt
+from theano.tests  import unittest_tools as utt
 import theano.scalar.sharedvar
-from theano.scan_module.scan_op import Scan
 from theano.gof.python25 import OrderedDict
 from theano.compat import PY3
 
@@ -47,7 +44,7 @@
 class multiple_outputs_numeric_grad:
     """WRITEME"""
     type_eps = {'float64': 1e-7,
-                'float32': 3e-3}
+            'float32': 3e-3}
 
     def __init__(self, f, pt, ndarray_mask=None, eps=None):
         """Return the gradient of f at pt.
@@ -82,12 +79,12 @@ def prod(inputs):
             if ndarray_mask[i]:
                 pt[i] = numpy.array(p)
                 _eps = multiple_outputs_numeric_grad.type_eps[str(
-                    pt[i].dtype)]
+                                            pt[i].dtype)]
                 if _eps > dtype_eps:
                     dtype_eps = _eps
 
         self.ndarray_mask = ndarray_mask
-        # '''
+        #'''
         # Compute clean output:
         f_x = f(*pt)
         gx = []
@@ -149,7 +146,7 @@ def max_err(self, _g_pt):
             return numpy.inf, 0
 
 
-# TODO: Test this function, and if it works,
+#TODO: Test this function, and if it works,
 # use it with the normal verify_grad rather than the
 # copy-and-pasted one above.
 # Also - add a reference to this technique in the
@@ -202,6 +199,7 @@ def grab_scan_node(output):
 
 
 class T_Scan(unittest.TestCase):
+#class T_Scan(object):
 
     def setUp(self):
         utt.seed_rng()
@@ -230,7 +228,7 @@ def f_pow2(x_tm1):
                                 updates=updates,
                                 allow_input_downcast=True)
 
-        # TESTING PICKLE-ing this function
+        ### TESTING PICKLE-ing this function
         origdir = os.getcwd()
         tmpdir = None
         try:
@@ -260,46 +258,7 @@ def f_pow2(x_tm1):
         numpy_values = numpy.array([state * (2 ** (k + 1)) for k
                                     in xrange(steps)])
         theano_values = my_f(state, steps)
-        utt.assert_allclose(numpy_values, theano_values)
-
-    # Test that the inner input_storage and output_storage are
-    # properly cleared
-    def test_inner_storage_leak(self):
-        def f_pow2(x_tm1):
-            return 2 * x_tm1
-
-        state = theano.tensor.scalar('state')
-        n_steps = theano.tensor.iscalar('nsteps')
-        output, updates = theano.scan(f_pow2,
-                                      [],
-                                      state,
-                                      [],
-                                      n_steps=n_steps)
-
-        f = theano.function([state, n_steps],
-                            output,
-                            updates=updates,
-                            allow_input_downcast=True)
-
-        scan_node = [node for node in f.maker.fgraph.toposort()
-                     if isinstance(node.op, Scan)]
-
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-
-        # Make sure they start out as None
-        assert all(i.value is None for i in scan_node.op.fn.input_storage)
-        assert all(o.value is None for o in scan_node.op.fn.output_storage)
-
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        state = rng.uniform()
-        steps = 5
-
-        f(state, steps)
-
-        # And that they stay that way
-        assert all(i.value is None for i in scan_node.op.fn.input_storage)
-        assert all(o.value is None for o in scan_node.op.fn.output_storage)
+        assert numpy.allclose(numpy_values, theano_values)
 
     # generator network, only one output , type scalar ; no sequence or
     # non sequence arguments
@@ -328,7 +287,7 @@ def f_pow2(x_tm1):
         numpy_values = numpy.array([state * (2 ** (k + 1)) for k
                                     in xrange(steps)])
         theano_values = my_f(state, steps)
-        utt.assert_allclose(numpy_values, theano_values)
+        assert numpy.allclose(numpy_values, theano_values)
 
     def test_subtensor_multiple_slices(self):
         # This addresses a bug reported by Matthias Zoehrer
@@ -400,13 +359,13 @@ def f_rnn(u_t, x_tm1, W_in, W):
         for step in xrange(1, 4):
             v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
         theano_values = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(theano_values, v_out)
+        assert numpy.allclose(theano_values, v_out)
 
     # as test_one_sequence_one_output_weights, but on the gpu
     # This first version test the first case in the optimizer to the gpu.
     def test_one_sequence_one_output_weights_gpu1(self):
         from theano.sandbox import cuda
-        if not cuda.cuda_available:
+        if cuda.cuda_available == False:
             raise SkipTest('Optional package cuda disabled')
 
         def f_rnn(u_t, x_tm1, W_in, W):
@@ -454,7 +413,8 @@ def f_rnn(u_t, x_tm1, W_in, W):
         for step in xrange(1, 4):
             v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
         theano_values = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(theano_values, v_out)
+        assert numpy.allclose(theano_values, v_out), (theano_values, v_out,
+                                                      theano_values - v_out)
 
         # TO DEL
         topo = f2.maker.fgraph.toposort()
@@ -486,7 +446,7 @@ def f_rnn(u_t, x_tm1, W_in, W):
     # This second version test the second case in the optimizer to the gpu.
     def test_one_sequence_one_output_weights_gpu2(self):
         from theano.sandbox import cuda
-        if not cuda.cuda_available:
+        if cuda.cuda_available == False:
             raise SkipTest('Optional package cuda disabled')
 
         def f_rnn(u_t, x_tm1, W_in, W):
@@ -524,7 +484,7 @@ def f_rnn(u_t, x_tm1, W_in, W):
         for step in xrange(1, 4):
             v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
         theano_values = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(theano_values, v_out)
+        assert numpy.allclose(theano_values, v_out)
 
         topo = f2.maker.fgraph.toposort()
         assert sum([isinstance(node.op, theano.sandbox.cuda.HostFromGpu)
@@ -550,7 +510,7 @@ def f_rnn(u_t, x_tm1, W_in, W):
     # outputs when is running on GPU
     def test_gpu3_mixture_dtype_outputs(self):
         from theano.sandbox import cuda
-        if not cuda.cuda_available:
+        if cuda.cuda_available == False:
             raise SkipTest('Optional package cuda disabled')
 
         def f_rnn(u_t, x_tm1, W_in, W):
@@ -593,8 +553,8 @@ def f_rnn(u_t, x_tm1, W_in, W):
             v_out2[step] = numpy.int64(v_u[step] + v_out1[step - 1])
 
         theano_out1, theano_out2 = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(theano_out1, v_out1)
-        utt.assert_allclose(theano_out2, v_out2)
+        assert numpy.allclose(theano_out1, v_out1)
+        assert numpy.allclose(theano_out2, v_out2)
 
         topo = f2.maker.fgraph.toposort()
         scan_node = [node for node in topo
@@ -634,11 +594,11 @@ def f_rnn_shared(u_t, x_tm1, tmp_W_in, tmp_W):
         v_out = numpy.zeros((4,))
         v_out[0] = v_u[0] * W_in.get_value() + v_x0 * W.get_value()
         for step in xrange(1, 4):
-            v_out[step] = (v_u[step] * W_in.get_value() +
-                           v_out[step - 1] * W.get_value())
+            v_out[step] = v_u[step] * W_in.get_value() + \
+                    v_out[step - 1] * W.get_value()
 
         theano_values = f3(v_u, v_x0)
-        assert numpy.allclose(theano_values, v_out)
+        assert  numpy.allclose(theano_values, v_out)
 
     # some rnn with multiple outputs and multiple inputs; other
     # dimension instead of scalars/vectors
@@ -663,7 +623,7 @@ def test_multiple_inputs_multiple_outputs(self):
         y0 = theano.tensor.scalar('y0')
 
         def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, W_in1):
-            return [theano.dot(u1_t, W_in1) + u2_t * W_in2 +
+            return [theano.dot(u1_t, W_in1) + u2_t * W_in2 + \
                     theano.dot(x_tm1, W), theano.dot(x_tm1, W_out)]
 
         outputs, updates = theano.scan(f_rnn_cmpl,
@@ -682,17 +642,17 @@ def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, W_in1):
         # compute the values in numpy
         v_x = numpy.zeros((3, 2), dtype=theano.config.floatX)
         v_y = numpy.zeros((3,), dtype=theano.config.floatX)
-        v_x[0] = (numpy.dot(v_u1[0], vW_in1) + v_u2[0] * vW_in2 +
-                  numpy.dot(v_x0, vW))
+        v_x[0] = numpy.dot(v_u1[0], vW_in1) + v_u2[0] * vW_in2 + \
+                    numpy.dot(v_x0, vW)
         v_y[0] = numpy.dot(v_x0, vWout)
         for i in xrange(1, 3):
-            v_x[i] = (numpy.dot(v_u1[i], vW_in1) + v_u2[i] * vW_in2 +
-                      numpy.dot(v_x[i - 1], vW))
+            v_x[i] = numpy.dot(v_u1[i], vW_in1) + v_u2[i] * vW_in2 + \
+                        numpy.dot(v_x[i - 1], vW)
             v_y[i] = numpy.dot(v_x[i - 1], vWout)
 
         (theano_x, theano_y) = f4(v_u1, v_u2, v_x0, v_y0, vW_in1)
-        utt.assert_allclose(theano_x, v_x)
-        utt.assert_allclose(theano_y, v_y)
+        assert numpy.allclose(theano_x, v_x), (theano_x, v_x, theano_x - v_x)
+        assert numpy.allclose(theano_y, v_y), (theano_y, v_y, theano_y - v_y)
 
     def test_multiple_outs_taps(self):
         l = 5
@@ -723,9 +683,9 @@ def f_rnn_cmpl(u1_t,
                        y_tm1,
                        y_tm3,
                        W_in1):
-            return [theano.dot(u1_t, W_in1) +
-                    (u2_t + u2_tm1 * u2_tp1) * W_in2 +
-                    theano.dot(x_tm1, W),
+            return [theano.dot(u1_t, W_in1) + \
+                        (u2_t + u2_tm1 * u2_tp1) * W_in2 + \
+                        theano.dot(x_tm1, W),
                     (y_tm1 + y_tm3) * theano.dot(x_tm1, W_out),
                     theano.dot(u1_t, W_in1)]
 
@@ -837,7 +797,7 @@ def f_rnn_shared(u_tm2, x_tm1, x_tm2):
         numpy_out = numpy.zeros((2,))
         numpy_out[0] = vu[0] * vW_in + vx0[1] * vW + vx0[0]
         numpy_out[1] = vu[1] * vW_in + numpy_out[0] * vW + vx0[1]
-        utt.assert_allclose(numpy_out, theano_out)
+        assert numpy.allclose(numpy_out, theano_out)
 
     # simple rnn, one input, one state, weights for each; input/state are
     # vectors, weights are scalars; using shared variables and past
@@ -876,7 +836,7 @@ def f_rnn_shared(u_tm2, u_tp2, x_tm1, x_tm2):
         # and vx0[0] as vx0[-2], vx0[1] as vx0[-1]
         numpy_out[0] = (vu[0] + vu[4]) * vW_in + vx0[1] * vW + vx0[0]
         numpy_out[1] = (vu[1] + vu[5]) * vW_in + numpy_out[0] * vW + vx0[1]
-        utt.assert_allclose(numpy_out, theano_out)
+        assert numpy.allclose(numpy_out, theano_out)
 
     # simple rnn ; compute inplace version 1
     def test_inplace1(self):
@@ -930,25 +890,27 @@ def f_rnn_shared(u0_t, u1_t, u2_t, x0_tm1, x1_tm1):
         numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu2[0]
         numpy_x1[0] = vu0[0] * vW_in + vx1 * vW + vu1[0] + vu2[0]
         for i in xrange(1, 3):
-            numpy_x0[i] = (vu0[i] * vW_in + numpy_x0[i - 1] * vW +
-                           vu1[i] * vu2[i])
-            numpy_x1[i] = (vu0[i] * vW_in + numpy_x1[i - 1] * vW +
-                           vu1[i] + vu2[i])
+            numpy_x0[i] = vu0[i] * vW_in + numpy_x0[i - 1] * vW + \
+                    vu1[i] * vu2[i]
+            numpy_x1[i] = vu0[i] * vW_in + numpy_x1[i - 1] * vW + \
+                    vu1[i] + vu2[i]
 
         # note theano computes inplace, so call function after numpy
         # equivalent is done
         (theano_x0, theano_x1) = f9(vu0, vu1, vu2, vx0, vx1)
         # assert that theano does what it should
-        utt.assert_allclose(theano_x0, numpy_x0)
-        utt.assert_allclose(theano_x1, numpy_x1)
+        assert numpy.allclose(theano_x0, numpy_x0), (theano_x0, numpy_x0,
+                                                     theano_x0 - numpy_x0)
+        assert numpy.allclose(theano_x1, numpy_x1), (theano_x1, numpy_x1,
+                                                     theano_x1 - numpy_x1)
         # assert that it was done in place
 
         # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
         # Old way of doing inplace operations is deprecated .. tests don't
         # make sense anymore.
 
-        ## utt.assert_allclose(theano_x0 , vu2)
-        ## utt.assert_allclose(theano_x1 , vu1)
+        ##assert numpy.allclose( theano_x0 , vu2)
+        ## assert numpy.allclose( theano_x1 , vu1)
 
     # simple rnn ; compute inplace version 2
     def test_inplace2(self):
@@ -1004,23 +966,23 @@ def f_rnn_shared(u0_t,
                      if isinstance(x.op, theano.scan_module.scan_op.Scan)]
         assert 0 in scan_node[0].op.destroy_map.keys()
         assert 1 in scan_node[0].op.destroy_map.keys()
-        # compute output in numpy
+       # compute output in numpy
         numpy_x0 = numpy.zeros((3,))
         numpy_x1 = numpy.zeros((3,))
         numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu1[1]
         numpy_x1[0] = vu0[0] * vW_in + vx1 * vW + vu2[0] + vu2[1] + vu2[2]
         for i in xrange(1, 3):
-            numpy_x0[i] = (vu0[i] * vW_in + numpy_x0[i - 1] * vW +
-                           vu1[i] * vu1[i + 1])
-            numpy_x1[i] = (vu0[i] * vW_in + numpy_x1[i - 1] * vW +
-                           vu2[i] + vu2[i + 1] + vu2[i + 2])
+            numpy_x0[i] = vu0[i] * vW_in + numpy_x0[i - 1] * vW + \
+                    vu1[i] * vu1[i + 1]
+            numpy_x1[i] = vu0[i] * vW_in + numpy_x1[i - 1] * vW + \
+                    vu2[i] + vu2[i + 1] + vu2[i + 2]
 
         # note theano computes inplace, so call function after numpy
         # equivalent is done
         (theano_x0, theano_x1) = f9(vu0, vu1, vu2, vx0, vx1)
         # assert that theano does what it should
-        utt.assert_allclose(theano_x0, numpy_x0)
-        utt.assert_allclose(theano_x1, numpy_x1)
+        assert numpy.allclose(theano_x0, numpy_x0), (theano_x0, numpy_x0)
+        assert numpy.allclose(theano_x1, numpy_x1), (theano_x1, numpy_x1)
         # assert that it was done in place
         # not that x0 should not be inplace of vu2 because you are using
         # past values of u2, and therefore you are not allowed to work
@@ -1030,7 +992,7 @@ def f_rnn_shared(u0_t,
         # Old way of doing inplace operations is deprecated .. tests don't
         # make sense anymore.
         #assert not numpy.allclose( theano_x0 , vu2[1:4])
-        #utt.assert_allclose( theano_x1 , vu1[0:3])
+        #assert numpy.allclose( theano_x1 , vu1[0:3])
 
     def test_inplace3(self):
         rng = numpy.random.RandomState(utt.fetch_seed())
@@ -1108,8 +1070,8 @@ def test_shared_arguments_with_updates(self):
         y1 = theano.shared(vy1, 'y1')
 
         def f(u1_t, u2_t, y0_tm3, y0_tm2, y0_tm1, y1_tm1):
-            y0_t = (theano.dot(theano.dot(u1_t, W1), W2) + 0.1 * y0_tm1 +
-                    0.33 * y0_tm2 + 0.17 * y0_tm3)
+            y0_t = theano.dot(theano.dot(u1_t, W1), W2) + 0.1 * y0_tm1 + \
+                    0.33 * y0_tm2 + 0.17 * y0_tm3
             y1_t = theano.dot(u2_t, W2) + y1_tm1
             y2_t = theano.dot(u1_t, W1)
             nwW1 = W1 + .1
@@ -1145,22 +1107,23 @@ def f(u1_t, u2_t, y0_tm3, y0_tm2, y0_tm1, y1_tm1):
         numpy_W1 = vW1.copy()
         numpy_W2 = vW2.copy()
         for idx in xrange(3):
-            numpy_y0[idx + 3] = numpy.dot(numpy.dot(vu1[idx, :], numpy_W1),
+            numpy_y0[idx + 3] = numpy.dot(\
+                                          numpy.dot(vu1[idx, :], numpy_W1), \
                                           numpy_W2) + \
                                 0.1 * numpy_y0[idx + 2] + \
                                 0.33 * numpy_y0[idx + 1] + \
                                 0.17 * numpy_y0[idx]
-            numpy_y1[idx + 1] = (numpy.dot(vu2[idx, :], numpy_W2) +
-                                 numpy_y1[idx])
+            numpy_y1[idx + 1] = numpy.dot(vu2[idx, :], numpy_W2) +\
+                                numpy_y1[idx]
             numpy_y2[idx] = numpy.dot(vu1[idx, :], numpy_W1)
             numpy_W1 = numpy_W1 + .1
             numpy_W2 = numpy_W2 + .05
 
-        utt.assert_allclose(theano_y0, numpy_y0[3:])
-        utt.assert_allclose(theano_y1, numpy_y1[1:])
-        utt.assert_allclose(theano_y2, numpy_y2)
-        utt.assert_allclose(W1.get_value(), numpy_W1)
-        utt.assert_allclose(W2.get_value(), numpy_W2)
+        assert numpy.allclose(theano_y0, numpy_y0[3:])
+        assert numpy.allclose(theano_y1, numpy_y1[1:])
+        assert numpy.allclose(theano_y2, numpy_y2)
+        assert numpy.allclose(W1.get_value(), numpy_W1)
+        assert numpy.allclose(W2.get_value(), numpy_W2)
 
     def test_grad_dtype_change(self):
         x = tensor.fscalar('x')
@@ -1181,7 +1144,7 @@ def inner_fn(cond, x, y):
             go_backwards=False)
         gX, gY = tensor.grad(values[1].sum(), [x, y])
         f = theano.function([c, x, y], [gX, gY],
-                            allow_input_downcast=True)
+                           allow_input_downcast=True)
         # Check for runtime errors
         f(numpy.int32(0), numpy.float32(1.), numpy.float32(.5))
 
@@ -1206,7 +1169,7 @@ def test_simple_shared_mrg_random(self):
 
     def test_simple_shared_random(self):
         theano_rng = theano.tensor.shared_randomstreams.RandomStreams(
-            utt.fetch_seed())
+                            utt.fetch_seed())
 
         values, updates = theano.scan(lambda: theano_rng.uniform((2,), -1, 1),
                                       [],
@@ -1228,13 +1191,13 @@ def test_simple_shared_random(self):
             numpy_v[i] = rng.uniform(-1, 1, size=(2,))
 
         theano_v = my_f()
-        utt.assert_allclose(theano_v, numpy_v[:5, :])
+        assert numpy.allclose(theano_v, numpy_v[:5, :])
         theano_v = my_f()
-        utt.assert_allclose(theano_v, numpy_v[5:, :])
+        assert numpy.allclose(theano_v, numpy_v[5:, :])
 
     def test_cuda_gibbs_chain(self):
         from theano.sandbox import cuda
-        if not cuda.cuda_available:
+        if cuda.cuda_available == False:
             raise SkipTest('Optional package cuda disabled')
 
         rng = numpy.random.RandomState(utt.fetch_seed())
@@ -1242,7 +1205,7 @@ def test_cuda_gibbs_chain(self):
                                 dtype='float32')
         vsample = theano.shared(v_vsample)
         trng = theano.sandbox.rng_mrg.MRG_RandomStreams(
-            utt.fetch_seed())
+                                utt.fetch_seed())
 
         def f(vsample_tm1):
             return trng.binomial(vsample_tm1.shape, n=1, p=0.3,
@@ -1278,7 +1241,7 @@ def test_gibbs_chain(self):
         bvis = theano.shared(v_bvis, 'vbvis')
         vsample = theano.tensor.matrix(dtype='float32')
         trng = theano.tensor.shared_randomstreams.RandomStreams(
-            utt.fetch_seed())
+                                utt.fetch_seed())
 
         def f(vsample_tm1):
             hmean_t = theano.tensor.nnet.sigmoid(
@@ -1330,7 +1293,7 @@ def numpy_implementation(vsample):
 
         t_result = my_f(v_vsample)
         n_result = numpy_implementation(v_vsample)
-        utt.assert_allclose(t_result, n_result)
+        assert numpy.allclose(t_result, n_result)
 
     def test_only_shared_no_input_no_output(self):
         rng = numpy.random.RandomState(utt.fetch_seed())
@@ -1354,7 +1317,7 @@ def f_2():
         n_steps = 3
         this_f(n_steps)
         numpy_state = v_state * (2 ** (n_steps))
-        utt.assert_allclose(state.get_value(), numpy_state)
+        assert numpy.allclose(state.get_value(), numpy_state)
 
     def test_map_functionality(self):
         def f_rnn(u_t):
@@ -1379,7 +1342,7 @@ def f_rnn(u_t):
         v_u = rng.uniform(size=(5,), low=-5., high=5.)
         numpy_result = v_u + 3
         theano_result = f2(v_u)
-        utt.assert_allclose(theano_result, numpy_result)
+        assert numpy.allclose(theano_result, numpy_result)
 
     def test_map(self):
         v = theano.tensor.vector('v')
@@ -1399,7 +1362,7 @@ def test_map(self):
         vals = rng.uniform(size=(10,), low=-5., high=5.)
         abs_vals = abs(vals)
         theano_vals = f(vals)
-        utt.assert_allclose(abs_vals, theano_vals)
+        assert numpy.allclose(abs_vals, theano_vals)
 
     def test_backwards(self):
         def f_rnn(u_t, x_tm1, W_in, W):
@@ -1436,7 +1399,7 @@ def f_rnn(u_t, x_tm1, W_in, W):
             v_out[step] = v_u[3 - step] * W_in + v_out[step - 1] * W
 
         theano_values = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(theano_values, v_out)
+        assert numpy.allclose(theano_values, v_out)
 
     def test_reduce(self):
         v = theano.tensor.vector('v')
@@ -1497,9 +1460,7 @@ def f_rnn(u_t, x_tm1, W_in, W):
 
         if max_err > 1e-2:
             raise Exception(theano.tensor.verify_grad.E_grad,
-                            (max_err, 1e-2, max_err_pos,
-                             analytic_grad[max_err_pos],
-                             num_grad.gx[max_err_pos]))
+                    (max_err, 1e-2, max_err_pos))
 
     def test_grad_multiple_outs(self):
         rng = numpy.random.RandomState(utt.fetch_seed())
@@ -1560,11 +1521,8 @@ def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, W_in1):
 
         if max_err > 1e-2:
             raise Exception(theano.tensor.verify_grad.E_grad,
-                            (max_err, 1e-2, max_err_pos,
-                             analytic_grad[max_err_pos],
-                             num_grad.gx[max_err_pos]))
+                    (max_err, 1e-2, max_err_pos))
 
-    @attr('slow')
     def test_grad_multiple_outs_taps(self):
         l = 5
         rng = numpy.random.RandomState(utt.fetch_seed())
@@ -1586,12 +1544,6 @@ def test_grad_multiple_outs_taps(self):
         x0 = theano.tensor.vector('x0')
         y0 = theano.tensor.vector('y0')
 
-        W_in1.tag.test_value = vW_in1
-        u1.tag.test_value = v_u1
-        u2.tag.test_value = v_u2
-        x0.tag.test_value = v_x0
-        y0.tag.test_value = v_y0
-
         def f_rnn_cmpl(u1_t,
                        u2_tm1,
                        u2_t,
@@ -1600,48 +1552,33 @@ def f_rnn_cmpl(u1_t,
                        y_tm1,
                        y_tm3,
                        W_in1):
-            return [theano.dot(u1_t, W_in1) +
-                    (u2_t + u2_tm1 * u2_tp1) * W_in2 +
-                    theano.dot(x_tm1, W),
+            return [theano.dot(u1_t, W_in1) + \
+                      (u2_t + u2_tm1 * u2_tp1) * W_in2 + \
+                      theano.dot(x_tm1, W),
                     (y_tm1 + y_tm3) * theano.dot(x_tm1, W_out),
                     theano.dot(u1_t, W_in1)]
+        cost, updates = scan_project_sum(
+            f_rnn_cmpl,
+            [u1, dict(input=u2, taps=[-1, 0, 1])],
+            [x0, dict(initial=y0, taps=[-1, -3]), None],
+            W_in1,
+            n_steps=None,
+            truncate_gradient=-1,
+            go_backwards=False)
+        vparams = [v_u1, v_u2, v_x0, v_y0, vW_in1]
+        params = [u1, u2, x0, y0, W_in1]
+        gparams = theano.tensor.grad(cost, params)
+        grad_fn = theano.function([u1, u2, x0, y0, W_in1],
+                                  gparams,
+                                  updates=updates,
+                                  no_default_updates=True,
+                                  allow_input_downcast=True)
 
-        # We change the compute_test_value[_opt] flag to run the
-        # assert in Scan.grad() of the new scan input sequence related
-        # to outer_mitsot_outs, outer_sitsot_outs and
-        # outer_nitsot_outs. This allow to test an old Scan bug.
-        old1 = theano.config.compute_test_value
-        old2 = theano.config.compute_test_value_opt
-        theano.config.compute_test_value = 'raise'
-        theano.config.compute_test_value_opt = 'raise'
-        try:
-            cost, updates = scan_project_sum(
-                f_rnn_cmpl,
-                [u1, dict(input=u2, taps=[-1, 0, 1])],
-                [x0, dict(initial=y0, taps=[-1, -3]), None],
-                W_in1,
-                n_steps=None,
-                truncate_gradient=-1,
-                go_backwards=False)
-            vparams = [v_u1, v_u2, v_x0, v_y0, vW_in1]
-            params = [u1, u2, x0, y0, W_in1]
-            gparams = theano.tensor.grad(cost, params)
-            print >> sys.stderr, "."
-            cost_fn = theano.function([u1, u2, x0, y0, W_in1],
-                                      cost,
-                                      updates=updates,
-                                      no_default_updates=True,
-                                      allow_input_downcast=True)
-            print >> sys.stderr, "."
-            grad_fn = theano.function([u1, u2, x0, y0, W_in1],
-                                      gparams,
-                                      updates=updates,
-                                      no_default_updates=True,
-                                      allow_input_downcast=True)
-            print >> sys.stderr, "."
-        finally:
-            theano.config.compute_test_value = old1
-            theano.config.compute_test_value_opt = old2
+        cost_fn = theano.function([u1, u2, x0, y0, W_in1],
+                                  cost,
+                                  updates=updates,
+                                  no_default_updates=True,
+                                  allow_input_downcast=True)
 
         num_grad = multiple_outputs_numeric_grad(cost_fn,
                                                  [v_u1,
@@ -1654,11 +1591,8 @@ def f_rnn_cmpl(u1_t,
         max_err, max_err_pos = num_grad.max_err(analytic_grad)
         if max_err > 1e-2:
             raise Exception(theano.tensor.verify_grad.E_grad,
-                            (max_err, 1e-2, max_err_pos,
-                             analytic_grad[max_err_pos],
-                             num_grad.gx[max_err_pos]))
+                    (max_err, 1e-2, max_err_pos))
 
-    @attr('slow')
     def test_grad_multiple_outs_taps_backwards(self):
         l = 5
         rng = numpy.random.RandomState(utt.fetch_seed())
@@ -1723,9 +1657,7 @@ def f_rnn_cmpl(u1_t,
         max_err, max_err_pos = num_grad.max_err(analytic_grad)
         if max_err > 1e-2:
             raise Exception(theano.tensor.verify_grad.E_grad,
-                            (max_err, 1e-2, max_err_pos,
-                             analytic_grad[max_err_pos],
-                             num_grad.gx[max_err_pos]))
+                    (max_err, 1e-2, max_err_pos))
 
     def test_grad_multiple_outs_some_uncomputable(self):
         rng = numpy.random.RandomState(utt.fetch_seed())
@@ -1790,9 +1722,7 @@ def reset_rng_fn(fn, *args):
 
         if max_err > 1e-2:
             raise Exception(theano.tensor.verify_grad.E_grad,
-                            (max_err, 1e-2, max_err_pos,
-                             analytic_grad[max_err_pos],
-                             num_grad.gx[max_err_pos]))
+                    (max_err, 1e-2, max_err_pos))
 
     def test_grad_multiple_outs_some_truncate(self):
         rng = numpy.random.RandomState(utt.fetch_seed())
@@ -1850,7 +1780,7 @@ def reset_rng_fn(fn, *args):
         num_grad = multiple_outputs_numeric_grad(
             reset_rng_cost_fn, [v_u, v_x0, vW_in])
         analytic_grad = reset_rng_grad_fn(v_u, v_x0, vW_in)
-        utt.assert_allclose(analytic_grad[0][:2], numpy.zeros((2, 2)))
+        assert numpy.allclose(analytic_grad[0][:2], numpy.zeros((2, 2)))
 
     def test_draw_as_input_to_scan(self):
         trng = theano.tensor.shared_randomstreams.RandomStreams(123)
@@ -1869,8 +1799,8 @@ def test_draw_as_input_to_scan(self):
         ny1, nz1 = f(nx)
         ny2, nz2 = f(nx)
 
-        utt.assert_allclose([ny1, ny1], nz1)
-        utt.assert_allclose([ny2, ny2], nz2)
+        assert numpy.allclose([ny1, ny1], nz1)
+        assert numpy.allclose([ny2, ny2], nz2)
         assert not numpy.allclose(ny1, ny2)
 
     def test_grad_of_shared(self):
@@ -1883,7 +1813,7 @@ def test_grad_of_shared(self):
         m = theano.tensor.grad(y.sum(), x1)
 
         f = theano.function([x2], m, allow_input_downcast=True)
-        utt.assert_allclose(f([2, 3]), 5)
+        assert numpy.allclose(f([2, 3]), 5)
 
     def test_computing_gradient(self):
         x1 = theano.tensor.scalar('x1')
@@ -1988,7 +1918,7 @@ def test_scan_extra_inputs_hessian(self):
         vR = numpy.array([[3.6, 1.8], [1.8, 0.9]], dtype=theano.config.floatX)
         out = f(vx, vA)
 
-        utt.assert_allclose(out, vR)
+        assert numpy.allclose(out, vR)
 
     def test_cloning_no_replace_strict_copy_inputs(self):
         # This has nothing to do with scan, but it refers to the clone
@@ -2002,7 +1932,7 @@ def test_cloning_no_replace_strict_copy_inputs(self):
         f2 = theano.clone(f1,
                           replace=None,
                           strict=True,
-                          share_inputs=True)
+                          copy_inputs=True)
         f2_inp = theano.gof.graph.inputs([f2])
 
         assert z  in f2_inp
@@ -2021,7 +1951,7 @@ def test_cloning_no_replace_strict_not_copy_inputs(self):
         f2 = theano.clone(f1,
                           replace=None,
                           strict=True,
-                          share_inputs=False)
+                          copy_inputs=False)
         f2_inp = theano.gof.graph.inputs([f2])
 
         assert not z in f2_inp
@@ -2041,7 +1971,7 @@ def test_cloning_replace_strict_copy_inputs(self):
         f2 = theano.clone(f1,
                           replace=OrderedDict([(y, y2)]),
                           strict=True,
-                          share_inputs=True)
+                          copy_inputs=True)
         f2_inp = theano.gof.graph.inputs([f2])
         assert z in f2_inp
         assert x in f2_inp
@@ -2060,7 +1990,7 @@ def test_cloning_replace_not_strict_copy_inputs(self):
         f2 = theano.clone(f1,
                           replace=OrderedDict([(y, y2)]),
                           strict=False,
-                          share_inputs=True)
+                          copy_inputs=True)
         f2_inp = theano.gof.graph.inputs([f2])
         assert z in f2_inp
         assert x in f2_inp
@@ -2079,7 +2009,7 @@ def test_cloning_replace_strict_not_copy_inputs(self):
         f2 = theano.clone(f1,
                           replace=[(y, y2)],
                           strict=True,
-                          share_inputs=False)
+                          copy_inputs=False)
         f2_inp = theano.gof.graph.inputs([f2])
         assert not z in f2_inp
         assert not x in f2_inp
@@ -2098,7 +2028,7 @@ def test_cloning_replace_not_strict_not_copy_inputs(self):
         f2 = theano.clone(f1,
                           replace=[(y, y2)],
                           strict=False,
-                          share_inputs=False)
+                          copy_inputs=False)
         f2_inp = theano.gof.graph.inputs([f2])
         assert not z  in f2_inp
         assert not x  in f2_inp
@@ -2167,8 +2097,8 @@ def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, y_tm3, W_in1):
                                                               v_y0,
                                                               vW_in1)
 
-        utt.assert_allclose(theano_x, v_x)
-        utt.assert_allclose(theano_y, v_y)
+        assert numpy.allclose(theano_x, v_x)
+        assert numpy.allclose(theano_y, v_y)
 
     def test_scan_as_tensor_on_gradients(self):
         """
@@ -2252,8 +2182,8 @@ def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, y_tm3, W_in1):
 
         (theano_dump, theano_x, theano_y) = f4(v_u1, v_u2, v_x0, v_y0, vW_in1)
 
-        utt.assert_allclose(theano_x, v_x[-1:])
-        utt.assert_allclose(theano_y, v_y[-1:])
+        assert numpy.allclose(theano_x, v_x[-1:])
+        assert numpy.allclose(theano_y, v_y[-1:])
 
     def caching_nsteps_by_scan_op(self):
         W = tensor.matrix('weights')
@@ -2290,7 +2220,7 @@ def one_step(x_t, h_tm1, W):
         rval = numpy.asarray([[5187989] * 5] * 5, dtype=theano.config.floatX)
         arg1 = numpy.ones((5, 5), dtype=theano.config.floatX)
         arg2 = numpy.ones((10, 5), dtype=theano.config.floatX)
-        utt.assert_allclose(f(arg1, arg2), rval)
+        assert numpy.allclose(f(arg1, arg2), rval)
 
     def test_save_mem_reduced_number_of_steps(self):
         def f_rnn(u_t):
@@ -2329,13 +2259,13 @@ def f_rnn(u_t):
         # compute the output in numpy
         tx1, tx2, tx3, tx4, tx5, tx6, tx7 = f2(v_u, 3, 15)
 
-        utt.assert_allclose(tx1, v_u[:2] + 1.)
-        utt.assert_allclose(tx2, v_u[4] + 2.)
-        utt.assert_allclose(tx3, v_u[3] + 3.)
-        utt.assert_allclose(tx4, v_u[:3] + 4.)
-        utt.assert_allclose(tx5, v_u[-10] + 5.)
-        utt.assert_allclose(tx6, v_u[-15] + 6.)
-        utt.assert_allclose(tx7, v_u[:-15] + 7.)
+        assert numpy.allclose(tx1, v_u[:2] + 1.)
+        assert numpy.allclose(tx2, v_u[4] + 2.)
+        assert numpy.allclose(tx3, v_u[3] + 3.)
+        assert numpy.allclose(tx4, v_u[:3] + 4.)
+        assert numpy.allclose(tx5, v_u[-10] + 5.)
+        assert numpy.allclose(tx6, v_u[-15] + 6.)
+        assert numpy.allclose(tx7, v_u[:-15] + 7.)
         scan_node = f2.maker.fgraph.outputs[0].owner.inputs[0]
 
         # Maybe ugly, way to check if the optimization had
@@ -2385,11 +2315,11 @@ def f_rnn(u_t, x1_tm1, x1_tm3, x2_tm1, x3tm2, x3_tm1, x4_tm1):
         # compute the output in numpy
         tx1, tx2, tx3, tx4, tx5 = f2(v_u, [0, 0], 0, [0, 0], 0)
 
-        utt.assert_allclose(tx1, v_u[-7] + 1.)
-        utt.assert_allclose(tx2, v_u[-3:-1] + 2.)
-        utt.assert_allclose(tx3, v_u[-6:] + 3.)
-        utt.assert_allclose(tx4, v_u[-1] + 4.)
-        utt.assert_allclose(tx5, v_u[-1] + 5.)
+        assert numpy.allclose(tx1, v_u[-7] + 1.)
+        assert numpy.allclose(tx2, v_u[-3:-1] + 2.)
+        assert numpy.allclose(tx3, v_u[-6:] + 3.)
+        assert numpy.allclose(tx4, v_u[-1] + 4.)
+        assert numpy.allclose(tx5, v_u[-1] + 5.)
 
     # The following test will fail in DebugMode if there are
     # some problems in Scan.infer_shape
@@ -2491,48 +2421,6 @@ def sum(s):
             n.op, theano.scan_module.scan_op.Scan)]
         self.assertTrue(len(scans) == 2)
 
-    def test_merge_3scans(self):
-        # This test checks a case where we have 3 scans, two of them
-        # cannot be merged together, but the third one can be merged with
-        # either.
-        x = theano.tensor.vector()
-        y = theano.tensor.vector()
-
-        def sum(s):
-            return s + 1
-
-        sx, upx = theano.scan(sum, sequences=[x], n_steps=4, name='X')
-        # We need to use an expression of y rather than y so the toposort
-        # comes up with the 'Y' scan last.
-        sy, upy = theano.scan(sum, sequences=[2 * y + 2], n_steps=4, name='Y')
-        sz, upz = theano.scan(sum, sequences=[sx], n_steps=4, name='Z')
-
-        f = theano.function(
-            [x, y], [sy, sz],
-            mode=mode_with_opt.excluding('scanOp_pushout_seqs_ops'))
-        topo = f.maker.fgraph.toposort()
-        scans = [n for n in topo if isinstance(
-            n.op, theano.scan_module.scan_op.Scan)]
-        self.assertTrue(len(scans) == 2)
-
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        x_val = rng.uniform(size=(4,)).astype(theano.config.floatX)
-        y_val = rng.uniform(size=(4,)).astype(theano.config.floatX)
-        # Run it so DebugMode can detect optimization problems.
-        f(x_val, y_val)
-
-    def test_sequence_dict(self):
-        # Test that we can specify sequences as a dictionary with
-        # only the 'input' key
-        def incr(s):
-            return s + 1
-
-        x = theano.tensor.vector()
-        sx, upx = theano.scan(
-            fn=incr,
-            sequences=[{'input': x}])
-        f = theano.function([x], sx)
-
     def test_hash(self):
         x = theano.tensor.vector()
         y = theano.tensor.vector()
@@ -2548,7 +2436,7 @@ def test_same(self):
         x = theano.tensor.fmatrix('x')
 
         mem_val = numpy.zeros((2,), dtype='float32')
-        memory = theano.shared(mem_val)
+        memory = theano.shared(mem_val.copy())
         W = theano.shared(numpy.random.random((5, 2)).astype('float32'))
 
         def f(inp, mem):
@@ -2557,8 +2445,8 @@ def f(inp, mem):
             return d, d
 
         outs, updts = theano.scan(f, sequences=[x],
-                                  non_sequences=[],
-                                  outputs_info=[None, memory])
+                          non_sequences=[],
+                          outputs_info=[None, memory])
 
         f = theano.function([x], outs[0])
         f2 = theano.function([x], outs[1])
@@ -2566,9 +2454,9 @@ def f(inp, mem):
         x_val = numpy.random.random((4, 3)).astype('float32')
 
         f_vals = f(x_val)
-        memory.set_value(mem_val)
+        memory.set_value(mem_val.copy())
         f2_vals = f2(x_val)
-        utt.assert_allclose(f_vals, f2_vals)
+        assert numpy.allclose(f_vals, f2_vals)
 
     def test_reduce_memory_consumption(self):
 
@@ -2591,7 +2479,7 @@ def test_reduce_memory_consumption(self):
         assert f1().shape[0] == 1
         gx = theano.tensor.grad(o, x)
         f2 = theano.function([], gx)
-        utt.assert_allclose(f2(), numpy.ones((10,)))
+        assert numpy.allclose(f2(), numpy.ones((10,)))
 
     def test_foldl_memory_consumption(self):
         x = theano.shared(numpy.asarray(
@@ -2614,7 +2502,7 @@ def test_foldl_memory_consumption(self):
         assert f1().shape[0] == 1
         gx = theano.tensor.grad(o, x)
         f2 = theano.function([], gx)
-        utt.assert_allclose(f2(), numpy.ones((10,)))
+        assert numpy.allclose(f2(), numpy.ones((10,)))
 
     def test_foldr_memory_consumption(self):
 
@@ -2638,9 +2526,8 @@ def test_foldr_memory_consumption(self):
         assert f1().shape[0] == 1
         gx = theano.tensor.grad(o, x)
         f2 = theano.function([], gx)
-        utt.assert_allclose(f2(), numpy.ones((10,)))
+        assert numpy.allclose(f2(), numpy.ones((10,)))
 
-    @attr('slow')
     def test_rop2(self):
         seed = utt.fetch_seed()
         rng = numpy.random.RandomState(seed)
@@ -2688,7 +2575,6 @@ def rnn_fn(_u, _y, _W):
         fn_rop = theano.function([u, h0, W, eu, eh0, eW],
                                  [nwo_u, nwo_h0, nwo_W, o],
                                  on_unused_input='ignore')
-        vnu, vnh0, vnW, vno = fn_rop(v_u, v_h0, v_W, v_eu, v_eh0, v_eW)
 
         n2o_u, _ = theano.scan(lambda i, o, u, h0, W, eu: \
                                 (theano.tensor.grad(o[i], u) * eu).sum(),
@@ -2712,10 +2598,11 @@ def rnn_fn(_u, _y, _W):
                                   [n2o_u, n2o_h0, n2o_W, o],
                                   on_unused_input='ignore')
 
+        vnu, vnh0, vnW, vno = fn_rop(v_u, v_h0, v_W, v_eu, v_eh0, v_eW)
         tnu, tnh0, tnW, tno = fn_test(v_u, v_h0, v_W, v_eu, v_eh0, v_eW)
-        utt.assert_allclose(vnu, tnu, atol=1e-6)
-        utt.assert_allclose(vnh0, tnh0, atol=1e-6)
-        utt.assert_allclose(vnW, tnW, atol=1e-6)
+        assert numpy.allclose(vnu, tnu, atol=1e-6)
+        assert numpy.allclose(vnh0, tnh0, atol=1e-6)
+        assert numpy.allclose(vnW, tnW, atol=1e-6)
 
     def test_rop(self):
         seed = utt.fetch_seed()
@@ -2786,9 +2673,9 @@ def rnn_fn(_u, _y, _W):
         vnu, vnh0, vnW = fn_rop(v_u, v_h0, v_W, v_eu, v_eh0, v_eW)
         tnu, tnh0, tnW = fn_test(v_u, v_h0, v_W, v_eu, v_eh0, v_eW)
 
-        utt.assert_allclose(vnu, tnu, atol=1e-6)
-        utt.assert_allclose(vnh0, tnh0, atol=1e-6)
-        utt.assert_allclose(vnW, tnW, atol=1e-6)
+        assert numpy.allclose(vnu, tnu, atol=1e-6)
+        assert numpy.allclose(vnh0, tnh0, atol=1e-6)
+        assert numpy.allclose(vnW, tnW, atol=1e-6)
 
     def test_pushout_all(self):
         W1 = tensor.matrix('W1')
@@ -2822,7 +2709,7 @@ def lambda_fn(h, W1, W2):
         # theano. Note that what we ask theano to do is to repeat the 2
         # elements vector v_out 5 times
         sol[:, :] = v_out
-        utt.assert_allclose(sol, f(v_h, v_W1, v_W2))
+        assert numpy.allclose(sol, f(v_h, v_W1, v_W2))
 
     def test_pushout(self):
         W1 = tensor.matrix('W1')
@@ -2845,26 +2732,6 @@ def lambda_fn(h, W1, W2):
         assert len([x for x in scan_node.op.fn.maker.fgraph.toposort()
                     if isinstance(x.op, theano.tensor.Elemwise)]) == 0
 
-    def test_pushout_nomodif(self):
-        inp = tensor.matrix('inp')
-
-        def fn(i, i_tm1):
-            return i + 10, i_tm1
-
-        ([i_t, i_tm1], _) = theano.scan(
-            fn, sequences=[inp],
-            outputs_info=[numpy.asarray([0.0, 0.0], theano.config.floatX),
-                          None])
-        f = theano.function([inp], [i_t, i_tm1])
-        val = numpy.arange(10).reshape(5, 2).astype(theano.config.floatX)
-        ret = f(val)
-        utt.assert_allclose(ret[0], val + 10)
-        utt.assert_allclose(ret[1], [[0.,  0.],
-                                     [10., 11.],
-                                     [12., 13.],
-                                     [14., 15.],
-                                     [16., 17.]])
-
     def test_alloc_inputs1(self):
         W1 = tensor.matrix('W1')
         W2 = tensor.matrix('W2')
@@ -3218,8 +3085,8 @@ def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, y_tm3, W_in1):
 
         (theano_dump, theano_x, theano_y) = f4(v_u1, v_u2, v_x0, v_y0, vW_in1)
 
-        utt.assert_allclose(theano_x, v_x[-2:])
-        utt.assert_allclose(theano_y, v_y[-4:])
+        assert numpy.allclose(theano_x, v_x[-2:])
+        assert numpy.allclose(theano_y, v_y[-4:])
 
     def test_opt_order(self):
         """
@@ -3246,7 +3113,7 @@ def test_opt_order(self):
         vA = numpy.array([[1., 1.], [1., 0.]], dtype=theano.config.floatX)
         vR = numpy.array([[[2, 1], [4, 2]], [[2, 1], [4, 2]]],
                          dtype=theano.config.floatX)
-        utt.assert_allclose(f(vx, vA), vR)
+        assert numpy.allclose(f(vx, vA), vR)
 
     def test_savemem_opt(self):
         y0 = theano.shared(numpy.ones((2, 10)))
@@ -3299,9 +3166,7 @@ def onestep(xdl, xprev, w):
         max_err, max_err_pos = num_grad.max_err(analytic_grad)
         if max_err > 1e-2:
             raise Exception(theano.tensor.verify_grad.E_grad,
-                            (max_err, 1e-2, max_err_pos,
-                             analytic_grad[max_err_pos],
-                             num_grad.gx[max_err_pos]))
+                    (max_err, 1e-2, max_err_pos))
 
     def test_grad_numeric_shared(self):
         shared_var = theano.shared(numpy.float32(1.))
@@ -3469,7 +3334,7 @@ def test_dot_optimization(self):
         rng = numpy.random.RandomState(utt.fetch_seed())
         vA = rng.uniform(size=(5, 5)).astype(theano.config.floatX)
         vB = rng.uniform(size=(5, 5)).astype(theano.config.floatX)
-        utt.assert_allclose(f(vA, vB), numpy.dot(vA.T, vB))
+        assert numpy.allclose(f(vA, vB), numpy.dot(vA.T, vB))
 
 
     def test_pregreedy_optimizer(self):
@@ -3525,10 +3390,10 @@ def rec_fn(*args):
         rng = numpy.random.RandomState(utt.fetch_seed())
         v_u = asarrayX(rng.uniform(size=(5,)))
         outs = f(v_u, [0, 0, 0], 0)
-        utt.assert_allclose(outs[0], v_u + 1)
-        utt.assert_allclose(outs[1], v_u + 2)
-        utt.assert_allclose(outs[2], v_u + 3)
-        utt.assert_allclose(sh.get_value(), v_u[-1] + 4)
+        assert numpy.allclose(outs[0], v_u + 1)
+        assert numpy.allclose(outs[1], v_u + 2)
+        assert numpy.allclose(outs[2], v_u + 3)
+        assert numpy.allclose(sh.get_value(), v_u[-1] + 4)
 
     def test_eliminate_nonseqs(self):
         W = tensor.scalar('W')
@@ -3558,10 +3423,10 @@ def rec_fn(*args):
         rng = numpy.random.RandomState(utt.fetch_seed())
         v_w = asarrayX(rng.uniform())
         outs = f(v_w, [0, 0, 0], 0)
-        utt.assert_allclose(outs[0], v_w + 1)
-        utt.assert_allclose(outs[1], v_w + 2)
-        utt.assert_allclose(outs[2], v_w + 3)
-        utt.assert_allclose(sh.get_value(), v_w + 4)
+        assert numpy.allclose(outs[0], v_w + 1)
+        assert numpy.allclose(outs[1], v_w + 2)
+        assert numpy.allclose(outs[2], v_w + 3)
+        assert numpy.allclose(sh.get_value(), v_w + 4)
 
     def test_grad_bug_disconnected_input(self):
         W = theano.shared(numpy.zeros((3, 3)), name='W')
@@ -3570,7 +3435,7 @@ def test_grad_bug_disconnected_input(self):
 
         #This used to raise an exception
         f = theano.function([v], theano.tensor.grad(y.sum(), W))
-        utt.assert_allclose(f([1,2]), [[0,0,0],[1,1,1],[1,1,1]])
+        assert numpy.allclose(f([1,2]), [[0,0,0],[1,1,1],[1,1,1]])
 
     def test_clone(self):
         def test(x, y, mention_y):
@@ -3583,9 +3448,9 @@ def test(x, y, mention_y):
             return theano.function([], out)()
 
         x = theano.shared(numpy.asarray(0., dtype=theano.config.floatX))
-        utt.assert_allclose(test(x, tensor.sum((x+1)**2), mention_y=False),
+        assert numpy.allclose(test(x, tensor.sum((x+1)**2), mention_y=False),
                               1.21000003815)
-        utt.assert_allclose(test(x, tensor.sum((x+1)**2), mention_y=True),
+        assert numpy.allclose(test(x, tensor.sum((x+1)**2), mention_y=True),
                               1.21000003815)
 
     def test_grad_find_input(self):
@@ -3621,9 +3486,12 @@ def test_scan_merge_nodes(self):
         assert not opt_obj.belongs_to_set(scan_node1, [scan_node2])
         assert not opt_obj.belongs_to_set(scan_node2, [scan_node1])
 
-    def test_remove_constants_and_unused_inputs_scan_non_seqs(self):
-        #Test the opt remove_constants_and_unused_inputs_scan for
-        #non sequences.
+    def test_remove_constants_and_unused_inputs_scan(self):
+        """
+        Test the opt remove_constants_and_unused_inputs_scan
+
+        TODO: currently we only test non_seqs, should test 
+        """
         W = theano.tensor.matrix(name='W')
         v = theano.tensor.ivector(name='v')
         y1, _ = theano.scan(lambda i, W: W[i], sequences=v,
@@ -3648,94 +3516,17 @@ def test_remove_constants_and_unused_inputs_scan_non_seqs(self):
             f(numpy.zeros((3, 3), dtype=theano.config.floatX), [1, 2])
             scan_node = f.maker.fgraph.toposort()[-1]
 
-            # The first input is the number of iteration.
-            assert (len(scan_node.inputs[1:]) ==
-                    len(set(scan_node.inputs[1:])))
+            # TODO: Why this assert always fail?
+#            assert (len(scan_node.inputs) ==
+#                    len(set(scan_node.inputs)))
             inp = scan_node.op.inner_non_seqs(scan_node.op.inputs)
             assert len(inp) == 1
             assert (len(inp) == len(set(inp)))
             inp = scan_node.op.outer_non_seqs(scan_node)
             assert len(inp) == 1
             assert (len(inp) == len(set(inp)))
-
-    def test_remove_constants_and_unused_inputs_scan_seqs(self):
-        #Test the opt remove_constants_and_unused_inputs_scan for sequences.
-        W = theano.tensor.matrix(name='W')
-        v = theano.tensor.ivector(name='v')
-        vv = theano.tensor.matrix(name='vv')
-        y1, _ = theano.scan(lambda i, W: W[i], sequences=v,
-                            outputs_info=None, non_sequences=[W])
-        y2, _ = theano.scan(lambda i, _, W: W[i], sequences=[v, v],
-                            outputs_info=None, non_sequences=W)
-        y3, _ = theano.scan(lambda i, _, W: W[i], sequences=[v, vv[0]],
-                            outputs_info=None, non_sequences=W)
-        y4, _ = theano.scan(lambda _, i, W: W[i], sequences=[vv[0], v],
-                            outputs_info=None, non_sequences=W)
-        y5, _ = theano.scan(lambda _, i, _2, W: W[i], sequences=[vv, v, vv[0]],
-                            outputs_info=None, non_sequences=W)
-        y6, _ = theano.scan(lambda _, _2, i, W: W[i], sequences=[vv[0], vv, v],
-                            outputs_info=None, non_sequences=W)
-        y7, _ = theano.scan(lambda i, _, _2, W: W[i],
-                            sequences=[v, vv[0], vv[0]],
-                            outputs_info=None, non_sequences=W)
-        y8, _ = theano.scan(lambda _, i, W, _2, _3: W[i], sequences=[vv[0], v],
-                            outputs_info=None, non_sequences=[W, W[0], W[0]])
-        for out in [y1, y2, y3, y4, y5, y6, y7, y8]:
-            #This used to raise an exception
-            f = theano.function([W, v, vv], out, on_unused_input='ignore',
-                                mode=mode_with_opt)
-            f(numpy.zeros((3, 3), theano.config.floatX),
-              [1, 2],
-              numpy.zeros((3, 3), theano.config.floatX))
-            scan_node = f.maker.fgraph.toposort()[-1]
-
-            # The first input is the number of iteration.
-            assert (len(scan_node.inputs[1:]) ==
-                    len(set(scan_node.inputs[1:])))
-            inp = scan_node.op.inner_seqs(scan_node.op.inputs)
-            assert len(inp) == 1
-            inp = scan_node.op.outer_seqs(scan_node)
-            assert len(inp) == 1
-            inp = scan_node.op.inner_non_seqs(scan_node.op.inputs)
-            assert len(inp) == 1
-            inp = scan_node.op.outer_non_seqs(scan_node)
-            assert len(inp) == 1
-
-    @attr('slow')
-    def test_hessian_bug_grad_grad_two_scans(self):
-        #Bug reported by Bitton Tenessi
-
-        W_flat = tensor.fvector(name='W')
-        W_flat.tag.test_value = numpy.ones((8,), dtype=numpy.float32)
-        W = W_flat.reshape((2, 2, 2))
-
-        def loss_outer(i_outer, sum_outer, W):
-
-            def loss_inner(i_inner, sum_inner, W):
-
-                return sum_inner + (W**2).sum().sum().sum()
-
-            result_inner, _ = theano.scan(
-                fn=loss_inner,
-                outputs_info=tensor.as_tensor_variable(
-                    numpy.asarray(0, dtype=numpy.float32)),
-                sequences=tensor.arange(1, dtype='int32'),
-                non_sequences=[W],
-            )
-            return sum_outer + result_inner[-1]
-
-        result_outer, _ = theano.scan(
-            fn=loss_outer,
-            outputs_info=tensor.as_tensor_variable(
-                numpy.asarray(0, dtype=numpy.float32)),
-            sequences=tensor.arange(1, dtype='int32'),
-            non_sequences=[W],
-        )
-
-        cost = result_outer[-1]
-        H = theano.gradient.hessian(cost, W_flat)
-        f = theano.function([W_flat], H)
-        f(numpy.ones((8,), dtype='float32'))
+            #import pdb;pdb.set_trace()
+            #assert numpy.allclose(f([1, 2]), [[0, 0, 0], [1, 1, 1], [1, 1, 1]])
 
 
 def test_speed():
diff --git a/theano/scan_module/tests/test_scan_opt.py b/theano/scan_module/tests/test_scan_opt.py
deleted file mode 100644
index 53d89cfe664..00000000000
--- a/theano/scan_module/tests/test_scan_opt.py
+++ /dev/null
@@ -1,274 +0,0 @@
-import numpy
-import unittest
-
-import theano
-from theano import config
-from theano import tensor as T
-from theano.scan_module.scan_op import Scan
-from theano.tests import unittest_tools as utt
-
-mode = theano.compile.mode.get_mode(config.mode)
-
-
-class TestGaussNewton(unittest.TestCase):
-    """
-    Regression test for code exhibiting various optimization errors.
-
-    This test case is based on code by Sigurd Spieckermann.
-    """
-    def setUp(self):
-        self.rng = numpy.random.RandomState(utt.fetch_seed())
-
-    def _run(self, num_features, num_timesteps, batch_size, mode):
-        # determine shapes of inputs and targets depending on the batch size
-        if batch_size == 1:
-            inputs_size = (num_timesteps, num_features)
-            targets_size = (num_timesteps, 1)
-        else:
-            inputs_size = (num_timesteps, batch_size, num_features)
-            targets_size = (num_timesteps, batch_size, 1)
-
-        # make inputs and targets shared variables
-        inputs = theano.shared(
-            self.rng.uniform(size=inputs_size).astype(config.floatX),
-            borrow=True)
-        targets = theano.shared(
-            self.rng.uniform(size=targets_size).astype(config.floatX),
-            borrow=True)
-
-        # create symbolic inputs and targets variables
-        if batch_size == 1:
-            x = T.matrix('inputs')
-            t = T.matrix('targets')
-        else:
-            x = T.tensor3('inputs')
-            t = T.tensor3('inputs')
-        x.tag.test_value = inputs.get_value(borrow=True)
-        t.tag.test_value = targets.get_value(borrow=True)
-
-        # create a set of parameters for a simple RNN
-        W_xh = theano.shared(
-            (0.01 * self.rng.uniform(
-                size=(num_features, 10))).astype(config.floatX),
-            borrow=True)
-        W_hh = theano.shared(
-            (0.01 * self.rng.uniform(size=(10, 10))).astype(config.floatX),
-            borrow=True)
-        W_hy = theano.shared(
-            (0.01 * self.rng.uniform(size=(10, 1))).astype(config.floatX),
-            borrow=True)
-        b_h = theano.shared(numpy.zeros(10).astype(config.floatX), borrow=True)
-        b_y = theano.shared(numpy.zeros(1).astype(config.floatX), borrow=True)
-
-        params = [W_xh, W_hh, W_hy, b_h, b_y]
-
-        # recurrent function
-        def step(x_t, h_tm1):
-            h = T.tanh(T.dot(h_tm1, W_hh) + T.dot(x_t, W_xh) + b_h)
-            return h
-
-        # build recurrent graph
-        if batch_size == 1:
-            h_0 = T.alloc(0.0, 10).astype(config.floatX)
-        else:
-            h_0 = T.alloc(0.0, batch_size, 10).astype(config.floatX)
-        h, updates = theano.scan(step,
-                                 sequences=[x],
-                                 outputs_info=[h_0])
-        # network output
-        y = T.dot(h, W_hy) + b_y
-
-        # Create Gauss-Newton-Matrix object. Not really of any use here, but I
-        # need it for Hessian-Free optimization.
-        gn = GaussNewtonMatrix(y)
-
-        # compute MSE
-        cost = ((t - y) ** 2).sum(axis=1).mean()
-
-        # Compute the cost at some other point in the parameter
-        # space. Not really of any use here, but this is how I do it
-        # during certain iterations of CG in the HF algorithm. There,
-        # it's in fact `pi + current update proposal`.  For simplicity,
-        # I just multiply by 2 here.
-        cost_ = theano.clone(cost,
-                             replace=dict([(pi, 2 * pi) for pi in params]))
-
-        # Compute Gauss-Newton-Matrix times some vector `v` which is `p` in CG,
-        # but for simplicity, I just take the parameters vector because it's
-        # already there.
-        Gv = gn(v=params, cost=cost, parameters=params, damp=T.constant(1.0))
-
-        # compile Theano function
-        f = theano.function([], [cost_] + Gv, givens={x: inputs, t: targets},
-                            mode=mode)
-        # execute
-        f()
-
-    def test_batch(self):
-        # This runs fine. The batch size is set to something greater than 1,
-        # i.e. the data is represented by a tensor3 object.
-        self._run(100, 10, batch_size=5, mode=mode)
-
-    def test_nobatch(self):
-        # This used to give an error due to optimization "scan_merge_inouts".
-        # The batch size is set to 1 and the data is represented by a matrix.
-        # As of 2013-10-24, it still triggers an optimization error due to
-        # "remove_constants_and_unused_inputs_scan".
-        mode_exc = mode.excluding("remove_constants_and_unused_inputs_scan")
-        self._run(100, 10, batch_size=1, mode=mode_exc)
-
-
-class GaussNewtonMatrix(object):
-    def __init__(self, s):
-        # `s` is the linear network outputs, i.e. the network output
-        # without having applied the activation function
-        self._s = s
-
-    def __call__(self, v, cost, parameters, damp):
-        # compute Gauss-Newton Matrix right-multiplied by `v`
-        Jv = T.Rop(self._s, parameters, v)
-        HJv = T.grad(T.sum(T.grad(cost, self._s) * Jv), self._s,
-                     consider_constant=[Jv])
-        JHJv = T.grad(T.sum(HJv * self._s), parameters,
-                      consider_constant=[HJv, Jv])
-
-        # apply Tikhonov damping
-        JHJv = [JHJvi + damp * vi for JHJvi, vi in zip(JHJv, v)]
-        return JHJv
-
-
-class TestPushOutScanOutputDot(object):
-    """
-    Test class for the PushOutScanOutput optimizer in the case where the inner
-    function of a scan op has an output which is the result of a Dot product
-    on a non-sequence matrix input to scan and a vector that is the result of
-    computation in the inner function.
-    """
-
-    def test_dot_not_output(self):
-        """
-        Test the case where the vector input to the dot is not already an
-        output of the inner function.
-        """
-
-        v = T.vector()
-        m = T.matrix()
-        output = T.dot(v, m)
-
-        # Compile the function twice, once with the optimization and once
-        # without
-        f_opt = theano.function([v, m], T.jacobian(output, v))
-
-        default_mode = theano.compile.get_default_mode()
-        default_mode.excluding("scanOp_pushout_output")
-        f_no_opt = theano.function([v, m], T.jacobian(output, v),
-                                   mode=default_mode)
-
-        # Ensure that the optimization was performed correctly in f_opt
-        # The inner function of scan should have only one output and it should
-        # not be the result of a Dot
-        scan_node = [node for node in f_opt.maker.fgraph.toposort()
-                     if isinstance(node.op, Scan)][0]
-        assert len(scan_node.op.outputs) == 1
-        assert not isinstance(scan_node.op.outputs[0], T.Dot)
-
-        # Ensure that the function compiled with the optimization produces
-        # the same results as the function compiled without
-        v_value = numpy.random.random((4))
-        m_value = numpy.random.random((4, 5))
-
-        output_opt = f_opt(v_value, m_value)
-        output_no_opt = f_no_opt(v_value, m_value)
-
-        utt.assert_allclose(output_opt, output_no_opt)
-
-    def test_dot_nitsot_output(self):
-        """
-        Test the case where the vector input to the dot is already a nitsot
-        output of the inner function.
-        """
-
-        a = T.matrix()
-        b = T.matrix()
-
-        def inner_fct(vect, mat):
-            vect_squared = vect ** 2
-            return T.dot(vect_squared, mat), vect_squared
-
-        outputs, updates = theano.scan(fn=inner_fct,
-                                          outputs_info=[None]*2,
-                                          sequences=a,
-                                          non_sequences=b)
-
-        # Compile the function twice, once with the optimization and once
-        # without
-        f_opt = theano.function([a, b], outputs)
-
-        default_mode = theano.compile.get_default_mode()
-        default_mode.excluding("scanOp_pushout_output")
-        f_no_opt = theano.function([a, b], outputs, mode=default_mode)
-
-        # Ensure that the optimization was performed correctly in f_opt
-        # The inner function of scan should have only one output and it should
-        # not be the result of a Dot
-        scan_node = [node for node in f_opt.maker.fgraph.toposort()
-                     if isinstance(node.op, Scan)][0]
-        assert len(scan_node.op.outputs) == 1
-        assert not isinstance(scan_node.op.outputs[0], T.Dot)
-
-        # Ensure that the function compiled with the optimization produces
-        # the same results as the function compiled without
-        a_value = numpy.random.random((3, 4))
-        b_value = numpy.random.random((4, 5))
-
-        output_opt = f_opt(a_value, b_value)
-        output_no_opt = f_no_opt(a_value, b_value)
-
-        utt.assert_allclose(output_opt[0], output_no_opt[0])
-        utt.assert_allclose(output_opt[1], output_no_opt[1])
-
-    def test_dot_sitsot_output(self):
-        """
-        Test the case where the vector input to the dot is not already a
-        non-nitsot (in this case a sitsot) output of the inner function.
-        """
-
-        a = T.matrix()
-        b = T.matrix()
-
-        def inner_fct(seq1, previous_output1, nonseq1):
-            output1 = previous_output1 + seq1
-            output2 = T.dot(output1, nonseq1)
-            return output1, output2
-
-        outputs, updates = theano.scan(fn=inner_fct,
-                                          outputs_info=[a[0], None],
-                                          sequences=a,
-                                          non_sequences=b)
-
-        # Compile the function twice, once with the optimization and once
-        # without
-        f_opt = theano.function([a, b], outputs)
-
-        default_mode = theano.compile.get_default_mode()
-        default_mode.excluding("scanOp_pushout_output")
-        f_no_opt = theano.function([a, b], outputs, mode=default_mode)
-
-        # Ensure that the optimization was performed correctly in f_opt
-        # The inner function of scan should have only one output and it should
-        # not be the result of a Dot
-        scan_node = [node for node in f_opt.maker.fgraph.toposort()
-                     if isinstance(node.op, Scan)][0]
-        assert len(scan_node.op.outputs) == 2
-        assert not isinstance(scan_node.op.outputs[0], T.Dot)
-
-        # Ensure that the function compiled with the optimization produces
-        # the same results as the function compiled without
-        a_value = numpy.random.random((3, 4))
-        b_value = numpy.random.random((4, 5))
-
-        output_opt = f_opt(a_value, b_value)
-        output_no_opt = f_no_opt(a_value, b_value)
-
-        utt.assert_allclose(output_opt[0], output_no_opt[0])
-        utt.assert_allclose(output_opt[1], output_no_opt[1])
diff --git a/theano/scan_module/tests/test_scan_utils.py b/theano/scan_module/tests/test_scan_utils.py
deleted file mode 100644
index d9672fb590c..00000000000
--- a/theano/scan_module/tests/test_scan_utils.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import theano
-from theano.scan_module.scan_utils import equal_computations
-from theano.tensor.type_other import NoneConst
-
-
-def test_equal_compuations():
-    # This was a bug report by a Theano user.
-    c = NoneConst
-    assert equal_computations([c], [c])
-    m = theano.tensor.matrix()
-    max_argmax1 = theano.tensor.max_and_argmax(m)
-    max_argmax2 = theano.tensor.max_and_argmax(m)
-    assert equal_computations(max_argmax1, max_argmax2)
diff --git a/theano/sparse/basic.py b/theano/sparse/basic.py
index f36b1aa6446..dc92f891063 100644
--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -10,21 +10,27 @@
 import sys
 
 import numpy
-from numpy.lib.stride_tricks import as_strided
+import theano
 import scipy.sparse
 
-import theano
 from theano import gof, tensor, compile, scalar, config
 from theano.gof.python25 import all
 from theano.gradient import DisconnectedType
 from theano.sparse.utils import hash_from_sparse
 import theano.tests.unittest_tools as utt
-from theano.gradient import grad_not_implemented, grad_undefined
+from theano.gradient import grad_not_implemented
 from theano.sparse.type import SparseType, _is_sparse
+from numpy.lib.stride_tricks import as_strided
 
 sparse_formats = ['csc', 'csr']
 
 
+# TODO: move this decorator to the compile submodule
+def register_specialize(lopt, *tags, **kwargs):
+    compile.optdb['specialize'].register((kwargs and kwargs.pop('name')) or
+                                         lopt.__name__, lopt, 'fast_run',
+                                         *tags)
+
 """ Types of sparse matrices to use for testing """
 _mtypes = [scipy.sparse.csc_matrix, scipy.sparse.csr_matrix]
 #_mtypes = [sparse.csc_matrix, sparse.csr_matrix, sparse.dok_matrix,
@@ -245,7 +251,7 @@ def sp_zeros_like(x):
     # TODO: don't restrict to CSM formats
     _, _, indptr, shape = csm_properties(x)
     return CSM(format=x.format)(data=numpy.array([], dtype=x.type.dtype),
-                                indices=numpy.array([], dtype='int32'),
+                                indices=numpy.array([]),
                                 indptr=tensor.zeros_like(indptr),
                                 shape=shape)
 
@@ -278,20 +284,6 @@ def __mul__(left, right):
     def __rmul__(left, right):
         return mul(left, right)
 
-    # comparison operators
-
-    def __lt__(self, other):
-        return lt(self, other)
-
-    def __le__(self, other):
-        return le(self, other)
-
-    def __gt__(self, other):
-        return gt(self, other)
-
-    def __ge__(self, other):
-        return ge(self, other)
-
     # extra pseudo-operator symbols
 
     def __dot__(left, right):
@@ -308,8 +300,6 @@ def __rdot__(right, left):
     # def _as_TensorVariable(self):
     #    return dense_from_sparse(self)
 
-    def toarray(self):
-        return dense_from_sparse(self)
     shape = property(lambda self: tensor.shape(dense_from_sparse(self)))
     # don't worry!
     # the plan is that the ShapeFeature in tensor.opt will do shape propagation
@@ -338,18 +328,14 @@ def __getitem__(self, args):
                             getattr(args[1], 'type', None) == tensor.iscalar)
             if scalar_arg_1 and scalar_arg_2:
                 ret = get_item_scalar(self, args)
-            elif isinstance(args[0], list):
-                ret = get_item_2lists(self, args[0], args[1])
             else:
                 ret = get_item_2d(self, args)
-        elif isinstance(args[0], list):
-            ret = get_item_list(self, args[0])
         else:
             ret = get_item_2d(self, args)
         return ret
 
 
-class SparseVariable(_sparse_py_operators, gof.Variable):
+class SparseVariable(gof.Variable, _sparse_py_operators):
     dtype = property(lambda self: self.type.dtype)
     format = property(lambda self: self.type.format)
 
@@ -404,7 +390,6 @@ def __repr__(self):
 SparseType.Variable = SparseVariable
 SparseType.Constant = SparseConstant
 
-
 # for more dtypes, call SparseType(format, dtype)
 def matrix(format, name=None, dtype=None):
     if dtype is None:
@@ -421,17 +406,11 @@ def csr_matrix(name=None, dtype=None):
     return matrix('csr', name, dtype)
 
 
-def bsr_matrix(name=None, dtype=None):
-    return matrix('bsr', name, dtype)
-
-
 # for more dtypes, call SparseType(format, dtype)
 csc_dmatrix = SparseType(format='csc', dtype='float64')
 csr_dmatrix = SparseType(format='csr', dtype='float64')
-bsr_dmatrix = SparseType(format='bsr', dtype='float64')
 csc_fmatrix = SparseType(format='csc', dtype='float32')
 csr_fmatrix = SparseType(format='csr', dtype='float32')
-bsr_fmatrix = SparseType(format='bsr', dtype='float32')
 
 all_dtypes = SparseType.dtype_set
 complex_dtypes = [t for t in all_dtypes if t[:7] == 'complex']
@@ -445,7 +424,23 @@ def bsr_matrix(name=None, dtype=None):
 
 # CONSTRUCTION
 class CSMProperties(gof.Op):
-    # See doc in instance of this Op or function after this class definition.
+    """Extract all of .data, .indices, .indptr and .shape.
+
+    For specific field, `csm_data`, `csm_indices`, `csm_indptr`
+    and `csm_shape` are provided. Also, `kmap` could be
+    set through to constructor to specified the parts
+    of the parameter `data` the op should return.Fancy indexing
+    with numpy.ndarray should be used for this purpose.
+
+    :param csm: Sparse matrix in CSR or CSC format.
+
+    :return: (data, indices, indptr, shape), the properties
+             of `csm`.
+
+    :note: The grad implemented is regular, i.e. not structured.
+           `infer_shape` method is not available for this op.
+    """
+
     # NOTE
     # We won't implement infer_shape for this op now. This will
     # ask that we implement an GetNNZ op, and this op will keep
@@ -483,7 +478,6 @@ def __str__(self):
 
     def make_node(self, csm):
         csm = as_sparse_variable(csm)
-        assert csm.format in ["csr", "csc"]
         data = tensor.TensorType(dtype=csm.type.dtype,
                                  broadcastable=(False,)).make_variable()
         return gof.Apply(self, [csm],
@@ -517,54 +511,56 @@ def grad(self, (csm,), g):
 
         data, indices, indptr, shape = csm_properties(csm)
         return [CSM(csm.format)(g[0], indices, indptr, shape)]
-
 # don't make this a function or it breaks some optimizations below
 csm_properties = CSMProperties()
-"""
-Extract all of .data, .indices, .indptr and .shape field.
-
-For specific field, `csm_data`, `csm_indices`, `csm_indptr`
-and `csm_shape` are provided.
-
-:param csm: Sparse matrix in CSR or CSC format.
-
-:return: (data, indices, indptr, shape), the properties of `csm`.
-
-:note: The grad implemented is regular, i.e. not structured.
-    `infer_shape` method is not available for this op.
-"""
 
 
 def csm_data(csm):
-    """
-    return the data field of the sparse variable.
-    """
     return csm_properties(csm)[0]
 
 
 def csm_indices(csm):
-    """
-    return the indices field of the sparse variable.
-    """
     return csm_properties(csm)[1]
 
 
 def csm_indptr(csm):
-    """
-    return the indptr field of the sparse variable.
-    """
     return csm_properties(csm)[2]
 
 
 def csm_shape(csm):
-    """
-    return the shape field of the sparse variable.
-    """
     return csm_properties(csm)[3]
 
 
 class CSM(gof.Op):
-    # See doc in instance of this Op or function after this class definition.
+    """Construct a CSC or CSR matrix from the internal
+    representation.
+
+    The format for the sparse array can be specified
+    through the constructor. Also, `kmap` could be
+    set through to constructor to specified the parts
+    of the parameter `data` the op should use to construct
+    the sparse matrix. Fancy indexing with numpy.ndarray
+    should be used for this purpose.
+
+    :param data: One dimensional tensor representing
+                 the data of the sparse to construct.
+    :param indices: One dimensional tensor of integers
+                    representing the indices of the sparse
+                    matrix to construct.
+    :param indptr: One dimensional tensor of integers
+                   representing the indice pointer for
+                   the sparse matrix to construct.
+    :param shape: One dimensional tensor of integers
+                  representing the shape of the sparse
+                  matrix to construct.
+
+    :return: A sparse matrix having the properties
+             specified by the inputs.
+
+    :note: The grad method returns a dense vector, so it provides
+           a regular grad.
+    """
+
     kmap = None
     """Indexing to speficied what part of the data parameter
     should be use to construct the sparse matrix."""
@@ -606,22 +602,12 @@ def __str__(self):
     def make_node(self, data, indices, indptr, shape):
         data = tensor.as_tensor_variable(data)
 
-        if not isinstance(indices, gof.Variable):
-            indices_ = numpy.asarray(indices)
-            indices_32 = theano._asarray(indices, dtype='int32')
-            assert (indices_ == indices_32).all()
-            indices = indices_32
-        if not isinstance(indptr, gof.Variable):
-            indptr_ = numpy.asarray(indptr)
-            indptr_32 = theano._asarray(indptr, dtype='int32')
-            assert (indptr_ == indptr_32).all()
-            indptr = indptr_32
-        if not isinstance(shape, gof.Variable):
-            shape_ = numpy.asarray(shape)
-            shape_32 = theano._asarray(shape, dtype='int32')
-            assert (shape_ == shape_32).all()
-            shape = shape_32
-
+        if not isinstance(indices, tensor.TensorVariable):
+            indices = theano._asarray(indices, dtype='int32')
+        if not isinstance(indptr, tensor.TensorVariable):
+            indptr = theano._asarray(indptr, dtype='int32')
+        if not isinstance(shape, tensor.TensorVariable):
+            shape = theano._asarray(shape, dtype='int32')
         indices = tensor.as_tensor_variable(indices)
         indptr = tensor.as_tensor_variable(indptr)
         shape = tensor.as_tensor_variable(shape)
@@ -683,54 +669,11 @@ def infer_shape(self, node, shapes):
             # node.inputs[3] is of lenght as we only support sparse matrix.
             return [(node.inputs[3][0], node.inputs[3][1])]
         else:
-            raise theano.tensor.basic.ShapeError("case not implemented")
+            return node.fgraph.shape_feature.default_infer_shape(node, shapes)
 
 
 CSC = CSM('csc')
-"""Construct a CSC matrix from the internal
-representation.
-
-:param data: One dimensional tensor representing
-    the data of the sparse matrix to construct.
-:param indices: One dimensional tensor of integers
-    representing the indices of the sparse
-    matrix to construct.
-:param indptr: One dimensional tensor of integers
-    representing the indice pointer for
-    the sparse matrix to construct.
-:param shape: One dimensional tensor of integers
-    representing the shape of the sparse
-    matrix to construct.
-
-:return: A sparse matrix having the properties
-             specified by the inputs.
-
-:note: The grad method returns a dense vector, so it provides
-    a regular grad.
-"""
-
 CSR = CSM('csr')
-"""Construct a CSR matrix from the internal
-representation.
-
-:param data: One dimensional tensor representing
-    the data of the sparse matrix to construct.
-:param indices: One dimensional tensor of integers
-    representing the indices of the sparse
-    matrix to construct.
-:param indptr: One dimensional tensor of integers
-    representing the indice pointer for
-    the sparse matrix to construct.
-:param shape: One dimensional tensor of integers
-    representing the shape of the sparse
-    matrix to construct.
-
-:return: A sparse matrix having the properties
-             specified by the inputs.
-
-:note: The grad method returns a dense vector, so it provides
-    a regular grad.
-"""
 
 
 class CSMGrad(gof.op.Op):
@@ -808,7 +751,16 @@ def infer_shape(self, node, shapes):
 
 
 class Cast(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
+    """Cast sparse variable to the desired dtype.
+
+    :param x: Sparse matrix.
+
+    :return: Same as `x` but having `out_type` as dtype.
+
+    :note: The grad implemented is regular, i.e. not
+           structured.
+    """
+
     def __init__(self, out_type):
         self.out_type = out_type
 
@@ -820,7 +772,6 @@ def __hash__(self):
 
     def make_node(self, x):
         x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
         return gof.Apply(
             self, [x],
             [SparseType(dtype=self.out_type, format=x.format).make_variable()])
@@ -853,17 +804,6 @@ def __str__(self):
 
 
 def cast(variable, dtype):
-    """Cast sparse variable to the desired dtype.
-
-    :param variable: Sparse matrix.
-    :param dtype: the dtype wanted.
-
-    :return: Same as `x` but having `dtype` as dtype.
-
-    :note: The grad implemented is regular, i.e. not
-           structured.
-    """
-
     return Cast(dtype)(variable)
 
 #
@@ -872,7 +812,19 @@ def cast(variable, dtype):
 
 
 class DenseFromSparse(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
+    """Convert a sparse matrix to a dense one.
+
+    :param x: A sparse matrix.
+
+    :return: A dense matrix, the same as `x`.
+
+    :note: The grad implementation can be controlled
+           through the constructor via the `structured`
+           parameter. `True` will provide a structured
+           grad while `False` will provide a regular
+           grad. By default, the grad is structured.
+    """
+
     def __init__(self, structured=True):
         self.sparse_grad = structured
 
@@ -928,21 +880,25 @@ def infer_shape(self, node, shapes):
         return [shapes[0]]
 
 dense_from_sparse = DenseFromSparse()
-"""Convert a sparse matrix to a dense one.
 
-:param x: A sparse matrix.
 
-:return: A dense matrix, the same as `x`.
+class SparseFromDense(gof.op.Op):
+    """Convert a dense matrix to a sparse matrix.
 
-:note: The grad implementation can be controlled
-    through the constructor via the `structured`
-    parameter. `True` will provide a structured
-    grad while `False` will provide a regular
-    grad. By default, the grad is structured.
-"""
+    To convert in CSR format, use `csr_from_dense`
+    and to convert in CSC format, use `csc_from_dense`.
 
+    :param x: A dense matrix.
+
+    :return: The same as `x` in a sparse matrix
+             format.
+
+    :note: The grad implementation is regular, i.e.
+           not structured.
+    :note: The output sparse format can also be controlled
+           via the `format` parameter in the constructor.
+    """
 
-class SparseFromDense(gof.op.Op):
     def __init__(self, format):
         self.format = format
 
@@ -988,213 +944,38 @@ def infer_shape(self, node, shapes):
         return [shapes[0]]
 
 csr_from_dense = SparseFromDense('csr')
-"""Convert a dense matrix to a sparse csr matrix.
-:param x: A dense matrix.
-:return: The same as `x` in a sparse csr matrix format.
-"""
-
 csc_from_dense = SparseFromDense('csc')
-"""Convert a dense matrix to a sparse csc matrix.
-:param x: A dense matrix.
-:return: The same as `x` in a sparse csc matrix format.
-"""
 
 
 # Indexing
-class GetItemList(gof.op.Op):
-
-    def __eq__(self, other):
-        return (type(self) == type(other))
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def infer_shape(self, node, shapes):
-        return [(shapes[1][0], shapes[0][1])]
-
-    def make_node(self, x, index):
-        x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
-
-        ind = tensor.as_tensor_variable(index)
-        assert ind.ndim == 1
-        assert "int" in ind.dtype
-
-        return gof.Apply(self, [x, ind], [x.type()])
-
-    def perform(self, node, inp, (out, )):
-        x = inp[0]
-        indices = inp[1]
-        assert _is_sparse(x)
-        out[0] = x[indices]
-
-    def grad(self, inputs, g_outputs):
-        x, indices = inputs
-        gout, = g_outputs
-        return [GetItemListGrad(self)(x, indices, gout),
-                grad_undefined(self, 1, indices, "No gradient for this input")]
-
-    def __str__(self):
-        return self.__class__.__name__
-
-get_item_list = GetItemList()
-"""Select row of sparse matrix, 
-returning them as a new sparse matrix.
-
-:param x: Sparse matrix.
-:param index: List of rows.
-
-:return: The corresponding rows in `x`.
-"""
-
-
-class GetItemListGrad(gof.op.Op):
-
-    def __eq__(self, other):
-        return (type(self) == type(other))
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def infer_shape(self, node, shapes):
-        return [(shapes[0])]
-
-    def make_node(self, x, index, gz):
-        x = as_sparse_variable(x)
-        gz = as_sparse_variable(gz)
-
-        assert x.format in ["csr", "csc"]
-        assert gz.format in ["csr", "csc"]
-
-        ind = tensor.as_tensor_variable(index)
-        assert ind.ndim == 1
-        assert "int" in ind.dtype
-
-        scipy_ver = [int(n) for n in scipy.__version__.split('.')[:2]]
-
-        if not scipy_ver >= [0, 13]:
-            raise NotImplementedError("Scipy version is to old")
-
-        return gof.Apply(self, [x, ind, gz], [x.type()])
-
-    def perform(self, node, inp, (out, )):
-        x = inp[0]
-        indices = inp[1]
-        gz = inp[2]
-
-        if x.format in ["csr"]:
-            y = scipy.sparse.csr_matrix((x.shape[0], x.shape[1]))
-        else:
-            y = scipy.sparse.csc_matrix((x.shape[0], x.shape[1]))
-        for a in range(0, len(indices)):
-                y[indices[a]] = gz[a]
-
-        out[0] = y
-
-    def __str__(self):
-        return self.__class__.__name__
-
-get_item_list_grad = GetItemListGrad()
-
-
-class GetItem2Lists(gof.op.Op):
-
-    def __eq__(self, other):
-        return (type(self) == type(other))
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def make_node(self, x, ind1, ind2):
-        x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
-        ind1 = tensor.as_tensor_variable(ind1)
-        ind2 = tensor.as_tensor_variable(ind2)
-        assert "int" in ind1.dtype
-        assert "int" in ind2.dtype
-
-        return gof.Apply(self, [x, ind1, ind2],
-                         [theano.tensor.vector()])
-
-    def perform(self, node, inp, (out, )):
-        x = inp[0]
-        ind1 = inp[1]
-        ind2 = inp[2]
-        out[0] = numpy.asarray(x[ind1, ind2]).flatten()
-        """Here scipy returns the corresponding elements in a matrix which isn't what we are aiming for.
-        Using asarray and flatten, out[0] becomes an array.
-        """
-    def grad(self, inputs, g_outputs):
-        x, ind1, ind2 = inputs
-        gout, = g_outputs
-        return [GetItem2ListsGrad(self)(x, ind1, ind2, gout),
-                grad_undefined(self, 1, ind1, "No gradient for this input"),
-                grad_undefined(self, 1, ind2, "No gradient for this input")]
-
-    def __str__(self):
-        return self.__class__.__name__
-
-get_item_2lists = GetItem2Lists()
-"""Select elements of sparse matrix, returning them in a vector.
-
-  :param x: Sparse matrix.
-
-  :param index: List of two lists, first list indicating the row of
-                each element and second list indicating its column.
-
-  :return: The corresponding elements in `x`.
-"""
-
-
-class GetItem2ListsGrad(gof.op.Op):
-
-    def __eq__(self, other):
-        return (type(self) == type(other))
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def infer_shape(self, node, shapes):
-        return [(shapes[0])]
-
-    def make_node(self, x, ind1, ind2, gz):
-        x = as_sparse_variable(x)
-
-        assert x.format in ["csr", "csc"]
-
-        ind1 = tensor.as_tensor_variable(ind1)
-        ind2 = tensor.as_tensor_variable(ind2)
-        assert ind1.ndim == 1
-        assert ind2.ndim == 1
-        assert "int" in ind1.dtype
-        assert "int" in ind2.dtype
+class GetItem2d(gof.op.Op):
+    """Implement a subtensor of sparse variable and that return a
+    sparse matrix.
 
-        return gof.Apply(self, [x, ind1, ind2, gz], [x.type()])
+    If you want to take only one element of a sparse matrix see
+    `GetItemScalar` that return a tensor scalar.
 
-    def perform(self, node, inp, (out, )):
-        x = inp[0]
-        ind1 = inp[1]
-        ind2 = inp[2]
-        gz = inp[3]
+    .. note::
 
-        if x.format in ["csr"]:
-            y = scipy.sparse.csr_matrix((x.shape[0], x.shape[1]))
-        else:
-            y = scipy.sparse.csc_matrix((x.shape[0], x.shape[1]))
-        z = 0
-        for z in range(0, len(ind1)):
-            y[(ind1[z], ind2[z])] = gz[z]
+        Subtensor selection always returns a matrix, so indexing
+        with [a:b, c:d] is forced.  If one index is a scalar. For
+        instance, x[a:b, c] and x[a, b:c], generate an error. Use
+        instead x[a:b, c:c+1] and x[a:a+1, b:c].
 
-        out[0] = y
+    The above indexing methods are not supported because the return value
+    would be a sparse matrix rather than a sparse vector, which is a
+    deviation from numpy indexing rule.  This decision is made largely
+    for keeping the consistency between numpy and theano. Subjected
+    to modification when sparse vector is supported.
 
-    def __str__(self):
-        return self.__class__.__name__
+    :param x: Sparse matrix.
+    :param index: Tuple of slice object.
 
-get_item_2lists_grad = GetItem2ListsGrad()
+    :return: The slice corresponding in `x`.
 
+    :note: The grad is not implemented for this op.
+    """
 
-class GetItem2d(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
     def __eq__(self, other):
         return (type(self) == type(other))
 
@@ -1208,7 +989,6 @@ def __hash__(self):
 
     def make_node(self, x, index):
         x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
         assert len(index) in [1, 2]
 
         input_op = [x]
@@ -1276,36 +1056,23 @@ def __str__(self):
         return self.__class__.__name__
 
 get_item_2d = GetItem2d()
-"""Implement a subtensor of sparse variable, returning a
-sparse matrix.
-
-If you want to take only one element of a sparse matrix see
-`GetItemScalar` that returns a tensor scalar.
 
-.. note::
 
-    Subtensor selection always returns a matrix, so indexing
-    with [a:b, c:d] is forced.  If one index is a scalar, for
-    instance, x[a:b, c] or x[a, b:c], an error will be raised. Use
-    instead x[a:b, c:c+1] or x[a:a+1, b:c].
-
-The above indexing methods are not supported because the return value
-would be a sparse matrix rather than a sparse vector, which is a
-deviation from numpy indexing rule.  This decision is made largely
-to preserve consistency between numpy and theano. This may be revised
-when sparse vectors are supported.
+class GetItemScalar(gof.op.Op):
+    """Implement a subtensor of a sparse variable that take
+    two scalar as index and return a scalar.
 
-:param x: Sparse matrix.
-:param index: Tuple of slice object.
+    If you want to take a slice of a sparse matrix see
+    `GetItem2d` that return a sparse matrix.
 
-:return: The corresponding slice in `x`.
+    :param x: Sparse matrix.
+    :param index: Tuple of scalar..
 
-:note: The grad is not implemented for this op.
-"""
+    :return: The item corresponding in `x`.
 
+    :note:  The grad is not implemented for this op.
+    """
 
-class GetItemScalar(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
     def __eq__(self, other):
         return (type(self) == type(other))
 
@@ -1317,7 +1084,6 @@ def infer_shape(self, node, shapes):
 
     def make_node(self, x, index):
         x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
         assert len(index) == 2
 
         input_op = [x]
@@ -1348,24 +1114,22 @@ def __str__(self):
         return self.__class__.__name__
 
 get_item_scalar = GetItemScalar()
-"""Implement a subtensor of a sparse variable that takes
-two scalars as index and returns a scalar.
 
-If you want to take a slice of a sparse matrix see
-`GetItem2d` that returns a sparse matrix.
 
-:param x: Sparse matrix.
-:param index: Tuple of scalars.
-
-:return: The corresponding item in `x`.
+# Linear Algebra
+class Transpose(gof.op.Op):
+    """Return the transpose of the sparse matrix.
 
-:note:  The grad is not implemented for this op.
-"""
+    :param x: Sparse matrix.
 
+    :return: `x` transposed.
 
-# Linear Algebra
-class Transpose(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
+    :note: The returned matrix will not be in the
+           same format. `csc` matrix will be changed
+           in `csr` matrix and `csr` matrix in `csc`
+           matrix.
+    :note: The grad is regular, i.e. not structured.
+    """
     view_map = {0: [0]}
 
     format_map = {'csr': 'csc',
@@ -1382,7 +1146,6 @@ def __str__(self):
 
     def make_node(self, x):
         x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
         return gof.Apply(self,
                          [x],
                          [SparseType(dtype=x.type.dtype,
@@ -1400,22 +1163,18 @@ def grad(self, (x,), (gz,)):
     def infer_shape(self, node, shapes):
         return [shapes[0][::-1]]
 transpose = Transpose()
-"""Return the transpose of the sparse matrix.
 
-:param x: Sparse matrix.
 
-:return: `x` transposed.
+class Neg(gof.op.Op):
+    """Return the negation of the sparse matrix.
 
-:note: The returned matrix will not be in the
-    same format. `csc` matrix will be changed
-    in `csr` matrix and `csr` matrix in `csc`
-    matrix.
-:note: The grad is regular, i.e. not structured.
-"""
+    :param x: Sparse matrix.
 
+    :return: -`x`.
+
+    :note: The grad is regular, i.e. not structured.
+    """
 
-class Neg(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
     def __eq__(self, other):
         return (type(self) == type(other))
 
@@ -1427,7 +1186,6 @@ def __str__(self):
 
     def make_node(self, x):
         x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
         return gof.Apply(self, [x], [x.type()])
 
     def perform(self, node, (x, ), (out, )):
@@ -1441,14 +1199,6 @@ def grad(self, (x,), (gz,)):
     def infer_shape(self, node, shapes):
         return [shapes[0]]
 neg = Neg()
-"""Return the negation of the sparse matrix.
-
-:param x: Sparse matrix.
-
-:return: -`x`.
-
-:note: The grad is regular, i.e. not structured.
-"""
 
 
 class ColScaleCSC(gof.op.Op):
@@ -1521,8 +1271,6 @@ def __hash__(self):
         return hash(type(self))
 
     def make_node(self, x, s):
-        x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
         return gof.Apply(self, [x, s], [x.type()])
 
     def perform(self, node, (x, s), (z,)):
@@ -1592,7 +1340,26 @@ def row_scale(x, s):
 
 
 class SpSum(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
+    """Calculate the sum of a sparse matrix along a specify
+    axis.
+
+    It operates a reduction along the axis specified. When
+    `axis` is `None`, it is apply along all axis.
+
+    :param x: Sparse matrix.
+    :param axis: Axis along the sum is apply. Integers or `None`.
+    :param sparse_grad: `True` to have a structured grad. Boolean.
+
+    :return: The sum of `x` in a dense format.
+
+    :note: The grad implementation is controlled with the `sparse_grad`
+           parameter. `True` will provide a structured grad and `False`
+           will provide a regular grad. For both choice, the grad
+           return a sparse matrix having the same format as `x`.
+    :note: This op does not return a sparse matrix, but a dense tensor
+           matrix.
+    """
+
     def __init__(self, axis=None, sparse_grad=True):
         super(SpSum, self).__init__()
         self.axis = axis
@@ -1618,7 +1385,6 @@ def __hash__(self):
 
     def make_node(self, x):
         x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
         b = ()
         if self.axis is not None:
             b = (False,)
@@ -1627,7 +1393,7 @@ def make_node(self, x):
         return gof.Apply(self, [x], [z])
 
     def perform(self, node, (x,), (z,)):
-        if self.axis is None:
+        if self.axis == None:
             z[0] = numpy.asarray(x.sum())
         else:
             z[0] = numpy.asarray(x.sum(self.axis)).ravel()
@@ -1678,31 +1444,21 @@ def __str__(self):
 
 
 def sp_sum(x, axis=None, sparse_grad=False):
-    """Calculate the sum of a sparse matrix along the specified
-    axis.
-
-    It operates a reduction along the specified axis. When
-    `axis` is `None`, it is applied along all axes.
+    return SpSum(axis, sparse_grad)(x)
 
-    :param x: Sparse matrix.
-    :param axis: Axis along which the sum is applied. Integer or `None`.
-    :param sparse_grad: `True` to have a structured grad. Boolean.
 
-    :return: The sum of `x` in a dense format.
+class Diag(gof.op.Op):
+    """Extract the diagonal of a square sparse matrix as a dense
+    vector.
 
-    :note: The grad implementation is controlled with the `sparse_grad`
-           parameter. `True` will provide a structured grad and `False`
-           will provide a regular grad. For both choices, the grad
-           returns a sparse matrix having the same format as `x`.
-    :note: This op does not return a sparse matrix, but a dense tensor
-           matrix.
-    """
+    :param x: A square sparse matrix in csc format.
 
-    return SpSum(axis, sparse_grad)(x)
+    :return: A dense vector representing the diagonal elements.
 
+    :note: The grad implemented is regular, i.e. not structured, since
+           the output is a dense vector.
+    """
 
-class Diag(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
     def __eq__(self, other):
         return (type(self) == type(other))
 
@@ -1710,8 +1466,6 @@ def __hash__(self):
         return hash(type(self))
 
     def make_node(self, x):
-        x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
         return gof.Apply(self, [x], [tensor.tensor(broadcastable=(False,),
                                                    dtype=x.dtype)])
 
@@ -1730,21 +1484,19 @@ def infer_shape(self, nodes, shapes):
     def __str__(self):
         return self.__class__.__name__
 diag = Diag()
-"""Extract the diagonal of a square sparse matrix as a dense vector.
 
-  :param x: A square sparse matrix in csc format.
 
-  :return: A dense vector representing the diagonal elements.
+class SquareDiagonal(gof.op.Op):
+    """Return a square sparse (csc) matrix whose diagonal
+    is given by the dense vector argument.
 
-.. note::
+    :param x: Dense vector for the diagonal.
 
-  The grad implemented is regular, i.e. not structured, since the
-  output is a dense vector.
-"""
+    :return: A sparse matrix having `x` as diagonal.
 
+    :note: The grad implemented is regular, i.e. not structured.
+    """
 
-class SquareDiagonal(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
     def __eq__(self, other):
         return type(self) == type(other)
 
@@ -1779,19 +1531,23 @@ def infer_shape(self, nodes, shapes):
     def __str__(self):
         return self.__class__.__name__
 square_diagonal = SquareDiagonal()
-"""Return a square sparse (csc) matrix whose diagonal
-is given by the dense vector argument.
 
-:param x: Dense vector for the diagonal.
 
-:return: A sparse matrix having `x` as diagonal.
+class EnsureSortedIndices(gof.op.Op):
+    """Resort indices of a sparse matrix.
 
-:note: The grad implemented is regular, i.e. not structured.
-"""
+    CSR column indices are not necessarily sorted. Likewise
+    for CSC row indices. Use `ensure_sorted_indices` when sorted
+    indices are required (e.g. when passing data to other
+    libraries).
+
+    :param x: A sparse matrix.
 
+    :return: The same as `x` with indices sorted.
+
+    :note: The grad implemented is regular, i.e. not structured.
+    """
 
-class EnsureSortedIndices(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
     def __init__(self, inplace):
         self.inplace = inplace
         if self.inplace:
@@ -1804,8 +1560,6 @@ def __hash__(self):
         return hash(type(self))
 
     def make_node(self, x):
-        x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
         return gof.Apply(self, [x], [x.type()])
 
     def perform(self, node, (x, ), (z, )):
@@ -1826,29 +1580,16 @@ def __str__(self):
         else:
             return self.__class__.__name__ + "{no_inplace}"
 ensure_sorted_indices = EnsureSortedIndices(inplace=False)
-"""Re-sort indices of a sparse matrix.
-
-CSR column indices are not necessarily sorted. Likewise
-for CSC row indices. Use `ensure_sorted_indices` when sorted
-indices are required (e.g. when passing data to other
-libraries).
-
-:param x: A sparse matrix.
-
-:return: The same as `x` with indices sorted.
-
-:note: The grad implemented is regular, i.e. not structured.
-"""
 
 
 def clean(x):
     """Remove explicit zeros from a sparse matrix, and
-    re-sort indices.
+    resort indices.
 
     CSR column indices are not necessarily sorted. Likewise
     for CSC row indices. Use `clean` when sorted
     indices are required (e.g. when passing data to other
-    libraries) and to ensure there are no zeros in the data.
+    libraries) and to ensure there is no zeros in the data.
 
     :param x: A sparse matrix.
 
@@ -1861,8 +1602,16 @@ def clean(x):
 
 
 class AddSS(gof.op.Op):
-    #add(sparse, sparse).
-    #see the doc of add() for more detail.
+    """Add tw sparse matrix.
+
+    :param x: A sparse matrix.
+    :param y: A sparse matrix
+
+    :return: `x`+`y`
+
+    :note: The grad implemented is regular, i.e. not structured.
+    """
+
     def __eq__(self, other):
         return (type(self) == type(other))
 
@@ -1874,12 +1623,13 @@ def __str__(self):
 
     def make_node(self, x, y):
         x, y = map(as_sparse_variable, [x, y])
-        assert x.format in ["csr", "csc"]
-        assert y.format in ["csr", "csc"]
-        out_dtype = scalar.upcast(x.type.dtype, y.type.dtype)
+        if x.type.dtype != y.type.dtype:
+            raise NotImplementedError()
+        if x.type.format != y.type.format:
+            raise NotImplementedError()
         return gof.Apply(self,
                          [x, y],
-                         [SparseType(dtype=out_dtype,
+                         [SparseType(dtype=x.type.dtype,
                                      format=x.type.format
                                     ).make_variable()])
 
@@ -1900,7 +1650,19 @@ def infer_shape(self, node, shapes):
 
 
 class AddSSData(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
+    """Add two sparse matrices assuming they have the same sparsity
+    pattern.
+
+    :param x: Sparse matrix.
+    :param y: Sparse matrix.
+
+    :return: The sum of the two sparse matrix element wise.
+
+    :note: `x` and `y` are assumed to have the same
+           sparsity pattern.
+    :note: The grad implemented is structured.
+    """
+
     def __eq__(self, other):
         return (type(self) == type(other))
 
@@ -1909,8 +1671,6 @@ def __hash__(self):
 
     def make_node(self, x, y):
         x, y = map(as_sparse_variable, [x, y])
-        assert x.format in ["csr", "csc"]
-        assert y.format in ["csr", "csc"]
         if x.type.dtype != y.type.dtype:
             raise NotImplementedError()
         if x.type.format != y.type.format:
@@ -1939,53 +1699,109 @@ def infer_shape(self, node, ins_shapes):
     def __str__(self):
         return self.__class__.__name__
 add_s_s_data = AddSSData()
-"""Add two sparse matrices assuming they have the same sparsity
-pattern.
 
-:param x: Sparse matrix.
-:param y: Sparse matrix.
 
-:return: The sum of the two sparse matrices element wise.
-
-:note: `x` and `y` are assumed to have the same
-    sparsity pattern.
-:note: The grad implemented is structured.
+class AddSD(gof.op.Op):
+    """Add a sparse and a dense matrix.
 
-"""
+    :param x: A sparse matrix.
+    :param y: A dense matrix
 
+    :return: `x`+`y`
 
-class AddSD(gof.op.Op):
-    #add(sparse, sparse).
-    #see the doc of add() for more detail.
-    def __init__(self, *args, **kwargs):
+    :note: The grad implemented is structured on `x`.
+    """
+    def __init__(self, inplace=False, *args, **kwargs):
         gof.Op.__init__(self, *args, **kwargs)
+        #Should we do inplace addition or not ?
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [3]}
 
     def __eq__(self, other):
-        return (type(self) == type(other))
+        return (type(self) == type(other)) and self.inplace == other.inplace
 
     def __hash__(self):
-        return hash(type(self))
+        return hash(type(self)) ^ hash(self.inplace)
 
     def __str__(self):
+        if self.inplace:
+            return self.__class__.__name__ + '{inplace}'
         return self.__class__.__name__
 
     def make_node(self, x, y):
         x, y = as_sparse_variable(x), tensor.as_tensor_variable(y)
-        assert x.format in ["csr", "csc"]
-        out_dtype = scalar.upcast(x.type.dtype, y.type.dtype)
 
+        if x.type.dtype != y.type.dtype:
+            raise NotImplementedError(
+                "AddSD support inputs with the same dtype only."
+                " You passed %s and %s inputs dtype." % (x.type.dtype,
+                                                         y.type.dtype))
+
+        indices, indptr, data = csm_indices(x), csm_indptr(x), csm_data(x)
+
+        # We either use CSC or CSR depending on the format of input
+        self.format = x.format
         # The magic number two here arises because L{scipy.sparse}
         # objects must be matrices (have dimension 2)
         assert y.type.ndim == 2
         return gof.Apply(self,
-                         [x, y],
-                         [tensor.TensorType(dtype=out_dtype,
+                         [data, indices, indptr, y],
+                         [tensor.TensorType(dtype=y.type.dtype,
                                             broadcastable=y.type.broadcastable
                                            ).make_variable()])
 
-    def perform(self, node, (x,  y), (out, )):
+    def c_code(self, node, name, (_data, _indices, _indptr, y), (z, ), sub):
+        inplace = int(self.inplace)
+        format = {'csc': 0, 'csr': 1}[self.format]
+        code = """
+                Py_XDECREF(%(z)s);
+                if (!%(inplace)s){
+                  %(z)s = (PyArrayObject *) PyArray_NewCopy(%(y)s, NPY_CORDER);
+                }else{
+                  %(z)s = %(y)s;
+                  Py_XINCREF(%(z)s);
+                }
+
+                npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1;
+                const npy_int32 * __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+                const npy_int32 * __restrict__ indices = (npy_int32*)%(_indices)s->data;
+                const dtype_%(_data)s* __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+
+                dtype_%(y)s* ydata = (dtype_%(y)s*)PyArray_DATA(%(y)s);
+                dtype_%(z)s* zdata = (dtype_%(z)s*)PyArray_DATA(%(z)s);
+                int Yi = PyArray_STRIDES(%(y)s)[0]/PyArray_DESCR(%(y)s)->elsize;
+                int Yj = PyArray_STRIDES(%(y)s)[1]/PyArray_DESCR(%(y)s)->elsize;
+
+                npy_int32 pos;
+                if (%(format)s == 0){
+                for (npy_int32 col = 0; col < N; ++col){
+                  for (npy_int32 ind = indptr[col]; ind < indptr[col+1]; ++ind){
+                    npy_int32 row = indices[ind];
+                    pos = row * Yi + col * Yj;
+                    zdata[pos] = ydata[pos] + data[ind];
+                  }
+                }
+                }else{
+                for (npy_int32 row = 0; row < N; ++row){
+                  for (npy_int32 ind = indptr[row]; ind < indptr[row+1]; ++ind){
+                    npy_int32 col = indices[ind];
+                    pos = row * Yi + col * Yj;
+                    zdata[pos] = ydata[pos] + data[ind];
+                  }
+                 } 
+                }
+             """ % dict(locals(), **sub)
+        return code
+
+    def perform(self, node, (data, indices, indptr,  y), (out, )):
         assert _is_dense(y)
 
+        if self.format == 'csr':
+            x = scipy.sparse.csr_matrix((data, indices, indptr), shape=y.shape)
+        elif self.format == 'csc':
+            x = scipy.sparse.csc_matrix((data, indices, indptr), shape=y.shape)
+
         # The asarray is needed as in some case, this return a
         # numpy.matrixlib.defmatrix.matrix object and not an ndarray.
         out[0] = theano._asarray(x + y, dtype=node.outputs[0].type.dtype)
@@ -1996,12 +1812,26 @@ def grad(self, (x, y), (gz,)):
         return sp_ones_like(x) * gz, gz
 
     def infer_shape(self, node, shapes):
-        return [shapes[1]]
+        return [shapes[3]]
 
 add_s_d = AddSD()
 
 
 class StructuredAddSV(gof.op.Op):
+    """Structured addition of a sparse matrix and a dense vector.
+    The elements of the vector are are only added to the corresponding
+    non-zero elements. Therefore, this operation outputs another sparse
+    matrix.
+
+    :param x: Sparse matrix.
+    :param y: Tensor type vector.
+
+    :return: A sparse matrix containing the addition of the vector to
+             the data of the sparse matrix.
+
+    :note: The grad implemented is structured since the op is structured.
+    """
+
     def __eq__(self, other):
         return (type(self) == type(other))
 
@@ -2010,7 +1840,6 @@ def __hash__(self):
 
     def make_node(self, x, y):
         x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
         y = tensor.as_tensor_variable(y)
 
         assert y.type.ndim == 1
@@ -2038,19 +1867,6 @@ def infer_shape(self, node, ins_shapes):
     def __str__(self):
         return self.__class__.__name__
 structured_add_s_v = StructuredAddSV()
-"""Structured addition of a sparse matrix and a dense vector.
-The elements of the vector are only added to the corresponding
-non-zero elements of the sparse matrix. Therefore, this operation
-outputs another sparse matrix.
-
-:param x: Sparse matrix.
-:param y: Tensor type vector.
-
-:return: A sparse matrix containing the addition of the vector to
-    the data of the sparse matrix.
-
-:note: The grad implemented is structured since the op is structured.
-"""
 
 
 def add(x, y):
@@ -2073,10 +1889,6 @@ def add(x, y):
         x = as_sparse_variable(x)
     if hasattr(y, 'getnnz'):
         y = as_sparse_variable(y)
-    if not isinstance(x, theano.Variable):
-        x = theano.tensor.as_tensor_variable(x)
-    if not isinstance(y, theano.Variable):
-        y = theano.tensor.as_tensor_variable(y)
 
     x_is_sparse_variable = _is_sparse_variable(x)
     y_is_sparse_variable = _is_sparse_variable(y)
@@ -2112,8 +1924,17 @@ def sub(x, y):
 
 
 class MulSS(gof.op.Op):
-    # mul(sparse, sparse)
-    # See the doc of mul() for more detail
+    """Elementwise multiply a sparse and a sparse.
+
+    :param x: A sparse matrix.
+    :param y: A sparse matrix.
+
+    :return: `x` * `y`
+
+    :note: At least one of `x` and `y` must be a sparse matrix.
+    :note: The grad implemented is regular, i.e. not structured.
+    """
+
     def __eq__(self, other):
         return (type(self) == type(other))
 
@@ -2125,13 +1946,11 @@ def __str__(self):
 
     def make_node(self, x, y):
         x, y = as_sparse_variable(x), as_sparse_variable(y)
-        assert x.format in ["csr", "csc"]
-        assert y.format in ["csr", "csc"]
-        out_dtype = scalar.upcast(x.type.dtype, y.type.dtype)
-        return gof.Apply(self, [x, y],
-                         [SparseType(dtype=out_dtype,
-                                     format=x.type.format
-                                    )()])
+        if x.type != y.type:
+            raise NotImplementedError(
+                    "MulSS not supported for differing types. "
+                    "Got %s and %s." % (str(x.type), str(y.type)))
+        return gof.Apply(self, [x, y], [x.type()])
 
     def perform(self, node, (x, y), (out, )):
         assert _is_sparse(x) and _is_sparse(y)
@@ -2151,8 +1970,16 @@ def infer_shape(self, node, shapes):
 
 
 class MulSD(gof.op.Op):
-    # mul(sparse, dense)
-    # See the doc of mul() for more detail
+    """Elementwise multiply a sparse and a dense matrix.
+
+    :param x: A sparse matrix.
+    :param y: A dense matrix.
+
+    :return: `x` * `y`
+
+    :note: The grad is regular, i.e. not structured..
+    """
+
     def __eq__(self, other):
         return (type(self) == type(other))
 
@@ -2165,29 +1992,25 @@ def __str__(self):
     def make_node(self, x, y):
         x, y = as_sparse_variable(x), tensor.as_tensor_variable(y)
 
-        assert x.format in ["csr", "csc"]
-
         # upcast the tensor. Is the cast of sparse done implemented?
         dtype = scalar.upcast(x.type.dtype, y.type.dtype)
+        if y.type.dtype != dtype:
+            y = tensor.cast(y, dtype)
 
+        if x.type.dtype != y.type.dtype:
+            raise NotImplementedError(
+                "MulSD not implemented for different input dtypes. "
+                "Got %s and %s." % (x.type.dtype, y.type.dtype))
         # The magic number two here arises because L{scipy.sparse}
         # objects must be matrices (have dimension 2)
         # Broadcasting of the sparse matrix is not supported.
-        # We support nd == 0 used by grad of SpSum()
-        assert y.type.ndim in [0, 2]
-        out = SparseType(dtype=dtype,
-                         format=x.type.format)()
-        return gof.Apply(self, [x, y], [out])
+        assert y.type.ndim <= 2
+        return gof.Apply(self, [x, y], [x.type()])
 
     def perform(self, node, (x, y), (out, )):
         assert _is_sparse(x) and _is_dense(y)
         if len(y.shape) == 0:
-            out_dtype = node.outputs[0].dtype
-            if x.dtype == out_dtype:
-                z = x.copy()
-            else:
-                z = x.astype(out_dtype)
-            out[0] = z
+            out[0] = x.copy()
             out[0].data *= y
         elif len(y.shape) == 1:
             raise NotImplementedError()  # RowScale / ColScale
@@ -2197,16 +2020,12 @@ def perform(self, node, (x, y), (out, )):
             # TODO: change runtime from O(M*N) to O(nonzeros)
             M, N = x.shape
             assert x.shape == y.shape
-            out_dtype = node.outputs[0].dtype
 
             if x.format == 'csc':
                 x_data = x.data
                 indices = x.indices
                 indptr = x.indptr
-                if x.dtype == out_dtype:
-                    z = x.copy()
-                else:
-                    z = x.astype(out_dtype)
+                z = x.copy()
                 z_data = z.data
 
                 for j in xrange(0, N):
@@ -2218,10 +2037,7 @@ def perform(self, node, (x, y), (out, )):
                 x_data = x.data
                 indices = x.indices
                 indptr = x.indptr
-                if x.dtype == out_dtype:
-                    z = x.copy()
-                else:
-                    z = x.astype(out_dtype)
+                z = x.copy()
                 z_data = z.data
 
                 for i in xrange(0, M):
@@ -2238,7 +2054,7 @@ def perform(self, node, (x, y), (out, )):
     def grad(self, (x, y), (gz,)):
         assert _is_sparse_variable(x) and _is_dense_variable(y)
         assert _is_sparse_variable(gz)
-        return y * gz, dense_from_sparse(x * gz)
+        return y * gz, x * gz
 
     def infer_shape(self, node, shapes):
         return [shapes[0]]
@@ -2246,6 +2062,17 @@ def infer_shape(self, node, shapes):
 
 
 class MulSV(gof.op.Op):
+    """Multiplication of sparse matrix by a broadcasted dense vector
+    element wise.
+
+    :param x: Sparse matrix to multiply.
+    :param y: Tensor broadcastable vector.
+
+    :Return: The product x * y element wise.
+
+    :note: The grad implemented is regular, i.e. not structured.
+    """
+
     def __eq__(self, other):
         return (type(self) == type(other))
 
@@ -2254,7 +2081,6 @@ def __hash__(self):
 
     def make_node(self, x, y):
         x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
         y = tensor.as_tensor_variable(y)
 
         assert y.type.ndim == 1
@@ -2293,15 +2119,6 @@ def infer_shape(self, node, ins_shapes):
     def __str__(self):
         return self.__class__.__name__
 mul_s_v = MulSV()
-"""Multiplication of sparse matrix by a broadcasted dense vector element wise.
-
-:param x: Sparse matrix to multiply.
-:param y: Tensor broadcastable vector.
-
-:Return: The product x * y element wise.
-
-:note: The grad implemented is regular, i.e. not structured.
-"""
 
 
 def mul(x, y):
@@ -2347,288 +2164,20 @@ def mul(x, y):
         raise NotImplementedError()
 
 
-class __ComparisonOpSS(gof.op.Op):
-    """
-    Used as a superclass for all comparisons between
-    two sparses matrices
-
-    :param x:first compared sparse matrix
-    :param y:second compared sparse matrix
-
-    :return: Comparison(x,y)
-    """
-
-    #Function to override
-    def comparison(self, x, y):
-        raise NotImplementedError()
-
-    def __eq__(self, other):
-        return (type(self) == type(other))
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def make_node(self, x, y):
-        x = as_sparse_variable(x)
-        y = as_sparse_variable(y)
-
-        if x.type.format != y.type.format:
-            raise NotImplementedError()
-        return gof.Apply(self,
-                         [x, y],
-                         [SparseType(dtype='uint8',
-                                 format=x.type.format).make_variable()])
-
-    def perform(self, node, (x, y), (out, )):
-        assert _is_sparse(x) and _is_sparse(y)
-        assert x.shape == y.shape
-        out[0] = self.comparison(x, y).astype('uint8')
-
-    def infer_shape(self, node, ins_shapes):
-        return [ins_shapes[0]]
-
-    def __str__(self):
-        return self.__class__.__name__
-
-
-class __ComparisonOpSD(gof.op.Op):
-    """
-    Used as a superclass for all comparisons between
-    sparse and dense matrix
-
-    :param x:sparse matrix
-    :param y:dense matrix
-
-    :return: Comparison(x,y)
-    """
-
-    #Function to override
-    def comparison(self, x, y):
-        raise NotImplementedError()
-
-    def __eq__(self, other):
-        return (type(self) == type(other))
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def make_node(self, x, y):
-        x, y = as_sparse_variable(x), tensor.as_tensor_variable(y)
-
-        assert y.type.ndim == 2
-        return gof.Apply(self,
-                         [x, y],
-                         [SparseType(dtype='uint8',
-                                 format=x.type.format).make_variable()])
-
-    def perform(self, node, (x, y), (out, )):
-        assert _is_sparse(x)
-        assert x.shape == y.shape
-        assert _is_dense(y)
-        out[0] = self.comparison(x, y).astype('uint8')
-
-    def infer_shape(self, node, ins_shapes):
-        return [ins_shapes[0]]
-
-    def __str__(self):
-        return self.__class__.__name__
-
+class HStack(gof.op.Op):
+    """Stack sparse matrices horizontally (column wise).
 
-def __ComparisonSwitch(SS, SD, DS):
-    """
-    :param SS: function to apply between two sparses matrices.
-    :param SD: function to apply between a sparse and a dense matrix.
-    :param DS: function to apply between a dense and a sparse matrix.
+    :param blocks: Sequence of sparse array of compatible shape.
+    :param format: String representing the output format. Default
+                   is csc.
+    :param dtype: Output dtype. Must be specified.
 
-    :return: switch function taking two matrices as input
+    :return: The concatenation of the sparse arrays column wise.
 
-    :note: At least one of `x` and `y` must be a sparse matrix.
-    :note: DS swap input as a dense matrix cannot be a left operand.
+    :note: The number of line of the sparse matrix must agree.
+    :note: The grad implemented is regular, i.e. not structured.
     """
 
-    def helper(x, y):
-
-        scipy_ver = [int(n) for n in scipy.__version__.split('.')[:2]]
-
-        assert scipy_ver >= [0, 13]
-
-        if hasattr(x, 'getnnz'):
-            x = as_sparse_variable(x)
-        if hasattr(y, 'getnnz'):
-            y = as_sparse_variable(y)
-        if not isinstance(x, theano.Variable):
-            x = theano.tensor.as_tensor_variable(x)
-        if not isinstance(y, theano.Variable):
-            y = theano.tensor.as_tensor_variable(y)
-
-        x_is_sparse_variable = _is_sparse_variable(x)
-        y_is_sparse_variable = _is_sparse_variable(y)
-
-        assert x_is_sparse_variable or y_is_sparse_variable
-        if x_is_sparse_variable and y_is_sparse_variable:
-            return SS(x, y)
-        elif x_is_sparse_variable and not y_is_sparse_variable:
-            return SD(x, y)
-        elif y_is_sparse_variable and not x_is_sparse_variable:
-            return DS(y, x)
-        else:
-            raise NotImplementedError()
-
-    return helper
-
-
-class EqualSS(__ComparisonOpSS):
-    def comparison(self, x, y):
-        return x == y
-
-
-equal_s_s = EqualSS()
-
-
-class EqualSD(__ComparisonOpSD):
-    def comparison(self, x, y):
-        return x == y
-
-equal_s_d = EqualSD()
-
-
-class NotEqualSS(__ComparisonOpSS):
-    def comparison(self, x, y):
-        return x != y
-
-not_equal_s_s = NotEqualSS()
-
-
-class NotEqualSD(__ComparisonOpSD):
-    def comparison(self, x, y):
-        return x != y
-
-not_equal_s_d = NotEqualSD()
-
-
-class LessThanSS(__ComparisonOpSS):
-    def comparison(self, x, y):
-        return x < y
-
-less_than_s_s = LessThanSS()
-
-
-class LessThanSD(__ComparisonOpSD):
-    def comparison(self, x, y):
-        return x < y
-
-less_than_s_d = LessThanSD()
-
-
-class GreaterThanSS(__ComparisonOpSS):
-    def comparison(self, x, y):
-        return x > y
-
-greater_than_s_s = GreaterThanSS()
-
-
-class GreaterThanSD(__ComparisonOpSD):
-    def comparison(self, x, y):
-        return x > y
-
-greater_than_s_d = GreaterThanSD()
-
-
-class LessEqualSS(__ComparisonOpSS):
-    def comparison(self, x, y):
-        return x <= y
-
-less_equal_s_s = LessEqualSS()
-
-
-class LessEqualSD(__ComparisonOpSD):
-    def comparison(self, x, y):
-        return x <= y
-
-less_equal_s_d = LessEqualSD()
-
-
-class GreaterEqualSS(__ComparisonOpSS):
-    def comparison(self, x, y):
-        return x >= y
-
-greater_equal_s_s = GreaterEqualSS()
-
-
-class GreaterEqualSD(__ComparisonOpSD):
-    def comparison(self, x, y):
-        return x >= y
-
-greater_equal_s_d = GreaterEqualSD()
-
-
-eq = __ComparisonSwitch(equal_s_s, equal_s_d, equal_s_d)
-"""
-:param x: A matrix variable.
-:param y: A matrix variable.
-
-:return: `x` == `y`
-
-:note: At least one of `x` and `y` must be a sparse matrix.
-"""
-
-
-neq = __ComparisonSwitch(not_equal_s_s, not_equal_s_d, not_equal_s_d)
-"""
-:param x: A matrix variable.
-:param y: A matrix variable.
-
-:return: `x` != `y`
-
-:note: At least one of `x` and `y` must be a sparse matrix.
-"""
-
-
-lt = __ComparisonSwitch(less_than_s_s, less_than_s_d, greater_than_s_d)
-"""
-:param x: A matrix variable.
-:param y: A matrix variable.
-
-:return: `x` < `y`
-
-:note: At least one of `x` and `y` must be a sparse matrix.
-"""
-
-
-gt = __ComparisonSwitch(greater_than_s_s, greater_than_s_d, less_than_s_d)
-"""
-:param x: A matrix variable.
-:param y: A matrix variable.
-
-:return: `x` > `y`
-
-:note: At least one of `x` and `y` must be a sparse matrix.
-"""
-
-le = __ComparisonSwitch(less_equal_s_s, less_equal_s_d, greater_equal_s_d)
-"""
-:param x: A matrix variable.
-:param y: A matrix variable.
-
-:return: `x` <= `y`
-
-:note: At least one of `x` and `y` must be a sparse matrix.
-"""
-
-ge = __ComparisonSwitch(greater_equal_s_s, greater_equal_s_d,
-                        less_equal_s_d)
-"""
-:param x: A matrix variable.
-:param y: A matrix variable.
-
-:return: `x` >= `y`
-
-:note: At least one of `x` and `y` must be a sparse matrix.
-"""
-
-
-class HStack(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
     def __init__(self, format=None, dtype=None):
         if format is None:
             self.format = 'csc'
@@ -2651,10 +2200,6 @@ def make_node(self, *mat):
         if not mat:
             raise ValueError('Cannot join an empty list of sparses.')
         var = [as_sparse_variable(x) for x in mat]
-
-        for x in var:
-            assert x.format in ["csr", "csc"]
-
         return gof.Apply(
             self, var,
             [SparseType(dtype=self.dtype, format=self.format).make_variable()])
@@ -2664,17 +2209,13 @@ def perform(self, node, block, (out, )):
             assert _is_sparse(b)
         out[0] = scipy.sparse.hstack(block, format=self.format,
                                      dtype=self.dtype)
-        # Some version of scipy (at least 0.14.0.dev-c4314b0)
-        # Do not cast to the wanted dtype.
-        if out[0].dtype != self.dtype:
-            out[0] = out[0].astype(self.dtype)
 
     def grad(self, inputs, (gz, )):
         is_continuous = [(inputs[i].dtype in tensor.continuous_dtypes)
                          for i in range(len(inputs))]
 
         if _is_sparse_variable(gz):
-            gz = dense_from_sparse(gz)
+            gz = DenseFromSparse()(gz)
 
         split = tensor.Split(len(inputs))(gz, 1,
                                           tensor.stack(
@@ -2720,28 +2261,36 @@ def hstack(blocks, format=None, dtype=None):
 
     blocks = [as_sparse_variable(i) for i in blocks]
     if dtype is None:
-        dtype = theano.scalar.upcast(*[i.dtype for i in blocks])
+        dtype = theano.scalar.upcast([i.dtype for i in blocks])
     return HStack(format=format, dtype=dtype)(*blocks)
 
 
 class VStack(HStack):
-    # See doc in instance of this Op or function after this class definition.
+    """Stack sparse matrices vertically (row wise).
+
+    :param blocks: Sequence of sparse array of compatible shape.
+    :param format: String representing the output format. Default
+                   is csc.
+    :param dtype: Output dtype. Must be specified.
+
+    :return: The concatenation of the sparse arrays row wise.
+
+    :note: The number of column of the sparse matrix must agree.
+    :note: The grad implemented is regular, i.e. not structured.
+    """
+
     def perform(self, node, block, (out, )):
         for b in block:
             assert _is_sparse(b)
         out[0] = scipy.sparse.vstack(block, format=self.format,
                                      dtype=self.dtype)
-        # Some version of scipy (at least 0.14.0.dev-c4314b0)
-        # Do not cast to the wanted dtype.
-        if out[0].dtype != self.dtype:
-            out[0] = out[0].astype(self.dtype)
 
     def grad(self, inputs, (gz, )):
         is_continuous = [(inputs[i].dtype in tensor.continuous_dtypes)
                         for i in range(len(inputs))]
 
         if _is_sparse_variable(gz):
-            gz = dense_from_sparse(gz)
+            gz = DenseFromSparse()(gz)
 
         split = tensor.Split(len(inputs))(gz, 0,
                                           tensor.stack(
@@ -2784,12 +2333,21 @@ def vstack(blocks, format=None, dtype=None):
 
     blocks = [as_sparse_variable(i) for i in blocks]
     if dtype is None:
-        dtype = theano.scalar.upcast(*[i.dtype for i in blocks])
+        dtype = theano.scalar.upcast([i.dtype for i in blocks])
     return VStack(format=format, dtype=dtype)(*blocks)
 
 
 class Remove0(gof.Op):
-    # See doc in instance of this Op or a function after the class definition.
+    """Remove explicit zeros from a sparse matrix, and
+    resort indices.
+
+    :param x: Sparse matrix.
+
+    :return: Exactly `x` but with a data attribute
+             exempt of zeros.
+    :note: The grad implemented is regular, i.e. not structured.
+    """
+
     def __init__(self, inplace=False, *args, **kwargs):
         gof.Op.__init__(self, *args, **kwargs)
         self.inplace = inplace
@@ -2809,8 +2367,6 @@ def __str__(self):
         return self.__class__.__name__ + '{%s}' % ', '.join(l)
 
     def make_node(self, x):
-        x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
         return gof.Apply(self, [x], [x.type()])
 
     def perform(self, node, (x,), (z,)):
@@ -2827,14 +2383,6 @@ def grad(self, (x,), (gz,)):
     def infer_shape(self, node, i0_shapes):
         return i0_shapes
 remove0 = Remove0()
-"""Remove explicit zeros from a sparse matrix.
-
-:param x: Sparse matrix.
-
-:return: Exactly `x` but with a data attribute
-    exempt of zeros.
-:note: The grad implemented is regular, i.e. not structured.
-"""
 
 
 # Structured monoid
@@ -2848,7 +2396,6 @@ def structured_monoid(tensor_op):
     def decorator(f):
         def wrapper(*args):
             x = as_sparse_variable(args[0])
-            assert x.format in ["csr", "csc"]
 
             xs = [scalar.as_scalar(arg) for arg in args[1:]]
 
@@ -3053,147 +2600,23 @@ def sqrt(x):
     # see decorator for function body
 
 
-class TrueDot(gof.op.Op):
-
-    # TODO
-    # Simplify code by splitting into DotSS and DotSD.
-
-    def __init__(self, grad_preserves_dense=True):
-        self.grad_preserves_dense = grad_preserves_dense
-
-    def __eq__(self, other):
-        # The grad_preserves_dense attribute doesn't change the
-        # execution behavior.  To let the optimizer merge nodes with
-        # different values of this attribute we shouldn't compare it
-        # here.
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __ne__(self, other):
-        return not (self == other)
-
-    def make_node(self, x, y):
-        # NOTE
-        # Because of trickiness of implementing,
-        # we assume that the left argument x is a
-        # SparseVariable (not dense)
-
-        if x.type.dtype != y.type.dtype:
-            raise NotImplementedError()
-
-        if not _is_sparse_variable(x):
-            raise TypeError(x)
-
-        # These are the conversions performed by scipy.sparse.dot
-        if x.type.format == "csc" or x.type.format == "coo":
-            myformat = "csc"
-        elif x.type.format == "csr":
-            myformat = "csr"
-        else:
-            raise NotImplementedError()
-
-        inputs = [x, y]  # Need to convert? e.g. assparse
-        outputs = [SparseType(dtype=x.type.dtype,
-                              format=myformat).make_variable()]
-        return gof.Apply(self, inputs, outputs)
-
-    def perform(self, node, inp, out_):
-        # TODO
-        # -Verify that output is sufficiently sparse,
-        #  and raise a warning if it is not.
-        # -Also determine that we are storing the
-        #  output in the best storage format?
-
-        x, y = inp
-        out, = out_
-        rval = x.dot(y)
-        if not scipy.sparse.issparse(rval):
-            rval = getattr(scipy.sparse, x.format + '_matrix')(rval)
-        #x.dot call tocsr() that will "upcast" to ['int8', 'uint8', 'short',
-        # 'ushort', 'intc', 'uintc', 'longlong', 'ulonglong', 'single',
-        # 'double', 'longdouble', 'csingle', 'cdouble', 'clongdouble']
-        # But ulonglong is uint64 on x86-64, but with a different typenum!
-        if rval.dtype.num != numpy.dtype(str(rval.dtype)).num:
-            assert str(rval.dtype) == node.outputs[0].dtype
-            # Create a view with the expected typenum.
-            format = node.outputs[0].type.format
-            data = rval.data.view(dtype=node.outputs[0].dtype)
-            indices = rval.indices
-            indptr = rval.indptr
-            shape = rval.shape
-            # No need to copy indices and indptr as in CSM.perform(),
-            # as there is only one user of them.
-            if format == 'csc':
-                rval = scipy.sparse.csc_matrix((data, indices, indptr),
-                                               shape, copy=False)
-            else:
-                assert format == 'csr'
-                rval = scipy.sparse.csr_matrix((data, indices, indptr),
-                                               shape, copy=False)
-        out[0] = rval
-
-    def grad(self, (x, y), (gz, )):
-        assert _is_sparse_variable(gz)
-        assert _is_sparse_variable(x)
-
-        rval = [true_dot(gz, y.T), true_dot(x.T, gz)]
-        if _is_dense_variable(y):
-            if self.grad_preserves_dense:
-                rval[1] = dense_from_sparse(rval[1])
-        return rval
-
-    def infer_shape(self, node, shapes):
-        return [(shapes[0][0], shapes[1][1])]
-
-    def __str__(self):
-        return self.__class__.__name__
-
+# Dot
+class StructuredDot(gof.Op):
+    """Structured Dot is like dot, except that only the
+    gradient wrt non-zero elements of the sparse matrix
+    `a` are calculated and propagated.
 
-def true_dot(x, y, grad_preserves_dense=True):
-    """
-    Operation for efficiently calculating the dot product when
-    one or all operands are sparse. Supported formats are CSC and CSR.
-    The output of the operation is sparse.
+    The output is presumed to be a dense matrix, and is represented by a
+    TensorType instance.
 
-    :param x: Sparse matrix.
-    :param y: Sparse matrix or 2d tensor variable.
-    :param grad_preserves_dense: if True (default), makes the grad of
-        dense inputs dense.  Otherwise the grad is always sparse.
+    :param a: A sparse matrix.
+    :param b: A sparse or dense matrix.
 
-    :return: The dot product `x`.`y` in a sparse format.
+    :return: The dot product of `a` and `b` as a dense matrix.
 
-    :note:
-     - The grad implemented is regular, i.e. not structured.
+    :note: The grad implemented is structured.
     """
-    # TODO
-    # Maybe the triple-transposition formulation
-    # (when x is dense) is slow. See if there is a
-    # direct way to do this.
-
-    if hasattr(x, 'getnnz'):
-        x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
-    if hasattr(y, 'getnnz'):
-        y = as_sparse_variable(y)
-        assert y.format in ["csr", "csc"]
-
-    x_is_sparse_variable = _is_sparse_variable(x)
-    y_is_sparse_variable = _is_sparse_variable(y)
-
-    if not x_is_sparse_variable and not y_is_sparse_variable:
-        raise TypeError()
-    if x_is_sparse_variable:
-        return TrueDot(grad_preserves_dense)(x, y)
-    else:
-        assert y_is_sparse_variable
-        return transpose(TrueDot(grad_preserves_dense)(y.T, x.T))
-
 
-# Dot
-class StructuredDot(gof.Op):
-    # See doc in instance of this Op or function after this class definition.
     def __eq__(self, other):
         return (type(self) == type(other))
 
@@ -3204,10 +2627,6 @@ def __str__(self):
         return self.__class__.__name__
 
     def make_node(self, a, b):
-
-        a = as_sparse_variable(a)
-        assert a.format in ["csr", "csc", "bsr"]
-
         if not _is_sparse_variable(a):
             raise TypeError('First argument must be of type SparseVariable '
                             'or SparseConstant')
@@ -3300,10 +2719,8 @@ def structured_dot(x, y):
 
     if hasattr(x, 'getnnz'):
         x = as_sparse_variable(x)
-        assert x.format in ["csr", "csc"]
     if hasattr(y, 'getnnz'):
         y = as_sparse_variable(y)
-        assert y.format in ["csr", "csc"]
 
     x_is_sparse_variable = _is_sparse_variable(x)
     y_is_sparse_variable = _is_sparse_variable(y)
@@ -3378,10 +2795,10 @@ def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
         if (PyArray_NDIM(%(_indices)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1"); %(fail)s;}
         if (PyArray_NDIM(%(_indptr)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1"); %(fail)s;}
 
-        if( PyArray_TYPE(%(_indices)s) != NPY_INT32) {
+        if( PyArray_DESCR(%(_indices)s)->type_num != NPY_INT32) {
         PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
 
-        if( PyArray_TYPE(%(_indptr)s) != NPY_INT32)
+        if( PyArray_DESCR(%(_indptr)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
 
         if( PyArray_DIMS(%(_d)s)[1] != PyArray_DIMS(%(_g)s)[1])
@@ -3391,29 +2808,29 @@ def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
             || (PyArray_DIMS(%(_zout)s)[0] != PyArray_DIMS(%(_indices)s)[0]))
         {
             Py_XDECREF(%(_zout)s);
-            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, PyArray_DIMS(%(_indices)s), PyArray_TYPE(%(_g)s));
+            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, PyArray_DIMS(%(_indices)s), PyArray_DESCR(%(_g)s)->type_num);
         }
 
         {   //makes it compile even though labels jump over variable definitions.
             npy_intp nnz = PyArray_DIMS(%(_indices)s)[0];
             npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1; //TODO: error checking with this
 
-            npy_intp Sindices = PyArray_STRIDES(%(_indices)s)[0]/PyArray_DESCR(%(_indices)s)->elsize;
-            npy_intp Sindptr = PyArray_STRIDES(%(_indptr)s)[0]/PyArray_DESCR(%(_indptr)s)->elsize;
+            npy_intp Sindices = %(_indices)s->strides[0]/PyArray_DESCR(%(_indices)s)->elsize;
+            npy_intp Sindptr = %(_indptr)s->strides[0]/PyArray_DESCR(%(_indptr)s)->elsize;
 
-            const npy_intp Sd1 = PyArray_STRIDES(%(_d)s)[1]/PyArray_DESCR(%(_d)s)->elsize;
-            const npy_intp Sg1 = PyArray_STRIDES(%(_g)s)[1]/PyArray_DESCR(%(_g)s)->elsize;
+            const npy_intp Sd1 = %(_d)s->strides[1]/PyArray_DESCR(%(_d)s)->elsize;
+            const npy_intp Sg1 = %(_g)s->strides[1]/PyArray_DESCR(%(_g)s)->elsize;
 
             const npy_intp K = PyArray_DIMS(%(_d)s)[1];
 
-            const npy_int32 * __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-            const npy_int32 * __restrict__ indices = (npy_int32 *)PyArray_DATA(%(_indices)s);
+            const npy_int32 * __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * __restrict__ indices = (npy_int32 *)%(_indices)s->data;
 
             // loop over columns
             for (npy_int32 j = 0; j < N; ++j)
             {
                 // extract j-th row of dense matrix
-                const dtype_%(_d)s* __restrict__ d_row = (dtype_%(_d)s*)(PyArray_BYTES(%(_d)s) + PyArray_STRIDES(%(_d)s)[0] * j);
+                const dtype_%(_d)s* __restrict__ d_row = (dtype_%(_d)s*)(%(_d)s->data + %(_d)s->strides[0] * j);
                 if(j >= PyArray_DIMS(%(_d)s)[0]) {PyErr_SetString(PyExc_NotImplementedError, "G"); %(fail)s;}
 
                 // for each non-null value in the sparse column
@@ -3423,7 +2840,7 @@ def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
                     npy_int32 i = indices[i_idx * Sindices];
 
                     // extract corresponding row in gradient
-                    const dtype_%(_g)s* __restrict__ g_row = (dtype_%(_g)s*)(PyArray_BYTES(%(_g)s) + PyArray_STRIDES(%(_g)s)[0] * i);
+                    const dtype_%(_g)s* __restrict__ g_row = (dtype_%(_g)s*)(%(_g)s->data + %(_g)s->strides[0] * i);
                     double ip = 0.0;
 
                     // make sure that row index is not bigger than actual number of rows
@@ -3439,7 +2856,7 @@ def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
                     }
 
                     // write resulting gradient to sparse output
-                    ((dtype_%(_zout)s* __restrict__)(PyArray_BYTES(%(_zout)s) + i_idx * PyArray_STRIDES(%(_zout)s)[0]))[0] = ip;
+                    ((dtype_%(_zout)s* __restrict__)(%(_zout)s->data + i_idx * %(_zout)s->strides[0]))[0] = ip;
                 }
             }
         }
@@ -3514,10 +2931,10 @@ def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
         if (PyArray_NDIM(%(_indices)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1"); %(fail)s;}
         if (PyArray_NDIM(%(_indptr)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1"); %(fail)s;}
 
-        if( PyArray_TYPE(%(_indices)s) != NPY_INT32) {
+        if( PyArray_DESCR(%(_indices)s)->type_num != NPY_INT32) {
         PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
 
-        if( PyArray_TYPE(%(_indptr)s) != NPY_INT32)
+        if( PyArray_DESCR(%(_indptr)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
 
         if( PyArray_DIMS(%(_d)s)[1] != PyArray_DIMS(%(_g)s)[1])
@@ -3527,7 +2944,7 @@ def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
             || (PyArray_DIMS(%(_zout)s)[0] != PyArray_DIMS(%(_indices)s)[0]))
         {
             Py_XDECREF(%(_zout)s);
-            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, PyArray_DIMS(%(_indices)s), PyArray_TYPE(%(_g)s));
+            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, PyArray_DIMS(%(_indices)s), PyArray_DESCR(%(_g)s)->type_num);
         }
 
         {   //makes it compile even though labels jump over variable definitions.
@@ -3535,16 +2952,16 @@ def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
             // extract number of rows
             npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1; //TODO: error checking with this
 
-            npy_intp Sindices = PyArray_STRIDES(%(_indices)s)[0]/PyArray_DESCR(%(_indices)s)->elsize;
-            npy_intp Sindptr = PyArray_STRIDES(%(_indptr)s)[0]/PyArray_DESCR(%(_indptr)s)->elsize;
+            npy_intp Sindices = %(_indices)s->strides[0]/PyArray_DESCR(%(_indices)s)->elsize;
+            npy_intp Sindptr = %(_indptr)s->strides[0]/PyArray_DESCR(%(_indptr)s)->elsize;
 
-            const npy_intp Sd1 = PyArray_STRIDES(%(_d)s)[1]/PyArray_DESCR(%(_d)s)->elsize;
-            const npy_intp Sg1 = PyArray_STRIDES(%(_g)s)[1]/PyArray_DESCR(%(_g)s)->elsize;
+            const npy_intp Sd1 = %(_d)s->strides[1]/PyArray_DESCR(%(_d)s)->elsize;
+            const npy_intp Sg1 = %(_g)s->strides[1]/PyArray_DESCR(%(_g)s)->elsize;
 
             const npy_intp K = PyArray_DIMS(%(_d)s)[1];
 
-            const npy_int32 * __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-            const npy_int32 * __restrict__ indices = (npy_int32 *)PyArray_DATA(%(_indices)s);
+            const npy_int32 * __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * __restrict__ indices = (npy_int32 *)%(_indices)s->data;
 
             // loop over columns of sparse matrix
             for (npy_int32 i = 0; i < N; ++i)
@@ -3556,11 +2973,11 @@ def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
                     npy_int32 j = indices[j_idx * Sindices];
 
                     // extract j-th row of dense matrix
-                    const dtype_%(_d)s* __restrict__ d_row = (dtype_%(_d)s*)(PyArray_BYTES(%(_d)s) + PyArray_STRIDES(%(_d)s)[0] * j);
+                    const dtype_%(_d)s* __restrict__ d_row = (dtype_%(_d)s*)(%(_d)s->data + %(_d)s->strides[0] * j);
                     if(j >= PyArray_DIMS(%(_d)s)[0]) {PyErr_SetString(PyExc_NotImplementedError, "G"); %(fail)s;}
 
                     // extract corresponding row in gradient
-                    const dtype_%(_g)s* __restrict__ g_row = (dtype_%(_g)s*)(PyArray_BYTES(%(_g)s) + PyArray_STRIDES(%(_g)s)[0] * i);
+                    const dtype_%(_g)s* __restrict__ g_row = (dtype_%(_g)s*)(%(_g)s->data + %(_g)s->strides[0] * i);
                     double ip = 0.0;
 
                     // make sure that row index is not bigger than actual number of rows
@@ -3576,7 +2993,7 @@ def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
                     }
 
                     // write resulting gradient to sparse output
-                    ((dtype_%(_zout)s* __restrict__)(PyArray_BYTES(%(_zout)s) + j_idx * PyArray_STRIDES(%(_zout)s)[0]))[0] = ip;
+                    ((dtype_%(_zout)s* __restrict__)(%(_zout)s->data + j_idx * %(_zout)s->strides[0]))[0] = ip;
                 }
             }
         }
@@ -3608,7 +3025,33 @@ def structured_dot_grad(sparse_A, dense_B, ga):
 
 
 class SamplingDot(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
+    """Operand for calculating the dot product dot(`x`, `y`.T) = `z` when you
+    only want to calculate a subset of `z`.
+
+    It is equivalent to `p` o (`x` . `y`.T) where o is the element-wise
+    product, `x` and `y` operands of the dot product and `p` is a matrix that
+    contains 1 when the corresponding element of `z` should be calculated
+    and 0 when it shouldn't. Note that SamplingDot has a different interface
+    than `dot` because SamplingDot requires `x` to be a `m`x`k` matrix while
+    `y` is a `n`x`k` matrix instead of the usual `k`x`n` matrix.
+
+    .. note::
+
+        It will work if the pattern is not binary value, but if the
+        pattern doesn't have a high sparsity proportion it will be slower
+        then a more optimized dot followed by a normal elemwise
+        multiplication.
+
+    :param x: Tensor matrix.
+    :param y: Tensor matrix.
+    :param p: Sparse matrix in csr format.
+
+    :return: A dense matrix containing the dot product of `x` by `y`.T only
+             where `p` is 1.
+
+    :note: The grad implemented is regular, i.e. not structured.
+    """
+
     def __eq__(self, other):
         return type(self) == type(other)
 
@@ -3619,7 +3062,6 @@ def make_node(self, x, y, p):
         x = tensor.as_tensor_variable(x)
         y = tensor.as_tensor_variable(y)
         p = as_sparse_variable(p)
-        assert p.format in ["csr", "csc"]
 
         if not _is_sparse_variable(p):
             raise TypeError(p)
@@ -3656,36 +3098,25 @@ def infer_shape(self, node, ins_shapes):
     def __str__(self):
         return self.__class__.__name__
 sampling_dot = SamplingDot()
-"""Operand for calculating the dot product dot(`x`, `y`.T) = `z` when you
-only want to calculate a subset of `z`.
 
-It is equivalent to `p` o (`x` . `y`.T) where o is the element-wise
-product, `x` and `y` operands of the dot product and `p` is a matrix that
-contains 1 when the corresponding element of `z` should be calculated
-and 0 when it shouldn't. Note that SamplingDot has a different interface
-than `dot` because SamplingDot requires `x` to be a `m`x`k` matrix while
-`y` is a `n`x`k` matrix instead of the usual `k`x`n` matrix.
 
-.. note::
-
-    It will work if the pattern is not binary value, but if the
-    pattern doesn't have a high sparsity proportion it will be slower
-    then a more optimized dot followed by a normal elemwise
-    multiplication.
-
-:param x: Tensor matrix.
-:param y: Tensor matrix.
-:param p: Sparse matrix in csr format.
+class Dot(gof.op.Op):
+    """Operation for efficiently calculating the dot product when
+    one or all operands is sparse. Supported format are CSC and CSR.
+    The output of the operation is dense.
 
-:return: A dense matrix containing the dot product of `x` by `y`.T only
-    where `p` is 1.
+    :param x: sparse or dense matrix variable.
+    :param y: sparse or dense matrix variable.
 
-:note: The grad implemented is regular, i.e. not structured.
-"""
+    :return: The dot product `x`.`y` in a dense format.
 
+    :note: The grad implemented is regular, i.e. not structured.
+    :note: At least one of `x` or `y` must be a sparse matrix.
+    :note: When the operation has the form dot(csr_matrix, dense)
+           the gradient of this operation can be performed inplace
+           by UsmmCscDense. This leads to significant speed-ups.
+    """
 
-class Dot(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
     def __eq__(self, other):
         return type(self) == type(other)
 
@@ -3709,15 +3140,11 @@ def infer_shape(self, node, shapes):
         raise NotImplementedError()
 
     def make_node(self, x, y):
-        dtype_out = scalar.upcast(x.dtype, y.dtype)
+        dtype_out = scalar.upcast(x.type.dtype, y.type.dtype)
 
         # Sparse dot product should have at least one sparse variable
         # as input. If the other one is not sparse, it has to be converted
         # into a tensor.
-        if isinstance(x, scipy.sparse.spmatrix):
-            x = as_sparse_variable(x)
-        if isinstance(y, scipy.sparse.spmatrix):
-            y = as_sparse_variable(y)
         x_is_sparse_var = _is_sparse_variable(x)
         y_is_sparse_var = _is_sparse_variable(y)
 
@@ -3728,7 +3155,6 @@ def make_node(self, x, y):
 
         if not x_is_sparse_var:
             x = tensor.as_tensor_variable(x)
-            assert y.format in ["csr", "csc"]
             if x.ndim not in (1, 2):
                 raise TypeError(
                     'theano.sparse.Dot: input 0 (0-indexed) must have ndim of '
@@ -3736,7 +3162,6 @@ def make_node(self, x, y):
 
         if not y_is_sparse_var:
             y = tensor.as_tensor_variable(y)
-            assert x.format in ["csr", "csc"]
             if y.ndim not in (1, 2):
                 raise TypeError(
                     'theano.sparse.Dot: input 1 (1-indexed) must have ndim of '
@@ -3787,17 +3212,13 @@ def dot(x, y):
     one or all operands is sparse. Supported format are CSC and CSR.
     The output of the operation is dense.
 
-    :param x: sparse or dense matrix variable.
-    :param y: sparse or dense matrix variable.
+    :param x: Matrix variable.
+    :param y: Matrix variable.
 
     :return: The dot product `x`.`y` in a dense format.
 
     :note: The grad implemented is regular, i.e. not structured.
     :note: At least one of `x` or `y` must be a sparse matrix.
-    :note: At least one of `x` or `y` must be a sparse matrix.
-    :note: When the operation has the form dot(csr_matrix, dense)
-           the gradient of this operation can be performed inplace
-           by UsmmCscDense. This leads to significant speed-ups.
     """
 
     if hasattr(x, 'getnnz'):
@@ -3815,7 +3236,19 @@ def dot(x, y):
 
 
 class Usmm(gof.op.Op):
-    # See doc in instance of this Op or function after this class definition.
+    """Performs the expression is `alpha` * `x` `y` + `z`.
+
+    :param x: Matrix variable.
+    :param y: Matrix variable.
+    :param z: Dense matrix.
+    :param alpha: A tensor scalar.
+
+    :return: The dense matrix resulting from `alpha` * `x` `y` + `z`.
+
+    :note: The grad is not implemented for this op.
+    :note: At least one of `x` or `y` must be a sparse matrix.
+    """
+
     # We don't implement the infer_shape as it is
     # inserted by optimization only.
 
@@ -3843,11 +3276,9 @@ def make_node(self, alpha, x, y, z):
         assert alpha.type.broadcastable == (True,) * alpha.ndim
         if not _is_sparse_variable(x):
             x = tensor.as_tensor_variable(x)
-            assert y.format in ["csr", "csc"]
             assert x.ndim == 2
         if not _is_sparse_variable(y):
             y = tensor.as_tensor_variable(y)
-            assert x.format in ["csr", "csc"]
             assert y.ndim == 2
 
         return gof.Apply(self, [alpha, x, y, z],
@@ -3875,22 +3306,13 @@ def perform(self, node, (alpha, x, y, z), (out, )):
 
         out[0] = rval
 usmm = Usmm()
-"""Performs the expression `alpha` * `x` `y` + `z`.
-
-:param x: Matrix variable.
-:param y: Matrix variable.
-:param z: Dense matrix.
-:param alpha: A tensor scalar.
-
-:return: The dense matrix resulting from `alpha` * `x` `y` + `z`.
-
-:note: The grad is not implemented for this op.
-:note: At least one of `x` or `y` must be a sparse matrix.
-"""
 
 
 class ConstructSparseFromList(gof.Op):
-    # See doc in instance of this Op or function after this class definition.
+    """Constructs a sparse matrix out of a list of 2-D matrix rows
+
+    :note: The grad implemented is regular, i.e. not structured.
+    """
     def __hash__(self):
         return hash((type(self)))
 
@@ -3980,7 +3402,3 @@ def grad(self, inputs, grads):
         return [gx, gy] + [DisconnectedType()()] * len(idx_list)
 
 construct_sparse_from_list = ConstructSparseFromList()
-"""Constructs a sparse matrix out of a list of 2-D matrix rows
-
-:note: The grad implemented is regular, i.e. not structured.
-"""
diff --git a/theano/sparse/opt.py b/theano/sparse/opt.py
index c4ebb1e463b..9ff0572b604 100644
--- a/theano/sparse/opt.py
+++ b/theano/sparse/opt.py
@@ -1,15 +1,12 @@
 from itertools import izip
 
-import numpy
-import scipy
-
 import theano
+import numpy
 from theano import gof, scalar, tensor
 from theano.tensor import blas
-from theano.tensor.opt import register_specialize, register_canonicalize
 from theano.sparse import (CSC, CSR, csm_properties,
-                           csm_grad, usmm, csm_indices, csm_indptr,
-                           csm_data)
+                           register_specialize,
+                           csm_grad, usmm)
 from theano.sparse import basic as sparse
 
 _is_sparse_variable = sparse._is_sparse_variable
@@ -29,11 +26,11 @@ def local_csm_properties_csm(node):
             return ret_var
 
     return False
-register_specialize(local_csm_properties_csm)
+sparse.register_specialize(local_csm_properties_csm)
 
 
 # This is tested in tests/test_basic.py:test_remove0
-@gof.local_optimizer([sparse.Remove0])
+@gof.local_optimizer([None])
 def local_inplace_remove0(node):
     """
     Optimization to insert inplace versions of Remove0.
@@ -50,158 +47,30 @@ def local_inplace_remove0(node):
                               gof.TopoOptimizer(local_inplace_remove0,
     failure_callback=gof.TopoOptimizer.warn_inplace),
                               60, 'fast_run', 'inplace')
-
-
-class AddSD_ccode(gof.op.Op):
-    """Add a sparse and a dense matrix.
-
-    :param x: A sparse matrix.
-    :param y: A dense matrix
-
-    :return: `x`+`y`
-
-    :note: The grad implemented is structured on `x`.
-    """
-    def __init__(self, format, inplace=False, *args, **kwargs):
-        gof.Op.__init__(self, *args, **kwargs)
-        #Should we do inplace addition or not ?
-        self.inplace = inplace
-        self.format = format
-        if self.inplace:
-            self.destroy_map = {0: [3]}
-
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.inplace == other.inplace and
-                self.format == other.format)
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.inplace) ^ hash(self.format)
-
-    def __str__(self):
-        inp = ''
-        if self.inplace:
-            inp = ',inplace'
-        return "%s{%s%s}" % (self.__class__.__name__,
-                             self.format, inp)
-
-    def make_node(self, x, y):
-        x, y = sparse.as_sparse_variable(x), tensor.as_tensor_variable(y)
-        out_dtype = scalar.upcast(x.type.dtype, y.type.dtype)
-        if self.inplace:
-            assert out_dtype == y.dtype
-
-        indices, indptr, data = csm_indices(x), csm_indptr(x), csm_data(x)
-        # We either use CSC or CSR depending on the format of input
-        assert self.format == x.type.format
-        # The magic number two here arises because L{scipy.sparse}
-        # objects must be matrices (have dimension 2)
-        assert y.type.ndim == 2
-        out = tensor.TensorType(dtype=out_dtype,
-                                broadcastable=y.type.broadcastable)()
-        return gof.Apply(self,
-                         [data, indices, indptr, y],
-                         [out])
-
-    def c_code(self, node, name, (_data, _indices, _indptr, y), (z, ), sub):
-        inplace = int(self.inplace)
-        format = {'csc': 0, 'csr': 1}[self.format]
-        out_typenum = node.outputs[0].type.dtype_specs()[2]
-        code = """
-                Py_XDECREF(%(z)s);
-                if (!%(inplace)s){
-                    if(PyArray_TYPE(%(y)s) != %(out_typenum)s){
-                        %(z)s = (PyArrayObject *) PyArray_FromArray(%(y)s,  PyArray_DescrFromType(%(out_typenum)s), 0);
-                    }else{
-                        %(z)s = (PyArrayObject *) PyArray_NewCopy(%(y)s, NPY_CORDER);
-                    }
-                }else{
-                  %(z)s = %(y)s;
-                  Py_XINCREF(%(z)s);
-                }
-
-                npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1;
-                const npy_int32 * __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-                const npy_int32 * __restrict__ indices = (npy_int32*)PyArray_DATA(%(_indices)s);
-                const dtype_%(_data)s* __restrict__ data = (dtype_%(_data)s*)PyArray_DATA(%(_data)s);
-
-                dtype_%(y)s* ydata = (dtype_%(y)s*)PyArray_DATA(%(y)s);
-                dtype_%(z)s* zdata = (dtype_%(z)s*)PyArray_DATA(%(z)s);
-                int Yi = PyArray_STRIDES(%(y)s)[0]/PyArray_DESCR(%(y)s)->elsize;
-                int Yj = PyArray_STRIDES(%(y)s)[1]/PyArray_DESCR(%(y)s)->elsize;
-
-                npy_int32 pos;
-                if (%(format)s == 0){
-                for (npy_int32 col = 0; col < N; ++col){
-                  for (npy_int32 ind = indptr[col]; ind < indptr[col+1]; ++ind){
-                    npy_int32 row = indices[ind];
-                    pos = row * Yi + col * Yj;
-                    zdata[pos] = ydata[pos] + data[ind];
-                  }
-                }
-                }else{
-                for (npy_int32 row = 0; row < N; ++row){
-                  for (npy_int32 ind = indptr[row]; ind < indptr[row+1]; ++ind){
-                    npy_int32 col = indices[ind];
-                    pos = row * Yi + col * Yj;
-                    zdata[pos] = ydata[pos] + data[ind];
-                  }
-                 }
-                }
-             """ % dict(locals(), **sub)
-        return code
-
-    def infer_shape(self, node, shapes):
-        return [shapes[3]]
-
-    def c_code_cache_version(self):
-        return (1,)
-
-
-@gof.local_optimizer([sparse.AddSD])
-def local_inplace_addsd_ccode(node):
+@gof.local_optimizer([None])
+def local_inplace_addsd(node):
     """
     Optimization to insert inplace versions of AddSD.
     """
-    if isinstance(node.op, sparse.AddSD) and theano.config.cxx:
-        out_dtype = scalar.upcast(*node.inputs)
-        if out_dtype != node.inputs[1].dtype:
-            return
-        new_node = AddSD_ccode(format=node.inputs[0].type.format,
-                               inplace=True)(*node.inputs)
+    if isinstance(node.op, sparse.AddSD) and not node.op.inplace:
+        inputs = node.inputs[:3] + [node.inputs[3].shape]
+        fmt = node.op.format
+        if fmt == 'csc':
+            x = sparse.CSC(*inputs)
+        elif fmt == 'csr':
+            x = sparse.CSR(*inputs)
+        else:
+            raise NotImplementedError('Sparse format %s is not supported' % fmt)
+        new_op = node.op.__class__(inplace=True)
+        new_node = new_op(x, node.inputs[3])
         return [new_node]
     return False
-theano.compile.optdb.register('local_inplace_addsd_ccode',
-                              gof.TopoOptimizer(local_inplace_addsd_ccode,
+theano.compile.optdb.register('local_inplace_addsd',
+                              gof.TopoOptimizer(local_inplace_addsd,
     failure_callback=gof.TopoOptimizer.warn_inplace),
                               60, 'fast_run', 'inplace')
 
 
-@register_canonicalize("fast_compile")
-@register_specialize
-@gof.local_optimizer([sparse.DenseFromSparse])
-def local_dense_from_sparse_sparse_from_dense(node):
-    if isinstance(node.op, sparse.DenseFromSparse):
-        inp = node.inputs[0]
-        if inp.owner and isinstance(inp.owner.op, sparse.SparseFromDense):
-            return inp.owner.inputs
-
-
-@gof.local_optimizer([sparse.AddSD])
-def local_addsd_ccode(node):
-    """
-    Convert AddSD to faster AddSD_ccode.
-    """
-    if isinstance(node.op, sparse.AddSD) and theano.config.cxx:
-        new_node = AddSD_ccode(format=node.inputs[0].type.format)(*node.inputs)
-        return [new_node]
-    return False
-theano.compile.optdb.register('local_addsd_ccode',
-                              gof.TopoOptimizer(local_addsd_ccode),
-                              #Must be after local_inplace_addsd_ccode at 60
-                              61, 'fast_run')
-
-
 class StructuredDotCSC(gof.Op):
     """Structured Dot CSC is like dot, except that only the
     gradient wrt non-zero elements of the sparse matrix
@@ -261,9 +130,9 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, a_nrows, b), (z,), sub):
         if node.inputs[4].type.dtype in ('complex64', 'complex128'):
             raise NotImplementedError('Complex types are not supported for b')
 
-        typenum_z = node.outputs[0].type.dtype_specs()[2]  # retrieve dtype number
-        typenum_a_val = node.inputs[0].type.dtype_specs()[2]  # retrieve dtype number
-        typenum_b = node.inputs[4].type.dtype_specs()[2]  # retrieve dtype number
+        typenum_z = node.outputs[0].type.dtype_specs()[-1]  # retrieve dtype number
+        typenum_a_val = node.inputs[0].type.dtype_specs()[-1]  # retrieve dtype number
+        typenum_b = node.inputs[4].type.dtype_specs()[-1]  # retrieve dtype number
 
         rval = """
 
@@ -273,19 +142,19 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, a_nrows, b), (z,), sub):
         if (PyArray_NDIM(%(a_nrows)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(nrows) != 0"); %(fail)s;}
         if (PyArray_NDIM(%(b)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2"); %(fail)s;}
 
-        if (PyArray_TYPE(%(a_val)s) != %(typenum_a_val)s) {
+        if (PyArray_DESCR(%(a_val)s)->type_num != %(typenum_a_val)s) {
         PyErr_SetString(PyExc_NotImplementedError, "Invalid type for a_val"); %(fail)s;}
 
-        if (PyArray_TYPE(%(b)s) != %(typenum_b)s) {
+        if (PyArray_DESCR(%(b)s)->type_num != %(typenum_b)s) {
         PyErr_SetString(PyExc_NotImplementedError, "Invalid type for b"); %(fail)s;}
 
-        if (PyArray_TYPE(%(a_ind)s) != NPY_INT32) {
+        if (PyArray_DESCR(%(a_ind)s)->type_num != NPY_INT32) {
         PyErr_SetString(PyExc_NotImplementedError, "a_ind dtype not INT32"); %(fail)s;}
 
-        if (PyArray_TYPE(%(a_ptr)s) != NPY_INT32)
+        if (PyArray_DESCR(%(a_ptr)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "a_ptr dtype not INT32"); %(fail)s;}
 
-        if (PyArray_TYPE(%(a_nrows)s) != NPY_INT32)
+        if (PyArray_DESCR(%(a_nrows)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "a_nrows dtype not INT32"); %(fail)s;}
 
         if (PyArray_DIMS(%(a_val)s)[0] != PyArray_DIMS(%(a_ind)s)[0])
@@ -295,13 +164,13 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, a_nrows, b), (z,), sub):
         {PyErr_SetString(PyExc_NotImplementedError, "a's number of columns doesn't match b's rows"); %(fail)s;}
 
         if ((!%(z)s)
-            || (PyArray_DIMS(%(z)s)[0] != ((npy_int32 *)PyArray_DATA(%(a_nrows)s))[0])
+            || (PyArray_DIMS(%(z)s)[0] != ((npy_int32 *)%(a_nrows)s->data)[0])
             || (PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(b)s)[1])
             )
         {
             {Py_XDECREF(%(z)s);}
             npy_intp dims[] = {0, 0};
-            dims[0] = ((npy_int32 *)PyArray_DATA(%(a_nrows)s))[0];
+            dims[0] = ((npy_int32 *)%(a_nrows)s->data)[0];
             dims[1] = PyArray_DIMS(%(b)s)[1];
             %(z)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_z)s);
         }
@@ -313,19 +182,19 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, a_nrows, b), (z,), sub):
             npy_intp K = PyArray_DIMS(%(b)s)[0];
 
             // strides tell you how many bytes to skip to go to next column/row entry
-            npy_intp Szm = PyArray_STRIDES(%(z)s)[0] / PyArray_DESCR(%(z)s)->elsize;
-            npy_intp Szn = PyArray_STRIDES(%(z)s)[1] / PyArray_DESCR(%(z)s)->elsize;
-            //npy_intp Sbm = PyArray_STRIDES(%(b)s)[0] / PyArray_DESCR(%(b)s)->elsize;
-            npy_intp Sbn = PyArray_STRIDES(%(b)s)[1] / PyArray_DESCR(%(b)s)->elsize;
-            npy_intp Sval = PyArray_STRIDES(%(a_val)s)[0] / PyArray_DESCR(%(a_val)s)->elsize;
-            npy_intp Sind = PyArray_STRIDES(%(a_ind)s)[0] / PyArray_DESCR(%(a_ind)s)->elsize;
-            npy_intp Sptr = PyArray_STRIDES(%(a_ptr)s)[0] / PyArray_DESCR(%(a_ptr)s)->elsize;
+            npy_intp Szm = %(z)s->strides[0] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Szn = %(z)s->strides[1] / PyArray_DESCR(%(z)s)->elsize;
+            //npy_intp Sbm = %(b)s->strides[0] / PyArray_DESCR(%(b)s)->elsize;
+            npy_intp Sbn = %(b)s->strides[1] / PyArray_DESCR(%(b)s)->elsize;
+            npy_intp Sval = %(a_val)s->strides[0] / PyArray_DESCR(%(a_val)s)->elsize;
+            npy_intp Sind = %(a_ind)s->strides[0] / PyArray_DESCR(%(a_ind)s)->elsize;
+            npy_intp Sptr = %(a_ptr)s->strides[0] / PyArray_DESCR(%(a_ptr)s)->elsize;
 
             // pointers to access actual data in the arrays passed as params.
-            dtype_%(z)s*     __restrict__ Dz   = (dtype_%(z)s*)PyArray_DATA(%(z)s);
-            const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)PyArray_DATA(%(a_val)s);
-            const npy_int32 * __restrict__ Dind = (npy_int32*)PyArray_DATA(%(a_ind)s;
-            const npy_int32 * __restrict__ Dptr = (npy_int32*)PyArray_DATA(%(a_ptr)s;
+            dtype_%(z)s*     __restrict__ Dz   = (dtype_%(z)s*)%(z)s->data;
+            const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)%(a_val)s->data;
+            const npy_int32 * __restrict__ Dind = (npy_int32*)%(a_ind)s->data;
+            const npy_int32 * __restrict__ Dptr = (npy_int32*)%(a_ptr)s->data;
 
             //npy_intp nnz = PyArray_DIMS(%(a_ind)s)[0];
 
@@ -349,7 +218,7 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, a_nrows, b), (z,), sub):
             for (npy_int32 k = 0; k < K; ++k)
             {
                 // get pointer to k-th row of dense matrix
-                const dtype_%(b)s* __restrict__ bk = (dtype_%(b)s*)(PyArray_BYTES(%(b)s) + PyArray_STRIDES(%(b)s)[0] * k);
+                const dtype_%(b)s* __restrict__ bk = (dtype_%(b)s*)(%(b)s->data + %(b)s->strides[0] * k);
 
                 // loop over sparse column indices through index pointer array
                 // (amounts to looping over rows M of sparse matrix)
@@ -360,7 +229,7 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, a_nrows, b), (z,), sub):
                     const dtype_%(a_val)s Amk = Dval[m_idx * Sval]; // actual value at that location
 
                     // pointer to m-th row of the output matrix Z
-                    dtype_%(z)s* __restrict__ zm = (dtype_%(z)s*)(PyArray_BYTES(%(z)s) + PyArray_STRIDES(%(z)s)[0] * m);
+                    dtype_%(z)s* __restrict__ zm = (dtype_%(z)s*)(%(z)s->data + %(z)s->strides[0] * m);
 
                     //RESOLVE: a.shape[0] equals z.shape[0], why is this not an equality constraint?
                     if (m >= PyArray_DIMS(%(z)s)[0])
@@ -449,7 +318,7 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, b), (z,), sub):
         @param sub: TODO, not too sure, something to do with weave probably
         """
         # retrieve dtype number
-        typenum_z = tensor.TensorType(self.dtype_out, []).dtype_specs()[2]
+        typenum_z = tensor.TensorType(self.dtype_out, []).dtype_specs()[-1]
         if node.inputs[0].type.dtype in ('complex64', 'complex128'):
             raise NotImplementedError('Complex types are not supported for a_val')
         if node.inputs[3].type.dtype in ('complex64', 'complex128'):
@@ -461,10 +330,10 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, b), (z,), sub):
         if (PyArray_NDIM(%(a_ptr)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ptr) != 1"); %(fail)s;}
         if (PyArray_NDIM(%(b)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2"); %(fail)s;}
 
-        if (PyArray_TYPE(%(a_ind)s) != NPY_INT32) {
+        if (PyArray_DESCR(%(a_ind)s)->type_num != NPY_INT32) {
         PyErr_SetString(PyExc_NotImplementedError, "a_ind dtype not INT32"); %(fail)s;}
 
-        if (PyArray_TYPE(%(a_ptr)s) != NPY_INT32)
+        if (PyArray_DESCR(%(a_ptr)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "a_ptr dtype not INT32"); %(fail)s;}
 
         if (PyArray_DIMS(%(a_val)s)[0] != PyArray_DIMS(%(a_ind)s)[0])
@@ -489,19 +358,19 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, b), (z,), sub):
             npy_intp K = PyArray_DIMS(%(b)s)[0];
 
             // strides tell you how many bytes to skip to go to next column/row entry
-            npy_intp Szm = PyArray_STRIDES(%(z)s)[0] / PyArray_DESCR(%(z)s)->elsize;
-            npy_intp Szn = PyArray_STRIDES(%(z)s)[1] / PyArray_DESCR(%(z)s)->elsize;
-            npy_intp Sbm = PyArray_STRIDES(%(b)s)[0] / PyArray_DESCR(%(b)s)->elsize;
-            npy_intp Sbn = PyArray_STRIDES(%(b)s)[1] / PyArray_DESCR(%(b)s)->elsize;
-            npy_intp Sval = PyArray_STRIDES(%(a_val)s)[0] / PyArray_DESCR(%(a_val)s)->elsize;
-            npy_intp Sind = PyArray_STRIDES(%(a_ind)s)[0] / PyArray_DESCR(%(a_ind)s)->elsize;
-            npy_intp Sptr = PyArray_STRIDES(%(a_ptr)s)[0] / PyArray_DESCR(%(a_ptr)s)->elsize;
+            npy_intp Szm = %(z)s->strides[0] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Szn = %(z)s->strides[1] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Sbm = %(b)s->strides[0] / PyArray_DESCR(%(b)s)->elsize;
+            npy_intp Sbn = %(b)s->strides[1] / PyArray_DESCR(%(b)s)->elsize;
+            npy_intp Sval = %(a_val)s->strides[0] / PyArray_DESCR(%(a_val)s)->elsize;
+            npy_intp Sind = %(a_ind)s->strides[0] / PyArray_DESCR(%(a_ind)s)->elsize;
+            npy_intp Sptr = %(a_ptr)s->strides[0] / PyArray_DESCR(%(a_ptr)s)->elsize;
 
             // pointers to access actual data in the arrays passed as params.
-            dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)PyArray_DATA(%(z)s);
-            const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)PyArray_DATA(%(a_val)s);
-            const npy_int32 * __restrict__ Dind = (npy_int32*)PyArray_DATA(%(a_ind)s);
-            const npy_int32 * __restrict__ Dptr = (npy_int32*)PyArray_DATA(%(a_ptr)s);
+            dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
+            const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)%(a_val)s->data;
+            const npy_int32 * __restrict__ Dind = (npy_int32*)%(a_ind)s->data;
+            const npy_int32 * __restrict__ Dptr = (npy_int32*)%(a_ptr)s->data;
 
             //npy_intp nnz = PyArray_DIMS(%(a_ind)s)[0];
 
@@ -524,7 +393,7 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, b), (z,), sub):
             for (npy_int64 m = 0; m < M; ++m)
             {
                 // pointer to m-th row of the output matrix Z
-                dtype_%(z)s* __restrict__ zm = (dtype_%(z)s*)(PyArray_BYTES(%(z)s) + PyArray_STRIDES(%(z)s)[0] * m);
+                dtype_%(z)s* __restrict__ zm = (dtype_%(z)s*)(%(z)s->data + %(z)s->strides[0] * m);
 
                 // loop over sparse rows indices through index pointer array
                 // (amounts to looping over cols k of sparse matrix)
@@ -534,7 +403,7 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, b), (z,), sub):
                     const dtype_%(a_val)s Amk = Dval[k_idx * Sval]; // actual value at that location
 
                     // get pointer to k-th row of dense matrix
-                    const dtype_%(b)s* __restrict__ bk = (dtype_%(b)s*)(PyArray_BYTES(%(b)s) + PyArray_STRIDES(%(b)s)[0] * k);
+                    const dtype_%(b)s* __restrict__ bk = (dtype_%(b)s*)(%(b)s->data + %(b)s->strides[0] * k);
 
                     // loop over final dimension (cols of dense matrix) and perform dot product
                     for(npy_int32 n = 0; n < N; ++n)
@@ -681,11 +550,11 @@ def c_code(self, node, name, inputs, outputs, sub):
             conv_type = "double"
             axpy = "daxpy_"
         # retrieve dtype numbers
-        typenum_alpha = node.inputs[0].type.dtype_specs()[2]
-        typenum_x_val = node.inputs[1].type.dtype_specs()[2]
-        typenum_y = node.inputs[5].type.dtype_specs()[2]
-        typenum_z = node.inputs[6].type.dtype_specs()[2]
-        typenum_zn = node.outputs[0].type.dtype_specs()[2]
+        typenum_alpha = node.inputs[0].type.dtype_specs()[-1]
+        typenum_x_val = node.inputs[1].type.dtype_specs()[-1]
+        typenum_y = node.inputs[5].type.dtype_specs()[-1]
+        typenum_z = node.inputs[6].type.dtype_specs()[-1]
+        typenum_zn = node.outputs[0].type.dtype_specs()[-1]
 
         inplace = int(self.inplace)
 
@@ -697,25 +566,25 @@ def c_code(self, node, name, inputs, outputs, sub):
         if (PyArray_NDIM(%(x_nrows)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(nrows) != 0"); %(fail)s;}
         if (PyArray_NDIM(%(y)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
 
-        if (PyArray_TYPE(%(x_val)s) != %(typenum_x_val)s) {
+        if (PyArray_DESCR(%(x_val)s)->type_num != %(typenum_x_val)s) {
         PyErr_SetString(PyExc_NotImplementedError, "Invalid type for x_val"); %(fail)s;}
 
-        if (PyArray_TYPE(%(y)s) != %(typenum_y)s) {
+        if (PyArray_DESCR(%(y)s)->type_num != %(typenum_y)s) {
         PyErr_SetString(PyExc_NotImplementedError, "Invalid type for y"); %(fail)s;}
 
-        if (PyArray_TYPE(%(z)s) != %(typenum_z)s) {
+        if (PyArray_DESCR(%(z)s)->type_num != %(typenum_z)s) {
         PyErr_SetString(PyExc_NotImplementedError, "Invalid type for z"); %(fail)s;}
 
-        if (PyArray_TYPE(%(alpha)s) != %(typenum_alpha)s) {
+        if (PyArray_DESCR(%(alpha)s)->type_num != %(typenum_alpha)s) {
         PyErr_SetString(PyExc_NotImplementedError, "Invalid type for alpha"); %(fail)s;}
 
-        if (PyArray_TYPE(%(x_ind)s) != NPY_INT32) {
+        if (PyArray_DESCR(%(x_ind)s)->type_num != NPY_INT32) {
         PyErr_SetString(PyExc_NotImplementedError, "x_ind dtype not INT32"); %(fail)s;}
 
-        if (PyArray_TYPE(%(x_ptr)s) != NPY_INT32)
+        if (PyArray_DESCR(%(x_ptr)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "x_ptr dtype not INT32"); %(fail)s;}
 
-        if (PyArray_TYPE(%(x_nrows)s) != NPY_INT32)
+        if (PyArray_DESCR(%(x_nrows)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "x_nrows dtype not INT32"); %(fail)s;}
 
         if (PyArray_DIMS(%(x_val)s)[0] != PyArray_DIMS(%(x_ind)s)[0])
@@ -724,7 +593,7 @@ def c_code(self, node, name, inputs, outputs, sub):
         if (PyArray_DIMS(%(x_ptr)s)[0] != PyArray_DIMS(%(y)s)[0]+1)
         {PyErr_SetString(PyExc_NotImplementedError, "x's number of columns doesn't match y's rows"); %(fail)s;}
 
-        if (PyArray_DIMS(%(z)s)[0] != ((npy_int32 *)PyArray_DATA(%(x_nrows)s))[0] || PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(y)s)[1])
+        if (PyArray_DIMS(%(z)s)[0] != ((npy_int32 *)%(x_nrows)s->data)[0] || PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(y)s)[1])
         {PyErr_SetString(PyExc_NotImplementedError, "The dimension of the allocated output doesn't match the correct output size."); %(fail)s;}
 
         if (PyArray_SIZE(%(alpha)s) != 1)
@@ -752,13 +621,13 @@ def c_code(self, node, name, inputs, outputs, sub):
             Py_INCREF(%(zn)s);
         }
         else if (!%(zn)s
-            || (PyArray_DIMS(%(zn)s)[0] != ((npy_int32 *)PyArray_DATA(%(x_nrows)s))[0])
+            || (PyArray_DIMS(%(zn)s)[0] != ((npy_int32 *)%(x_nrows)s->data)[0])
             || (PyArray_DIMS(%(zn)s)[1] != PyArray_DIMS(%(y)s)[1])
             )
         {
             {Py_XDECREF(%(zn)s);}
             npy_intp dims[] = {0, 0};
-            dims[0] = ((npy_int32 *)PyArray_DATA(%(x_nrows)s))[0];
+            dims[0] = ((npy_int32 *)%(x_nrows)s->data)[0];
             dims[1] = PyArray_DIMS(%(y)s)[1];
             %(zn)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_zn)s);
         }
@@ -770,17 +639,17 @@ def c_code(self, node, name, inputs, outputs, sub):
             npy_intp K = PyArray_DIMS(%(y)s)[0];
 
             // pointers to access actual data in the arrays passed as params.
-            const dtype_%(x_val)s* __restrict__ Dval = (dtype_%(x_val)s*)PyArray_DATA(%(x_val)s);
-            const npy_int32 * __restrict__ Dind = (npy_int32*)PyArray_DATA(%(x_ind)s);
-            const npy_int32 * __restrict__ Dptr = (npy_int32*)PyArray_DATA(%(x_ptr)s);
-            const dtype_%(alpha)s alpha = ((dtype_%(alpha)s*)PyArray_DATA(%(alpha)s))[0];
+            const dtype_%(x_val)s* __restrict__ Dval = (dtype_%(x_val)s*)%(x_val)s->data;
+            const npy_int32 * __restrict__ Dind = (npy_int32*)%(x_ind)s->data;
+            const npy_int32 * __restrict__ Dptr = (npy_int32*)%(x_ptr)s->data;
+            const dtype_%(alpha)s alpha = ((dtype_%(alpha)s*)%(alpha)s->data)[0];
 
-            npy_intp Sz = PyArray_STRIDES(%(z)s)[1] / PyArray_DESCR(%(z)s)->elsize;
-            npy_intp Szn = PyArray_STRIDES(%(zn)s)[1] / PyArray_DESCR(%(zn)s)->elsize;
-            npy_intp Sval = PyArray_STRIDES(%(x_val)s)[0] / PyArray_DESCR(%(x_val)s)->elsize;
-            npy_intp Sind = PyArray_STRIDES(%(x_ind)s)[0] / PyArray_DESCR(%(x_ind)s)->elsize;
-            npy_intp Sptr = PyArray_STRIDES(%(x_ptr)s)[0] / PyArray_DESCR(%(x_ptr)s)->elsize;
-            npy_intp Sy = PyArray_STRIDES(%(y)s)[1] / PyArray_DESCR(%(y)s)->elsize;
+            npy_intp Sz = %(z)s->strides[1] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Szn = %(zn)s->strides[1] / PyArray_DESCR(%(zn)s)->elsize;
+            npy_intp Sval = %(x_val)s->strides[0] / PyArray_DESCR(%(x_val)s)->elsize;
+            npy_intp Sind = %(x_ind)s->strides[0] / PyArray_DESCR(%(x_ind)s)->elsize;
+            npy_intp Sptr = %(x_ptr)s->strides[0] / PyArray_DESCR(%(x_ptr)s)->elsize;
+            npy_intp Sy = %(y)s->strides[1] / PyArray_DESCR(%(y)s)->elsize;
 
 
             if (!(%(inplace)s))
@@ -800,14 +669,14 @@ def c_code(self, node, name, inputs, outputs, sub):
 
                     const dtype_%(x_val)s Amk = alpha * Dval[m_idx * Sval]; // actual value at that location
 
-                    dtype_%(y)s* y_row = (dtype_%(y)s*)(PyArray_BYTES(%(y)s) + PyArray_STRIDES(%(y)s)[0] * k);
+                    dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k);
                     // axpy expects pointer to the beginning of memory arrays,
                     // so when the stride is negative, we need to get the
                     // last element
                     if (Sy < 0)
                         y_row += (K - 1) * Sy;
 
-                    dtype_%(zn)s* z_row = (dtype_%(zn)s*)(PyArray_BYTES(%(zn)s) + PyArray_STRIDES(%(zn)s)[0] * m);
+                    dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m);
                     if (Szn < 0)
                         z_row += (N - 1) * Szn;
 
@@ -871,7 +740,7 @@ def local_usmm_csx(node):
                 return [usmm_csc_dense(alpha, x_val, x_ind, x_ptr,
                                        x_nsparse, y, z)]
     return False
-register_specialize(local_usmm_csx, 'cxx_only')
+sparse.register_specialize(local_usmm_csx, 'cxx_only')
 
 
 class CSMGradC(gof.Op):
@@ -892,7 +761,7 @@ def make_node(self, a_val, a_ind, a_ptr, a_dim,
     def c_code(self, node, name, (a_val, a_ind, a_ptr, a_dim,
                         b_val, b_ind, b_ptr, b_dim), (z,), sub):
         # retrieve dtype number
-        typenum_z = node.outputs[0].type.dtype_specs()[2]
+        typenum_z = node.outputs[0].type.dtype_specs()[-1]
         if node.inputs[0].type.dtype in ('complex64', 'complex128'):
             raise NotImplementedError('Complex types are not supported for a_val')
         if node.inputs[3].type.dtype in ('complex64', 'complex128'):
@@ -906,16 +775,16 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, a_dim,
         if (PyArray_NDIM(%(b_ind)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(b_ind) != 1"); %(fail)s;}
         if (PyArray_NDIM(%(b_ptr)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(b_ptr) != 1"); %(fail)s;}
 
-        if (PyArray_TYPE(%(a_ind)s) != NPY_INT32) {
+        if (PyArray_DESCR(%(a_ind)s)->type_num != NPY_INT32) {
         PyErr_SetString(PyExc_NotImplementedError, "a_ind dtype not INT32"); %(fail)s;}
 
-        if (PyArray_TYPE(%(a_ptr)s) != NPY_INT32)
+        if (PyArray_DESCR(%(a_ptr)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "a_ptr dtype not INT32"); %(fail)s;}
 
-        if (PyArray_TYPE(%(b_ind)s) != NPY_INT32) {
+        if (PyArray_DESCR(%(b_ind)s)->type_num != NPY_INT32) {
         PyErr_SetString(PyExc_NotImplementedError, "b_ind dtype not INT32"); %(fail)s;}
 
-        if (PyArray_TYPE(%(b_ptr)s) != NPY_INT32)
+        if (PyArray_DESCR(%(b_ptr)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "b_ptr dtype not INT32"); %(fail)s;}
 
         if (PyArray_DIMS(%(a_val)s)[0] != PyArray_DIMS(%(a_ind)s)[0])
@@ -938,28 +807,28 @@ def c_code(self, node, name, (a_val, a_ind, a_ptr, a_dim,
         {
             // sparse array has size MxK, dense KxN, output MxN
             npy_intp M = PyArray_DIMS(%(a_ptr)s)[0] - 1;
-            npy_intp a_dim_0 = ((npy_int32 *)PyArray_DATA(%(a_dim)s))[0];
-            npy_intp a_dim_1 = ((npy_int32 *)PyArray_DATA(%(a_dim)s))[1];
+            npy_intp a_dim_0 = ((npy_int32 *)%(a_dim)s->data)[0];
+            npy_intp a_dim_1 = ((npy_int32 *)%(a_dim)s->data)[1];
 
             npy_intp sp_dim = (M == a_dim_0)?a_dim_1:a_dim_0;
 
             // strides tell you how many bytes to skip to go to next column/row entry
-            npy_intp Sz = PyArray_STRIDES(%(z)s)[0] / PyArray_DESCR(%(z)s)->elsize;
-            npy_intp Sa_val = PyArray_STRIDES(%(a_val)s)[0] / PyArray_DESCR(%(a_val)s)->elsize;
-            npy_intp Sa_ind = PyArray_STRIDES(%(a_ind)s)[0] / PyArray_DESCR(%(a_ind)s)->elsize;
-            npy_intp Sa_ptr = PyArray_STRIDES(%(a_ptr)s)[0] / PyArray_DESCR(%(a_ptr)s)->elsize;
-            npy_intp Sb_val = PyArray_STRIDES(%(b_val)s)[0] / PyArray_DESCR(%(b_val)s)->elsize;
-            npy_intp Sb_ind = PyArray_STRIDES(%(b_ind)s)[0] / PyArray_DESCR(%(b_ind)s)->elsize;
-            npy_intp Sb_ptr = PyArray_STRIDES(%(b_ptr)s)[0] / PyArray_DESCR(%(b_ptr)s)->elsize;
+            npy_intp Sz = %(z)s->strides[0] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Sa_val = %(a_val)s->strides[0] / PyArray_DESCR(%(a_val)s)->elsize;
+            npy_intp Sa_ind = %(a_ind)s->strides[0] / PyArray_DESCR(%(a_ind)s)->elsize;
+            npy_intp Sa_ptr = %(a_ptr)s->strides[0] / PyArray_DESCR(%(a_ptr)s)->elsize;
+            npy_intp Sb_val = %(b_val)s->strides[0] / PyArray_DESCR(%(b_val)s)->elsize;
+            npy_intp Sb_ind = %(b_ind)s->strides[0] / PyArray_DESCR(%(b_ind)s)->elsize;
+            npy_intp Sb_ptr = %(b_ptr)s->strides[0] / PyArray_DESCR(%(b_ptr)s)->elsize;
 
             // pointers to access actual data in the arrays passed as params.
-            dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)PyArray_DATA(%(z)s);
-            const dtype_%(a_val)s* __restrict__ Da_val = (dtype_%(a_val)s*)PyArray_DATA(%(a_val)s);
-            const npy_int32 * __restrict__ Da_ind = (npy_int32*)PyArray_DATA(%(a_ind)s);
-            const npy_int32 * __restrict__ Da_ptr = (npy_int32*)PyArray_DATA(%(a_ptr)s);
-            const dtype_%(b_val)s* __restrict__ Db_val = (dtype_%(b_val)s*)PyArray_DATA(%(b_val)s);
-            const npy_int32 * __restrict__ Db_ind = (npy_int32*)PyArray_DATA(%(b_ind)s);
-            const npy_int32 * __restrict__ Db_ptr = (npy_int32*)PyArray_DATA(%(b_ptr)s);
+            dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
+            const dtype_%(a_val)s* __restrict__ Da_val = (dtype_%(a_val)s*)%(a_val)s->data;
+            const npy_int32 * __restrict__ Da_ind = (npy_int32*)%(a_ind)s->data;
+            const npy_int32 * __restrict__ Da_ptr = (npy_int32*)%(a_ptr)s->data;
+            const dtype_%(b_val)s* __restrict__ Db_val = (dtype_%(b_val)s*)%(b_val)s->data;
+            const npy_int32 * __restrict__ Db_ind = (npy_int32*)%(b_ind)s->data;
+            const npy_int32 * __restrict__ Db_ptr = (npy_int32*)%(b_ptr)s->data;
 
             npy_intp nnz = PyArray_DIMS(%(a_ind)s)[0];
 
@@ -1007,8 +876,7 @@ def local_csm_grad_c(node):
     if node.op == csm_grad(None):
         return [csm_grad_c(*node.inputs)]
     return False
-#DISABLED AS IT IS BROKEN FOR UNSORTED INDICES!
-#register_specialize(local_csm_grad_c, 'cxx_only')
+register_specialize(local_csm_grad_c, 'cxx_only')
 
 
 class MulSDCSC(gof.Op):
@@ -1068,10 +936,10 @@ def c_code(self, node, name, (_data, _indices, _indptr, _b,),
             PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
             %(fail)s;}
 
-        if( PyArray_TYPE(%(_indices)s) != NPY_INT32) {
+        if( PyArray_DESCR(%(_indices)s)->type_num != NPY_INT32) {
         PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
 
-        if( PyArray_TYPE(%(_indptr)s) != NPY_INT32)
+        if( PyArray_DESCR(%(_indptr)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
 
         if (!%(_zout)s ||
@@ -1080,7 +948,7 @@ def c_code(self, node, name, (_data, _indices, _indptr, _b,),
         {
             Py_XDECREF(%(_zout)s);
             %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                  PyArray_DIMS(%(_indices)s), PyArray_TYPE(%(_b)s));
+                  PyArray_DIMS(%(_indices)s), PyArray_DESCR(%(_b)s)->type_num);
             if (!%(_zout)s)
             {
                 PyErr_SetString(PyExc_MemoryError,
@@ -1094,13 +962,13 @@ def c_code(self, node, name, (_data, _indices, _indptr, _b,),
             //TODO: error checking with this
             const npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1;
 
-            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)PyArray_DATA(%(_data)s);
-            const npy_int32 * const __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-            const npy_int32 * const __restrict__ indices = (npy_int32 *)PyArray_DATA(%(_indices)s);
+            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
 
-            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)PyArray_DATA(%(_zout)s);
+            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
 
-            const npy_intp Sb = PyArray_STRIDES(%(_b)s)[0];
+            const npy_intp Sb = %(_b)s->strides[0];
 
             // loop over columns
             for (npy_int32 j = 0; j < N; ++j)
@@ -1112,7 +980,7 @@ def c_code(self, node, name, (_data, _indices, _indptr, _b,),
                     npy_int32 i = indices[i_idx];
 
                     // extract i-th row of dense matrix
-                    const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(PyArray_BYTES(%(_b)s) + Sb * i);
+                    const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(%(_b)s->data + Sb * i);
 
                     // write resulting gradient to sparse output
                     zout[i_idx] = data[i_idx] * b_row[j];
@@ -1184,10 +1052,10 @@ def c_code(self, node, name, (_data, _indices, _indptr, _b,),
             PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
             %(fail)s;}
 
-        if( PyArray_TYPE(%(_indices)s) != NPY_INT32) {
+        if( PyArray_DESCR(%(_indices)s)->type_num != NPY_INT32) {
         PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
 
-        if( PyArray_TYPE(%(_indptr)s) != NPY_INT32)
+        if( PyArray_DESCR(%(_indptr)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
 
         if (!%(_zout)s ||
@@ -1196,7 +1064,7 @@ def c_code(self, node, name, (_data, _indices, _indptr, _b,),
         {
             Py_XDECREF(%(_zout)s);
             %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                    PyArray_DIMS(%(_indices)s), PyArray_TYPE(%(_b)s));
+                    PyArray_DIMS(%(_indices)s), PyArray_DESCR(%(_b)s)->type_num);
             if (!%(_zout)s)
             {
                 PyErr_SetString(PyExc_MemoryError,
@@ -1210,19 +1078,19 @@ def c_code(self, node, name, (_data, _indices, _indptr, _b,),
             //TODO: error checking with this
             const npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1;
 
-            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)PyArray_DATA(%(_data)s);
-            const npy_int32 * const __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-            const npy_int32 * const __restrict__ indices = (npy_int32 *)PyArray_DATA(%(_indices)s);
+            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
 
-            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)PyArray_DATA(%(_zout)s);
+            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
 
-            const npy_intp Sb = PyArray_STRIDES(%(_b)s)[0];
+            const npy_intp Sb = %(_b)s->strides[0];
 
             // loop over columns
             for (npy_int32 j = 0; j < N; ++j)
             {
                 // extract i-th row of dense matrix
-                const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(PyArray_BYTES(%(_b)s) + Sb * j);
+                const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(%(_b)s->data + Sb * j);
 
                 // for each non-null value in the sparse column
                 for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
@@ -1268,9 +1136,6 @@ def local_mul_s_d(node):
             mul_s_d_csx = mul_s_d_csr
         else:
             raise NotImplemented()
-        if x.dtype != y.dtype:
-            #mul_s_d_csx don't support that case
-            return
 
         c_data = mul_s_d_csx(sparse.csm_data(svar),
                              sparse.csm_indices(svar),
@@ -1282,7 +1147,7 @@ def local_mul_s_d(node):
                     sparse.csm_shape(svar))]
 
     return False
-register_specialize(local_mul_s_d, 'cxx_only')
+sparse.register_specialize(local_mul_s_d, 'cxx_only')
 
 
 class MulSVCSR(gof.Op):
@@ -1343,10 +1208,10 @@ def c_code(self, node, name, inputs, outputs, sub):
             %(fail)s;
         }
 
-        if( PyArray_TYPE(%(_indices)s) != NPY_INT32) {
+        if( PyArray_DESCR(%(_indices)s)->type_num != NPY_INT32) {
         PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
 
-        if( PyArray_TYPE(%(_indptr)s) != NPY_INT32)
+        if( PyArray_DESCR(%(_indptr)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
 
         if (!%(_zout)s
@@ -1355,7 +1220,7 @@ def c_code(self, node, name, inputs, outputs, sub):
         {
             Py_XDECREF(%(_zout)s);
             %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                    PyArray_DIMS(%(_indices)s), PyArray_TYPE(%(_b)s));
+                    PyArray_DIMS(%(_indices)s), PyArray_DESCR(%(_b)s)->type_num);
         }
 
         { //makes it compile even though labels jump over variable definitions.
@@ -1363,15 +1228,15 @@ def c_code(self, node, name, inputs, outputs, sub):
             //TODO: error checking with this
             const npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1;
 
-            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)PyArray_DATA(%(_data)s);
-            const npy_int32 * const __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-            const npy_int32 * const __restrict__ indices = (npy_int32 *)PyArray_DATA(%(_indices)s);
+            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
 
-            const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)PyArray_DATA(%(_b)s);
+            const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)%(_b)s->data;
 
-            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)PyArray_DATA(%(_zout)s);
+            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
 
-            const npy_intp Sb = PyArray_STRIDES(%(_b)s)[0] / PyArray_DESCR(%(_b)s)->elsize;
+            const npy_intp Sb = %(_b)s->strides[0] / PyArray_DESCR(%(_b)s)->elsize;
 
             // loop over rows
             for (npy_int32 j = 0; j < N; ++j)
@@ -1424,7 +1289,7 @@ def local_mul_s_v(node):
         return [CSx(c_data, s_ind, s_ptr, s_shape)]
 
     return False
-register_specialize(local_mul_s_v, 'cxx_only')
+sparse.register_specialize(local_mul_s_v, 'cxx_only')
 
 
 class StructuredAddSVCSR(gof.Op):
@@ -1493,10 +1358,10 @@ def c_code(self, node, name, inputs, outputs, sub):
             %(fail)s;
         }
 
-        if( PyArray_TYPE(%(_indices)s) != NPY_INT32) {
+        if( PyArray_DESCR(%(_indices)s)->type_num != NPY_INT32) {
         PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
 
-        if( PyArray_TYPE(%(_indptr)s) != NPY_INT32)
+        if( PyArray_DESCR(%(_indptr)s)->type_num != NPY_INT32)
         {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
 
         if (!%(_zout)s
@@ -1505,7 +1370,7 @@ def c_code(self, node, name, inputs, outputs, sub):
         {
             Py_XDECREF(%(_zout)s);
             %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                    PyArray_DIMS(%(_indices)s), PyArray_TYPE(%(_b)s));
+                    PyArray_DIMS(%(_indices)s), PyArray_DESCR(%(_b)s)->type_num);
             if (!%(_zout)s)
             {
                 PyErr_SetString(PyExc_MemoryError,
@@ -1519,15 +1384,15 @@ def c_code(self, node, name, inputs, outputs, sub):
             //TODO: error checking with this
             const npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1;
 
-            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)PyArray_DATA(%(_data)s);
-            const npy_int32 * const __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-            const npy_int32 * const __restrict__ indices = (npy_int32 *)PyArray_DATA(%(_indices)s);
+            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
 
-            const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)PyArray_DATA(%(_b)s);
+            const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)%(_b)s->data;
 
-            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)PyArray_DATA(%(_zout)s);
+            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
 
-            const npy_intp Sb = PyArray_STRIDES(%(_b)s)[0] / PyArray_DESCR(%(_b)s)->elsize;
+            const npy_intp Sb = %(_b)s->strides[0] / PyArray_DESCR(%(_b)s)->elsize;
 
             // loop over columns
             for (npy_int32 j = 0; j < N; ++j)
@@ -1583,7 +1448,7 @@ def local_structured_add_s_v(node):
         return [CSx(c_data, s_ind, s_ptr, s_shape)]
 
     return False
-register_specialize(local_structured_add_s_v, 'cxx_only')
+sparse.register_specialize(local_structured_add_s_v, 'cxx_only')
 
 
 class SamplingDotCSR(gof.Op):
@@ -1693,15 +1558,15 @@ def c_code(self, node, name, inputs, outputs, sub):
             cdot = "ddot_"
 
         # retrieve dtype number
-        typenum_x = node.inputs[0].type.dtype_specs()[2]
-        typenum_y = node.inputs[1].type.dtype_specs()[2]
-        typenum_p = node.inputs[2].type.dtype_specs()[2]
+        typenum_x = node.inputs[0].type.dtype_specs()[-1]
+        typenum_y = node.inputs[1].type.dtype_specs()[-1]
+        typenum_p = node.inputs[2].type.dtype_specs()[-1]
         typenum_zd = tensor.TensorType(node.outputs[0].dtype,
-                                       []).dtype_specs()[2]
+                                       []).dtype_specs()[-1]
         typenum_zi = tensor.TensorType(node.outputs[1].dtype,
-                                       []).dtype_specs()[2]
+                                       []).dtype_specs()[-1]
         typenum_zp = tensor.TensorType(node.outputs[2].dtype,
-                                       []).dtype_specs()[2]
+                                       []).dtype_specs()[-1]
 
         rval = """
         if (PyArray_NDIM(%(x)s) != 2) {
@@ -1709,17 +1574,17 @@ def c_code(self, node, name, inputs, outputs, sub):
         if (PyArray_NDIM(%(y)s) != 2) {
 PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
 
-        if (PyArray_TYPE(%(x)s) != %(typenum_x)s) {
+        if (PyArray_DESCR(%(x)s)->type_num != %(typenum_x)s) {
             PyErr_SetString(PyExc_NotImplementedError,
                             "Invalid type for x");
             %(fail)s;}
 
-        if (PyArray_TYPE(%(y)s) != %(typenum_y)s) {
+        if (PyArray_DESCR(%(y)s)->type_num != %(typenum_y)s) {
             PyErr_SetString(PyExc_NotImplementedError,
                             "Invalid type for y");
             %(fail)s;}
 
-        if (PyArray_TYPE(%(p_data)s) != %(typenum_p)s) {
+        if (PyArray_DESCR(%(p_data)s)->type_num != %(typenum_p)s) {
             PyErr_SetString(PyExc_NotImplementedError,
                             "Invalid type for pattern");
             %(fail)s;}
@@ -1729,7 +1594,7 @@ def c_code(self, node, name, inputs, outputs, sub):
               "x's number of columns doesn't match y's rows! Note: sampling_dot is different from dot because y is assumed to be transposed.");
             %(fail)s;}
 
-        if (PyArray_DIMS(%(y)s)[0] != ((npy_int32 *)PyArray_DATA(%(p_ncols)s))[0] ||
+        if (PyArray_DIMS(%(y)s)[0] != ((npy_int32 *)%(p_ncols)s->data)[0] ||
             PyArray_DIMS(%(x)s)[0] != (PyArray_DIMS(%(p_ptr)s)[0] - 1))
         {PyErr_SetString(PyExc_NotImplementedError,
         "The dimension of the pattern and the output must match"); %(fail)s;}
@@ -1737,7 +1602,7 @@ def c_code(self, node, name, inputs, outputs, sub):
         // Allocate output
         if (!%(z_data)s
             || (PyArray_DIMS(%(z_data)s)[0] != PyArray_DIMS(%(p_data)s)[0])
-            || (PyArray_TYPE(%(z_data)s) != %(typenum_zd)s)
+            || (PyArray_DESCR(%(z_data)s)->type_num != %(typenum_zd)s)
             || !(PyArray_ISCONTIGUOUS(%(z_data)s)))
          {
             {Py_XDECREF(%(z_data)s);}
@@ -1748,7 +1613,7 @@ def c_code(self, node, name, inputs, outputs, sub):
         }
         if (!%(z_ind)s
             || (PyArray_DIMS(%(z_ind)s)[0] != PyArray_DIMS(%(p_ind)s)[0])
-            || (PyArray_TYPE(%(z_ind)s) != %(typenum_zi)s)
+            || (PyArray_DESCR(%(z_ind)s)->type_num != %(typenum_zi)s)
             || !(PyArray_ISCONTIGUOUS(%(z_ind)s)))
         {
             {Py_XDECREF(%(z_ind)s);}
@@ -1759,7 +1624,7 @@ def c_code(self, node, name, inputs, outputs, sub):
         }
         if (!%(z_ptr)s
             || (PyArray_DIMS(%(z_ptr)s)[0] != PyArray_DIMS(%(p_ptr)s)[0])
-            || (PyArray_TYPE(%(z_ptr)s) != %(typenum_zp)s)
+            || (PyArray_DESCR(%(z_ptr)s)->type_num != %(typenum_zp)s)
             || !(PyArray_ISCONTIGUOUS(%(z_ptr)s)))
         {
             {Py_XDECREF(%(z_ptr)s);}
@@ -1776,23 +1641,23 @@ def c_code(self, node, name, inputs, outputs, sub):
             npy_intp K = PyArray_DIMS(%(y)s)[1];
 
             // pointers to access actual data in the arrays passed as params.
-            const dtype_%(x)s* __restrict__ Dx = (dtype_%(x)s*)PyArray_DATA(%(x)s);
-            const dtype_%(y)s* __restrict__ Dy = (dtype_%(y)s*)PyArray_DATA(%(y)s);
-            const dtype_%(p_data)s* __restrict__ Dpd = (dtype_%(p_data)s*)PyArray_DATA(%(p_data)s);
-            const dtype_%(p_ind)s* __restrict__ Dpi = (dtype_%(p_ind)s*)PyArray_DATA(%(p_ind)s);
-            const dtype_%(p_ptr)s* __restrict__ Dpp = (dtype_%(p_ptr)s*)PyArray_DATA(%(p_ptr)s);
-            dtype_%(z_data)s* __restrict__ Dzd = (dtype_%(z_data)s*)PyArray_DATA(%(z_data)s);
-            dtype_%(z_ind)s* __restrict__ Dzi = (dtype_%(z_ind)s*)PyArray_DATA(%(z_ind)s);
-            dtype_%(z_ptr)s* __restrict__ Dzp = (dtype_%(z_ptr)s*)PyArray_DATA(%(z_ptr)s);
-
-            const npy_intp Sdx = PyArray_STRIDES(%(x)s)[1]/PyArray_DESCR(%(x)s)->elsize;
-            const npy_intp Sdy = PyArray_STRIDES(%(y)s)[1]/PyArray_DESCR(%(y)s)->elsize;
-            const npy_intp Sdpd = PyArray_STRIDES(%(p_data)s)[0] / PyArray_DESCR(%(p_data)s)->elsize;
-            const npy_intp Sdpi = PyArray_STRIDES(%(p_ind)s)[0] / PyArray_DESCR(%(p_ind)s)->elsize;
-            const npy_intp Sdpp = PyArray_STRIDES(%(p_ptr)s)[0] / PyArray_DESCR(%(p_ptr)s)->elsize;
-            const npy_intp Sdzd = PyArray_STRIDES(%(z_data)s)[0] / PyArray_DESCR(%(z_data)s)->elsize;
-            const npy_intp Sdzi = PyArray_STRIDES(%(z_ind)s)[0] / PyArray_DESCR(%(z_ind)s)->elsize;
-            const npy_intp Sdzp = PyArray_STRIDES(%(z_ptr)s)[0] / PyArray_DESCR(%(z_ptr)s)->elsize;
+            const dtype_%(x)s* __restrict__ Dx = (dtype_%(x)s*)%(x)s->data;
+            const dtype_%(y)s* __restrict__ Dy = (dtype_%(y)s*)%(y)s->data;
+            const dtype_%(p_data)s* __restrict__ Dpd = (dtype_%(p_data)s*)%(p_data)s->data;
+            const dtype_%(p_ind)s* __restrict__ Dpi = (dtype_%(p_ind)s*)%(p_ind)s->data;
+            const dtype_%(p_ptr)s* __restrict__ Dpp = (dtype_%(p_ptr)s*)%(p_ptr)s->data;
+            dtype_%(z_data)s* __restrict__ Dzd = (dtype_%(z_data)s*)%(z_data)s->data;
+            dtype_%(z_ind)s* __restrict__ Dzi = (dtype_%(z_ind)s*)%(z_ind)s->data;
+            dtype_%(z_ptr)s* __restrict__ Dzp = (dtype_%(z_ptr)s*)%(z_ptr)s->data;
+
+            const npy_intp Sdx = %(x)s->strides[1]/PyArray_DESCR(%(x)s)->elsize;
+            const npy_intp Sdy = %(y)s->strides[1]/PyArray_DESCR(%(y)s)->elsize;
+            const npy_intp Sdpd = %(p_data)s->strides[0] / PyArray_DESCR(%(p_data)s)->elsize;
+            const npy_intp Sdpi = %(p_ind)s->strides[0] / PyArray_DESCR(%(p_ind)s)->elsize;
+            const npy_intp Sdpp = %(p_ptr)s->strides[0] / PyArray_DESCR(%(p_ptr)s)->elsize;
+            const npy_intp Sdzd = %(z_data)s->strides[0] / PyArray_DESCR(%(z_data)s)->elsize;
+            const npy_intp Sdzi = %(z_ind)s->strides[0] / PyArray_DESCR(%(z_ind)s)->elsize;
+            const npy_intp Sdzp = %(z_ptr)s->strides[0] / PyArray_DESCR(%(z_ptr)s)->elsize;
 
             memcpy(Dzi, Dpi, PyArray_DIMS(%(p_ind)s)[0]*sizeof(dtype_%(p_ind)s));
             memcpy(Dzp, Dpp, PyArray_DIMS(%(p_ptr)s)[0]*sizeof(dtype_%(p_ptr)s));
@@ -1801,9 +1666,9 @@ def c_code(self, node, name, inputs, outputs, sub):
                 for (npy_int32 n_idx = Dpp[m * Sdpp]; n_idx < Dpp[(m+1)*Sdpp]; ++n_idx) {
                     const npy_int32 n = Dpi[n_idx * Sdpi]; // row index of non-null value for column K
 
-                    const dtype_%(x)s* x_row = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * m);
+                    const dtype_%(x)s* x_row = (dtype_%(x)s*)(%(x)s->data + %(x)s->strides[0] * m);
 
-                    const dtype_%(y)s* y_col = (dtype_%(y)s*)(PyArray_BYTES(%(y)s) + PyArray_STRIDES(%(y)s)[0] * n);
+                    const dtype_%(y)s* y_col = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * n);
 
                     Dzd[n_idx * Sdzd] = Dpd[n_idx * Sdpd] * %(cdot)s((int*)&K, (const %(conv_type)s*)x_row, (int*)&Sdx, (const %(conv_type)s*)y_col, (int*)&Sdy);
                 }
@@ -1832,6 +1697,6 @@ def local_sampling_dot_csr(node):
             return [sparse.CSR(z_data, z_ind, z_ptr, p_shape)]
     return False
 
-register_specialize(local_sampling_dot_csr,
-                    'cxx_only',
-                    name='local_sampling_dot_csr')
+sparse.register_specialize(local_sampling_dot_csr,
+                           'cxx_only',
+                           name='local_sampling_dot_csr')
diff --git a/theano/sparse/sandbox/sp2.py b/theano/sparse/sandbox/sp2.py
index 21791c9d0af..8d2ad7fe103 100644
--- a/theano/sparse/sandbox/sp2.py
+++ b/theano/sparse/sandbox/sp2.py
@@ -2,11 +2,11 @@
 import scipy.sparse
 
 from theano import gof, tensor
-from theano.tensor.opt import register_specialize
+
 from theano.sparse.basic import (
     as_sparse_variable, SparseType, add_s_s, neg,
     mul_s_s, mul_s_d, dot,
-    CSMProperties, CSM,
+    CSMProperties, CSM, register_specialize,
     _is_sparse_variable, _is_dense_variable, CSC, CSR,
     csm_properties, csm_data, csm_indices, csm_indptr, csm_shape,
     _is_sparse,
@@ -72,7 +72,6 @@ def make_node(self, x):
 
     def perform(self, node, (x, ), (out, )):
         assert _is_sparse(x)
-        assert x.format in ["csr", "csc"]
         out[0] = x.copy()
         out[0].data = numpy.asarray(numpy.random.poisson(out[0].data),
                                     dtype=x.dtype)
@@ -122,9 +121,8 @@ def make_node(self, n, p, shape):
         n = tensor.as_tensor_variable(n)
         p = tensor.as_tensor_variable(p)
         shape = tensor.as_tensor_variable(shape)
-        return gof.Apply(self, [n, p, shape],
-                         [SparseType(dtype=self.dtype,
-                                     format=self.format).make_variable()])
+        return gof.Apply(self, [n, p, shape], [SparseType(dtype=self.dtype,
+                                 format=self.format).make_variable()])
 
     def perform(self, node, (n, p, shape, ), (out, )):
         binomial = numpy.random.binomial(n, p, size=shape)
@@ -177,7 +175,6 @@ def __hash__(self):
     def make_node(self, n, p):
         n = tensor.as_tensor_variable(n)
         p = as_sparse_variable(p)
-        assert p.format in ["csr", "csc"]
 
         return gof.Apply(self, [n, p], [p.type()])
 
diff --git a/theano/sparse/sandbox/test_sp.py b/theano/sparse/sandbox/test_sp.py
index 2c46888429c..124b94411b3 100644
--- a/theano/sparse/sandbox/test_sp.py
+++ b/theano/sparse/sandbox/test_sp.py
@@ -1,5 +1,4 @@
 from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
 import sys
 import time
 import unittest
@@ -129,7 +128,6 @@ def test_convolution(self):
         #profmode.print_summary()
 
 
-    @attr('slow')
     def test_sparse(self):
 
 #        print '\n\n*************************************************'
@@ -138,7 +136,7 @@ def test_sparse(self):
 
         # fixed parameters
         bsize = 10     # batch size
-        imshp = (8, 8)
+        imshp = (28,28)
         kshp = (5,5)
         nkern = 1 # per output pixel
         ssizes = ((1,1),(2,2))
@@ -151,8 +149,8 @@ def test_sparse(self):
         rng = numpy.random.RandomState(3423489)
 
         import theano.gof as gof
-
-        for mode in (None,):
+        #Mode(optimizer='fast_run', linker=gof.OpWiseCLinker(allow_gc=False)),):
+        for mode in ('FAST_COMPILE','FAST_RUN'): #,profmode):
             ntot, ttot = 0,0
             for conv_mode in convmodes:
                 for ss in ssizes:
@@ -301,6 +299,7 @@ def test_multilayer_conv(self):
                     l2hidval = l2propup(l2kernvals,l1hidval)
 
 
+
     def test_maxpool(self):
         # generate flatted images
         maxpoolshps = ((2,2),(3,3),(4,4),(5,5),(6,6))
diff --git a/theano/sparse/sandbox/truedot.py b/theano/sparse/sandbox/truedot.py
index cb52e1bfcf7..65b6e57ff2e 100644
--- a/theano/sparse/sandbox/truedot.py
+++ b/theano/sparse/sandbox/truedot.py
@@ -1,21 +1,251 @@
 import unittest
 
-import theano
 import numpy
-import scipy.sparse as sp
 
-from theano import sparse
-from theano import gof, tensor, compile
+from theano import gof, tensor,compile
 
 from theano.sparse.tests.test_basic import eval_outputs
-from theano.sparse.basic import (
-    _is_sparse_variable, _is_dense_variable,
-    as_sparse_variable, _is_sparse, _mtypes, _mtype_to_str)
+from theano.sparse.basic import _is_sparse_variable, _is_dense_variable, as_sparse_variable, _is_sparse, _mtypes, _mtype_to_str
 from theano.sparse import SparseType, dense_from_sparse, transpose
 
-from theano.sparse.tests.test_basic import sparse_random_inputs
-from theano.tests import unittest_tools as utt
-from theano.sparse import verify_grad_sparse
+###############
+#
+# TrueDot
+#
+class TrueDot(gof.op.Op):
+    """
+    Attributes:
+    grad_preserves_dense - a boolean flags [default: True].
+    grad_preserves_dense controls whether gradients with respect to inputs
+    are converted to dense matrices when the corresponding input y is
+    dense (not in a L{SparseVariable} wrapper). This is generally a good idea
+    when L{Dot} is in the middle of a larger graph, because the types
+    of gy will match that of y. This conversion might be inefficient if
+    the gradients are graph outputs though, hence this mask.
 
-# To maintain compatibility
-from theano.sparse.basic import TrueDot, true_dot
+    @todo: Simplify code by splitting into DotSS and DotSD.
+    """
+    def __init__(self, grad_preserves_dense=True):
+        self.grad_preserves_dense = grad_preserves_dense
+    def __eq__(self, other):
+        return type(self) == type(other) and self.grad_preserves_dense == other.grad_preserves_dense
+    def __hash__(self):
+        return hash(self.grad_preserves_dense)
+    def __ne__(self, other):
+        return not (self == other)
+    def make_node(self, x, y):
+        """
+        :note: Because of trickiness of implementing, we assume that the left argument x is SparseVariable (not dense)
+        """
+        if x.type.dtype != y.type.dtype:
+            raise NotImplementedError()
+
+        if not _is_sparse_variable(x):
+            raise TypeError(x)
+
+        # These are the conversions performed by scipy.sparse.dot
+        if x.type.format == "csc" or x.type.format == "coo":
+            myformat = "csc"
+        elif x.type.format == "csr":
+            myformat = "csr"
+        else:
+            raise NotImplementedError()
+
+        inputs = [x, y]    # Need to convert? e.g. assparse
+        outputs = [SparseType(dtype = x.type.dtype, format = myformat).make_variable()]
+        return gof.Apply(self, inputs, outputs)
+    def perform(self, node, inp, out_):
+        """
+        @todo: Verify that output is sufficiently sparse, and raise a warning if it is not
+        @todo: Also determine that we are storing the output in the best storage format?
+        """
+        x, y = inp
+        out, = out_
+        rval = x.dot(y)
+        out[0] = rval
+    def grad(self, inp, grads):
+        x, y = inp
+        gz, = grads
+        assert _is_sparse_variable(gz)
+        assert _is_sparse_variable(x)
+        rval = [true_dot(gz, y.T), true_dot(x.T, gz)]
+        if _is_dense_variable(y):
+            if self.grad_preserves_dense:
+                rval[1] = dense_from_sparse(rval[1])
+        return rval
+
+def true_dot(x, y, grad_preserves_dense=True):
+    """
+    @todo: Maybe the triple-transposition formulation (when x is dense)
+    is slow. See if there is a direct way to do this.
+    """
+    if hasattr(x, 'getnnz'): x = as_sparse_variable(x)
+    if hasattr(y, 'getnnz'): y = as_sparse_variable(y)
+
+    x_is_sparse_variable = _is_sparse_variable(x)
+    y_is_sparse_variable = _is_sparse_variable(y)
+    if not x_is_sparse_variable and not y_is_sparse_variable:
+        raise TypeError()
+    if x_is_sparse_variable:
+        return TrueDot(grad_preserves_dense)(x, y)
+    else:
+        assert y_is_sparse_variable
+        return transpose(TrueDot(grad_preserves_dense)(y.T, x.T))
+
+
+class test_true_dot(unittest.TestCase):
+    def setUp(self):
+        numpy.random.seed(44)
+
+    def test_basicSS(self):
+        for mtype in _mtypes:
+            x = as_sparse_variable(mtype((500,3)))
+            x.data[(10, 1)] = 1
+            x.data[(20, 2)] = 2
+            self.assertTrue(_is_sparse_variable(x))
+
+            xT = x.T
+            self.assertTrue(_is_sparse_variable(xT))
+
+            zop = true_dot(x,xT)
+            self.assertTrue(_is_sparse_variable(zop))
+            z = eval_outputs([zop])
+            self.assertTrue(_is_sparse(z))
+            self.assertTrue(z.shape == (500,500))
+            self.assertTrue(type(z) is mtype)
+
+            w = mtype((500,500))
+            w[(10, 10)] = 1
+            w[(20, 20)] = 4
+            self.assertTrue(z.shape == w.shape)
+            self.assertTrue(type(z) == type(w))
+            self.assertTrue(z.dtype == w.dtype)
+
+            #self.assertTrue(z == w)
+            self.assertTrue(abs(z-w).nnz == 0)
+
+            z = z.todense()
+            w = w.todense()
+            self.assertTrue((z == w).all() == True)
+
+    def test_basicSD(self):
+        for mtype in _mtypes:
+            x = as_sparse_variable(mtype((500,3)))
+            x.data[(10, 1)] = 1
+            x.data[(20, 2)] = 2
+            self.assertTrue(_is_sparse_variable(x))
+
+            y = tensor.as_tensor_variable([[1., 2], [3, 4], [2, 1]])
+            self.assertTrue(_is_dense_variable(y))
+
+            zop = true_dot(x,y)
+            self.assertTrue(_is_sparse_variable(zop))
+            z = eval_outputs([zop])
+            self.assertTrue(_is_sparse(z))
+            self.assertTrue(z.shape == (500,2))
+            self.assertTrue(type(z) is mtype)
+
+            w = mtype((500,2))
+            w[(10, 0)] = 3.
+            w[(20, 0)] = 4
+            w[(10, 1)] = 4
+            w[(20, 1)] = 2
+            self.assertTrue(z.shape == w.shape)
+            self.assertTrue(type(z) == type(w))
+            self.assertTrue(z.dtype == w.dtype)
+
+            #self.assertTrue(z == w)
+            self.assertTrue(abs(z-w).nnz == 0)
+
+            z = z.todense()
+            w = w.todense()
+            self.assertTrue((z == w).all() == True)
+
+    def test_basicDS(self):
+        for mtype in _mtypes:
+            x = as_sparse_variable(mtype((500,3)))
+            x.data[(10, 1)] = 1
+            x.data[(20, 2)] = 2
+            self.assertTrue(_is_sparse_variable(x))
+
+            y = tensor.as_tensor_variable([[1., 2], [3, 4], [2, 1]])
+            self.assertTrue(_is_dense_variable(y))
+
+            x.data = x.data.T
+            y.data = y.data.T
+
+            zop = true_dot(y, x)
+            zop = transpose(true_dot(y, x))
+            self.assertTrue(_is_sparse_variable(zop))
+            z = eval_outputs([zop])
+            self.assertTrue(_is_sparse(z))
+            self.assertTrue(z.shape == (500,2))
+#            self.assertTrue(type(z) is mtype)
+
+            w = mtype((500,2))
+            w[(10, 0)] = 3.
+            w[(20, 0)] = 4
+            w[(10, 1)] = 4
+            w[(20, 1)] = 2
+            self.assertTrue(z.shape == w.shape)
+            # Type should switch from csr to csc and vice-versa, so don't perform this test
+            #self.assertTrue(type(z) == type(w))
+            self.assertTrue(z.dtype == w.dtype)
+
+            # Type should switch from csr to csc and vice-versa, so don't perform this test
+            #self.assertTrue(z == w)
+            self.assertTrue(abs(z-w).nnz == 0)
+
+            z = z.todense()
+            w = w.todense()
+            self.assertTrue((z == w).all() == True)
+
+    def test_graph_bprop0(self):
+        for mtype in _mtypes:
+            x = tensor.matrix('x') #TensorType('float64', broadcastable=[False,False], name='x')
+            w = SparseType(dtype = 'float64', format = _mtype_to_str[mtype]).make_variable()
+            xw = dense_from_sparse(true_dot(w, x))
+            y = dense_from_sparse(true_dot(w.T, xw))
+            diff = x-y
+            loss = tensor.sum(tensor.sqr(diff))
+            gw = tensor.grad(loss, w)
+            trainfn = compile.function([x, w], [y, loss, gw])
+
+            x = numpy.asarray([[1., 2], [3, 4], [2, 1]])
+            w = mtype((500,3))
+            w[(10, 1)] = 1
+            w[(20, 2)] = 2
+            lr = 0.001
+            y, origloss, gw = trainfn(x, w)
+            for epoch in xrange(50):
+                y, loss, gw = trainfn(x, w)
+                w = w - (lr * gw)
+                print loss
+
+            self.assertTrue(origloss > loss)
+            self.assertTrue('1.05191241115' == str(loss))
+
+    def test_graph_bprop_rand(self):
+        for i in range(10):
+            xorig = numpy.random.rand(3,2)
+            for mtype in _mtypes:
+                x = tensor.matrix('x')
+                w = SparseType(dtype = 'float64', format = _mtype_to_str[mtype]).make_variable()
+                xw = dense_from_sparse(true_dot(w, x))
+                y = dense_from_sparse(true_dot(w.T, xw))
+                diff = x-y
+                loss = tensor.sum(tensor.sqr(diff))
+                gw = tensor.grad(loss, w)
+                trainfn = compile.function([x, w], [y, loss, gw])
+
+                x = xorig
+                w = mtype((500,3))
+                w[(10, 1)] = 1
+                w[(20, 2)] = 2
+                lr = 0.001
+                y, origloss, gw = trainfn(x, w)
+                for epoch in xrange(50):
+                    y, loss, gw = trainfn(x, w)
+                    w = w - (lr * gw)
+
+                self.assertTrue(origloss > loss)
diff --git a/theano/sparse/sharedvar.py b/theano/sparse/sharedvar.py
index 41c42665134..25787b98f09 100644
--- a/theano/sparse/sharedvar.py
+++ b/theano/sparse/sharedvar.py
@@ -5,8 +5,7 @@
 
 
 class SparseTensorSharedVariable(_sparse_py_operators, SharedVariable):
-    dtype = property(lambda self: self.type.dtype)
-    format = property(lambda self: self.type.format)
+    pass
 
 
 @shared_constructor
diff --git a/theano/sparse/tests/test_basic.py b/theano/sparse/tests/test_basic.py
index 63477c7917b..0c7eed2d4ca 100644
--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -6,7 +6,6 @@
 try:
     import scipy.sparse as sp
     import scipy.sparse
-    from scipy.sparse import csr_matrix
 except ImportError:
     pass  # The variable enable_sparse will be used to disable the test file.
 
@@ -16,11 +15,10 @@
 from theano import compile, config, gof
 from theano.sparse import enable_sparse
 from theano.gof.python25 import all, any, product
-from theano.gof.python25 import product as itertools_product
-from theano.tensor.basic import _allclose
 
-if not enable_sparse:
-    raise SkipTest('Optional package SciPy not installed')
+
+if enable_sparse == False:
+    raise SkipTest('Optional package sparse disabled')
 
 from theano.sparse.basic import _is_dense, _is_sparse, _mtypes
 from theano.sparse.basic import _is_dense_variable, _is_sparse_variable
@@ -33,16 +31,15 @@
     AddSS, AddSD, MulSS, MulSD, Transpose, Neg, Remove0,
     add, mul, structured_dot, transpose,
     csc_from_dense, csr_from_dense, dense_from_sparse,
-    Dot, Usmm, sp_ones_like, GetItemScalar, GetItemList, GetItem2Lists,
+    Dot, Usmm, sp_ones_like, GetItemScalar,
     SparseFromDense,
     Cast, cast, HStack, VStack, AddSSData, add_s_s_data,
     structured_minimum, structured_maximum, structured_add,
-    mul_s_v, structured_add_s_v,
+     mul_s_v, structured_add_s_v,
     SamplingDot, sampling_dot,
     Diag, diag, SquareDiagonal, square_diagonal,
     EnsureSortedIndices, ensure_sorted_indices, clean,
-    ConstructSparseFromList, construct_sparse_from_list,
-    TrueDot, true_dot, eq, neq, le, ge, gt, lt)
+    ConstructSparseFromList, construct_sparse_from_list)
 
 # Probability distributions are currently tested in test_sp2.py
 #from theano.sparse import (
@@ -51,6 +48,7 @@
 from theano.sparse.opt import (StructuredDotCSC, UsmmCscDense, CSMGradC)
 
 from theano.tests import unittest_tools as utt
+from theano.tensor.basic import _allclose
 
 
 def as_sparse_format(data, format):
@@ -71,21 +69,18 @@ def random_lil(shape, dtype, nnz):
     huge = 2 ** 30
     for k in range(nnz):
         # set non-zeros in random locations (row x, col y)
-        idx = numpy.random.random_integers(huge, size=2) % shape
+        idx = numpy.random.random_integers(huge, size=len(shape)) % shape
         value = numpy.random.rand()
         #if dtype *int*, value will always be zeros!
         if "int" in dtype:
             value = int(value * 100)
-        # The call to tuple is needed as scipy 0.13.1 do not support
-        # ndarray with lenght 2 as idx tuple.
         rval.__setitem__(
-            tuple(idx),
-            value)
+                idx,
+                value)
     return rval
 
 
-def sparse_random_inputs(format, shape, n=1, out_dtype=None, p=0.5, gap=None,
-                         explicit_zero=False, unsorted_indices=False):
+def sparse_random_inputs(format, shape, n=1, out_dtype=None, p=0.5, gap=None):
     """Return a tuple containing everything needed to
     perform a test.
 
@@ -102,15 +97,9 @@ def sparse_random_inputs(format, shape, n=1, out_dtype=None, p=0.5, gap=None,
                 max, when `gap` = (`a`, `b`) it provide a sample
                 from [a, b[. If `None` is used, it provide [0, 1]
                 for float dtypes and [0, 50[ for integer dtypes.
-    :param explicit_zero: When True, we add explicit zero in the
-                          returned sparse matrix
-    :param unsorted_indices: when True, we make sure there is
-                             unsorted indices in the returned
-                             sparse matrix.
+
     :return: (variable, data) where both `variable`
              and `data` are list.
-
-    :note: explicit_zero and unsorted_indices was added in Theano 0.6rc4
     """
 
     if out_dtype is None:
@@ -147,19 +136,6 @@ def _rand():
                 for k in range(n)]
     data = [getattr(scipy.sparse, format + '_matrix')(_rand(), dtype=out_dtype)
             for k in range(n)]
-    if unsorted_indices:
-        for idx in range(n):
-            d = data[idx]
-            d = d[range(d.shape[0])]
-            assert not d.has_sorted_indices
-            data[idx] = d
-    if explicit_zero:
-        for idx in range(n):
-            assert data[idx].nnz > 1, (
-                "can't make a sparse matrix with explicit 0")
-            d_idx = numpy.random.randint(data[idx].nnz)
-            data[idx].data[d_idx] = 0
-
     #numpy 1.5.0 with scipy 0.9.0 have scipy.sparse.XXX_matrix return
     #typenum 10(ulonglong) instead of 8(uint64) event if they are the same!
     #Theano don't like ulonglong type_num
@@ -327,7 +303,7 @@ def test_add_sd(self):
                 [sp.csr_matrix(random_lil((10, 40),
                                config.floatX, 3)),
                  numpy.random.randn(10, 40).astype(config.floatX)],
-                (AddSD, sparse.opt.AddSD_ccode))
+                AddSD)
 
     def test_mul_ss(self):
         x = SparseType('csr', dtype=config.floatX)()
@@ -396,17 +372,17 @@ def test_structured_dot_grad(self):
                     [x, y],
                     [grads[0]],
                     [as_sparse_format(random_lil((4, 5),
-                                                 config.floatX, 3), format),
+                                   config.floatX, 3), format),
                      as_sparse_format(random_lil((5, 3),
-                                                 config.floatX, 3), format)],
+                                   config.floatX, 3), format)],
                     op)
             self._compile_and_check(
                     [x, y],
                     [grads[1]],
                     [as_sparse_format(random_lil((4, 5),
-                                                 config.floatX, 3), format),
+                                   config.floatX, 3), format),
                      as_sparse_format(random_lil((5, 3),
-                                                 config.floatX, 3), format)],
+                                   config.floatX, 3), format)],
                     op)
 
     def test_dense_from_sparse(self):
@@ -422,7 +398,8 @@ def test_sparse_from_dense(self):
         self._compile_and_check([x],
                                 [csc_from_dense(x)],
                                 [numpy.random.randn(10, 40).astype(
-                                    config.floatX)],
+                    config.floatX)],
+
                                 csc_from_dense.__class__)
 
     def test_sparse_from_list(self):
@@ -441,59 +418,6 @@ def test_sparse_from_list(self):
                 )
 
 
-class TestConstructSparseFromList(unittest.TestCase):
-    def test_adv_sub1_sparse_grad(self):
-        v = theano.tensor.ivector()
-
-        # Assert we don't create a sparse grad by default
-        m = theano.tensor.matrix()
-        sub = m[v]
-        g = theano.grad(sub.sum(), m)
-        assert isinstance(g.owner.op, tensor.AdvancedIncSubtensor1)
-
-        # Test that we create a sparse grad when asked
-        # USER INTERFACE
-        m = theano.tensor.matrix()
-        v = theano.tensor.ivector()
-        sub = theano.sparse_grad(m[v])
-        g = theano.grad(sub.sum(), m)
-        assert isinstance(g.owner.op, ConstructSparseFromList)
-
-        # Test that we create a sparse grad when asked
-        # Op INTERFACE
-        m = theano.tensor.matrix()
-        v = theano.tensor.ivector()
-        sub = theano.tensor.AdvancedSubtensor1(sparse_grad=True)(m, v)
-        g = theano.grad(sub.sum(), m)
-        assert isinstance(g.owner.op, ConstructSparseFromList)
-
-        # Test the sparse grad
-        valm = numpy.random.rand(5, 4).astype(config.floatX)
-        valv = numpy.random.random_integers(0, 4, 10)
-        m = theano.tensor.matrix()
-        shared_v = theano.shared(valv)
-
-        def fn(m):
-            return theano.sparse_grad(m[shared_v])
-        verify_grad_sparse(fn, [valm])
-
-    def test_err(self):
-        for ndim in [1, 3]:
-            t = theano.tensor.TensorType(dtype=config.floatX,
-                                         broadcastable=(False,) * ndim)()
-            v = theano.tensor.ivector()
-            sub = t[v]
-
-            # Assert we don't create a sparse grad by default
-            g = theano.grad(sub.sum(), t)
-            assert isinstance(g.owner.op, tensor.AdvancedIncSubtensor1)
-
-            # Test that we raise an error, as we can't create a sparse
-            # grad from tensors that don't have 2 dimensions.
-            sub = theano.sparse_grad(sub)
-            self.assertRaises(TypeError, theano.grad, sub.sum(), t)
-
-
 class T_AddMul(unittest.TestCase):
     def testAddSS(self):
         self._testSS(add)
@@ -521,256 +445,169 @@ def testMulDS(self):
 
     def _testSS(self, op, array1=numpy.array([[1., 0], [3, 0], [0, 6]]),
                 array2=numpy.asarray([[0, 2.], [0, 4], [5, 0]])):
-        for mtype1, mtype2 in itertools_product(_mtypes, _mtypes):
-            for dtype1, dtype2 in [('float64', 'int8'),
-                                   ('int8', 'float64'),
-                                   ('float64', 'float64'),
-                               ]:
-                a = mtype1(array1).astype(dtype1)
-                aR = as_sparse_variable(a)
-                self.assertFalse(aR.data is a)
-                self.assertTrue(_is_sparse(a))
-                self.assertTrue(_is_sparse_variable(aR))
-
-                b = mtype2(array2).astype(dtype2)
-                bR = as_sparse_variable(b)
-                self.assertFalse(bR.data is b)
-                self.assertTrue(_is_sparse(b))
-                self.assertTrue(_is_sparse_variable(bR))
-
-                apb = op(aR, bR)
-                self.assertTrue(_is_sparse_variable(apb))
-
-                self.assertTrue(apb.type.format == aR.type.format, apb.type.format)
-
-                val = eval_outputs([apb])
-                self.assertTrue(val.shape == (3, 2))
-                if op is add:
-                    self.assertTrue(numpy.all(val.todense() == (array1 + array2)))
-                    if dtype1.startswith('float') and dtype2.startswith('float'):
-                        verify_grad_sparse(op, [a, b], structured=False)
-                elif op is mul:
-                    self.assertTrue(numpy.all(val.todense()
-                                              == (array1 * array2)))
-                    if dtype1.startswith('float') and dtype2.startswith('float'):
-                        verify_grad_sparse(op, [a, b], structured=False)
+        for mtype in _mtypes:
+            a = mtype(array1)
+            aR = as_sparse_variable(a)
+            self.assertFalse(aR.data is a)
+            self.assertTrue(_is_sparse(a))
+            self.assertTrue(_is_sparse_variable(aR))
+
+            b = mtype(array2)
+            bR = as_sparse_variable(b)
+            self.assertFalse(bR.data is b)
+            self.assertTrue(_is_sparse(b))
+            self.assertTrue(_is_sparse_variable(bR))
+
+            apb = op(aR, bR)
+            self.assertTrue(_is_sparse_variable(apb))
+
+            self.assertTrue(apb.type.dtype == aR.type.dtype, apb.type.dtype)
+            self.assertTrue(apb.type.dtype == bR.type.dtype, apb.type.dtype)
+            self.assertTrue(apb.type.format == aR.type.format, apb.type.format)
+            self.assertTrue(apb.type.format == bR.type.format, apb.type.format)
+
+            val = eval_outputs([apb])
+            self.assertTrue(val.shape == (3, 2))
+            if op is add:
+                self.assertTrue(numpy.all(val.todense() == (array1 + array2)))
+                verify_grad_sparse(op, [a, b], structured=False)
+            elif op is mul:
+                self.assertTrue(numpy.all(val.todense()
+                                          == (array1 * array2)))
+                verify_grad_sparse(op, [a, b], structured=False)
 
     def _testSD(self, op, array1=numpy.array([[1., 0], [3, 0], [0, 6]]),
                 array2=numpy.asarray([[0, 2.], [0, 4], [5, 0]])):
         for mtype in _mtypes:
-            for a in [numpy.array(array1), tensor.as_tensor_variable(array1),
-                      theano.shared(array1)]:
-                for dtype1, dtype2 in [('float64', 'int8'),
-                                       ('int8', 'float64'),
-                                       # Needed to test the grad
-                                       ('float32', 'float64'),
-                                   ]:
-                    a = a.astype(dtype1)
-                    b = mtype(array2).astype(dtype2)
-                    bR = as_sparse_variable(b)
-                    self.assertFalse(bR.data is b)  # constants are copied
-                    self.assertTrue(_is_sparse(b))
-                    self.assertTrue(_is_sparse_variable(bR))
-
-                    apb = op(a, bR)
-
-                    val = eval_outputs([apb])
-                    self.assertTrue(val.shape == (3, 2))
-                    if op is add:
-                        self.assertTrue(_is_dense_variable(apb))
-                        self.assertTrue(numpy.all(val == (array1 + b)))
-                        ans = numpy.array([[1., 2], [3, 4], [5, 6]])
-                        self.assertTrue(numpy.all(val == ans))
-                        if isinstance(a, theano.Constant):
-                            a = a.data
-                        if getattr(a, 'owner', None):
-                            continue
-                        if dtype1.startswith('float') and dtype2.startswith('float'):
-                            verify_grad_sparse(op, [a, b], structured=True)
-                    elif op is mul:
-                        self.assertTrue(_is_sparse_variable(apb))
-                        self.assertTrue(numpy.all(val.todense() == (b.multiply(array1))))
-                        self.assertTrue(numpy.all(val.todense() == numpy.array(
-                            [[1, 0], [9, 0], [0, 36]])))
-                        if isinstance(a, theano.Constant):
-                            a = a.data
-                        if getattr(a, 'owner', None):
-                            continue
-                        if dtype1.startswith('float') and dtype2.startswith('float'):
-                            verify_grad_sparse(op, [a, b], structured=False)
+            a = numpy.array(array1)
+            aR = tensor.as_tensor_variable(a)
+            self.assertFalse(aR.data is a)  # constants are copied
+            self.assertTrue(_is_dense(a))
+            self.assertTrue(_is_dense_variable(aR))
+
+            b = mtype(array2)
+            bR = as_sparse_variable(b)
+            self.assertFalse(bR.data is b)  # constants are copied
+            self.assertTrue(_is_sparse(b))
+            self.assertTrue(_is_sparse_variable(bR))
+
+            apb = op(aR, bR)
+
+            self.assertTrue(apb.type.dtype == aR.type.dtype, apb.type.dtype)
+            self.assertTrue(apb.type.dtype == bR.type.dtype, apb.type.dtype)
+
+            val = eval_outputs([apb])
+            self.assertTrue(val.shape == (3, 2))
+            if op is add:
+                self.assertTrue(_is_dense_variable(apb))
+                self.assertTrue(numpy.all(val == (a + b)))
+                ans = numpy.array([[1., 2], [3, 4], [5, 6]])
+                self.assertTrue(numpy.all(val == ans))
+            elif op is mul:
+                self.assertTrue(_is_sparse_variable(apb))
+                self.assertTrue(numpy.all(val.todense() == (b.multiply(a))))
+                self.assertTrue(numpy.all(val.todense() == numpy.array(
+                    [[1, 0], [9, 0], [0, 36]])))
 
     def _testDS(self, op, array1=numpy.array([[1., 0], [3, 0], [0, 6]]),
                 array2=numpy.asarray([[0, 2.], [0, 4], [5, 0]])):
         for mtype in _mtypes:
-            for b in [numpy.asarray(array2),
-                      tensor.as_tensor_variable(array2),
-                      theano.shared(array2)]:
-                for dtype1, dtype2 in [('float64', 'int8'),
-                                       ('int8', 'float64'),
-                                   ]:
-                    a = mtype(array1).astype(dtype1)
-                    aR = as_sparse_variable(a)
-                    self.assertFalse(aR.data is a)
-                    self.assertTrue(_is_sparse(a))
-                    self.assertTrue(_is_sparse_variable(aR))
-                    b = b.astype(dtype2)
-
-                    apb = op(aR, b)
-
-                    val = eval_outputs([apb])
-                    self.assertTrue(val.shape == (3, 2))
-                    if op is add:
-                        self.assertTrue(_is_dense_variable(apb))
-                        self.assertTrue(numpy.all(val == (a + array2)))
-                        ans = numpy.array([[1., 2], [3, 4], [5, 6]])
-                        self.assertTrue(numpy.all(val == ans))
-                        if isinstance(b, theano.Constant):
-                            b = b.data
-                        if dtype1.startswith('float') and dtype2.startswith('float'):
-                            verify_grad_sparse(op, [a, b], structured=True)
-                    elif op is mul:
-                        self.assertTrue(_is_sparse_variable(apb))
-                        ans = numpy.array([[1, 0], [9, 0], [0, 36]])
-                        self.assertTrue(numpy.all(val.todense() == (a.multiply(array2))))
-                        self.assertTrue(numpy.all(val.todense() == ans))
-                        if isinstance(b, theano.Constant):
-                            b = b.data
-                        if dtype1.startswith('float') and dtype2.startswith('float'):
-                            verify_grad_sparse(op, [a, b], structured=False)
-
-
-class test_comparison(unittest.TestCase):
-    def setUp(self):
-        utt.seed_rng()
-
-    #took from tensor basic_test.py
-    def _rand_ranged(self, min, max, shape):
-        return numpy.asarray(numpy.random.rand(*shape) * (max - min) + min,
-                         dtype=config.floatX)
-
-    tests = [lambda x, y: x > y, lambda x, y: x < y,
-                lambda x, y: x >= y, lambda x, y: x <= y]
-
-    testsDic = {gt: lambda x, y: x > y, lt: lambda x, y: x < y,
-                ge: lambda x, y: x >= y, le: lambda x, y: x <= y}
-
-    def __generalized_ss_test(self, theanop, symbolicType, testOp, scipyType):
-
-        scipy_ver = [int(n) for n in scipy.__version__.split('.')[:2]]
-
-        if (bool(scipy_ver < [0, 13])):
-            raise SkipTest("comparison operators need newer release of scipy")
-
-        x = symbolicType()
-        y = symbolicType()
-
-        op = theanop(x, y)
-
-        f = theano.function([x, y], op)
-
-        m1 = scipyType(random_lil((10, 40), config.floatX, 3))
-        m2 = scipyType(random_lil((10, 40), config.floatX, 3))
-
-        self.assertTrue(numpy.array_equal(f(m1, m2).data, testOp(m1, m2).data))
-
-    def __generalized_sd_test(self, theanop, symbolicType, testOp, scipyType):
-
-        scipy_ver = [int(n) for n in scipy.__version__.split('.')[:2]]
-
-        if (bool(scipy_ver < [0, 13])):
-            raise SkipTest("comparison operators need newer release of scipy")
-
-        x = symbolicType()
-        y = theano.tensor.matrix()
-
-        op = theanop(x, y)
-
-        f = theano.function([x, y], op)
-
-        m1 = scipyType(random_lil((10, 40), config.floatX, 3))
-        m2 = self._rand_ranged(1000, -1000, [10, 40])
-
-        self.assertTrue(numpy.array_equal(f(m1, m2).data, testOp(m1, m2).data))
-
-    def __generalized_ds_test(self, theanop, symbolicType, testOp, scipyType):
-
-        scipy_ver = [int(n) for n in scipy.__version__.split('.')[:2]]
-
-        if (bool(scipy_ver < [0, 13])):
-            raise SkipTest("comparison operators need newer release of scipy")
-
-        x = symbolicType()
-        y = theano.tensor.matrix()
-
-        op = theanop(y, x)
-
-        f = theano.function([y, x], op)
-
-        m1 = scipyType(random_lil((10, 40), config.floatX, 3))
-        m2 = self._rand_ranged(1000, -1000, [10, 40])
-
-        self.assertTrue(numpy.array_equal(f(m2, m1).data, testOp(m2, m1).data))
-
-    def test_ss_csr_comparison(self):
-
-        for op in self.tests:
-            self.__generalized_ss_test(op, sparse.csr_matrix,
-                              op, sp.csr_matrix)
-
-    def test_ss_csc_comparison(self):
-
-        for op in self.tests:
-            self.__generalized_ss_test(op, sparse.csc_matrix,
-                              op, sp.csc_matrix)
-
-    def test_sd_csr_comparison(self):
-
-        for op in self.tests:
-            self.__generalized_sd_test(op, sparse.csr_matrix,
-                              op, sp.csr_matrix)
-
-    def test_sd_csc_comparison(self):
-
-        for op in self.tests:
-            self.__generalized_sd_test(op, sparse.csc_matrix,
-                              op, sp.csc_matrix)
-
-    def test_ds_csc_comparison(self):
-
-        for op in self.testsDic:
-            self.__generalized_ds_test(op, sparse.csc_matrix,
-                              self.testsDic[op], sp.csc_matrix)
-
-    def test_ds_csr_comparison(self):
-
-        for op in self.testsDic:
-            self.__generalized_ds_test(op, sparse.csr_matrix,
-                              self.testsDic[op], sp.csr_matrix)
-
-    def test_equality_case(self):
-        """
-        Test assuring normal behaviour when values
-        in the matrices are equal
-        """
-
-        scipy_ver = [int(n) for n in scipy.__version__.split('.')[:2]]
-
-        if (bool(scipy_ver < [0, 13])):
-            raise SkipTest("comparison operators need newer release of scipy")
-
-        x = sparse.csc_matrix()
-        y = theano.tensor.matrix()
-
-        m1 = sp.csc_matrix((2, 2), dtype=theano.config.floatX)
-        m2 = numpy.asarray([[0, 0], [0, 0]], dtype=theano.config.floatX)
-
-        for func in self.testsDic:
+            a = mtype(array1)
+            aR = as_sparse_variable(a)
+            self.assertFalse(aR.data is a)
+            self.assertTrue(_is_sparse(a))
+            self.assertTrue(_is_sparse_variable(aR))
+
+            b = numpy.asarray(array2)
+            bR = tensor.as_tensor_variable(b)
+            self.assertFalse(bR.data is b)
+            self.assertTrue(_is_dense(b))
+            self.assertTrue(_is_dense_variable(bR))
+
+            apb = op(aR, bR)
+
+            self.assertTrue(apb.type.dtype == aR.type.dtype, apb.type.dtype)
+            self.assertTrue(apb.type.dtype == bR.type.dtype, apb.type.dtype)
+
+            val = eval_outputs([apb])
+            self.assertTrue(val.shape == (3, 2))
+            if op is add:
+                self.assertTrue(_is_dense_variable(apb))
+                self.assertTrue(numpy.all(val == (a + b)))
+                ans = numpy.array([[1., 2], [3, 4], [5, 6]])
+                self.assertTrue(numpy.all(val == ans))
+            elif op is mul:
+                self.assertTrue(_is_sparse_variable(apb))
+                ans = numpy.array([[1, 0], [9, 0], [0, 36]])
+                self.assertTrue(numpy.all(val.todense() == (a.multiply(b))))
+                self.assertTrue(numpy.all(val.todense() == ans))
 
-            op = func(y, x)
-            f = theano.function([y, x], op)
+    def test_upcast(self):
+        array1 = numpy.array([[1, 0], [3, 0], [0, 6]], dtype='float32')
+        array2 = numpy.array([[1, 0], [3, 0], [0, 6]], dtype='int32')
+        array3 = numpy.array([[1, 0], [3, 0], [0, 6]], dtype='int8')
 
-            self.assertTrue(numpy.array_equal(f(m2, m1),
-                                              self.testsDic[func](m2, m1)))
+        # AddSS and MulSS
+        for mtype in _mtypes:
+            a = mtype(array1)
+            aR = as_sparse_variable(a)
+            b = mtype(array2)
+            bR = as_sparse_variable(b)
+            c = mtype(array3)
+            cR = as_sparse_variable(c)
+
+            # Ops that do not upcast
+            self.assertRaises(NotImplementedError, add, aR, bR)
+            self.assertRaises(NotImplementedError, add, bR, aR)
+            self.assertRaises(NotImplementedError, add, bR, cR)
+            self.assertRaises(NotImplementedError, add, cR, bR)
+            self.assertRaises(NotImplementedError, add, aR, cR)
+            self.assertRaises(NotImplementedError, add, cR, aR)
+
+            self.assertRaises(NotImplementedError, mul, aR, bR)
+            self.assertRaises(NotImplementedError, mul, bR, aR)
+            self.assertRaises(NotImplementedError, mul, bR, cR)
+            self.assertRaises(NotImplementedError, mul, cR, bR)
+            self.assertRaises(NotImplementedError, mul, aR, cR)
+            self.assertRaises(NotImplementedError, mul, cR, aR)
+
+        # AddSD and MulSD
+        for mtype in _mtypes:
+            a = mtype(array1)
+            a_sv = as_sparse_variable(a)
+            a_dv = tensor.as_tensor_variable(array1)
+            b = mtype(array2)
+            b_sv = as_sparse_variable(b)
+            b_dv = tensor.as_tensor_variable(array2)
+            c = mtype(array3)
+            c_sv = as_sparse_variable(c)
+            c_dv = tensor.as_tensor_variable(array3)
+
+            # add does not upcast
+            self.assertRaises(NotImplementedError, add, a_sv, b_dv)
+            self.assertRaises(NotImplementedError, add, b_sv, a_dv)
+            self.assertRaises(NotImplementedError, add, b_sv, c_dv)
+            self.assertRaises(NotImplementedError, add, c_sv, b_dv)
+            self.assertRaises(NotImplementedError, add, a_sv, c_dv)
+            self.assertRaises(NotImplementedError, add, c_sv, a_dv)
+
+            # mul may upcast the dense input if needed
+            if (config.cast_policy in ('custom', 'numpy') or
+                (config.cast_policy == 'numpy+floatX' and
+                 config.floatX == 'float64')):
+                # The result should be a float64 (not implemented).
+                self.assertRaises(NotImplementedError, mul, a_sv, b_dv)
+            elif (config.cast_policy == 'numpy+floatX' and
+                  config.floatX == 'float32'):
+                # The result should be a float32.
+                assert mul(a_sv, b_dv).dtype == 'float32'
+            else:
+                raise NotImplementedError()
+            self.assertRaises(NotImplementedError, mul, b_sv, a_dv)
+            assert mul(b_sv, c_dv).dtype == 'int32'
+            self.assertRaises(NotImplementedError, mul, c_sv, b_dv)
+            assert mul(a_sv, c_dv).dtype == 'float32'
+            self.assertRaises(NotImplementedError, mul, c_sv, a_dv)
 
 
 class T_conversion(unittest.TestCase):
@@ -793,25 +630,18 @@ def test1(self):
             self.assertTrue(str(val.dtype) == 'float64')
             self.assertTrue(val.format == 'csr')
 
-    def test_dense_from_sparse(self):
-        #call dense_from_sparse
-        for t in _mtypes:
-            s = t(scipy.sparse.identity(5))
-            s = as_sparse_variable(s)
-            d = dense_from_sparse(s)
-            val = eval_outputs([d])
-            self.assertTrue(str(val.dtype) == s.dtype)
-            self.assertTrue(numpy.all(val[0] == [1, 0, 0, 0, 0]))
-
-    def test_todense(self):
-        #call sparse_var.todense()
-        for t in _mtypes:
-            s = t(scipy.sparse.identity(5))
-            s = as_sparse_variable(s)
-            d = s.toarray()
-            val = eval_outputs([d])
-            self.assertTrue(str(val.dtype) == s.dtype)
-            self.assertTrue(numpy.all(val[0] == [1, 0, 0, 0, 0]))
+    if 1:
+        def test2(self):
+            #call dense_from_sparse
+            for t in _mtypes:
+                s = t(scipy.sparse.identity(5))
+                d = dense_from_sparse(s)
+                # s should be copied into the graph as a constant
+                s[0, 0] = 3.0  # changes s, but not the copy
+                val = eval_outputs([d])
+                return
+                self.assertTrue(str(val.dtype) == s.dtype)
+                self.assertTrue(numpy.all(val[0] == [1, 0, 0, 0, 0]))
 
     @staticmethod
     def check_format_ndim(format, ndim):
@@ -844,27 +674,27 @@ def setUp(self):
 
     def test_csm_properties_grad(self):
         sp_types = {'csc': sp.csc_matrix,
-                    'csr': sp.csr_matrix}
+            'csr': sp.csr_matrix}
 
         for format in ['csc', 'csr']:
             for dtype in ['float32', 'float64']:
                 spmat = sp_types[format](random_lil((4, 3), dtype, 3))
 
                 verify_grad_sparse(lambda *x: CSMProperties()(*x)[0], [spmat],
-                                   structured=True)
+                    structured=True)
 
                 verify_grad_sparse(lambda *x: CSMProperties()(*x)[1], [spmat],
-                                   structured=True)
+                    structured=True)
 
                 verify_grad_sparse(lambda *x: CSMProperties()(*x)[2], [spmat],
-                                   structured=True)
+                    structured=True)
 
                 verify_grad_sparse(lambda *x: CSMProperties()(*x)[2], [spmat],
-                                   structured=True)
+                    structured=True)
 
     def test_csm_properties(self):
         sp_types = {'csc': sp.csc_matrix,
-                    'csr': sp.csr_matrix}
+            'csr': sp.csr_matrix}
 
         for format in ['csc', 'csr']:
             for dtype in ['float32', 'float64']:
@@ -887,7 +717,7 @@ def setUp(self):
 
     def test_csm_grad(self):
         sp_types = {'csc': sp.csc_matrix,
-                    'csr': sp.csr_matrix}
+            'csr': sp.csr_matrix}
 
         for format in ['csc', 'csr']:
             for dtype in ['float32', 'float64']:
@@ -902,7 +732,7 @@ def test_csm_sparser(self):
         Test support for gradients sparser than the input.
         """
         sp_types = {'csc': sp.csc_matrix,
-                    'csr': sp.csr_matrix}
+            'csr': sp.csr_matrix}
 
         for format in ['csc', 'csr']:
             for dtype in ['float32', 'float64']:
@@ -912,7 +742,7 @@ def test_csm_sparser(self):
                 s = tensor.ivector()
 
                 a = as_sparse_variable(sp_types[format](random_lil((4, 3),
-                                                                   dtype, 1)))
+                    dtype, 1)))
 
                 f = theano.function([x, y, z, s],
                                     tensor.grad(dense_from_sparse(
@@ -921,7 +751,7 @@ def test_csm_sparser(self):
                 spmat = sp_types[format](random_lil((4, 3), dtype, 3))
 
                 res = f(spmat.data, spmat.indices, spmat.indptr,
-                        numpy.asarray(spmat.shape, 'int32'))
+                    numpy.asarray(spmat.shape, 'int32'))
 
                 assert len(spmat.data) == len(res)
 
@@ -930,7 +760,7 @@ def test_csm_unsorted(self):
         Test support for gradients of unsorted inputs.
         """
         sp_types = {'csc': sp.csc_matrix,
-                    'csr': sp.csr_matrix}
+            'csr': sp.csr_matrix}
 
         for format in ['csr', 'csc', ]:
             for dtype in ['float32', 'float64']:
@@ -939,17 +769,29 @@ def test_csm_unsorted(self):
                 z = tensor.ivector()
                 s = tensor.ivector()
                 # Sparse advanced indexing produces unsorted sparse matrices
-                a = sparse_random_inputs(format, (4, 3), out_dtype=dtype,
-                                         unsorted_indices=True)[1][0]
+                a = sp_types[format]([[1, 2, 1],
+                                      [1, 2, 1],
+                                      [1, 2, 1],
+                                      [1, 2, 1]],
+                    dtype=dtype)[range(4)]
                 # Make sure it's unsorted
                 assert not a.has_sorted_indices
-                def my_op(x):
-                    y = tensor.constant(a.indices)
-                    z = tensor.constant(a.indptr)
-                    s = tensor.constant(a.shape)
-                    return tensor.sum(
-                        dense_from_sparse(CSM(format)(x, y, z, s) * a))
-                verify_grad_sparse(my_op, [a.data])
+                a = as_sparse_variable(a)
+
+                f = theano.function([x, y, z, s], tensor.grad(tensor.sum(
+                    dense_from_sparse(a * CSM(format)(x, y, z, s))), x))
+
+                spmat = sp_types[format](random_lil((4, 3), dtype,
+                    12))[range(4)]
+                assert not spmat.has_sorted_indices
+
+                res = f(spmat.data, spmat.indices, spmat.indptr,
+                    numpy.asarray(spmat.shape, 'int32'))
+
+                col1 = sp_types[format]((res, spmat.indices, spmat.indptr),
+                    shape=numpy.asarray(spmat.shape, 'int32'))[:, 1].data
+
+                assert numpy.all(col1 == 2)
 
     def test_csm(self):
         sp_types = {'csc': sp.csc_matrix,
@@ -966,7 +808,7 @@ def test_csm(self):
                 spmat = sp_types[format](random_lil((4, 3), dtype, 3))
 
                 res = f(spmat.data, spmat.indices, spmat.indptr,
-                        numpy.asarray(spmat.shape, 'int32'))
+                    numpy.asarray(spmat.shape, 'int32'))
 
                 assert numpy.all(res.data == spmat.data)
                 assert numpy.all(res.indices == spmat.indices)
@@ -1044,7 +886,7 @@ def test_upcast(self):
                 scipy_result = spmat * mat
                 assert theano_result.shape == scipy_result.shape
                 assert theano_result.dtype == scipy_result.dtype
-                utt.assert_allclose(scipy_result, theano_result)
+                assert _allclose(theano_result, scipy_result)
 
     def test_opt_unpack(self):
         #
@@ -1067,7 +909,8 @@ def test_opt_unpack(self):
         spmat = sp.csc_matrix(spmat)
 
         images = tensor.Tensor(dtype='float32',
-                               broadcastable=[False, False])('images')
+                               broadcastable=[False, False])(
+            'images')
 
         cscmat = CSC(kerns, spmat.indices[:spmat.size],
                      spmat.indptr, spmat.shape)
@@ -1088,8 +931,7 @@ def test_opt_unpack(self):
         #print 'type of kernvals = ', kernvals.dtype
         bsize = 3
         imvals = 1.0 * numpy.array(numpy.arange(bsize * spmat.shape[1]).\
-                                   reshape(bsize, spmat.shape[1]),
-                                   dtype='float32')
+                reshape(bsize, spmat.shape[1]), dtype='float32')
         outvals = f(kernvals, imvals)
         #print outvals
 
@@ -1097,21 +939,20 @@ def test_dot_sparse_sparse(self):
         #test dot for 2 input sparse matrix
         sparse_dtype = 'float64'
         sp_mat = {'csc': sp.csc_matrix,
-                  'csr': sp.csr_matrix,
-                  'bsr': sp.csr_matrix}
+                  'csr': sp.csr_matrix}
 
-        for sparse_format_a in ['csc', 'csr', 'bsr']:
-            for sparse_format_b in ['csc', 'csr', 'bsr']:
+        for sparse_format_a in ['csc', 'csr']:
+            for sparse_format_b in ['csc', 'csr']:
                 a = SparseType(sparse_format_a, dtype=sparse_dtype)()
                 b = SparseType(sparse_format_b, dtype=sparse_dtype)()
                 d = theano.dot(a, b)
                 f = theano.function([a, b], theano.Out(d, borrow=True))
                 topo = f.maker.fgraph.toposort()
                 for M, N, K, nnz in [(4, 3, 2, 3),
-                                     (40, 30, 20, 3),
-                                     (40, 30, 20, 30),
-                                     (400, 3000, 200, 6000),
-                                 ]:
+                                  (40, 30, 20, 3),
+                                  (40, 30, 20, 30),
+                                  (400, 3000, 200, 6000),
+                                  ]:
                     a_val = sp_mat[sparse_format_a](
                         random_lil((M, N), sparse_dtype, nnz))
                     b_val = sp_mat[sparse_format_b](
@@ -1128,10 +969,10 @@ def test_csc_correct_output_faster_than_scipy(self):
         f = theano.function([a, b], theano.Out(d, borrow=True))
 
         for M, N, K, nnz in [(4, 3, 2, 3),
-                             (40, 30, 20, 3),
-                             (40, 30, 20, 30),
-                             (400, 3000, 200, 6000),
-                         ]:
+                (40, 30, 20, 3),
+                (40, 30, 20, 30),
+                (400, 3000, 200, 6000),
+                ]:
             spmat = sp.csc_matrix(random_lil((M, N), sparse_dtype, nnz))
             mat = numpy.asarray(numpy.random.randn(N, K), dense_dtype)
             theano_times = []
@@ -1158,7 +999,7 @@ def test_csc_correct_output_faster_than_scipy(self):
             # fail if Theano is slower than scipy by more than a certain amount
             overhead_tol = 0.003  # seconds overall
             overhead_rtol = 1.2  # times as long
-            utt.assert_allclose(scipy_result, theano_result)
+            self.assertTrue(numpy.allclose(theano_result, scipy_result))
             if not theano.config.mode in ["DebugMode", "DEBUG_MODE"]:
                 self.assertFalse(theano_time > overhead_rtol * scipy_time +
                                  overhead_tol)
@@ -1176,10 +1017,10 @@ def test_csr_correct_output_faster_than_scipy(self):
         f = theano.function([a, b], d)
 
         for M, N, K, nnz in [(4, 3, 2, 3),
-                             (40, 30, 20, 3),
-                             (40, 30, 20, 30),
-                             (400, 3000, 200, 6000),
-                         ]:
+                (40, 30, 20, 3),
+                (40, 30, 20, 30),
+                (400, 3000, 200, 6000),
+                ]:
             spmat = sp.csr_matrix(random_lil((M, N), sparse_dtype, nnz))
             mat = numpy.asarray(numpy.random.randn(N, K), dense_dtype)
             t0 = time.time()
@@ -1194,14 +1035,11 @@ def test_csr_correct_output_faster_than_scipy(self):
             # print 'scipy took', scipy_time
             overhead_tol = 0.002  # seconds
             overhead_rtol = 1.1  # times as long
-            utt.assert_allclose(scipy_result, theano_result)
+            self.assertTrue(numpy.allclose(theano_result, scipy_result))
             if (not theano.config.mode in ["DebugMode", "DEBUG_MODE"] and
                 theano.config.cxx):
-                    self.assertFalse(
-                        theano_time > overhead_rtol * scipy_time + overhead_tol,
-                        (theano_time,
-                         overhead_rtol * scipy_time + overhead_tol,
-                         scipy_time, overhead_rtol, overhead_tol))
+                    self.assertFalse(theano_time > overhead_rtol * scipy_time +
+                                     overhead_tol)
 
 
 class DotTests(utt.InferShapeTester):
@@ -1224,7 +1062,7 @@ def setUp(self):
         self.v_10 = numpy.asarray(numpy.random.uniform(-1, 1, 10),
                                   dtype=theano.config.floatX)
         self.v_100 = numpy.asarray(numpy.random.uniform(-1, 1, 100),
-                                   dtype=theano.config.floatX)
+                                  dtype=theano.config.floatX)
 
     def test_csr_dense(self):
         x = theano.sparse.csr_matrix('x')
@@ -1237,7 +1075,7 @@ def test_csr_dense(self):
             f_a = theano.function([x, y], theano.sparse.dot(x, y))
             f_b = lambda x, y: x * y
 
-            utt.assert_allclose(f_a(x_v, y_v), f_b(x_v, y_v))
+            assert _allclose(f_a(x_v, y_v), f_b(x_v, y_v))
 
             # Test infer_shape
             self._compile_and_check([x, y], [theano.sparse.dot(x, y)],
@@ -1256,7 +1094,7 @@ def test_csc_dense(self):
             f_a = theano.function([x, y], theano.sparse.dot(x, y))
             f_b = lambda x, y: x * y
 
-            utt.assert_allclose(f_a(x_v, y_v), f_b(x_v, y_v))
+            assert _allclose(f_a(x_v, y_v), f_b(x_v, y_v))
 
             # Test infer_shape
             self._compile_and_check([x, y], [theano.sparse.dot(x, y)],
@@ -1279,12 +1117,12 @@ def test_sparse_sparse(self):
                 x = theano.sparse.SparseType(format=x_f, dtype=d1)('x')
                 y = theano.sparse.SparseType(format=x_f, dtype=d2)('x')
 
-                f_a = lambda x, y: x * y
-                f_b = theano.function([x, y], theano.sparse.dot(x, y))
+                f_a = theano.function([x, y], theano.sparse.dot(x, y))
+                f_b = lambda x, y: x * y
 
                 vx = getattr(self, 'x_' + x_f).astype(d1)
                 vy = getattr(self, 'y_' + y_f).astype(d2)
-                utt.assert_allclose(f_a(vx, vy).toarray(), f_b(vx, vy))
+                assert _allclose(f_a(vx, vy), f_b(vx, vy).toarray())
 
                 # Test infer_shape
                 f_a = theano.function([x, y], theano.sparse.dot(x, y).shape)
@@ -1305,14 +1143,14 @@ def test_cuda(self):
 
         a = sparse.csr_matrix('a', dtype='float32')
         b = cuda.float32_shared_constructor(
-            numpy.random.rand(3, 4).astype('float32'))
+                numpy.random.rand(3, 4).astype('float32'))
         d = sparse.dot(a, b)
         f = theano.function([a], d)
 
         a_val = scipy.sparse.csr_matrix(random_lil((5, 3), 'float32', 5))
         d_theano = f(a_val)
         d_numpy = a_val * b.get_value()
-        utt.assert_allclose(d_numpy, d_theano)
+        assert numpy.allclose(d_theano, d_numpy)
 
     def test_int32_dtype(self):
         # Reported on the theano-user mailing-list:
@@ -1325,7 +1163,7 @@ def test_int32_dtype(self):
 
         fI = I.flatten()
         data = tensor.ones_like(fI)
-        indptr = tensor.arange(data.shape[0] + 1, dtype='int32')
+        indptr = tensor.arange(data.shape[0] + 1)
 
         m1 = sparse.CSR(data, fI, indptr, (8, size))
         m2 = sparse.dot(m1, C)
@@ -1337,20 +1175,6 @@ def test_int32_dtype(self):
                           dtype=intX)
         f(i, a)
 
-    def test_csr_dense_grad(self):
-
-        #shortcut: testing csc in float32, testing csr in float64
-
-        # allocate a random sparse matrix
-        spmat = sp.csr_matrix(random_lil((4, 3), 'float64', 3))
-
-        mat = numpy.asarray(numpy.random.randn(2, 4), 'float64')
-
-        def buildgraph_T(mat):
-            return Dot()(mat, spmat)
-
-        theano.tests.unittest_tools.verify_grad(buildgraph_T, [mat])
-
 
 class UsmmTests(unittest.TestCase):
     """ Test the Usmm and UsmmCscDense class and related optimization """
@@ -1367,7 +1191,6 @@ def setUp(self):
         self.z = numpy.asarray(self.rng.uniform(-1, 1, z_size),
                                dtype=theano.config.floatX)
 
-    # this is slow, but it's the only test for the op.
     def test(self):
         def mat(format, name, dtype):
             if format == 'dense':
@@ -1433,15 +1256,7 @@ def mat(format, name, dtype):
                     theano.tensor.basic.float64_atol = orig_atol
                     theano.tensor.basic.float64_rtol = orig_rtol
 
-            # As we do a dot product of 2 vector of 100 element,
-            # This mean we can have 2*100*eps abs error.
-            if f_a_out.dtype in ['float64', 'complex128']:
-                atol = 3e-8
-                rtol = 1e-5
-            else:
-                atol = None
-                rtol = None
-            utt.assert_allclose(f_a_out, f_b_out, rtol=rtol, atol=atol)
+            assert _allclose(f_a_out, f_b_out, rtol=1e-5), (f_a_out, f_b_out)
             topo = f_a.maker.fgraph.toposort()
             up = theano.scalar.upcast(dtype1, dtype2, dtype3, dtype4)
 
@@ -1466,7 +1281,7 @@ def mat(format, name, dtype):
                              for node in topo]) == len(topo) - 5)
                 new_topo = []
                 for node in topo:
-                    if not (isinstance(node.op, tensor.Elemwise) and
+                    if not (isinstance(node.op, tensor.Elemwise) and \
                        isinstance(node.op.scalar_op,
                                   theano.scalar.basic.Cast)):
                         new_topo.append(node)
@@ -1545,8 +1360,8 @@ def test(self):
         x = theano.sparse.csr_matrix()
         f = theano.function([x], theano.sparse.sp_zeros_like(x))
         vx = scipy.sparse.csr_matrix(numpy.asarray(
-            numpy.random.binomial(1, 0.5, (100, 100)),
-            dtype=theano.config.floatX))
+                numpy.random.binomial(1, 0.5, (100, 100)),
+                dtype=theano.config.floatX))
 
         fx = f(vx)
 
@@ -1677,7 +1492,7 @@ def test_op(self):
             expected = x * s
 
             assert tested.format == format
-            utt.assert_allclose(expected, tested.toarray())
+            assert numpy.allclose(tested.toarray(), expected)
 
     def test_infer_shape(self):
         for format, cls in [('csc', sparse.ColScaleCSC),
@@ -1718,7 +1533,7 @@ def test_op(self):
             expected = x * s
 
             assert tested.format == format
-            utt.assert_allclose(expected, tested.toarray())
+            assert numpy.allclose(tested.toarray(), expected)
 
     def test_infer_shape(self):
         for format, cls in [('csc', sparse.RowScaleCSC),
@@ -1756,7 +1571,7 @@ def test_op(self):
                                                       shape=(10, 10))
 
                 z = theano.sparse.sp_sum(variable[0], axis=axis)
-                if axis is None:
+                if axis == None:
                     assert z.type.broadcastable == ()
                 else:
                     assert z.type.broadcastable == (False, )
@@ -1764,7 +1579,7 @@ def test_op(self):
                 f = theano.function(variable, self.op(variable[0], axis=axis))
                 tested = f(*data)
                 expected = data[0].todense().sum(axis).ravel()
-                utt.assert_allclose(expected, tested)
+                assert numpy.allclose(tested, expected)
 
     def test_infer_shape(self):
         for format in sparse.sparse_formats:
@@ -1806,7 +1621,7 @@ def test_op(self):
             tested = f(*data)
             expected = data[0].toarray().diagonal()
 
-            utt.assert_allclose(expected, tested)
+            assert numpy.allclose(tested, expected)
 
     def test_infer_shape(self):
         for format in sparse.sparse_formats:
@@ -1844,7 +1659,7 @@ def test_op(self):
                 tested = f(*data).toarray()
 
                 expected = numpy.diag(*data)
-                utt.assert_allclose(expected, tested)
+                assert numpy.allclose(tested, expected)
                 assert tested.dtype == expected.dtype
                 assert tested.shape == expected.shape
 
@@ -1886,7 +1701,7 @@ def test_op(self):
                 tested = f(*data).toarray()
                 expected = data[0].sorted_indices().toarray()
 
-                utt.assert_allclose(expected, tested)
+                assert numpy.allclose(tested, expected)
 
     def test_infer_shape(self):
         for format in sparse.sparse_formats:
@@ -1929,7 +1744,7 @@ def test_op(self):
 
                 tested = tested.toarray()
                 expected = expected.toarray()
-                utt.assert_allclose(expected, tested)
+                assert numpy.allclose(tested, expected)
 
     def test_grad(self):
         for format in sparse.sparse_formats:
@@ -1953,45 +1768,41 @@ def test_remove0(self):
             ('csr', scipy.sparse.csr_matrix), ]
 
         for format, matrix_class in configs:
-            for zero, unsor in [(True, True), (True, False),
-                              (False, True), (False, False)]:
-                (x,), (mat,) = sparse_random_inputs(format, (6, 8),
-                                            out_dtype=config.floatX,
-                                            explicit_zero=zero,
-                                            unsorted_indices=unsor)
-                assert 0 in mat.data or not zero
-                assert not mat.has_sorted_indices or not unsor
-
-                # the In thingy has to be there because theano has as rule not
-                # to optimize inputs
-                f = theano.function([theano.In(x, borrow=True, mutable=True)],
-                                    Remove0()(x))
-
-                # assert optimization local_inplace_remove0 is applied in
-                # modes with optimization
-                if theano.config.mode not in ['FAST_COMPILE']:
-                    # list of apply nodes in the optimized graph.
-                    nodes = f.maker.fgraph.toposort()
-                    # Check there isn't any Remove0 instance not inplace.
-                    assert not any([isinstance(node.op, Remove0) and
-                                    not node.op.inplace for node in nodes]), (
-                           'Inplace optimization should have been applied')
-                    # Check there is at least one Remove0 inplace.
-                    assert any([isinstance(node.op, Remove0) and node.op.inplace
-                                for node in nodes])
-                # checking
-                # makes sense to change its name
-                target = mat
-                result = f(mat)
-                mat.eliminate_zeros()
-                msg = 'Matrices sizes differ. Have zeros been removed ?'
-                assert result.size == target.size, msg
-                if unsor:
-                    assert not result.has_sorted_indices
-                    assert not target.has_sorted_indices
-                else:
-                    assert result.has_sorted_indices
-                    assert target.has_sorted_indices
+            # real
+            origin = (numpy.arange(9) + 1).reshape((3, 3))
+            origin.astype(config.floatX)
+            mat = matrix_class(origin).astype(theano.config.floatX)
+
+            mat[0, 1] = mat[1, 0] = mat[2, 2] = 0
+
+            assert mat.size == 9
+
+            # symbolic
+            x = theano.sparse.SparseType(format=format, dtype=config.floatX)()
+            # the In thingy has to be there because theano has as rule not
+            # to optimize inputs
+            f = theano.function([theano.In(x, borrow=True, mutable=True)],
+                                Remove0()(x))
+
+            # assert optimization local_inplace_remove0 is applied in
+            # modes with optimization
+            if theano.config.mode not in ['FAST_COMPILE']:
+                # list of apply nodes in the optimized graph.
+                nodes = f.maker.fgraph.toposort()
+                # Check there isn't any Remove0 instance not inplace.
+                assert not any([isinstance(node.op, Remove0) and
+                                not node.op.inplace for node in nodes]), (
+                       'Inplace optimization should have been applied')
+                # Check there is at least one Remove0 inplace.
+                assert any([isinstance(node.op, Remove0) and node.op.inplace
+                            for node in nodes])
+            # checking
+            # makes sense to change its name
+            target = mat
+            result = f(mat)
+            mat.eliminate_zeros()
+            msg = 'Matrices sizes differ. Have zeros been removed ?'
+            assert result.size == target.size, msg
 
     def test_infer_shape(self):
         mat = (numpy.arange(12) + 1).reshape((4, 3))
@@ -2026,83 +1837,6 @@ class Test_getitem(unittest.TestCase):
     def setUp(self):
         self.rng = numpy.random.RandomState(utt.fetch_seed())
 
-    def test_GetItemList(self):
-
-        a, A = sparse_random_inputs('csr', (4, 5))
-        b, B = sparse_random_inputs('csc', (4, 5))
-        y = a[0][[0, 1, 2, 3, 1]]
-        z = b[0][[0, 1, 2, 3, 1]]
-
-        fa = theano.function([a[0]], y)
-        fb = theano.function([b[0]], z)
-
-        t_geta = fa(A[0]).todense()
-        t_getb = fb(B[0]).todense()
-
-        s_geta = scipy.sparse.csr_matrix(A[0])[[0, 1, 2, 3, 1]].todense()
-        s_getb = scipy.sparse.csc_matrix(B[0])[[0, 1, 2, 3, 1]].todense()
-
-        utt.assert_allclose(t_geta, s_geta)
-        utt.assert_allclose(t_getb, s_getb)
-
-    def test_GetItemList_wrong_index(self):
-        a, A = sparse_random_inputs('csr', (4, 5))
-        y = a[0][[0, 4]]
-        f = theano.function([a[0]], y)
-
-        self.assertRaises(IndexError, f, A[0])
-
-    def test_get_item_list_grad(self):
-        op = theano.sparse.basic.GetItemList()
-        def op_with_fixed_index(x):
-            return op(x, index=numpy.asarray([0, 1]))
-
-        x, x_val = sparse_random_inputs("csr", (4,5))
-
-        try:
-            verify_grad_sparse(op_with_fixed_index, x_val)
-        except NotImplementedError, e:
-            assert "Scipy version is to old" in str(e)
-
-    def test_GetItem2Lists(self):
-
-        a, A = sparse_random_inputs('csr', (4, 5))
-        b, B = sparse_random_inputs('csc', (4, 5))
-        y = a[0][[0, 0, 1, 3], [0, 1, 2, 4]]
-        z = b[0][[0, 0, 1, 3], [0, 1, 2, 4]]
-
-        fa = theano.function([a[0]], y)
-        fb = theano.function([b[0]], z)
-
-        t_geta = fa(A[0])
-        t_getb = fb(B[0])
-
-        s_geta = numpy.asarray(scipy.sparse.csr_matrix(A[0])[[0, 0, 1, 3], [0, 1, 2, 4]])
-        s_getb = numpy.asarray(scipy.sparse.csc_matrix(B[0])[[0, 0, 1, 3], [0, 1, 2, 4]])
-
-        utt.assert_allclose(t_geta, s_geta)
-        utt.assert_allclose(t_getb, s_getb)
-
-    def test_GetItem2Lists_wrong_index(self):
-        a, A = sparse_random_inputs('csr', (4, 5))
-        y1 = a[0][[0, 5], [0, 3]]
-        y2 = a[0][[0, 3], [0, 5]]
-
-        f1 = theano.function([a[0]], y1)
-        f2 = theano.function([a[0]], y2)
-
-        self.assertRaises(IndexError, f1, A[0])
-        self.assertRaises(IndexError, f2, A[0])
-
-    def test_get_item_2lists_grad(self):
-        op = theano.sparse.basic.GetItem2Lists()
-        def op_with_fixed_index(x):
-            return op(x, ind1=numpy.asarray([0, 1]), ind2=numpy.asarray([2, 3]))
-
-        x, x_val = sparse_random_inputs("csr", (4,5))
-
-        verify_grad_sparse(op_with_fixed_index, x_val)
-
     def test_GetItem2D(self):
         sparse_formats = ('csc', 'csr')
         for format in sparse_formats:
@@ -2129,7 +1863,7 @@ def test_GetItem2D(self):
             assert r1.shape == t1.shape
             assert numpy.all(t1.toarray() == r1.toarray())
 
-            """
+            """"
             Important: based on a discussion with both Fred and James
             The following indexing methods is not supported because the rval
             would be a sparse matrix rather than a sparse vector, which is a
@@ -2217,26 +1951,24 @@ def test_GetItem2D(self):
             # the [] shortcut for getitem.
             # x[a:b] is not accepted because we don't have sparse vectors
             self.assertRaises(NotImplementedError,
-                              x.__getitem__, (slice(a, b), c))
+                    x.__getitem__, (slice(a, b), c))
 
             # x[a:b:step, c:d] is not accepted because scipy silently drops
             # the step (!)
             self.assertRaises(ValueError,
-                              x.__getitem__, (slice(a, b, -1), slice(c, d)))
+                    x.__getitem__, (slice(a, b, -1), slice(c, d)))
             self.assertRaises(ValueError,
-                              x.__getitem__, (slice(a, b), slice(c, d, 2)))
+                    x.__getitem__, (slice(a, b), slice(c, d, 2)))
 
             # Advanced indexing is not supported
             self.assertRaises(ValueError,
-                              x.__getitem__,
-                              (tensor.ivector('l'), slice(a, b)))
+                    x.__getitem__, (tensor.ivector('l'), slice(a, b)))
 
             # Indexing with random things is not supported either
             self.assertRaises(ValueError,
-                              x.__getitem__, slice(tensor.fscalar('f'), None))
+                    x.__getitem__, slice(tensor.fscalar('f'), None))
             self.assertRaises(ValueError,
-                              x.__getitem__,
-                              (slice(None), slice([1, 3, 4], None)))
+                    x.__getitem__, (slice(None), slice([1, 3, 4], None)))
 
     def test_GetItemScalar(self):
         sparse_formats = ('csc', 'csr')
@@ -2249,7 +1981,7 @@ def test_GetItemScalar(self):
             n = 42
 
             vx = as_sparse_format(self.rng.binomial(1, 0.5, (97, 100)),
-                                  format).astype(theano.config.floatX)
+                                 format).astype(theano.config.floatX)
 
             f1 = theano.function([x, a, b], x[a, b])
             r1 = f1(vx, 10, 10)
@@ -2288,7 +2020,6 @@ class CastTester(utt.InferShapeTester):
     def setUp(self):
         super(CastTester, self).setUp()
 
-    # slow but only test
     def test_cast(self):
         for format in sparse.sparse_formats:
             for i_dtype in sparse.all_dtypes:
@@ -2315,9 +2046,9 @@ def test_cast(self):
                     t_cls = t_cls.toarray()
                     t_prop = t_prop.toarray()
 
-                    utt.assert_allclose(expected, t_func)
-                    utt.assert_allclose(expected, t_cls)
-                    utt.assert_allclose(expected, t_prop)
+                    assert numpy.allclose(t_func, expected)
+                    assert numpy.allclose(t_cls, expected)
+                    assert numpy.allclose(t_prop, expected)
 
     def test_infer_shape(self):
         for format in sparse.sparse_formats:
@@ -2343,7 +2074,7 @@ def test_grad(self):
 
                     eps = None
                     if o_dtype == 'float32':
-                        eps = 1e-2
+                        eps = 7e-4
 
                     verify_grad_sparse(Cast(o_dtype), data, eps=eps)
 
@@ -2357,7 +2088,8 @@ def _format_info(nb):
         spa = getattr(sp, format + '_matrix')
 
         x[format] = [variable() for t in range(nb)]
-        mat[format] = [spa(random_lil((3, 4), theano.config.floatX, 8))
+        mat[format] = [spa(numpy.random.random_integers(5, size=(3, 4)) - 1,
+                           dtype=theano.config.floatX)
                        for t in range(nb)]
     return x, mat
 
@@ -2386,7 +2118,7 @@ def test_op(self):
                                                format=out_f,
                                                dtype=dtype)
 
-                    utt.assert_allclose(expected.toarray(), tested.toarray())
+                    assert numpy.allclose(tested.toarray(), expected.toarray())
                     assert tested.format == expected.format
                     assert tested.dtype == expected.dtype
 
@@ -2406,8 +2138,7 @@ def test_grad(self):
                         self.op_class(format=out_f, dtype=dtype),
                         self.mat[format],
                         structured=False,
-                        eps=1e-2,
-                        )
+                        eps=7e-4)
 
 
 def _hv_switch(op, expected_function):
@@ -2458,7 +2189,7 @@ def test_op(self):
             tested = f(*self.a[format])
             expected = 2 * self.a[format][0]
 
-            utt.assert_allclose(expected.toarray(), tested.toarray())
+            assert numpy.allclose(tested.toarray(), expected.toarray())
             assert tested.format == expected.format
             assert tested.dtype == expected.dtype
 
@@ -2517,7 +2248,6 @@ def setUp(self):
             else:
                 self.gap_grad = gap
             # Ensure the test's name is correct.
-            utt.seed_rng()
             assert eval(self.__class__.__name__) is self.__class__
 
         def test_op(self):
@@ -2553,7 +2283,7 @@ def test_op(self):
                     tested = tested.toarray()
 
                     try:
-                        utt.assert_allclose(expected, tested)
+                        assert numpy.allclose(tested, expected)
                     except AssertionError:
                         raise AssertionError(self.__name__)
 
@@ -2615,7 +2345,7 @@ def test_op(self):
                         tested = tested.toarray()
 
                         try:
-                            utt.assert_allclose(tested, expected, rtol=1e-2)
+                            assert numpy.allclose(tested, expected, rtol=1e-2)
                         except AssertionError:
                             raise AssertionError(self.__name__)
 
@@ -2643,35 +2373,6 @@ def test_grad(self):
 
     return Tester
 
-def test_hstack_vstack():
-    """
-    Tests sparse.hstack and sparse.vstack (as opposed to the HStack and VStack
-    classes that they wrap).
-    """
-
-    def make_block(dtype):
-        return theano.sparse.csr_matrix(name="%s block" % dtype,
-                                        dtype=dtype)
-
-    def get_expected_dtype(blocks, to_dtype):
-        if to_dtype is None:
-            block_dtypes = tuple(b.dtype for b in blocks)
-            return theano.scalar.upcast(*block_dtypes)
-        else:
-            return to_dtype
-
-    # a deliberately weird mix of dtypes to stack
-    dtypes = ('complex128', theano.config.floatX)
-
-    blocks = [make_block(dtype) for dtype in dtypes]
-
-    for stack_dimension, stack_function in enumerate((theano.sparse.vstack,
-                                                      theano.sparse.hstack)):
-
-        for to_dtype in (None, ) + dtypes:
-            stacked_blocks = stack_function(blocks, dtype=to_dtype)
-            expected_dtype = get_expected_dtype(blocks, to_dtype)
-            assert stacked_blocks.dtype == expected_dtype
 
 def structure_function(f, index=0):
     """Decorator to structure a function wich
@@ -2748,8 +2449,7 @@ def structured_function(*args):
 ArcsinTester = elemwise_checker(
     sparse.arcsin,
     numpy.arcsin,
-    gap=(-1, 1),
-    gap_grad=(-0.99, 0.99))
+    gap=(-1, 1))
 
 ArctanTester = elemwise_checker(
     sparse.arctan,
@@ -2801,7 +2501,7 @@ def structured_function(*args):
     numpy.floor,
     grad_test=False,
     test_dtypes=[m for m in sparse.all_dtypes
-                 if not m in sparse.complex_dtypes])
+                 if not  m in sparse.complex_dtypes])
 
 Log1pTester = elemwise_checker(
     sparse.log1p,
@@ -2816,20 +2516,20 @@ def structured_function(*args):
     sparse.deg2rad,
     numpy.deg2rad,
     test_dtypes=[m for m in sparse.all_dtypes
-                 if not m in sparse.complex_dtypes])
+                 if not  m in sparse.complex_dtypes])
 
 Rad2degTester = elemwise_checker(
     sparse.rad2deg,
     numpy.rad2deg,
     test_dtypes=[m for m in sparse.all_dtypes
-                 if not m in sparse.complex_dtypes])
+                 if not  m in sparse.complex_dtypes])
 
 
 TruncTester = elemwise_checker(
     sparse.trunc,
     numpy.trunc,
     test_dtypes=[m for m in sparse.all_dtypes
-                 if not m in sparse.complex_dtypes])
+                 if not  m in sparse.complex_dtypes])
 
 
 SqrTester = elemwise_checker(
@@ -2848,7 +2548,7 @@ def setUp(self):
 
     def test_mul_s_v_grad(self):
         sp_types = {'csc': sp.csc_matrix,
-                    'csr': sp.csr_matrix}
+            'csr': sp.csr_matrix}
 
         for format in ['csr', 'csc']:
             for dtype in ['float32', 'float64']:
@@ -2856,8 +2556,7 @@ def test_mul_s_v_grad(self):
                 mat = numpy.asarray(numpy.random.rand(3), dtype=dtype)
 
                 theano.sparse.verify_grad_sparse(mul_s_v,
-                                                 [spmat, mat],
-                                                 structured=True)
+                    [spmat, mat], structured=True)
 
     def test_mul_s_v(self):
         sp_types = {'csc': sp.csc_matrix,
@@ -2874,7 +2573,7 @@ def test_mul_s_v(self):
 
                 out = f(spmat, mat)
 
-                utt.assert_allclose(spmat.toarray() * mat, out.toarray())
+                assert numpy.allclose(out.toarray(), spmat.toarray() * mat)
 
 
 class StructuredAddSVTester(unittest.TestCase):
@@ -2891,8 +2590,7 @@ def test_structured_add_s_v_grad(self):
                 mat = numpy.asarray(numpy.random.rand(3), dtype=dtype)
 
                 theano.sparse.verify_grad_sparse(structured_add_s_v,
-                                                 [spmat, mat],
-                                                 structured=True)
+                    [spmat, mat], structured=True)
 
     def test_structured_add_s_v(self):
         sp_types = {'csc': sp.csc_matrix,
@@ -2911,94 +2609,8 @@ def test_structured_add_s_v(self):
 
                 out = f(spmat, mat)
 
-                utt.assert_allclose(spones.multiply(spmat + mat),
-                                    out.toarray())
-
-
-class TrueDotTester(utt.InferShapeTester):
-    def setUp(self):
-        super(TrueDotTester, self).setUp()
-        self.op = true_dot
-        self.op_class = TrueDot
-
-    def test_op_ss(self):
-        for format in sparse.sparse_formats:
-            for dtype in sparse.all_dtypes:
-                variable, data = sparse_random_inputs(format,
-                                                      shape=(10, 10),
-                                                      out_dtype=dtype,
-                                                      n=2,
-                                                      p=0.1)
-
-                f = theano.function(variable, self.op(*variable))
-
-                tested = f(*data)
-
-                x, y = [m.toarray() for m in data]
-                expected = numpy.dot(x, y)
-
-                assert tested.format == format
-                assert tested.dtype == expected.dtype
-                tested = tested.toarray()
-                utt.assert_allclose(tested, expected)
-
-    def test_op_sd(self):
-        for format in sparse.sparse_formats:
-            for dtype in sparse.all_dtypes:
-                variable, data = sparse_random_inputs(format,
-                                                      shape=(10, 10),
-                                                      out_dtype=dtype,
-                                                      n=2,
-                                                      p=0.1)
-                variable[1] = tensor.TensorType(dtype=dtype,
-                                                broadcastable=(False, False))()
-                data[1] = data[1].toarray()
-
-                f = theano.function(variable, self.op(*variable))
-
-                tested = f(*data)
-                expected = numpy.dot(data[0].toarray(), data[1])
-
-                assert tested.format == format
-                assert tested.dtype == expected.dtype
-                tested = tested.toarray()
-                utt.assert_allclose(tested, expected)
-
-    def test_infer_shape(self):
-        for format in sparse.sparse_formats:
-            for dtype in sparse.all_dtypes:
-                (x, ), (x_value, ) = sparse_random_inputs(format,
-                                                          shape=(9, 10),
-                                                          out_dtype=dtype,
-                                                          p=0.1)
-                (y, ), (y_value, ) = sparse_random_inputs(format,
-                                                          shape=(10, 24),
-                                                          out_dtype=dtype,
-                                                          p=0.1)
-                variable = [x, y]
-                data = [x_value, y_value]
-                self._compile_and_check(variable,
-                                        [self.op(*variable)],
-                                        data,
-                                        self.op_class)
-
-    def test_grad(self):
-        for format in sparse.sparse_formats:
-            for dtype in sparse.float_dtypes:
-                (x, ), (x_value, ) = sparse_random_inputs(format,
-                                                          shape=(9, 10),
-                                                          out_dtype=dtype,
-                                                          p=0.1)
-                (y, ), (y_value, ) = sparse_random_inputs(format,
-                                                          shape=(10, 24),
-                                                          out_dtype=dtype,
-                                                          p=0.1)
-                variable = [x, y]
-                data = [x_value, y_value]
-                verify_grad_sparse(
-                    self.op,
-                    data,
-                    structured=False)
+                assert numpy.allclose(out.toarray(),
+                                      spones.multiply(spmat + mat))
 
 
 class SamplingDotTester(utt.InferShapeTester):
@@ -3006,11 +2618,11 @@ class SamplingDotTester(utt.InferShapeTester):
     x.append(sparse.csr_matrix())
     #unsquare shape
     a = [numpy.array(numpy.random.random_integers(5, size=(4, 3)) - 1,
-                     dtype=theano.config.floatX),
+                      dtype=theano.config.floatX),
          numpy.array(numpy.random.random_integers(5, size=(5, 3)) - 1,
-                     dtype=theano.config.floatX),
+                      dtype=theano.config.floatX),
          numpy.array(numpy.random.random_integers(2, size=(4, 5)) - 1,
-                     dtype=theano.config.floatX)
+                      dtype=theano.config.floatX)
          ]
     a[2] = sp.csr_matrix(a[2])
 
@@ -3027,7 +2639,7 @@ def test_op(self):
         x, y, p = self.a
         expected = p.multiply(numpy.dot(x, y.T))
 
-        utt.assert_allclose(expected, tested.toarray())
+        assert numpy.allclose(tested.toarray(), expected)
         assert tested.format == 'csr'
         assert tested.dtype == expected.dtype
 
@@ -3060,7 +2672,7 @@ def _helper(x, y):
     ref_fct_=lambda a: numpy.asarray((a * 2).todense()),
     cast_value_=scipy.sparse.csr_matrix,
     name='test_shared_options',
-)
+    )
 
 
 if __name__ == '__main__':
diff --git a/theano/sparse/tests/test_opt.py b/theano/sparse/tests/test_opt.py
index c99a363d7c6..3631e7fd66d 100644
--- a/theano/sparse/tests/test_opt.py
+++ b/theano/sparse/tests/test_opt.py
@@ -37,7 +37,6 @@ def test_local_csm_properties_csm():
 
 
 def test_local_csm_grad_c():
-    raise SkipTest("Opt disabled as it don't support unsorted indices")
     if not theano.config.cxx:
         raise SkipTest("G++ not available, so we need to skip this test.")
     data = tensor.vector()
@@ -139,17 +138,3 @@ def test_local_sampling_dot_csr():
             # be inserted
             assert not any(isinstance(node.op, sparse.opt.SamplingDotCSR) for node
                        in f.maker.fgraph.toposort())
-
-
-def test_local_dense_from_sparse_sparse_from_dense():
-    mode = theano.compile.mode.get_default_mode()
-    mode = mode.including("local_dense_from_sparse_sparse_from_dense")
-
-    m = theano.tensor.matrix()
-    for op in [theano.sparse.csr_from_dense, theano.sparse.csc_from_dense]:
-        s = op(m)
-        o = theano.sparse.dense_from_sparse(s)
-        f = theano.function([m], o, mode=mode)
-        # We should just have a deep copy.
-        assert len(f.maker.fgraph.apply_nodes) == 1
-        f([[1, 2], [3, 4]])
diff --git a/theano/sparse/tests/test_type.py b/theano/sparse/tests/test_type.py
deleted file mode 100644
index 83d358d91bf..00000000000
--- a/theano/sparse/tests/test_type.py
+++ /dev/null
@@ -1,4 +0,0 @@
-def test_sparse_type():
-    import theano.sparse
-    # They need to be available even if scipy is not available.
-    assert hasattr(theano.sparse, "SparseType")
diff --git a/theano/sparse/type.py b/theano/sparse/type.py
index 9082fac19e8..9663d56f794 100644
--- a/theano/sparse/type.py
+++ b/theano/sparse/type.py
@@ -15,7 +15,7 @@ def _is_sparse(x):
     @return: True iff x is a L{scipy.sparse.spmatrix} (and not a
     L{numpy.ndarray})
     """
-    if not isinstance(x, (scipy.sparse.spmatrix, numpy.ndarray, tuple, list)):
+    if not isinstance(x, (scipy.sparse.spmatrix, numpy.ndarray)):
         raise NotImplementedError("this function should only be called on "
                                   "sparse.scipy.sparse.spmatrix or "
                                   "numpy.ndarray, not,", x)
@@ -33,8 +33,7 @@ class SparseType(gof.Type):
     """
     if imported_scipy:
         format_cls = {'csr': scipy.sparse.csr_matrix,
-                      'csc': scipy.sparse.csc_matrix,
-                      'bsr': scipy.sparse.bsr_matrix}
+                      'csc': scipy.sparse.csc_matrix}
     dtype_set = set(['int8', 'int16', 'int32', 'int64', 'float32',
                      'uint8', 'uint16', 'uint32', 'uint64',
                      'float64', 'complex64', 'complex128'])
diff --git a/theano/tensor/__init__.py b/theano/tensor/__init__.py
index 3e9290318a6..120d092cffe 100644
--- a/theano/tensor/__init__.py
+++ b/theano/tensor/__init__.py
@@ -4,11 +4,6 @@
 import warnings
 
 from theano.tensor.basic import *
-from theano.tensor.subtensor import *
-from theano.tensor.type_other import *
-from theano.tensor.var import (
-    AsTensorError, _tensor_py_operators, TensorVariable,
-    TensorConstantSignature, TensorConstant)
 
 from theano.tensor import opt
 from theano.tensor import opt_uncanonicalize
@@ -38,7 +33,6 @@
 
 from theano.tensor.io import *
 
-
 def shared(*args, **kw):
     """
     Backward-compatibility wrapper around `tensor._shared`.
@@ -59,12 +53,8 @@ def shared(*args, **kw):
 from theano.tensor import nnet  # used for softmax, sigmoid, etc.
 
 from theano.gradient import Rop, Lop, grad, numeric_grad, verify_grad, \
-    jacobian, hessian, consider_constant
+    jacobian, hessian
 
 from theano.tensor.sort import sort, argsort
 from theano.tensor.extra_ops import (DiffOp, bincount, squeeze,
-                       repeat, bartlett, fill_diagonal, fill_diagonal_offset,
-                       cumsum, cumprod)
-
-# SpecifyShape is defined in theano.compile, but should be available in tensor
-from theano.compile import SpecifyShape, specify_shape
+                       repeat, bartlett, fill_diagonal)
diff --git a/theano/tensor/basic.py b/theano/tensor/basic.py
index b913143ec2f..d3a96f23499 100644
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -5,30 +5,35 @@
 import sys
 import warnings
 from itertools import izip
+from textwrap import dedent
 
 import numpy
+from copy import copy as python_copy
 
 import theano
+from theano.compat import PY3
 from theano.configparser import config
 from theano import gof
-from theano.gof import Apply, Constant, Op, Variable
+from theano.gof import Apply, Constant, Op, Type, Variable
 
 from theano.tensor import elemwise
-from theano.tensor.var import (AsTensorError, TensorVariable,
-                               TensorConstant,
-                               _tensor_py_operators)
-from theano.tensor.type import TensorType
-from theano.tensor.type_other import NoneConst
 from theano import scalar as scal
-from theano.gof.python25 import partial, any, all
-from theano.gof.utils import hashtype
+from theano.gof.python25 import partial, any, all, maxsize
+from theano.gof.utils import MethodNotDefined
 from theano import compile, printing
 from theano.printing import pprint, min_informative_str
-#For history
-from theano.compile import Rebroadcast, Shape, shape
+from theano.tensor.utils import hash_from_ndarray
 
+inplace_increment = None
+if config.cxx:
+    import theano.gof.cutils  # needed to import cutils_ext
+    try:
+        from cutils_ext.cutils_ext import inplace_increment
+    except ImportError:
+        pass
 
 # We use these exceptions as well.
+from theano.scalar import ComplexError, IntegerDivisionError
 import theano.scalar.sharedvar
 from theano.gradient import grad_undefined
 from theano.gradient import grad_not_implemented
@@ -55,6 +60,10 @@
 uint_dtypes = map(str, scal.uint_types)
 
 
+# Do a lazy import of the sparse module
+sparse_module_ref = None
+
+
 class ShapeError(Exception):
     """Raised when the shape cannot be computed."""
     pass
@@ -78,6 +87,12 @@ def check_equal_numpy(x, y):
 compile.register_checker(check_equal_numpy)
 
 
+def hashtype(self):
+    t = type(self)
+    return hash(t.__name__) ^ hash(t.__module__)
+elemwise.hashtype = hashtype
+
+
 __oplist_constructor_list = []
 """List of functions to be listed as op constructors in the oplist
 (`gen_oplist`, doc/oplist.txt)."""
@@ -136,44 +151,37 @@ def as_tensor_variable(x, name=None, ndim=None):
        not possible.
 
     :Exceptions:
-     - `ValueError`: raised if an `Apply` with more then one output is fetched
-     - `AsTensorError`: raised if `x` cannot be converted to a TensorType
-       Variable
+     - `ValueError`: raised if an `Apply` with no default output is fetched
+     - `TypeError`: raised if `x` cannot be converted to a TensorType Variable
 
     """
     if hasattr(x, '_as_TensorVariable'):
         return x._as_TensorVariable()  # TODO: pass name and ndim arguments
 
     if isinstance(x, gof.Apply):
-        # use Apply's default output mechanism
-        if (x.op.default_output is None) and (len(x.outputs) != 1):
+        # TODO: use Apply's default output mechanism
+        if len(x.outputs) != 1:
             raise ValueError(
                 "It is ambiguous which output of a multi-output Op has"
                 " to be fetched.", x)
-
-        x = x.default_output()
+        else:
+            x = x.outputs[0]
     if isinstance(x, Variable):
         if isinstance(x.type, scal.Scalar):
             x = tensor_from_scalar(x)
 
         if not isinstance(x.type, TensorType):
-            raise AsTensorError(
+            raise TypeError(
                 "Variable type field must be a TensorType.", x, x.type)
 
         if ndim is None:
             return x
         else:
             if (x.type.ndim > ndim):
-                # strip off leading broadcastable dimensions
-                first_non_broadcastable = [idx for idx in range(x.ndim)
-                                           if not x.broadcastable[idx]][0]
-                x = x.dimshuffle(range(x.ndim)[first_non_broadcastable:])
-                if x.ndim > ndim:
-                    raise ValueError(
-                        'TensorType could not be cast to have %i dimensions'
-                        % ndim, x.type
-                    )
-                return x
+                # TODO: strip off leading broadcastable dimensions
+                raise ValueError(
+                    'TensorType could not be cast to have %i dimensions' %
+                    ndim, x.type)
             elif (x.type.ndim < ndim):
                 return shape_padleft(x, n_ones=(ndim - x.type.ndim))
             else:
@@ -186,7 +194,7 @@ def as_tensor_variable(x, name=None, ndim=None):
             pass
 
     if isinstance(x, bool):
-        raise AsTensorError(
+        raise TypeError(
             "Cannot cast True or False as a tensor variable. Please use 1 or "
             "0. This error might be caused by using the == operator on "
             "Variables. v == w does not do what you think it does, "
@@ -199,7 +207,7 @@ def as_tensor_variable(x, name=None, ndim=None):
             str_x = str(x)
         except Exception:
             str_x = repr(x)
-        raise AsTensorError("Cannot convert %s to TensorType" % str_x, type(x))
+        raise TypeError("Cannot convert %s to TensorType" % str_x, type(x))
 
 # this has a different name, because _as_tensor_variable is the
 # function which ops use to upcast their arguments... this
@@ -252,9 +260,9 @@ def __call__(self, x):
             return numpy.asarray(x)
         elif config.cast_policy == 'numpy+floatX':
             rval = numpy.asarray(x)
-            if ((rval.dtype == 'float64' and         # numpy wants float64
-                 config.floatX == 'float32' and      # but we prefer float32
-                 not hasattr(x, 'dtype'))):           # and `x` was not typed
+            if (rval.dtype == 'float64' and         # numpy wants float64
+                config.floatX == 'float32' and      # but we prefer float32
+                not hasattr(x, 'dtype')):           # and `x` was not typed
                 rval = theano._asarray(rval, dtype='float32')
             return rval
 
@@ -276,9 +284,9 @@ def __call__(self, x):
 
         # unsafe downcast of float64 variables when config.floatX == 'float32'
         # recall: float is numpy.float
-        if ((isinstance(x, float) and
-             config.floatX in self.dtypes and
-             config.floatX == 'float32')):
+        if (isinstance(x, float) and
+            config.floatX in self.dtypes and
+            config.floatX == 'float32'):
 
             return theano._asarray(x, dtype='float32')
 
@@ -314,7 +322,7 @@ class autocast_float_as(object):
 
     For example:
     >>> with autocast_float_as('float32') as _dummy:
-    ...    assert (fvector() + 1.1).dtype == 'float32'  # temporary downcasting
+    >>>    assert (fvector() + 1.1).dtype == 'float32'  # temporary downcasting
     >>> assert (fvector() + 1.1).dtype == 'float64' # back to default behaviour
 
     This class might be convenient in some code, but it definitely
@@ -380,7 +388,7 @@ def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
             # it will work if the long fits in int64 or uint64.
             x_ = numpy.asarray(x)
 
-    assert type(x_) in [numpy.ndarray, numpy.memmap]
+    assert type(x_) == numpy.ndarray
 
     bcastable = [d == 1 for d in x_.shape]
     if ndim is not None:
@@ -396,9 +404,9 @@ def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
     try:
         if rtype is TensorConstant:
             rval = rtype(
-                TensorType(dtype=x_.dtype, broadcastable=bcastable),
-                x_.copy(),
-                name=name)
+                    TensorType(dtype=x_.dtype, broadcastable=bcastable),
+                    x_.copy(),
+                    name=name)
             return rval
         else:
             # leave the shape out of the type
@@ -409,30 +417,8 @@ def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
 
 
 def constant(x, name=None, ndim=None, dtype=None):
-    ret = constant_or_value(x, rtype=TensorConstant, name=name, ndim=ndim,
-                            dtype=dtype)
-
-    # We create a small cache of frequently used constant.
-    # This speed up the Merge optimization for big graph.
-    # We want to cache all scalar to don't merge as frequently constants.
-    # But we don't want to cache too much stuff
-    # So we cache integer with dtype [u]int and float where the value is
-    # between -10 and 10
-    # We want to cache all broadcast pattern for scalar.
-    if not constant.enable:
-        return ret
-    sig = ret.signature()
-    if (sig not in constant_cache and ret.data.size == 1 and
-        ret.data <= 10 and ret.data >= -10 and
-        (ret.dtype in int_dtypes or ret.dtype in uint_dtypes or
-         (ret.dtype in float_dtypes and int(ret.data) == ret.data))):
-        constant_cache[sig] = ret
-        # This is needed to raise a good error to the user.
-        ret.cached = True
-
-    return constant_cache.get(sig, ret)
-constant.enable = True
-constant_cache = {}
+    return constant_or_value(x, rtype=TensorConstant, name=name, ndim=ndim,
+                             dtype=dtype)
 
 
 def _obj_is_wrappable_as_tensor(x):
@@ -481,8 +467,6 @@ def _wrap_tensor_into_member(x):
 
 
 def _allclose(a, b, rtol=None, atol=None):
-    a = numpy.asarray(a)
-    b = numpy.asarray(b)
     narrow = 'float32', 'complex64'
     if (str(a.dtype) in narrow) or (str(b.dtype) in narrow):
         atol_ = float32_atol
@@ -516,228 +500,789 @@ class EmptyConstantError(NotScalarConstantError):
     """
 
 
-def numpy_scalar(data):
-    """ Return a scalar stored in a numpy ndarray, or raise
-    NotScalarConstantError if the numpy ndarray is not a scalar
-    """
-
-    # handle case where data is numpy.array([])
-    if (data.ndim > 0 and
-        (len(data.shape) == 0 or
-         __builtins__['max'](data.shape) == 0)):
-        assert numpy.all(numpy.array([]) == data)
-        raise EmptyConstantError()
-    try:
-        numpy.complex(data)  # works for all numeric scalars
-        return data
-    except Exception:
-        raise NotScalarConstantError(
-            'v.data is non-numeric, non-scalar, or has more than one'
-            ' unique value', data)
-
-
-get_scalar_constant_value_elemwises = (
-    scal.Cast, scal.Switch,
-    scal.NEQ, scal.EQ,
-    scal.LT, scal.GT, scal.LE, scal.GE,
-    scal.Sub, scal.Add, scal.Mod, scal.Mul,
-    scal.IntDiv, scal.TrueDiv, scal.Minimum, scal.Maximum)
-
-
-def get_scalar_constant_value(orig_v, elemwise=True):
+def get_scalar_constant_value(v):
     """return the constant scalar(0-D) value underlying variable `v`
 
-    If v is the output of dimshuffles, fills, allocs, rebroadcasts,
-    cast, OutputGuard, DeepCopyOp, ScalarFromTensor, ScalarOp,
-    Elemwise and some pattern with Subtensor,
+    If v is the output of dimshuffles, fills, allocs, rebroadcasts, cast
     this function digs through them.
 
     If `v` is not some view of constant scalar data, then raise a
     NotScalarConstantError.
 
-    :param elemwise: If False, we won't try to go into elemwise.
-        So this call is faster.
-
     :note: There may be another function similar to this one in the
         code, but I'm not sure where it is.
     """
-    v = orig_v
-    while True:
-        if v is None:
-            # None is not a scalar (and many uses of this function seem
-            # to depend on passing it None)
-            raise NotScalarConstantError()
 
-        if isinstance(v, (numpy.integer, int, float)):
-            return numpy.asarray(v)
+    if v is None:
+        # None is not a scalar (and many uses of this function seem to depend
+        # on passing it None)
+        raise NotScalarConstantError()
 
-        if isinstance(v, numpy.ndarray):
-            return numpy_scalar(v)
+    if isinstance(v, (numpy.integer, int, float)):
+        return numpy.asarray(v)
 
-        if isinstance(v, Constant):
-            if getattr(v.tag, 'unique_value', None) is not None:
-                data = v.tag.unique_value
-            else:
-                data = v.data
-            return numpy_scalar(data)
-
-        if getattr(v, 'owner', None):
-            if isinstance(v.owner.op, (Alloc, DimShuffle, Rebroadcast,
-                                       compile.ops.OutputGuard,
-                                       compile.DeepCopyOp)):
-                v = v.owner.inputs[0]
-                continue
-            elif isinstance(v.owner.op, theano.compile.ops.Shape_i):
-                if isinstance(v.owner.inputs[0], Constant):
-                    return numpy.asarray(v.owner.inputs[0].data.shape[v.owner.op.i])
-            # Don't act as the constant_folding optimization here as this
-            # fct is used too early in the optimization phase.  This would
-            # mess with the stabilization optimization and be too slow.
-            # We put all the scalar Ops used by get_canonical_form_slice()
-            # to allow it to determine the broadcast pattern correctly.
-            elif isinstance(v.owner.op, ScalarFromTensor):
-                return get_scalar_constant_value(v.owner.inputs[0])
-            elif isinstance(v.owner.op, scal.ScalarOp):
-                if isinstance(v.owner.op, scal.Second):
-                    # We don't need both input to be constant for second
-                    shp, val = v.owner.inputs
-                    v = val
-                    continue
-                if isinstance(v.owner.op, get_scalar_constant_value_elemwises):
-                    const = [get_scalar_constant_value(i)
-                             for i in v.owner.inputs]
-                    ret = [[None]]
-                    v.owner.op.perform(v.owner, const, ret)
-                    return ret[0][0]
-            elif elemwise and isinstance(v.owner.op, Elemwise):
-                if isinstance(v.owner.op.scalar_op, scal.Second):
-                    # We don't need both input to be constant for second
-                    shp, val = v.owner.inputs
-                    v = val
-                    continue
-                elif isinstance(v.owner.op.scalar_op,
-                                get_scalar_constant_value_elemwises):
-                    const = [get_scalar_constant_value(i)
-                             for i in v.owner.inputs]
-                    ret = [[None]]
-                    v.owner.op.perform(v.owner, const, ret)
-                    return ret[0][0]
-            elif (isinstance(v.owner.op, theano.tensor.subtensor.Subtensor)
-                  and v.ndim == 0):
-                if isinstance(v.owner.inputs[0], TensorConstant):
-                    cdata = tuple(v.owner.op.get_constant_idx(v.owner.inputs))
-                    try:
-                        return v.owner.inputs[0].data.__getitem__(cdata)
-                    except IndexError:
-                        raise IndexError(
+    def numpy_scalar(data):
+        """ Return a scalar stored in a numpy ndarray, or raise
+        NotScalarConstantError if the numpy ndarray is not a scalar
+        """
+
+        # handle case where data is numpy.array([])
+        if data.ndim > 0 and  (len(data.shape) == 0 or
+            __builtins__['max'](data.shape) == 0):
+            assert numpy.all(numpy.array([]) == data)
+            raise EmptyConstantError()
+        try:
+            numpy.complex(data)  # works for all numeric scalars
+            return data
+        except Exception:
+            raise NotScalarConstantError(
+                'v.data is non-numeric, non-scalar, or has more than one'
+                ' unique value', data)
+
+    if isinstance(v, numpy.ndarray):
+        return numpy_scalar(v)
+
+    if isinstance(v, Constant):
+        if getattr(v.tag, 'unique_value', None) is not None:
+            data = v.tag.unique_value
+        else:
+            data = v.data
+        return numpy_scalar(data)
+
+    if v.owner:
+        if isinstance(v.owner.op, (Alloc, DimShuffle, Rebroadcast,
+                                   compile.ops.OutputGuard,
+                                   compile.DeepCopyOp)):
+            return get_scalar_constant_value(v.owner.inputs[0])
+        if isinstance(v.owner.op, Elemwise) and \
+                isinstance(v.owner.op.scalar_op, scal.Second):
+            shape, val = v.owner.inputs
+            return get_scalar_constant_value(val)
+        if isinstance(v.owner.op, scal.Second):
+            x, y = v.owner.inputs
+            return get_scalar_constant_value(y)
+        # Don't act as the constant_folding optimization here as this
+        # fct is used too early in the optimization phase.  This would
+        # mess with the stabilization optimization.
+        if (isinstance(v.owner.op, Elemwise) and isinstance(
+            v.owner.op.scalar_op, scal.Cast)) or \
+            isinstance(v.owner.op, scal.Cast):
+            const = get_scalar_constant_value(v.owner.inputs[0])
+            ret = [[None]]
+            v.owner.op.perform(v.owner, [const], ret)
+            return ret[0][0]
+        if isinstance(v.owner.op, Subtensor) and v.ndim == 0:
+            # This condition depends on Subtensor always embedding constant
+            # indices in the Op rather than making them inputs to the Apply
+            # node.
+            if isinstance(v.owner.inputs[0], TensorConstant) and \
+                len(v.owner.inputs) == 1:
+                try:
+                    return v.owner.inputs[0].data.__getitem__(
+                    tuple(v.owner.op.idx_list))
+                except IndexError:
+                    raise IndexError(
                             str(tuple(v.owner.op.idx_list)) +
                             " is not a valid index into " +
                             str(v.owner.inputs[0].data))
 
-                # The index list 'idx_list' should have length the same
-                # shape as the input.
-                # TODO: implement the case where we take a scalar in a matrix
-                assert len(v.owner.op.idx_list) == v.owner.inputs[0].ndim
-
-                # Needed to make better graph in this test in
-                # theano/tensor/tests/test_sharedvar.py:
-                # test_shared_options.test_specify_shape_partial
-                if ((v.owner.inputs[0].owner and
-                     isinstance(v.owner.inputs[0].owner.op, Join) and
-                     len(v.owner.op.idx_list) == 1)):
-                    # Ensure the Join is joining only scalar variables (so that
-                    # the constant value can be found at the same index as the
-                    # one used in the sub-tensor).
-                    if python_all(var.ndim == 0 for var in
-                                  v.owner.inputs[0].owner.inputs[1:]):
-                        idx = v.owner.op.idx_list[0]
-                        if isinstance(idx, gof.Type):
-                            idx = get_scalar_constant_value(v.owner.inputs[1])
-                        # Note the '+ 1' is because the first argument to Join
-                        # is the axis.
-                        ret = v.owner.inputs[0].owner.inputs[idx + 1]
-                        ret = get_scalar_constant_value(ret)
-                        # join can cast implicitly its input in some case.
-                        return theano._asarray(ret, dtype=v.type.dtype)
-                    if python_all(var.ndim == 1 for var in
-                                  v.owner.inputs[0].owner.inputs[1:]):
-                        idx = v.owner.op.idx_list[0]
-                        if isinstance(idx, gof.Type):
-                            idx = get_scalar_constant_value(v.owner.inputs[1])
-                        try:
-                            #TODO: assert joined axis is 0.
-                            length = 0
-                            for joined in v.owner.inputs[0].owner.inputs[1:]:
-                                ll = get_vector_length(joined)
-                                if idx < length + ll:
-                                    return get_scalar_constant_value(
-                                        joined[idx - length])
-                                length += ll
-                        except TypeError:
-                            pass
-                        except ValueError:
-                            pass
-
-                elif (v.owner.inputs[0].owner and
-                      isinstance(v.owner.inputs[0].owner.op,
-                                 theano.tensor.opt.MakeVector) and
-                      # MakeVector normally accept only scalar as input.
-                      # We put this check in case there is change in the future
-                      python_all(var.ndim == 0 for var in
-                                 v.owner.inputs[0].owner.inputs) and
-                      len(v.owner.op.idx_list) == 1):
-
-                    idx = v.owner.op.idx_list[0]
-                    if isinstance(idx, gof.Type):
-                        idx = get_scalar_constant_value(v.owner.inputs[1])
-                    # Python 2.4 does not support indexing with numpy.integer
-                    # So we cast it.
-                    idx = int(idx)
-                    ret = v.owner.inputs[0].owner.inputs[idx]
-                    ret = get_scalar_constant_value(ret)
-                    # MakeVector can cast implicitly its input in some case.
-                    return theano._asarray(ret, dtype=v.type.dtype)
-
-                # This is needed when we take the grad as the Shape op
-                # are not already changed into MakeVector
-                owner = v.owner
-                leftmost_parent = owner.inputs[0]
-                if (leftmost_parent.owner and
-                    isinstance(leftmost_parent.owner.op,
-                               theano.tensor.Shape)):
-                    op = owner.op
-                    idx_list = op.idx_list
-                    idx = idx_list[0]
-                    if isinstance(idx, gof.Type):
-                        idx = get_scalar_constant_value(owner.inputs[1])
-                    grandparent = leftmost_parent.owner.inputs[0]
-                    gp_broadcastable = grandparent.type.broadcastable
-                    ndim = grandparent.type.ndim
-                    if grandparent.owner and isinstance(grandparent.owner.op,
-                                                        Rebroadcast):
-                        l = []
-                        for idx, (b1, b2) in enumerate(
-                                zip(grandparent.owner.inputs[0].broadcastable,
-                                    gp_broadcastable)):
-                            l.append(b1 or b2)
-                        gp_broadcastable = tuple(l)
-
-                    assert ndim == len(gp_broadcastable)
-
-                    if not (idx < len(gp_broadcastable)):
-                        msg = ("get_scalar_constant_value detected " +
-                               "deterministic IndexError: x.shape[%d] " +
-                               "when x.ndim=%d.") % (ndim, idx)
-                        if config.exception_verbosity == 'high':
-                            msg += 'x=%s' % min_informative_str(v)
-                        else:
-                            msg += 'x=%s' % str(v)
-                        raise ValueError(msg)
-
-                    if gp_broadcastable[idx]:
-                        return numpy.asarray(1)
-
-        raise NotScalarConstantError(v)
+            # The index list 'idx_list' should have length the same
+            # shape as the input.
+            # TODO: implement the case where we take a scalar in a matrix
+            assert len(v.owner.op.idx_list) == v.owner.inputs[0].ndim
+
+            # Needed to make better graph in this test in theano/tensor/tests:
+            # test_sharedvar.py:test_shared_options.test_specify_shape_partial
+            if (v.owner.inputs[0].owner and
+                isinstance(v.owner.inputs[0].owner.op, Join) and
+                # Ensure the Join is joining only scalar variables (so that
+                # the constant value can be found at the same index as the one
+                # used in the sub-tensor).
+                python_all(var.ndim == 0 for var in
+                           v.owner.inputs[0].owner.inputs) and
+                len(v.owner.op.idx_list) == 1):
+
+                # Note the '+ 1' is because the first argument to Join is the
+                # axis.
+                ret = v.owner.inputs[0].owner.inputs[
+                    v.owner.op.idx_list[0] + 1]
+                ret = get_scalar_constant_value(ret)
+                # join can cast implicitly its input in some case.
+                return theano._asarray(ret, dtype=v.type.dtype)
+
+            if (v.owner.inputs[0].owner and
+                isinstance(v.owner.inputs[0].owner.op,
+                           theano.tensor.opt.MakeVector) and
+                # MakeVector normally accept only scalar as input.
+                # We put this check in case there is change in the future
+                python_all(var.ndim == 0 for var in
+                           v.owner.inputs[0].owner.inputs) and
+                len(v.owner.op.idx_list) == 1 and
+                #idx_list can contain Scalar Type object.
+                isinstance(v.owner.op.idx_list[0], (int, long,
+                                                    numpy.integer))):
+
+                # Python 2.4 does not support indexing with numpy.integer
+                # So we cast it.
+                idx = int(v.owner.op.idx_list[0])
+                ret = v.owner.inputs[0].owner.inputs[idx]
+                ret = get_scalar_constant_value(ret)
+                # MakeVector can cast implicitly its input in some case.
+                return theano._asarray(ret, dtype=v.type.dtype)
+
+            # This is needed when we take the grad as the Shape op
+            # are not already changed into MakeVector
+            owner = v.owner
+            leftmost_parent = owner.inputs[0]
+            if (leftmost_parent.owner and
+                isinstance(leftmost_parent.owner.op,
+                           theano.tensor.Shape)):
+                op = owner.op
+                idx_list = op.idx_list
+                idx = idx_list[0]
+                grandparent = leftmost_parent.owner.inputs[0]
+                gp_broadcastable = grandparent.type.broadcastable
+                ndim = grandparent.type.ndim
+
+                assert ndim == len(gp_broadcastable)
+
+                if not (idx < len(gp_broadcastable)):
+                    msg = "get_scalar_constant_value detected " + \
+                            "deterministic IndexError: x.shape[%d] " + \
+                            "when x.ndim=%d." % (ndim, idx)
+                    if config.exception_verbosity == 'high':
+                        msg += 'x=%s' % min_informative_str(x)
+                    else:
+                        msg += 'x=%s' % str(x)
+                    raise ValueError(msg)
+
+                if gp_broadcastable[idx]:
+                    return numpy.asarray(1)
+
+    raise NotScalarConstantError(v)
+
+
+class TensorType(Type):
+    """Symbolic `Type` representing a numpy.ndarray value."""
+
+    filter_checks_isfinite = False
+    """
+    When this is True, strict filtering rejects data containing NaN or
+    Inf entries. (Used in `DebugMode`)
+    """
+
+    def __init__(self, dtype, broadcastable, name=None, sparse_grad=False):
+        """Initialize self.dtype and self.broadcastable.
+
+        :Parameters:
+         - `dtype`: str corresponding to numpy dtype (e.g., 'int64')
+           The value (ndarray) associated to a `Variable` of this `Type` will
+           have this dtype.
+         - `broadcastable`: tuple, list, or array of boolean values
+           This argument serves two purposes.  First, the True elements of this
+           list indicate the dimensions where the shape of an associated value
+           must be 1.  Secondly, the length of this list is the number of
+           dimensions that an associated value must have.  See
+           :doc:`broadcasting` for an explanation of how this list is used.
+         - `name`: str
+           Optional name for this type.
+        """
+        self.dtype = str(dtype)
+        if self.dtype == 'floatX':
+            self.dtype = config.floatX
+        ###    broadcastable is immutable, and all elements are either
+        ###    True or False
+        self.broadcastable = tuple(bool(b) for b in broadcastable)
+        self.dtype_specs()  # error checking is done there
+        self.name = name
+        self.numpy_dtype = numpy.dtype(self.dtype)
+        self.sparse_grad = sparse_grad
+
+    def filter(self, data, strict=False, allow_downcast=None):
+        """Convert `data` to something which can be associated to a
+        `TensorVariable`.
+
+        This function is not meant to be called in user code.  It is for
+        `Linker` instances to use when running a compiled graph.
+        """
+        # Explicit error message when one accidentally uses a Variable as
+        # input (typical mistake, especially with shared variables).
+        if isinstance(data, Variable):
+            raise TypeError(
+                    'Expected an array-like object, but found a Variable: '
+                    'maybe you are trying to call a function on a (possibly '
+                    'shared) variable instead of a numeric array?')
+
+        if ((type(data) is numpy.ndarray)
+                and (data.dtype == self.numpy_dtype)):
+            if data.dtype.num != self.numpy_dtype.num:
+                data = theano._asarray(data, dtype=self.dtype)
+            # -- now fall through to ndim check
+        elif((type(data) is numpy.memmap)
+                and (data.dtype == self.numpy_dtype)):
+            # numpy.memmap is a "safe" subclass of ndarray,
+            # so we can use it whereever we expect a base ndarray.
+            # however, casting it would defeat the purpose of not
+            # loading the whole data into memory
+            pass
+        elif strict:
+            # If any of the two conditions above was not met,
+            # we raise a meaningful TypeError.
+            if not (type(data) is numpy.ndarray):
+                raise TypeError("%s expected a ndarray object." % self,
+                        data, type(data))
+            if data.dtype != self.numpy_dtype:
+                raise TypeError(("%s expected a ndarray object with "
+                        "dtype = %s (got %s).") % (
+                            self, self.numpy_dtype, data.dtype))
+            assert False, "This point should never be reached."
+        else:
+            if allow_downcast:
+                # Convert to self.dtype, regardless of the type of data
+                data = theano._asarray(data, dtype=self.dtype)
+                # TODO: consider to pad shape with ones to make it consistent
+                # with self.broadcastable... like vector->row type thing
+            else:
+                if isinstance(data, numpy.ndarray):
+                    # Check if self.dtype can accurately represent data
+                    # (do not try to convert the data)
+                    up_dtype = scal.upcast(self.dtype, data.dtype)
+                    if up_dtype == self.dtype:
+                        # Bug in the following line when data is a
+                        # scalar array, see
+                        # http://projects.scipy.org/numpy/ticket/1611
+                        # data = data.astype(self.dtype)
+                        data = theano._asarray(data, dtype=self.dtype)
+                    if up_dtype != self.dtype:
+                        err_msg = (
+                            '%s cannot store a value of dtype %s without '
+                            'risking loss of precision. If you do not mind '
+                            'this loss, you can: '
+                            '1) explicitly cast your data to %s, or '
+                            '2) set "allow_input_downcast=True" when calling '
+                            '"function".'
+                            % (self, data.dtype, self.dtype))
+                        raise TypeError(err_msg, data)
+                elif (allow_downcast is None and
+                        type(data) is float and
+                        self.dtype == theano.config.floatX):
+                    # Special case where we allow downcasting of Python float
+                    # literals to floatX, even when floatX=='float32'
+                    data = theano._asarray(data, self.dtype)
+                else:
+                    # data has to be converted.
+                    # Check that this conversion is lossless
+                    converted_data = theano._asarray(data, self.dtype)
+                    # We use the `values_eq` static function from TensorType
+                    # to handle NaN values.
+                    if TensorType.values_eq(numpy.asarray(data),
+                                            converted_data,
+                                            force_same_dtype=False):
+                        data = converted_data
+                    else:
+                        # Do not print a too long description of data
+                        # (ndarray truncates it, but it's not sure for data)
+                        str_data = str(data)
+                        if len(str_data) > 80:
+                            str_data = str_data[:75] + '(...)'
+
+                        err_msg = (
+                            '%s cannot store accurately value %s, '
+                            'it would be represented as %s. '
+                            'If you do not mind this precision loss, you can: '
+                            '1) explicitly convert your data to a numpy array '
+                            'of dtype %s, or '
+                            '2) set "allow_input_downcast=True" when calling '
+                            '"function".'
+                            % (self, data, converted_data, self.dtype))
+                        raise TypeError(err_msg, data)
+
+        if self.ndim != data.ndim:
+            raise TypeError("Wrong number of dimensions: expected %s,"
+                            " got %s with shape %s." % (self.ndim, data.ndim,
+                                                        data.shape))
+        if not data.flags.aligned:
+            try:
+                msg = "object buffer" + str(data.data)
+            except AttributeError:
+                msg = ""
+            raise TypeError("The numpy.ndarray object is not aligned."
+                            " Theano C code does not support that.",
+                            msg,
+                            "object shape", data.shape,
+                            "object strides", data.strides)
+
+        i = 0
+        for b in self.broadcastable:
+            if b and data.shape[i] != 1:
+                raise TypeError("Non-unit value on shape on a broadcastable"
+                                " dimension.", data.shape, self.broadcastable)
+            i += 1
+        if (self.filter_checks_isfinite and
+            not numpy.all(numpy.isfinite(data))):
+            raise ValueError("non-finite elements not allowed")
+        return data
+
+    def filter_variable(self, other):
+        """Convert a symbolic Variable into a TensorType, if compatible.
+
+        For the moment, only a TensorType or CudaNdarrayType will be
+        converted, provided they have the same number of dimensions,
+        broadcastable pattern, and dtype.
+        """
+        if hasattr(other, '_as_TensorVariable'):
+            other = other._as_TensorVariable()
+
+        if not isinstance(other, Variable):
+            # The value is not a Variable: we cast it into
+            # a Constant of the appropriate Type.
+            other = self.Constant(type=self, data=other)
+
+        if other.type == self:
+            return other
+
+        raise TypeError(
+                'Cannot convert Type %(othertype)s '
+                '(of Variable %(other)s) into Type %(self)s. '
+                'You can try to manually convert %(other)s into a %(self)s.'
+                % dict(
+                    othertype=other.type,
+                    other=other,
+                    self=self)
+                )
+
+    def value_validity_msg(self, a):
+        try:
+            self.filter(a, strict=True)
+        except Exception, e:
+            return str(e)
+        return "value is valid"
+
+    def dtype_specs(self):
+        """Return a tuple (python type, c type, numpy typenum) that corresponds
+        to self.dtype.
+
+        This function is used internally as part of C code generation.
+        """
+        # TODO: add more type correspondances for e.g. int32, int64, float32,
+        # complex64, etc.
+        try:
+            return {
+                'float32': (float, 'npy_float32', 'NPY_FLOAT32'),
+                'float64': (float, 'npy_float64', 'NPY_FLOAT64'),
+                'uint8': (int, 'npy_uint8', 'NPY_UINT8'),
+                'int8': (int, 'npy_int8', 'NPY_INT8'),
+                'uint16': (int, 'npy_uint16', 'NPY_UINT16'),
+                'int16': (int, 'npy_int16', 'NPY_INT16'),
+                'uint32': (int, 'npy_uint32', 'NPY_UINT32'),
+                'int32': (int, 'npy_int32', 'NPY_INT32'),
+                'uint64': (int, 'npy_uint64', 'NPY_UINT64'),
+                'int64': (int, 'npy_int64', 'NPY_INT64'),
+                'complex128': (complex, 'theano_complex128', 'NPY_COMPLEX128'),
+                'complex64': (complex, 'theano_complex64', 'NPY_COMPLEX64')
+                }[self.dtype]
+        except KeyError:
+            raise TypeError("Unsupported dtype for %s: %s"
+                    % (self.__class__.__name__, self.dtype))
+
+    def to_scalar_type(self):
+        return scal.Scalar(dtype=self.dtype)
+
+    def __eq__(self, other):
+        """Compare True iff other is the same kind of TensorType"""
+        return type(self) == type(other) and other.dtype == self.dtype \
+            and other.broadcastable == self.broadcastable
+
+    @staticmethod
+    def may_share_memory(a, b):
+        # This is a method of TensorType, so both a and b should be ndarrays
+        if isinstance(a, numpy.ndarray) and isinstance(b, numpy.ndarray):
+            return numpy.may_share_memory(a, b)
+        else:
+            return False
+
+    @staticmethod
+    def values_eq(a, b, force_same_dtype=True):
+        # TODO: check to see if the shapes must match
+        #      for now, we err on safe side...
+        if a.shape != b.shape:
+            return False
+        if force_same_dtype and a.dtype != b.dtype:
+            return False
+        a_eq_b = (a == b)
+        r = numpy.all(a_eq_b)
+        if r:
+            return True
+        # maybe the trouble is that there are NaNs
+        a_missing = numpy.isnan(a)
+        if a_missing.any():
+            b_missing = numpy.isnan(b)
+            return numpy.all(a_eq_b + (a_missing == b_missing))
+        else:
+            return False
+
+    @staticmethod
+    def values_eq_approx(a, b, allow_remove_inf=False, allow_remove_nan=False,
+                         rtol=None, atol=None):
+        """
+        :param allow_remove_inf: If True, when there is an inf in a,
+                                 we allow any value in b in that position.
+                                 Event -inf
+        :param allow_remove_nan: If True, when there is a nan in a,
+                                 we allow any value in b in that position.
+                                 Event +-inf
+        :param rtol: relative tolerance, passed to _allclose
+        :param atol: absolute tolerance, passed to _allclose
+        """
+        if isinstance(a, numpy.ndarray) and isinstance(b, numpy.ndarray):
+            if a.shape != b.shape:
+                return False
+            if a.dtype != b.dtype:
+                return False
+            if 'int' in str(a.dtype):
+                return numpy.all(a == b)
+            else:
+                # work around a numpy.allclose bug:
+                # http://projects.scipy.org/numpy/ticket/1672
+                if a.ndim == 0 and numpy.isinf(a):
+                    a = a.reshape(1)
+                    b = b.reshape(1)
+
+                cmp = _allclose(a, b, rtol=rtol, atol=atol)
+                if cmp:
+                    # Numpy claims they are close, this is good enough for us.
+                    return True
+                # Numpy is unhappy, but it does not necessarily mean that a and
+                # b are different. Indeed, Numpy does not like missing values
+                # and will return False whenever some are found in a or b.
+                # The proper way would be to use the MaskArray stuff available
+                # in Numpy. However, it looks like it has been added to Numpy's
+                # core recently, so it may not be available to everyone. Thus,
+                # for now we use a home-made recipe, that should probably be
+                # revisited in the future.
+                a_missing = numpy.isnan(a)
+                a_inf = numpy.isinf(a)
+
+                if not (a_missing.any() or (allow_remove_inf and a_inf.any())):
+                    # There are no missing values in a, thus this is not the
+                    # reason why numpy.allclose(a, b) returned False.
+                    _logger.info(
+                        'numpy allclose failed for abs_err %f and rel_err %f',
+                        numpy.max(abs(a - b)),
+                        numpy.max(abs(a - b) / (abs(a) + abs(b))))
+                    return False
+                # The following line is what numpy.allclose bases its decision
+                # upon, according to its documentation.
+                rtol = 1.0000000000000001e-05
+                atol = 1e-8
+                cmp_elemwise = (numpy.absolute(a - b) <=
+                        (atol + rtol * numpy.absolute(b)))
+                # Find places where both a and b have missing values.
+                both_missing = a_missing * numpy.isnan(b)
+
+                # Find places where both a and b have inf of the same sign.
+                both_inf = a_inf * numpy.isinf(b)
+
+                # cmp_elemwise is weird when we have inf and -inf.
+                # set it to False
+                cmp_elemwise = numpy.where(
+                        both_inf & cmp_elemwise,
+                        a == b,
+                        cmp_elemwise)
+
+                # check the sign of the inf
+                both_inf = numpy.where(both_inf, (a == b), both_inf)
+
+                if allow_remove_inf:
+                    both_inf += a_inf
+                if allow_remove_nan:
+                    both_missing += a_missing
+
+                # Combine all information.
+                return (cmp_elemwise + both_missing + both_inf).all()
+
+        return False
+
+    @staticmethod
+    def values_eq_approx_remove_inf(a, b):
+        return TensorType.values_eq_approx(a, b, True)
+
+    @staticmethod
+    def values_eq_approx_remove_nan(a, b):
+        return TensorType.values_eq_approx(a, b, False, True)
+
+    @staticmethod
+    def values_eq_approx_remove_inf_nan(a, b):
+        return TensorType.values_eq_approx(a, b, True, True)
+
+    def __hash__(self):
+        """Hash equal for same kinds of TensorType"""
+        return hashtype(self) ^ hash(self.dtype) ^ hash(self.broadcastable)
+
+    ndim = property(lambda self: len(self.broadcastable),
+            doc="number of dimensions")
+    """Number of dimensions
+
+    This read-only property is the preferred way to get the number of
+    dimensions of a `TensorType`.
+
+    """
+
+    def make_variable(self, name=None):
+        """Return a `TensorVariable` of this type
+
+        :Parameters:
+         - `name`: str
+           A pretty name to identify this `Variable` when printing and
+           debugging
+        """
+        return TensorVariable(self, name=name)
+
+    def __str__(self):
+        if self.name:
+            return self.name
+        else:
+            b = self.broadcastable
+            named_broadcastable = {(): 'scalar',
+                     (False,): 'vector',
+                     (False, True): 'col',
+                     (True, False): 'row',
+                     (False, False): 'matrix'}
+            if b in named_broadcastable:
+                bcast = named_broadcastable[b]
+            else:
+                if python_any(b):
+                    bcast = str(b)
+                else:
+                    bcast = '%iD' % len(b)
+            return "TensorType(%s, %s)" % (str(self.dtype), bcast)
+
+    def __repr__(self):
+        return str(self)
+        #"TensorType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
+
+    def c_declare(self, name, sub):
+        """Override `CLinkerOp.c_declare` """
+        return """
+        PyArrayObject* %(name)s;
+        int type_num_%(name)s;
+        typedef %(dtype)s dtype_%(name)s;
+        """ % dict(sub, name=name, dtype=self.dtype_specs()[1])
+
+    def c_init(self, name, sub):
+        """Override `CLinkerOp.c_init` """
+        return """
+        %(name)s = NULL;
+        type_num_%(name)s = %(type_num)s;
+        """ % dict(sub, name=name, type_num=self.dtype_specs()[2])
+
+    def c_extract(self, name, sub):
+        """Override `CLinkerOp.c_extract` """
+        return """
+        %(name)s = NULL;
+        if (py_%(name)s == Py_None) {
+            // We can either fail here or set %(name)s to NULL and rely on Ops
+            // using tensors to handle the NULL case, but if they fail to do so
+            // they'll end up with nasty segfaults, so this is public service.
+            PyErr_SetString(PyExc_ValueError, "expected an ndarray, not None");
+            %(fail)s
+        }
+        if (!PyArray_Check(py_%(name)s)) {
+            PyErr_SetString(PyExc_ValueError, "expected an ndarray");
+            %(fail)s
+        }
+        // We expect %(type_num)s
+        type_num_%(name)s = ((PyArrayObject*)py_%(name)s)->descr->type_num;
+        if (!PyArray_ISALIGNED(py_%(name)s)) {
+            PyErr_Format(PyExc_NotImplementedError,
+                         "expected an aligned array of type %%ld "
+                         "(%(type_num)s), got non-aligned array of type %%ld"
+                         " with %%ld dimensions, with 3 last dims "
+                         "%%ld, %%ld, %%ld"
+                         " and 3 last strides %%ld %%ld, %%ld.",
+                         (long int) %(type_num)s,
+                         (long int) type_num_%(name)s,
+                         (long int) PyArray_NDIM(py_%(name)s),
+                         (long int) PyArray_NDIM(py_%(name)s) >= 3 ?
+        PyArray_DIMS(py_%(name)s)[PyArray_NDIM(py_%(name)s)-3] : -1,
+                         (long int) PyArray_NDIM(py_%(name)s) >= 2 ?
+        PyArray_DIMS(py_%(name)s)[PyArray_NDIM(py_%(name)s)-2] : -1,
+                         (long int) PyArray_NDIM(py_%(name)s) >= 1 ?
+        PyArray_DIMS(py_%(name)s)[PyArray_NDIM(py_%(name)s)-1] : -1,
+                         (long int) PyArray_NDIM(py_%(name)s) >= 3 ?
+        PyArray_STRIDES(py_%(name)s)[PyArray_NDIM(py_%(name)s)-3] : -1,
+                         (long int) PyArray_NDIM(py_%(name)s) >= 2 ?
+        PyArray_STRIDES(py_%(name)s)[PyArray_NDIM(py_%(name)s)-2] : -1,
+                         (long int) PyArray_NDIM(py_%(name)s) >= 1 ?
+        PyArray_STRIDES(py_%(name)s)[PyArray_NDIM(py_%(name)s)-1] : -1
+        );
+            %(fail)s
+        }
+        // This is a TypeError to be consistent with DEBUG_MODE
+        // Note: DEBUG_MODE also tells the name of the container
+        if (type_num_%(name)s != %(type_num)s) {
+            PyErr_Format(PyExc_TypeError,
+                         "expected type_num %%d (%(type_num)s) got %%d",
+                         %(type_num)s, type_num_%(name)s);
+            %(fail)s
+        }
+        %(name)s = (PyArrayObject*)(py_%(name)s);
+        Py_XINCREF(%(name)s);
+        """ % dict(sub, name=name, type_num=self.dtype_specs()[2])
+
+    def c_cleanup(self, name, sub):
+        """Override `CLinkerOp.c_cleanup` """
+        return """
+        if (%(name)s) {
+            Py_XDECREF(%(name)s);
+        }
+        """ % locals()
+
+    def c_sync(self, name, sub):
+        """Override `CLinkerOp.c_sync` """
+        fail = sub['fail']
+        type_num = self.dtype_specs()[2]
+        return """
+        {Py_XDECREF(py_%(name)s);}
+        if (!%(name)s) {
+            Py_INCREF(Py_None);
+            py_%(name)s = Py_None;
+        }
+        else if ((void*)py_%(name)s != (void*)%(name)s) {
+            py_%(name)s = (PyObject*)%(name)s;
+        }
+
+        {Py_XINCREF(py_%(name)s);}
+
+        if (!PyArray_ISALIGNED(py_%(name)s)) {
+            PyErr_Format(PyExc_NotImplementedError,
+                         "c_sync: expected an aligned array of type %%ld "
+                         "(%(type_num)s), got non-aligned array of type %%ld"
+                         " with %%ld dimensions, with 3 last dims "
+                         "%%ld, %%ld, %%ld"
+                         " and 3 last strides %%ld %%ld, %%ld.",
+                         (long int) %(type_num)s,
+                         (long int) type_num_%(name)s,
+                         (long int) PyArray_NDIM(py_%(name)s),
+                         (long int) PyArray_NDIM(py_%(name)s) >= 3 ?
+        PyArray_DIMS(py_%(name)s)[PyArray_NDIM(py_%(name)s)-3] : -1,
+                         (long int) PyArray_NDIM(py_%(name)s) >= 2 ?
+        PyArray_DIMS(py_%(name)s)[PyArray_NDIM(py_%(name)s)-2] : -1,
+                         (long int) PyArray_NDIM(py_%(name)s) >= 1 ?
+        PyArray_DIMS(py_%(name)s)[PyArray_NDIM(py_%(name)s)-1] : -1,
+                         (long int) PyArray_NDIM(py_%(name)s) >= 3 ?
+        PyArray_STRIDES(py_%(name)s)[PyArray_NDIM(py_%(name)s)-3] : -1,
+                         (long int) PyArray_NDIM(py_%(name)s) >= 2 ?
+        PyArray_STRIDES(py_%(name)s)[PyArray_NDIM(py_%(name)s)-2] : -1,
+                         (long int) PyArray_NDIM(py_%(name)s) >= 1 ?
+        PyArray_STRIDES(py_%(name)s)[PyArray_NDIM(py_%(name)s)-1] : -1
+        );
+            %(fail)s
+        }
+        """ % locals()
+
+    def c_headers(self):
+        """Override `CLinkerOp.c_headers` """
+        return scal.Scalar(self.dtype).c_headers()
+
+    def c_libraries(self):
+        return scal.Scalar(self.dtype).c_libraries()
+
+    def c_compile_args(self):
+        return scal.Scalar(self.dtype).c_compile_args()
+
+    def c_support_code(self):
+        """Override `CLinkerOp.c_support_code` """
+        return scal.Scalar(self.dtype).c_support_code()
+
+    def c_code_cache_version(self):
+        scalar_version = scal.Scalar(self.dtype).c_code_cache_version()
+        if scalar_version:
+            return (9,) + scalar_version
+        else:
+            return ()
+
+    def value_zeros(self, shape):
+        """
+        Create an numpy ndarray full of 0 values.
+        """
+        return numpy.zeros(shape, dtype=self.dtype)
+
+    def get_shape_info(self, obj):
+        """
+        Return the information needed to compute the memory size of ``obj``.
+
+        The memory size is only the data, so this excludes the container.
+        For an ndarray, this is the data, but not the ndarray object and
+        other data structures such as shape and strides.
+
+        ``get_shape_info()`` and ``get_size()`` work in tandem for the memory
+        profiler.
+
+        ``get_shape_info()`` is called during the execution of the function.
+        So it is better that it is not too slow.
+
+        ``get_size()`` will be called on the output of this function
+        when printing the memory profile.
+
+        :param obj: The object that this Type represents during execution
+        :return: Python object that ``self.get_size()`` understands
+        """
+        return obj.shape
+
+    def get_size(self, shape_info):
+        """ Number of bytes taken by the object represented by shape_info.
+
+        :param shape_info: the output of the call to get_shape_info()
+        :return: the number of bytes taken by the object described by
+            ``shape_info``.
+        """
+        if shape_info:
+            return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
+        else:  # a scalar
+            return numpy.dtype(self.dtype).itemsize
+theano.compile.ops.expandable_types += (TensorType,)
+
+# Register TensorType C code for ViewOp.
+theano.compile.register_view_op_c_code(
+        TensorType,
+        """
+        Py_XDECREF(%(oname)s);
+        %(oname)s = %(iname)s;
+        Py_XINCREF(%(oname)s);
+        """,
+        version=1)
+
+# Register TensorType C code for DeepCopyOp
+theano.compile.register_deep_copy_op_c_code(
+        TensorType,
+        """
+        int alloc = %(oname)s == NULL;
+        for(int i=0; !alloc && i<PyArray_NDIM(%(oname)s); i++) {
+           if(PyArray_DIMS(%(iname)s)[i] != PyArray_DIMS(%(oname)s)[i]) {
+               alloc = true;
+               break;
+           }
+        }
+        if(alloc) {
+            Py_XDECREF(%(oname)s);
+            %(oname)s = (PyArrayObject*)PyArray_NewCopy(%(iname)s,
+                                                        NPY_ANYORDER);
+            if (!%(oname)s)
+            {
+                PyErr_SetString(PyExc_ValueError,
+                                "DeepCopyOp: the copy failed!");
+                %(fail)s;
+            }
+        } else {
+            if(PyArray_CopyInto(%(oname)s, %(iname)s)){
+                PyErr_SetString(PyExc_ValueError,
+            "DeepCopyOp: the copy failed into already allocated space!");
+                %(fail)s;
+            }
+        }
+        """,
+        version=2)
 
 
 # Easy constructors
@@ -787,7 +1332,7 @@ def scalar(name=None, dtype=None):
     return type(name)
 
 scalars, fscalars, dscalars, iscalars, lscalars = _multi(
-    scalar, fscalar, dscalar, iscalar, lscalar)
+        scalar, fscalar, dscalar, iscalar, lscalar)
 
 int_types = bscalar, wscalar, iscalar, lscalar
 float_types = fscalar, dscalar
@@ -817,7 +1362,7 @@ def vector(name=None, dtype=None):
     return type(name)
 
 vectors, fvectors, dvectors, ivectors, lvectors = _multi(
-    vector, fvector, dvector, ivector, lvector)
+        vector, fvector, dvector, ivector, lvector)
 
 int_vector_types = bvector, wvector, ivector, lvector
 float_vector_types = fvector, dvector
@@ -844,7 +1389,7 @@ def matrix(name=None, dtype=None):
     return type(name)
 
 matrices, fmatrices, dmatrices, imatrices, lmatrices = _multi(
-    matrix, fmatrix, dmatrix, imatrix, lmatrix)
+        matrix, fmatrix, dmatrix, imatrix, lmatrix)
 
 int_matrix_types = bmatrix, wmatrix, imatrix, lmatrix
 float_matrix_types = fmatrix, dmatrix
@@ -913,7 +1458,7 @@ def tensor3(name=None, dtype=None):
     return type(name)
 
 tensor3s, ftensor3s, dtensor3s, itensor3s, ltensor3s = _multi(
-    tensor3, ftensor3, dtensor3, itensor3, ltensor3)
+        tensor3, ftensor3, dtensor3, itensor3, ltensor3)
 
 ctensor4 = TensorType('complex64', ((False,) * 4))
 ztensor4 = TensorType('complex128', ((False,) * 4))
@@ -935,126 +1480,800 @@ def tensor4(name=None, dtype=None):
     type = TensorType(dtype, (False, False, False, False))
     return type(name)
 tensor4s, ftensor4s, dtensor4s, itensor4s, ltensor4s = _multi(
-    tensor4, ftensor4, dtensor4, itensor4, ltensor4)
+        tensor4, ftensor4, dtensor4, itensor4, ltensor4)
 
 
-Tensor = TensorType
+class _tensor_py_operators:
+    # UNARY
+    def __abs__(self):
+        return abs_(self)
 
+    def __neg__(self):
+        return neg(self)
 
-# This bizarre push-import avoids a circular dependency.
-elemwise.as_tensor_variable = as_tensor_variable
-elemwise.TensorType = TensorType
-elemwise.TensorVariable = TensorVariable
-elemwise.TensorConstant = TensorConstant
+    # CASTS
+    #### REMOVED THESE BECAUSE PYTHON appears to require __int__ to return
+    #### an int. -JB 20081112
+    #def __int__(self): return convert_to_int32(self)
+    #def __float__(self): return convert_to_float64(self)
+    #def __complex__(self): return convert_to_complex128(self)
 
-#########################
-# Utilities
-#########################
+    # COMPARISONS
+    _is_nonzero = True
 
+    def __lt__(self, other):
+        rval = lt(self, other)
+        rval._is_nonzero = False
+        return rval
 
-def _scal_elemwise_with_nfunc(nfunc, nin, nout):
-    """
-    Replace a symbol definition with an elementwise version of the
-    corresponding scalar Op.  If it is not None, the nfunc argument
-    should be a string such that getattr(numpy, nfunc) implements
-    a vectorized version of the elemwise operation. nin is the number
-    of inputs expected by that function, and nout is the number of
-    **destination** inputs it takes. That is, the function should
-    take nin+nout inputs. nout == 0 means that the numpy function
-    does not take a numpy array argument to put its result in.
-    """
-    def construct(symbol):
-        symbolname = symbol.__name__
-        inplace = symbolname.endswith('_inplace')
-        if inplace:
-            msg = "inplace"
-        else:
-            msg = "no_inplace"
+    def __le__(self, other):
+        rval = le(self, other)
+        rval._is_nonzero = False
+        return rval
 
-        n = "Elemwise{%s,%s}" % (symbolname, msg)
+    def __gt__(self, other):
+        rval = gt(self, other)
+        rval._is_nonzero = False
+        return rval
 
-        if inplace:
-            scalar_op = getattr(scal, symbolname[:-len('_inplace')])
-            inplace_scalar_op = scalar_op.__class__(scal.transfer_type(0))
-            rval = elemwise.Elemwise(inplace_scalar_op, {0: 0}, name=n,
-                                     nfunc_spec=(nfunc and (nfunc, nin, nout)))
+    def __ge__(self, other):
+        rval = ge(self, other)
+        rval._is_nonzero = False
+        return rval
+
+    def __nonzero__(self):
+        # This is meant to prohibit stuff like a < b < c, which is internally
+        # implemented as (a < b) and (b < c). The trouble with this is the
+        # side-effect that checking for a non-NULL a by typing "if a: ..."
+        # uses the same __nonzero__ method.  We want these both to work, but
+        # it seems impossible.  Currently, all vars evaluate to nonzero except
+        # the return values of comparison operators, which raise this
+        # exception.  If you can think of a better solution, go for it!
+        if self._is_nonzero:
+            return True
         else:
-            scalar_op = getattr(scal, symbolname)
-            rval = elemwise.Elemwise(scalar_op, name=n,
-                                     nfunc_spec=(nfunc and (nfunc, nin, nout)))
+            raise TypeError(
+                "Variables do not support boolean operations. This "
+                "can happen if you do a logical operation (<, <=, >, <=, "
+                "==, !=) between a numpy.ndarray and a Theano tensor"
+                "variable. Due to NumPy implementation before NumPy 1.8, "
+                "we cannot make the Python syntax work when the ndarray "
+                "is on the left, and this results in this error. To work "
+                "around that, either call "
+                "theano.tensor.{lt,le,eq,ne,gt,ge}(ndarray, tensor), or "
+                "use the Python syntax with the Theano tensor on the "
+                "left. Or update to NumPy 1.8 or above."
+            )
+
+    # BITWISE
+    def __invert__(self):
+        return invert(self)
+
+    def __and__(self, other):
+        return and_(self, other)
+
+    def __or__(self, other):
+        return or_(self, other)
+
+    def __xor__(self, other):
+        return xor(self, other)
+
+    def __rand__(self, other):
+        return and_(other, self)
+
+    def __ror__(self, other):
+        return or_(other, self)
+
+    def __rxor__(self, other):
+        return xor(other, self)
+
+    # def __iand__(self, other):
+    #    return _and_inplace(self, other)
+    #
+    # def __ior__(self, other):
+    #    return _or_inplace(self, other)
+    #
+    #def __ixor__(self, other):
+    #    return _xor_inplace(self, other)
+
+    # ARITHMETIC - NORMAL
+    def __add__(self, other):
+        try:
+            return add(self, other)
+        # We should catch the minimum number of exception here.
+        # Otherwise this will convert error when Theano flags
+        # compute_test_value is used
+        # Evidently, we need to catch NotImplementedError
+        # But we also need to catch TypeError
+        # Oterwise TensorVariable * SparseVariable won't work!
+        except (NotImplementedError, TypeError):
+            # We must return NotImplemented and not an
+            # NotImplementedError or raise an NotImplementedError.
+            # That way python will give a good error message like this
+            # `TypeError: unsupported operand type(s) for +:
+            # 'TensorVariable' and 'TensorVariable'`
+            return NotImplemented
+
+    def __sub__(self, other):
+        # See explanation in __add__ for the error catched
+        # and the return value in that case
+        try:
+            return sub(self, other)
+        except (NotImplementedError, TypeError):
+            return NotImplemented
 
-        if getattr(symbol, '__doc__', False):
-            rval.__doc__ = symbol.__doc__ + '\n' + rval.__doc__
+    def __mul__(self, other):
+        # See explanation in __add__ for the error catched
+        # and the return value in that case
+        try:
+            return mul(self, other)
+        except (NotImplementedError, TypeError):
+            return NotImplemented
 
-        # for the meaning of this see the ./epydoc script
-        # it makes epydoc display rval as if it were a function, not an object
-        rval.__epydoc_asRoutine = symbol
-        rval.__module__ = 'tensor'
+    def __div__(self, other):
+        # See explanation in __add__ for the error catched
+        # and the return value in that case
+        try:
+            return div_proxy(self, other)
+        except IntegerDivisionError:
+            # This is to raise the exception that occurs when trying to divide
+            # two integer arrays (currently forbidden).
+            raise
+        except (NotImplementedError, TypeError):
+            return NotImplemented
+    if PY3:
+        __truediv__ = __div__
+
+    def __pow__(self, other):
+        # See explanation in __add__ for the error catched
+        # adn the return value in that case
+        try:
+            return pow(self, other)
+        except (NotImplementedError, TypeError):
+            return NotImplemented
 
-        pprint.assign(rval, printing.FunctionPrinter(symbolname))
+    def __mod__(self, other):
+        # See explanation in __add__ for the error catched
+        # adn the return value in that case
+        try:
+            return mod_check(self, other)
+        except ComplexError:
+            # This is to raise the exception that occurs when trying to compute
+            # x % y with either x or y a complex number.
+            raise
+        except (NotImplementedError, TypeError):
+            return NotImplemented
+
+    def __truediv__(self, other):
+        return true_div(self, other)
+
+    def __floordiv__(self, other):
+        return floor_div(self, other)
+
+    def __rtruediv__(self, other):
+        return true_div(other, self)
+
+    def __rfloordiv__(self, other):
+        return floor_div(other, self)
+
+    ##### DO NOT USE THESE BECAUSE INPLACE OPS SHOULD BE INSERTED
+    ##### BY OPTIMIZATIONS ONLY
+    ## ARITHMETIC - INPLACE
+    #def __iadd__(self, other):
+    #    return _add_inplace(self, other)
+    #def __isub__(self, other):
+    #    return _sub_inplace(self, other)
+    #
+    #def __imul__(self, other):
+    #    return _mul_inplace(self, other)
+    #
+    #def __idiv__(self, other):
+    #    return _div_inplace(self, other)
+    #
+    #def __ipow__(self, other):
+    #    return _pow_inplace(self, other)
+
+    # ARITHMETIC - RIGHT-OPERAND
+    def __radd__(self, other):
+        return add(other, self)
+
+    def __rsub__(self, other):
+        return sub(other, self)
+
+    def __rmul__(self, other):
+        return mul(other, self)
+
+    def __rdiv__(self, other):
+        return div_proxy(other, self)
+
+    def __rmod__(self, other):
+        return mod(other, self)
+
+    def __rpow__(self, other):
+        return pow(other, self)
+
+    # TRANSPOSE
+    T = property(lambda self: transpose(self))
+
+    def transpose(self, *axes):
+        """
+        Return `tensor.transpose(self, axes)`
+        or `tensor.transpose(self, axes[0])`
 
-        return rval
-    return construct
+        If only one `axes` argument is provided and it is iterable, then it is
+        assumed to be the entire axes tuple, and passed intact to
+        tensor.transpose.
 
-_scal_elemwise = _scal_elemwise_with_nfunc(None, None, None)
+        """
+        if len(axes) == 0:
+            return transpose(self)
+        try:
+            iter(axes[0])
+            iterable = True
+        except TypeError:
+            iterable = False
+        if len(axes) == 1 and iterable:
+            return transpose(self, axes[0])
+        else:
+            return transpose(self, axes)
 
+    shape = property(lambda self: shape(self))
 
-#########################
-# Casting Operations
-#########################
+    size = property(lambda self: prod(self.shape))
 
-class TensorFromScalar(Op):
-    def make_node(self, s):
-        assert isinstance(s.type, scal.Scalar)
-        return Apply(self,
-                     [s],
-                     [tensor(dtype=s.type.dtype,
-                             broadcastable=())])
+    # We can't implement __len__ to provide a better error message.
+    def any(self, axis=None, keepdims=False):
+        return any(self, axis=axis, keepdims=keepdims)
 
-    def perform(self, node, inp, out_):
-        s, = inp
-        out, = out_
-        out[0] = numpy.asarray(s)
+    def all(self, axis=None, keepdims=False):
+        return all(self, axis=axis, keepdims=keepdims)
 
-    def infer_shape(self, node, in_shapes):
-        return [()]
+    # Otherwise TensorVariable[:-1] does not work as Python 2.5.1 calls
+    # __len__ before calling __getitem__. It also does not catch the raised
+    # Exception!
+    # def __len__(self):
+    #     # We can't implement __len__ as Python requests that this
+    #     # function returns an integer >=0
+    #     raise Exception("Theano Variables can't work with len(Theano "
+    #                     "Variable) due to Python restriction. You can use "
+    #                     "TheanoVariable.shape[0] instead.")
 
-    def grad(self, inp, grads):
-        s, = inp
-        dt, = grads
-        if s.type.dtype in float_dtypes:
-            assert dt.type.dtype in float_dtypes
-            return [scalar_from_tensor(dt)]
+    def reshape(self, shape, ndim=None):
+        """Return a reshaped view/copy of this variable.
 
-        # If the input dtype is an integer, then so is the output dtype,
-        # and the "zero" gradient can be represented in that int dtype.
-        # Currently, theano.grad insists that the dtype of the returned
-        # gradient has a float dtype, so we use floatX.
-        if s.type.dtype in discrete_dtypes:
-            return [s.zeros_like().astype(theano.config.floatX)]
+        :param shape: something that can be converted to a symbolic vector of
+            integers
 
-        raise NotImplementedError("grad not implemented for complex dtypes")
+        :param ndim: the length of the shape.  Passing None here means for
+            theano to try and guess the length of `shape`.
 
-    def __str__(self):
-        return self.__class__.__name__
+        * warning-- this has a different signature than numpy's
+                    ndarray.reshape!
+                    in numpy you do not need to wrap the shape arguments
+                    in a tuple, in theano you do need to
 
-tensor_from_scalar = TensorFromScalar()
+        """
 
+        if ndim is not None:
+            if not isinstance(ndim, int):
+                raise ValueError("Expected ndim to be an integer, is "\
+                        + str(type(ndim)))
 
-class ScalarFromTensor(Op):
-    def __eq__(self, other):
-        return type(self) == type(other)
+        return reshape(self, shape, ndim=ndim)
 
-    def __hash__(self):
-        return hash(type(self))
+    def dimshuffle(self, *pattern):
+        """
+        Reorder the dimensions of this variable, optionally inserting
+        broadcasted dimensions.
+
+        :param pattern: list/tuple of int mixed with 'x' for broadcastable
+            dimensions
+
+        For example, to create a 3D view of a [2D] matrix, call
+        ``dimshuffle([0,'x',1])``.  This will create a 3D view such that the
+        middle dimension is an implicit broadcasted dimension.  To do the same
+        thing on the transpose of that matrix, call
+        ``dimshuffle([1, 'x', 0])``.
+
+        This function supports the pattern passed as a tuple, or as a
+        variable-length argument (e.g. ``a.dimshuffle(pattern)`` is equivalent
+        to ``a.dimshuffle(*pattern)`` where ``pattern`` is a list/tuple of ints
+        mixed with 'x' characters).
+
+        For more information, see `DimShuffle`.
+        """
+        if (len(pattern) == 1) and (isinstance(pattern[0], (list, tuple))):
+            pattern = pattern[0]
+        op = DimShuffle(list(self.type.broadcastable), pattern)
+        return op(self)
+
+    def flatten(self, ndim=1):
+        return flatten(self, ndim)
+
+    def ravel(self):
+        return flatten(self)
+
+    def diagonal(self, offset=0, axis1=0, axis2=1):
+        return diagonal(self, offset, axis1, axis2)
+
+    # CASTING
+    def astype(self, dtype):
+        return cast(self, dtype)
+
+    # SLICING
+    # Do not define __getslice__ here:
+    # When calling t[1:], for instance, the arguments passed to __getslice__
+    # are (1, sys.maxsize), which is a pain to deal with, and can even not be
+    # an int (but a long).
+    # If __getslice__ does not exist, __getitem__ is called instead, with
+    # argument slice(1, None, None), which is much more desirable.
+    # __getslice__ is deprecated in python 2.6 anyway.
+
+    def __getitem__(self, args):
+        if not isinstance(args, tuple):
+            args = args,
+        # Determine if advanced indexing is needed or not
+        # The logic is already in Subtensor.convert: if it succeeds,
+        # standard indexing is used; if it fails with
+        # AdvancedIndexingError, advanced indexing
+        advanced = False
+        axis = None
+        for i, arg in enumerate(args):
+            try:
+                arg == numpy.newaxis or Subtensor.convert(arg)
+            except AdvancedIndexingError:
+                if advanced:
+                    axis = None
+                    break
+                else:
+                    advanced = True
+                    axis = i
+
+        if advanced:
+            if (axis is not None
+                and numpy.all(a == slice(None) for a in args[:axis])
+                and numpy.all(a == slice(None) for a in args[axis + 1:])
+                and isinstance(args[axis], (
+                        numpy.ndarray,
+                        list,
+                        TensorVariable,
+                        TensorConstant,
+                        theano.tensor.sharedvar.TensorSharedVariable))):
+                return self.take(arg, axis)
+            else:
+                return AdvancedSubtensor()(self, *args)
+        else:
+            if numpy.newaxis in args:
+                # None (aka np.newaxis) in numpy indexing means to add a
+                # broadcastable dimension, which theano traditionally did with
+                # the dimshuffle op.  The following code converts numpy-style
+                # indexing on self to traditional [read: implemented] theano
+                # indexing on a dimshuffled view of self.
+
+                counter = 0
+                pattern = []
+                new_args = []
+                for arg in args:
+                    if arg == numpy.newaxis:
+                        pattern.append('x')
+                        new_args.append(slice(None, None, None))
+                    else:
+                        pattern.append(counter)
+                        counter += 1
+                        new_args.append(arg)
+                view = self.dimshuffle(pattern)
+                rval = view.__getitem__(tuple(new_args))
+                return rval
+            else:
+                return Subtensor(args)(self, *Subtensor.collapse(args,
+                    lambda entry: isinstance(entry, Variable)))
+
+    def take(self, indices, axis=None, mode='raise'):
+        return take(self, indices, axis, mode)
+
+    # COPYING
+    def copy(self):
+        return tensor_copy(self)
+
+    def __iter__(self):
+        try:
+            for i in xrange(get_vector_length(self)):
+                yield self[i]
+        except TypeError:
+            # This prevents accidental iteration via builtin.sum(self)
+            raise TypeError(('TensorType does not support iteration. '
+                'Maybe you are using builtin.sum instead of '
+                'theano.tensor.sum? (Maybe .max?)'))
+
+    # CONVENIENT ACCESS TO TYPE PROPERTIES
+    ndim = property(lambda self: self.type.ndim)
+    """The rank of this tensor."""
+
+    broadcastable = property(lambda self: self.type.broadcastable)
+    """The broadcastable signature of this tensor.
+
+    See :doc:`broadcasting` for details.
+    """
+
+    dtype = property(lambda self: self.type.dtype)
+    """ The dtype of this tensor.  """
+
+    # extra pseudo-operator symbols
+    def __dot__(left, right):
+        return dot(left, right)
+
+    def __rdot__(right, left):
+        return dot(left, right)
+
+    dot = __dot__
+
+    def sum(self, axis=None, dtype=None, keepdims=False, acc_dtype=None):
+        """See `theano.tensor.sum`"""
+        return sum(self, axis=axis, dtype=dtype, keepdims=keepdims,
+                acc_dtype=acc_dtype)
+
+    def prod(self, axis=None, dtype=None, keepdims=False, acc_dtype=None):
+        """See `theano.tensor.prod`"""
+        return prod(self, axis=axis, dtype=dtype, keepdims=keepdims,
+                acc_dtype=acc_dtype)
+
+    def norm(self, L, axis=None):
+        if L == 0:
+            raise NotImplementedError()
+        if numpy.isinf(L):
+            raise NotImplementedError()
+        # optimizations will/should catch cases like L=1, L=2
+        return pow(pow(abs_(self), L).sum(axis=axis), 1.0 / L)
+
+    def mean(self, axis=None, dtype=None, keepdims=False, acc_dtype=None):
+        """See `theano.tensor.mean`"""
+        return mean(self, axis=axis, dtype=dtype, keepdims=keepdims,
+                acc_dtype=acc_dtype)
+
+    def var(self, axis=None, keepdims=False):
+        """See `theano.tensor.var`"""
+        return var(self, axis, keepdims=keepdims)
+
+    def std(self, axis=None, keepdims=False):
+        """See `theano.tensor.std`"""
+        return std(self, axis, keepdims=keepdims)
+
+    def min(self, axis=None, keepdims=False):
+        """See `theano.tensor.min`"""
+        return min(self, axis, keepdims=keepdims)
+
+    def max(self, axis=None, keepdims=False):
+        """See `theano.tensor.max`"""
+        return max(self, axis, keepdims=keepdims)
+
+    def argmin(self, axis=None, keepdims=False):
+        """See `theano.tensor.argmin`"""
+        return argmin(self, axis, keepdims=keepdims)
+
+    def argmax(self, axis=None, keepdims=False):
+        """See `theano.tensor.argmax`"""
+        return argmax(self, axis, keepdims=keepdims)
+
+    def nonzero(self, return_matrix=False):
+        """See `theano.tensor.nonzero`"""
+        return nonzero(self, return_matrix=return_matrix)
+
+    def nonzero_values(self):
+        """See `theano.tensor.nonzero_values`"""
+        return nonzero_values(self)
+
+    def sort(self,  axis=-1, kind='quicksort', order=None):
+        """See `theano.tensor.sort`"""
+        from theano.tensor.sort import sort
+        return sort(self, axis, kind, order)
+
+    def argsort(self,  axis=-1, kind='quicksort', order=None):
+        """See `theano.tensor.argsort`"""
+        from theano.tensor.sort import argsort
+        return argsort(self, axis, kind, order)
+
+    def clip(self, a_min, a_max):
+        "Clip (limit) the values in an array."
+        return clip(self, a_min, a_max)
+
+    def conj(self):
+        """See `theano.tensor.conj`"""
+        return conj(self)
+
+    conjugate = conj
+
+    def repeat(self, repeats, axis=None):
+        """See `theano.tensor.repeat`"""
+        from theano.tensor.extra_ops import repeat
+        return repeat(self, repeats, axis)
+
+    def round(self, mode="half_away_from_zero"):
+        """See `theano.tensor.round`"""
+        return round(self, mode)
+
+    def trace(self):
+        from theano.sandbox.linalg import trace
+        return trace(self)
+
+    # TO TRUMP NUMPY OPERATORS
+    __array_priority__ = 1000
+
+    def get_scalar_constant_value(self):
+        return get_scalar_constant_value(self)
+
+    def zeros_like(model, dtype=None):
+        return zeros_like(model, dtype=dtype)
+
+
+class TensorVariable(_tensor_py_operators, Variable):
+    """Subclass to add the tensor operators to the basic `Variable` class."""
+
+TensorType.Variable = TensorVariable
+
+
+class TensorConstantSignature(tuple):
+    """A Signature object for comparing TensorConstant instances
+
+    An instance is a pair: (Type instance, ndarray).
+    """
+    def __eq__(self, other):
+        if type(self) != type(other):
+            return False
+        try:
+            (t0, d0), (t1, d1) = self, other
+        except Exception:
+            return False
+
+        # N.B. compare shape to ensure no broadcasting in ==
+        if t0 != t1 or d0.shape != d1.shape:
+            return False
+
+        self.no_nan  # Ensure has_nan is computed.
+        # Note that in the comparisons below, the elementwise comparisons
+        # come last because they are the most expensive checks.
+        if self.has_nan:
+            other.no_nan  # Ensure has_nan is computed.
+            return (other.has_nan and
+                    self.sum == other.sum and
+                    (self.no_nan.mask == other.no_nan.mask).all() and
+                    # Note that the second test below (==) may crash e.g. for
+                    # a single scalar NaN value, so we do not run it when all
+                    # values are missing.
+                    (self.no_nan.mask.all() or
+                     (self.no_nan == other.no_nan).all()))
+        else:
+            # Simple case where we do not need to worry about NaN values.
+            # (note that if there are NaN values in d1, this will return
+            # False, which is why we do not bother with testing `other.has_nan`
+            # here).
+            return (self.sum == other.sum) and numpy.all(d0 == d1)
+
+    def __hash__(self):
+        t, d = self
+        return hashtype(self) ^ hash(t) ^ hash(d.shape) ^ hash(self.sum)
+
+    def theano_hash(self):
+        _, d = self
+        return hash_from_ndarray(d)
+
+    def _get_sum(self):
+        """Compute sum of non NaN / Inf values in the array."""
+        try:
+            return self._sum
+        except AttributeError:
+            self._sum = self.no_nan.sum()
+            if self.has_nan and self.no_nan.mask.all():
+                # In this case the sum is not properly computed by numpy.
+                self._sum = 0
+            if numpy.isinf(self._sum) or numpy.isnan(self._sum):
+                # NaN may happen when there are both -inf and +inf values.
+                if self.has_nan:
+                    # Filter both NaN and Inf values.
+                    mask = self.no_nan.mask + numpy.isinf(self[1])
+                else:
+                    # Filter only Inf values.
+                    mask = numpy.isinf(self[1])
+                if mask.all():
+                    self._sum = 0
+                else:
+                    self._sum = numpy.ma.masked_array(self[1], mask).sum()
+                # At this point there should be no more NaN.
+                assert not numpy.isnan(self._sum)
+        return self._sum
+    sum = property(_get_sum)
+
+    def _get_no_nan(self):
+        try:
+            return self._no_nan
+        except AttributeError:
+            nan_mask = numpy.isnan(self[1])
+            if nan_mask.any():
+                self._no_nan = numpy.ma.masked_array(self[1], nan_mask)
+                self.has_nan = True
+            else:
+                self._no_nan = self[1]
+                self.has_nan = False
+        return self._no_nan
+    no_nan = property(_get_no_nan)
+
+
+class TensorConstant(_tensor_py_operators, Constant):
+    """Subclass to add the tensor operators to the basic `Constant` class.
+
+    To create a TensorConstant, use the `constant` function in this module.
+    """
+    def __init__(self, type, data, name=None):
+        Constant.__init__(self, type, data, name)
+        if (isinstance(data, numpy.ndarray) and
+            data.ndim > 0 and
+            len(numpy.unique(data)) == 1):
+            self.tag.unique_value = numpy.unique(data)[0]
+        else:
+            self.tag.unique_value = None
+
+    def __str__(self):
+        if self.tag.unique_value is not None:
+            name = "%s of %s" % (str(self.data.shape),
+                               str(self.tag.unique_value))
+        else:
+            name = "%s" % self.data
+        if len(name) > 20:
+            name = name[:10] + ".." + name[-10:]
+
+        return "TensorConstant{%s}" % name
+
+    def signature(self):
+        return TensorConstantSignature((self.type, self.data))
+
+    def equals(self, other):
+        # Override Contant.equals to allow to compare with numpy.ndarray
+        if isinstance(other, numpy.ndarray):
+            # Make a TensorConstant to be able to compare
+            other = constant(other)
+        return (isinstance(other, TensorConstant) and
+                self.signature() == other.signature())
+
+TensorType.Constant = TensorConstant
+
+
+Tensor = TensorType
+
+
+# This bizarre push-import avoids a circular dependency.
+elemwise.as_tensor_variable = as_tensor_variable
+elemwise.TensorType = TensorType
+elemwise.TensorVariable = TensorVariable
+elemwise.TensorConstant = TensorConstant
+
+#########################
+# Utilities
+#########################
+
+
+def _redefine(real_symbol_value, module='tensor'):
+    """Replace the value associated with a function symbol.
+
+    This is useful to trick epydoc into doing what we want.  It's a hack.
+    """
+    real_symbol_value.__module__ = 'tensor.basic'
+
+    def decorator(f):
+        return real_symbol_value
+
+    return decorator
+
+
+def _redefine_asRoutine(real_symbol_value):
+    real_symbol_value.__epydoc_asRoutine = True
+
+    def decorator(f):
+        return real_symbol_value
+
+    return decorator
+
+
+def _scal_elemwise_with_nfunc(nfunc, nin, nout):
+    """
+    Replace a symbol definition with an elementwise version of the
+    corresponding scalar Op.  If it is not None, the nfunc argument
+    should be a string such that getattr(numpy, nfunc) implements
+    a vectorized version of the elemwise operation. nin is the number
+    of inputs expected by that function, and nout is the number of
+    **destination** inputs it takes. That is, the function should
+    take nin+nout inputs. nout == 0 means that the numpy function
+    does not take a numpy array argument to put its result in.
+    """
+    def construct(symbol):
+        symbolname = symbol.__name__
+        inplace = symbolname.endswith('_inplace')
+        if inplace:
+            msg = "inplace"
+        else:
+            msg = "no_inplace"
+
+        n = "Elemwise{%s,%s}" % (symbolname, msg)
+
+        if inplace:
+            scalar_op = getattr(scal, symbolname[:-len('_inplace')])
+            inplace_scalar_op = scalar_op.__class__(scal.transfer_type(0))
+            rval = elemwise.Elemwise(inplace_scalar_op, {0: 0}, name=n,
+                                     nfunc_spec=(nfunc and (nfunc, nin, nout)))
+        else:
+            scalar_op = getattr(scal, symbolname)
+            rval = elemwise.Elemwise(scalar_op, name=n,
+                                     nfunc_spec=(nfunc and (nfunc, nin, nout)))
+
+        if getattr(symbol, '__doc__', False):
+            rval.__doc__ = symbol.__doc__ + '\n' + rval.__doc__
+
+        # for the meaning of this see the ./epydoc script
+        # it makes epydoc display rval as if it were a function, not an object
+        rval.__epydoc_asRoutine = symbol
+        rval.__module__ = 'tensor'
+
+        pprint.assign(rval, printing.FunctionPrinter(symbolname))
+
+        return rval
+    return construct
+
+_scal_elemwise = _scal_elemwise_with_nfunc(None, None, None)
+
+
+#########################
+# Casting Operations
+#########################
+
+class TensorFromScalar(Op):
+    def make_node(self, s):
+        assert isinstance(s.type, scal.Scalar)
+        return Apply(self,
+                     [s],
+                     [tensor(dtype=s.type.dtype,
+                             broadcastable=())])
+
+    def perform(self, node, inp, out_):
+        s, = inp
+        out, = out_
+        out[0] = numpy.asarray(s)
+
+    def infer_shape(self, node, in_shapes):
+        return [()]
+
+    def grad(self, inp, grads):
+        s, = inp
+        dt, = grads
+        if s.type.dtype in float_dtypes:
+            assert dt.type.dtype in float_dtypes
+            return [scalar_from_tensor(dt)]
+
+        # If the input dtype is an integer, then so is the output dtype,
+        # and the "zero" gradient can be represented in that int dtype.
+        # Currently, theano.grad insists that the dtype of the returned
+        # gradient has a float dtype, so we use floatX.
+        if s.type.dtype in discrete_dtypes:
+            return [s.zeros_like().astype(theano.config.floatX)]
+
+        raise NotImplementedError("grad not implemented for complex dtypes")
+
+    def __str__(self):
+        return self.__class__.__name__
+
+tensor_from_scalar = TensorFromScalar()
+
+
+class ScalarFromTensor(Op):
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
 
     def make_node(self, t):
         assert isinstance(t.type, TensorType)
         assert t.type.broadcastable == ()
         return Apply(self,
                      [t],
-                     [scal.get_scalar_type(dtype=t.type.dtype).make_variable()]
-                     )
+                     [scal.Scalar(dtype=t.type.dtype).make_variable()])
 
     def perform(self, node, inp, out_):
         s, = inp
@@ -1106,66 +2325,66 @@ def _conversion(real_value, name):
 # `cast()` function below.
 
 _convert_to_int8 = _conversion(
-    elemwise.Elemwise(scal.convert_to_int8), 'int8')
+        elemwise.Elemwise(scal.convert_to_int8), 'int8')
 """Cast to 8-bit integer"""
 
 _convert_to_int16 = _conversion(
-    elemwise.Elemwise(scal.convert_to_int16), 'int16')
+        elemwise.Elemwise(scal.convert_to_int16), 'int16')
 """Cast to 16-bit integer"""
 
 _convert_to_int32 = _conversion(
-    elemwise.Elemwise(scal.convert_to_int32), 'int32')
+        elemwise.Elemwise(scal.convert_to_int32), 'int32')
 """Cast to 32-bit integer"""
 
 _convert_to_int64 = _conversion(
-    elemwise.Elemwise(scal.convert_to_int64), 'int64')
+        elemwise.Elemwise(scal.convert_to_int64), 'int64')
 """Cast to 64-bit integer"""
 
 _convert_to_uint8 = _conversion(
-    elemwise.Elemwise(scal.convert_to_uint8), 'uint8')
+        elemwise.Elemwise(scal.convert_to_uint8), 'uint8')
 """Cast to unsigned 8-bit integer"""
 
 _convert_to_uint16 = _conversion(
-    elemwise.Elemwise(scal.convert_to_uint16), 'uint16')
+        elemwise.Elemwise(scal.convert_to_uint16), 'uint16')
 """Cast to unsigned 16-bit integer"""
 
 _convert_to_uint32 = _conversion(
-    elemwise.Elemwise(scal.convert_to_uint32), 'uint32')
+        elemwise.Elemwise(scal.convert_to_uint32), 'uint32')
 """Cast to unsigned 32-bit integer"""
 
 _convert_to_uint64 = _conversion(
-    elemwise.Elemwise(scal.convert_to_uint64), 'uint64')
+        elemwise.Elemwise(scal.convert_to_uint64), 'uint64')
 """Cast to unsigned 64-bit integer"""
 
 _convert_to_float32 = _conversion(
-    elemwise.Elemwise(scal.convert_to_float32), 'float32')
+        elemwise.Elemwise(scal.convert_to_float32), 'float32')
 """Cast to single-precision floating point"""
 
 _convert_to_float64 = _conversion(
-    elemwise.Elemwise(scal.convert_to_float64), 'float64')
+        elemwise.Elemwise(scal.convert_to_float64), 'float64')
 """Cast to double-precision floating point"""
 
 _convert_to_complex64 = _conversion(
-    elemwise.Elemwise(scal.convert_to_complex64), 'complex64')
+        elemwise.Elemwise(scal.convert_to_complex64), 'complex64')
 """Cast to single-precision complex"""
 
 _convert_to_complex128 = _conversion(
-    elemwise.Elemwise(scal.convert_to_complex128), 'complex128')
+        elemwise.Elemwise(scal.convert_to_complex128), 'complex128')
 """Cast to double-precision complex"""
 
 _cast_mapping = {
-    'int8': _convert_to_int8,
-    'int16': _convert_to_int16,
-    'int32': _convert_to_int32,
-    'int64': _convert_to_int64,
-    'uint8': _convert_to_uint8,
-    'uint16': _convert_to_uint16,
-    'uint32': _convert_to_uint32,
-    'uint64': _convert_to_uint64,
-    'float32': _convert_to_float32,
-    'float64': _convert_to_float64,
-    'complex64': _convert_to_complex64,
-    'complex128': _convert_to_complex128}
+           'int8': _convert_to_int8,
+           'int16': _convert_to_int16,
+           'int32': _convert_to_int32,
+           'int64': _convert_to_int64,
+           'uint8': _convert_to_uint8,
+           'uint16': _convert_to_uint16,
+           'uint32': _convert_to_uint32,
+           'uint64': _convert_to_uint64,
+           'float32': _convert_to_float32,
+           'float64': _convert_to_float64,
+           'complex64': _convert_to_complex64,
+           'complex128': _convert_to_complex128}
 
 
 @constructor
@@ -1188,6 +2407,83 @@ def cast(x, dtype):
 ##########################
 
 
+class Shape(Op):
+    """
+    L{Op} to return the shape of a matrix.
+
+    @note: Non-differentiable.
+    """
+    def __hash__(self):
+        return hash(type(self))
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __str__(self):
+        return self.__class__.__name__
+
+    def make_node(self, x):
+        # Must work for all type that have a shape attribute.
+        # This will fail at execution time.
+        x = as_tensor_variable(x)
+        # Each type variable should implement their .shape attribute
+        # and have the fct infer_shape() implemented in the op that convert
+        # the type to TensorVariable to have the optimization working
+        # correctly.
+        return Apply(self, [x], [lvector()])
+
+    def perform(self, node, inp, out_):
+        x, = inp
+        out, = out_
+        out[0] = theano._asarray(x.shape, dtype='int64')
+
+    def infer_shape(self, node, in_shapes):
+        return [[len(in_shapes[0])]]
+
+    def connection_pattern(self, node):
+        # the grad returns the gradient with respect to the
+        # elements of a tensor variable
+        # the elements of the tensor variable do not participate
+        # in the computation of the shape, so they are not really
+        # part of the graph
+        return [[False]]
+
+    def grad(self, inp, grads):
+        # the grad returns the gradient with respect to the
+        # elements of a tensor variable
+        # the elements of the tensor variable do not participate
+        # in the computation of the shape, so they are not really
+        # part of the graph
+        return [DisconnectedType()()]
+
+    def R_op(self, inputs, eval_points):
+        return [None]
+
+    def c_code(self, node, nodename, inp, out, sub):
+        x, = inp
+        z, = out
+        if isinstance(node.inputs[0].type, TensorType):
+            return """
+            npy_intp shape[] = {PyArray_NDIM(%(x)s)};
+            if(%(z)s == NULL || (PyArray_DIMS(%(z)s)[0] != shape[0]))
+            {
+                Py_XDECREF(%(z)s);
+                %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
+            }
+            for(int i=0;i<shape[0];i++)
+            {
+                ((npy_int64*)PyArray_GETPTR1(%(z)s, i))[0] = PyArray_DIMS(%(x)s)[i];
+            }
+            """ % locals()
+        else:
+            #TODO: if your type is not listed here, make a damn registry of
+            #      shape_i ops for various types of variables.
+            #      Do not continue this madness.
+            return super(Shape, self).c_code(node, nodename, (x,), (out,), sub)
+
+    def c_code_cache_version(self):
+        return (1,)
+
 @constructor
 def old_shape(a):
     """
@@ -1209,16 +2505,143 @@ def old_shape(a):
         # a tuple directly.  This tuple is like the numpy.ndarray.shape tuple.
         return va.type.shape
 
+shape = Shape()
+_shape = shape  # was used in the past, now use shape directly.
+pprint.assign(_shape, printing.MemberPrinter('shape'))
 
-class MaxAndArgmax(Op):
-    """Calculate the max and argmax over a given axis or over all axes.
+
+class SpecifyShape(Op):
     """
-    nin = 2  # tensor, axis
-    nout = 2  # max val, max idx
-    E_axis = 'invalid axis'
+    L{Op} that puts into the graph the user-provided shape.
 
-    def __eq__(self, other):
-        return type(self) == type(other)
+    In the case where this op stays in the final graph, we assert the shape.
+    For this the output of this op must be used in the graph. This is not
+    the case most of the time if we only take the shape of the output.
+    Maybe there are other optimizations that will mess with this.
+
+    @note:     Maybe in the future we will never do the assert!
+    @note:     We currently don't support specifying partial shape information.
+
+    @todo:     test this op with sparse and cuda ndarray.
+               Do C code for them too.
+    """
+    view_map = {0: [0]}
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __str__(self):
+        return self.__class__.__name__
+
+    def make_node(self, x, shape):
+        if not isinstance(x, Variable):
+            x = as_tensor_variable(x)
+        shape = as_tensor_variable(shape)
+        assert shape.ndim == 1
+        assert "int" in shape.dtype
+        if isinstance(shape, TensorConstant):
+            assert shape.data.size == x.ndim
+        return Apply(self, [x, shape], [x.type()])
+
+    def perform(self, node, inp, out_):
+        x, shape = inp
+        out, = out_
+        assert x.ndim == shape.size
+        assert numpy.all(x.shape == shape), ("got shape", x.shape,
+                                           "expected", shape)
+        out[0] = x
+
+    def infer_shape(self, node, shapes):
+        xshape, sshape = shapes
+        new_shape = []
+        for dim in xrange(node.inputs[0].ndim):
+            try:
+                s = get_scalar_constant_value(node.inputs[1][dim])
+                s = as_tensor_variable(s)
+                new_shape.append(s)
+            except NotScalarConstantError:
+                new_shape.append(node.inputs[1][dim])
+
+        assert len(new_shape) == len(xshape)
+        return [new_shape]
+
+    def connection_pattern(self, node):
+        return [[True], [False]]
+
+    def grad(self, inp, grads):
+        x, s = inp
+        gz, = grads
+        # Should I set an SpecifyShape on gz? I think so
+        # But I don't do it now as we need to make an optimization
+        # to remove that op from the graph to don't block other optimization
+        # Should I do an optimizer that will remove the SpecifyShape?
+        # I think Yes
+        return [gz, DisconnectedType()()]
+        return [specify_shape(gz, s), DisconnectedType()()]
+
+    def R_op(self, inputs, eval_points):
+        if eval_points[0] is None:
+            # It means that the this op sits on top of a non-differentiable
+            # path
+            return [None]
+        return self.make_node(eval_points[0], *inputs[1:]).outputs
+
+    def c_code(self, node, nodename, inp, out, sub):
+        if not isinstance(node.inputs[0], TensorVariable):
+            # The C code below supports only Tensor.  super.c_code
+            # will raise an exception to tell that there is no C code
+            # for the other cases.
+            return super(SpecifyShape, self).c_code(node, nodename,
+                                                    inp, out, sub)
+        iname, shape = inp
+        oname, = out
+        fail = sub['fail']
+
+        return """
+        if (PyArray_NDIM(%(iname)s) != PyArray_DIMS(%(shape)s)[0]) {
+            PyErr_Format(PyExc_AssertionError,
+                         "SpecifyShape: vector of shape has %%d elements,"
+                         " but the input has %%d dimensions.",
+                         PyArray_NDIM(%(iname)s),
+                         PyArray_DIMS(%(shape)s)[0]);
+            %(fail)s;
+        }
+        for(int i = 0; i < PyArray_NDIM(%(iname)s); i++){
+            dtype_%(shape)s shp = ((dtype_%(shape)s*)PyArray_GETPTR1(%(shape)s,
+                                                                     i))[0];
+            if (PyArray_DIMS(%(iname)s)[i] != shp) {
+                PyErr_Format(PyExc_AssertionError,
+                             "SpecifyShape: dim %%d of input has shape %%d,"
+                             " expected %%d.",
+                             i, PyArray_DIMS(%(iname)s)[i],
+                             shp);
+                %(fail)s;
+            }
+        }
+        Py_XDECREF(%(oname)s);
+        %(oname)s = %(iname)s;
+        Py_XINCREF(%(oname)s);
+        """ % locals()
+
+    def c_code_cache_version(self):
+        return (1,)
+
+
+specify_shape = SpecifyShape()
+
+
+class MaxAndArgmax(Op):
+    """Calculate the max and argmax over a given axis or over all axes.
+    """
+    nin = 2  # tensor, axis
+    nout = 2  # max val, max idx
+    E_axis = 'invalid axis'
+
+    def __eq__(self, other):
+        return type(self) == type(other)
 
     def __hash__(self):
         return hash(type(self))
@@ -1226,55 +2649,43 @@ def __hash__(self):
     def make_node(self, x, axis=None):
         x = _as_tensor_variable(x)
 
-        if isinstance(axis, (tuple, list)):
-            axis = [int(a) for a in axis]
+        if isinstance(axis, int):
+            axis = [axis]
+        elif isinstance(axis, (tuple, list)):
             if len(axis) != 1:
-                axis = list(axis)
-                for idx in range(len(axis)):
-                    if axis[idx] < 0:
-                        axis[idx] += x.type.ndim
+                list(axis)
                 axis.sort()
-                if axis == range(-x.type.ndim, 0, 1):
-                    axis = range(x.type.ndim)
                 assert axis == range(x.type.ndim), (
                     "MaxAndArgmax does not support multiple"
                     " axes. the max fct supports it.")
-                axis = None
-            else:
-                axis = axis[0]
-
-        if isinstance(axis, (int, numpy.integer)):
-            axis = int(axis)
-        elif isinstance(axis, numpy.ndarray) and axis.ndim == 0:
-            axis = int(axis)
         elif isinstance(axis, Variable):
             if not isinstance(axis, TensorConstant):
                 raise TypeError("MaxAndArgmax needs a constant axis")
-            assert (axis.dtype.startswith("int")
-                    or axis.dtype.startswith("uint"))
-            axis = int(axis.data)
+            axis = axis.data
+            if axis.ndim == 0:
+                axis = [axis]
         # we make the axis all positive to make the infer_shape work
         # with negative axis
         if x.type.ndim > 0 and axis is not None:
-            if axis < 0:
-                if -axis > x.type.ndim:
-                    raise ValueError('axis out of range')
-                axis = x.type.ndim + axis
-        # Verify that the axis is valid.
-        all_axes = set()
-        if axis is not None:
-            if axis < 0 or axis >= x.type.ndim:
-                raise ValueError(
-                    'Invalid axis: %s (the number of dimensions of the '
-                    'input is: %s)' % (axis, x.type.ndim))
-            all_axes.add(axis)
-        else:
-            all_axes = range(x.ndim)
+            for id, a in enumerate(axis):
+                if not isinstance(a, TensorVariable) and a < 0:
+                    if -a > x.type.ndim:
+                        raise ValueError('axis out of range')
+                    axis[id] = x.type.ndim + a
         if axis is None:
-            axis = NoneConst.clone()
+            axis = _as_tensor_variable(range(x.type.ndim))
         else:
             axis = _as_tensor_variable(axis)
-            assert axis.ndim == 0
+
+        # Verify that the axis is valid.
+        all_axes = set()
+        for ax in axis.data:
+            if ax < 0 or ax >= x.type.ndim:
+                raise ValueError(
+                        'Invalid axis: %s (the number of dimensions of the '
+                        'input is: %s)' % (axis, x.type.ndim))
+            all_axes.add(ax.item())
+        assert axis.ndim == 1
         inputs = [x, axis]
         # We keep the original broadcastable flags for dimensions on which
         # we do not perform the max / argmax.
@@ -1287,78 +2698,19 @@ def make_node(self, x, axis=None):
     def perform(self, node, inp, outs):
         x, axis = inp
         max, max_idx = outs
+        if python_all(axis == range(x.ndim)):
+            axis = None
         max[0] = theano._asarray(numpy.max(x, axis),
                                  dtype=node.outputs[0].dtype)
         max_idx[0] = theano._asarray(numpy.argmax(x, axis), dtype='int64')
 
-    def c_code(self, node, name, inp, out, sub):
-        x, axis = inp
-        max, argmax = out
-        fail = sub["fail"]
-
-        if NoneConst.equals(node.inputs[1]):
-            axis_code = "axis = NPY_MAXDIMS;"
-        else:
-            assert node.inputs[1].ndim == 0
-            axis_code = """
-            axis = ((dtype_%(axis)s*)PyArray_DATA(%(axis)s))[0];
-            if(axis > PyArray_NDIM(%(x)s)-1 || axis < -PyArray_NDIM(%(x)s)){
-                PyErr_SetString(PyExc_ValueError, "MaxAndArgmax, bad axis argument");
-                %(fail)s
-            }
-            """ % locals()
-        ret = """
-        int axis;
-
-        Py_CLEAR(%(max)s);
-        Py_CLEAR(%(argmax)s);//todo pass them as out parameter.
-        %(axis_code)s
-        %(max)s = (PyArrayObject*)PyArray_Max(%(x)s, axis, NULL);
-        if(%(max)s == NULL){
-            PyErr_SetString(PyExc_ValueError,
-                         "MaxAndArgmax, max failed");
-            %(fail)s;
-        }
-        if(!PyArray_CheckExact(%(max)s)){
-            %(max)s = (PyArrayObject*)PyArray_FromAny((PyObject*)%(max)s, NULL, 0, 0, NPY_ARRAY_ENSUREARRAY, NULL);
-            if(%(max)s == NULL){
-                %(fail)s;
-            }
-        }
-
-        %(argmax)s = (PyArrayObject*)PyArray_ArgMax(%(x)s, axis, NULL);
-        if(%(argmax)s == NULL){
-            PyErr_SetString(PyExc_ValueError, "MaxAndArgmax, argmax failed");
-            Py_CLEAR(%(max)s);
-            %(fail)s;
-        }
-        if(!PyArray_CheckExact(%(argmax)s)){
-            %(argmax)s = (PyArrayObject*)PyArray_FromAny((PyObject*)%(argmax)s, NULL, 0, 0, NPY_ARRAY_ENSUREARRAY, NULL);
-            if(%(argmax)s == NULL){
-                %(fail)s;
-            }
-        }
-        if(PyArray_TYPE(%(argmax)s) != NPY_INT64){
-            PyObject * tmp = PyArray_Cast(%(argmax)s, NPY_INT64);
-            if (NULL == tmp){
-                %(fail)s;
-            }
-            Py_DECREF(%(argmax)s);
-            %(argmax)s = (PyArrayObject*)tmp;
-        }
-        """
-        return ret % locals()
-
-    def c_code_cache_version(self):
-        return (3,)
-
     def infer_shape(self, node, shapes):
         ishape, axis_shape = shapes
         axis = node.inputs[1]
-        if node.inputs[1].data is None:
+        if python_all(axis.data == range(node.inputs[0].ndim)):
             return [(), ()]
         rval = tuple([ishape[i] for (i, b) in enumerate(
-            node.inputs[0].type.broadcastable) if i != axis.data])
+                    node.inputs[0].type.broadcastable) if i != axis.data])
         return [rval, rval]
 
     def R_op(self, inputs, eval_points):
@@ -1405,25 +2757,20 @@ def grad(self, inp, grads):
         if g_max_disconnected and g_max_idx_disconnected:
             return [DisconnectedType()(), DisconnectedType()()]
 
-        axis_grad = grad_undefined(
-            self, 1, axis,
-            "argmax is not defined for non-integer axes so"
-            " argmax(x, axis+eps) is undefined")
+        axis_grad = grad_undefined(self, 1, axis,
+                "argmax is not defined for non-integer axes so"
+                " argmax(x, axis+eps) is undefined")
 
         # if the max is disconnected but the argmax is not,
         # the gradient on its inputs is zero
         if g_max_disconnected:
             return [x.zeros_like(), axis_grad]
-        if NoneConst.equals(axis):
-            axis_ = range(x.ndim)
-        else:
-            axis_ = axis
-        xmax = max(x, axis_)
+        xmax = max(x, axis)
 
         # Raise the g_max and xmax to the same number of dim as the input.
         pattern = []
         out_dim = 0
-        if NoneConst.equals(axis):
+        if python_all(axis.data == range(x.ndim)):
             # We are taking the max/argmax over all dimensions.
             axis = None
         for i in range(x.ndim):
@@ -1457,24 +2804,12 @@ def makeKeepDims(x, y, axis):
 
     if axis is None:
         axis = range(x.type.ndim)
-    elif isinstance(axis, (int, numpy.integer)):
+    elif isinstance(axis, int):
         axis = [axis]
-    elif isinstance(axis, numpy.ndarray) and axis.ndim == 0:
-        axis = [int(axis)]
-    else:
-        axis = [int(a) for a in axis]
-    newaxis = []
-    for a in axis:
-        if not isinstance(a, int):
-            raise ValueError(
-                "keepdims option can be used only with constant axis")
-        if a < 0:
-            a += x.type.ndim
-        newaxis.append(a)
     i = 0
     new_dims = []
     for j, _ in enumerate(x.type.broadcastable):
-        if j in newaxis:
+        if j in axis:
             new_dims.append('x')
         else:
             new_dims.append(i)
@@ -1576,7 +2911,7 @@ def min(x, axis=None, keepdims=False):
         the result as dimensions with size one. With this option, the result
         will broadcast correctly against the original tensor.
     """
-    x = as_tensor_variable(x)
+
     str_x_type = str(x.dtype)
     if str_x_type.startswith('float') or str_x_type in int_dtypes:
         return -max(-x, axis=axis, keepdims=keepdims)
@@ -1597,7 +2932,7 @@ def argmin(x, axis=None, keepdims=False):
         the result as dimensions with size one. With this option, the result
         will broadcast correctly against the original tensor.
     """
-    x = as_tensor_variable(x)
+
     str_x_type = str(x.dtype)
     if str_x_type.startswith('float') or str_x_type in int_dtypes:
         return argmax(-x, axis=axis, keepdims=keepdims)
@@ -1816,7 +3151,7 @@ def round(a, mode="half_away_from_zero"):
         raise Exception("round mode %s is not implemented." % mode)
 
 
-@_scal_elemwise_with_nfunc('around', 1, 1)
+@_scal_elemwise_with_nfunc('around', 1, -1)
 def round_half_to_even(a):
     """round_half_to_even(a)"""
 
@@ -1949,27 +3284,25 @@ def gammaln(a):
 def psi(a):
     """derivative of log gamma function"""
 
-
 @_scal_elemwise
 def chi2sf(x, k):
     """chi squared survival function"""
 
 
-#numpy.real(float32) return a view on the inputs.
-#@_scal_elemwise_with_nfunc('real', 1, 1)
-@_scal_elemwise
+
+@_scal_elemwise_with_nfunc('real', 1, -1)
 def real(z):
     """Return real component of complex-valued tensor `z`"""
 _tensor_py_operators.real = property(real)
 
 
-@_scal_elemwise_with_nfunc('imag', 1, 1)
+@_scal_elemwise_with_nfunc('imag', 1, -1)
 def imag(z):
     """Return imaginary component of complex-valued tensor `z`"""
 _tensor_py_operators.imag = property(imag)
 
 
-@_scal_elemwise_with_nfunc('angle', 1, 1)
+@_scal_elemwise_with_nfunc('angle', 1, -1)
 def angle(z):
     """Return polar-coordinate angle of complex-valued tensor `z`"""
 
@@ -1979,7 +3312,7 @@ def complex(real, imag):
     """Return complex-valued tensor with `real` and `imag` components"""
 
 
-@_scal_elemwise_with_nfunc('conj', 1, 1)
+@_scal_elemwise_with_nfunc('conj', 1, -1)
 def conj(z):
     """Return the complex conjugate of `z`."""
 
@@ -2024,8 +3357,6 @@ def zeros(shape, dtype=None):
     """
     Create a Tensor filled with zeros, closer to Numpy's syntax than ``alloc``.
     """
-    if not isinstance(shape, (list, tuple, TensorVariable)):
-        shape = [shape]
     if dtype is None:
         dtype = config.floatX
     return alloc(numpy.array(0, dtype=dtype), *shape)
@@ -2035,8 +3366,6 @@ def ones(shape, dtype=None):
     """
     Create a Tensor filled with ones, closer to Numpy's syntax than ``alloc``.
     """
-    if not isinstance(shape, (list, tuple, TensorVariable)):
-        shape = [shape]
     if dtype is None:
         dtype = config.floatX
     return alloc(numpy.array(1, dtype=dtype), *shape)
@@ -2070,8 +3399,6 @@ class Nonzero(gof.Op):
         flattened input array.
 
     """
-    __props__ = ()
-
     def make_node(self, a):
         a = as_tensor_variable(a)
         if a.ndim == 0:
@@ -2217,10 +3544,8 @@ def make_node(self, N, M, k):
         N = as_tensor_variable(N)
         M = as_tensor_variable(M)
         k = as_tensor_variable(k)
-        return gof.Apply(
-            self,
-            [N, M, k],
-            [TensorType(dtype=self.dtype, broadcastable=(False, False))()])
+        return gof.Apply(self, [N, M, k],
+                [TensorType(dtype=self.dtype, broadcastable=(False, False))()])
 
     def perform(self, node, inp, out_):
         N, M, k = inp
@@ -2328,10 +3653,8 @@ def make_node(self, n, m, k):
         assert n.ndim == 0
         assert m.ndim == 0
         assert k.ndim == 0
-        return gof.Apply(
-            self,
-            [n, m, k],
-            [TensorType(dtype=self.dtype, broadcastable=(False, False))()])
+        return gof.Apply(self, [n, m, k],
+                [TensorType(dtype=self.dtype, broadcastable=(False, False))()])
 
     def perform(self, node, inp, out_):
         n, m, k = inp
@@ -2393,9 +3716,8 @@ class Alloc(gof.Op):
 
     Returns an N-dimensional tensor initialized by `value` using something
     equivalent to
-
-        z = numpy.zeros(shape, value.dtype)
-        z += value
+    >>> z = numpy.zeros(shape, value.dtype)
+    >>> z += value
 
     The result has N dimensions, has the dtype of `value` and is obtained by
     broadcasting value over the output ndarray.
@@ -2481,7 +3803,7 @@ def c_code(self, node, name, inp, out, sub):
             {
                 Py_XDECREF(%(zz)s);
                 %(zz)s = (PyArrayObject*) PyArray_SimpleNew(%(ndim)s,
-                    shape, PyArray_TYPE((PyArrayObject*) py_%(vv)s));
+                    shape, type_num_%(vv)s);
                 if (!%(zz)s)
                 {
                     PyErr_SetString(PyExc_MemoryError, "alloc failed");
@@ -2514,35 +3836,14 @@ def grad(self, inputs, grads):
         x = inputs[0]
         gz = grads[0]
         n_axes_to_sum = gz.ndim - x.ndim
-        #The number of dimensions added
-        axis = range(n_axes_to_sum)
-        #The broadcasted dimensions
-        axis_broadcasted = []
-        for i, (ib, gb) in enumerate(
-            zip(inputs[0].broadcastable,
-                #We need the dimensions corresponding to x
-                grads[0].broadcastable[-inputs[0].ndim:])):
-            if ib and not gb:
-                axis_broadcasted.append(i + n_axes_to_sum)
-        gx = gz.sum(axis=axis + axis_broadcasted)
-        if axis_broadcasted:
-            new_order = list(x.broadcastable)
-            idx = 0
-            for i in range(x.ndim):
-                if not new_order[i]:
-                    new_order[i] = idx
-                    idx += 1
-                else:
-                    new_order[i] = 'x'
-            gx = gx.dimshuffle(new_order)
-            #Dimshuffle to add back the broadcasted dims
+        gx = gz.sum(axis=range(n_axes_to_sum))
         #The *elements* of the output are not connected to
         #the inputs that specify the shape. If you grow the
         #shape by epsilon, the existing elements do not
         #change.
         return [gx] + [DisconnectedType()() for i in inputs[1:]]
 
-    def __call__(self, val, *shapes, **kwargs):
+    def __call__(self, val, *shapes):
         """
         If the alloc would be useless, this function returns val.
 
@@ -2553,7 +3854,7 @@ def __call__(self, val, *shapes, **kwargs):
 
         If you always want an Alloc node, call make_node.
         """
-        ret = super(Alloc, self).__call__(val, *shapes, **kwargs)
+        ret = super(Alloc, self).__call__(val, *shapes)
         try:
             # It makes optimization difficult when useless allocs are thrown
             # into the graph at every stage of optimization.  This little logic
@@ -2567,7 +3868,7 @@ def __call__(self, val, *shapes, **kwargs):
     def R_op(self, inputs, eval_points):
         if eval_points[0] is None:
             return [None]
-        return self(eval_points[0], *inputs[1:], **dict(return_list=True))
+        return self.make_node(eval_points[0], *inputs[1:]).outputs
 
     def do_constant_folding(self, node):
         if not getattr(node.outputs[0], 'clients', []):
@@ -2579,30 +3880,12 @@ def do_constant_folding(self, node):
                 # If the output is a constant, it will have to be deepcopied
                 # each time the function is called.  So we do not fold.
                 return False
-            elif (
-                # The following ops work inplace of their input id 0.
-                client[1] == 0 and
-                isinstance(client[0].op, (
-                    # Ops that will work inplace on the Alloc. So if they
-                    # get constant_folded, they would copy the
-                    # constant and this is less efficients.
-
-                    # Not doing the constant folding could also lower
-                    # the peak memory usage, as we the "constant" won't
-                    # always exists.
-                    theano.tensor.subtensor.IncSubtensor,
-                    theano.tensor.subtensor.AdvancedIncSubtensor1,
-                    theano.tensor.subtensor.AdvancedIncSubtensor,
-                    theano.tensor.blas.Gemv,
-                    theano.tensor.blas_c.CGemv,
-                    theano.tensor.blas.Ger,
-                    theano.tensor.blas_c.CGer,
-                    theano.tensor.blas_scipy.ScipyGer))):
-                return False
-            #If the clients is a transfer to the GPU, we don't want to
-            #fold. We let the Alloc being moved to the GPU, then we
-            #let the GPU algo decide if it need to fold it or not.
-            elif client[0].op.__class__.__name__.lower().startswith("gpu"):
+            elif (not isinstance(client[0], basestring)
+                    and isinstance(client[0].op, (
+                        IncSubtensor,
+                        AdvancedIncSubtensor1,
+                        AdvancedIncSubtensor,
+                        ))):
                 return False
         return True
 
@@ -2611,8 +3894,9 @@ def do_constant_folding(self, node):
 pprint.assign(alloc, printing.FunctionPrinter('alloc'))
 
 
-"""Create a duplicate of `a` (with duplicated storage)"""
-tensor_copy = elemwise.Elemwise(scal.identity)
+@_redefine(elemwise.Elemwise(scal.identity))
+def tensor_copy(a):
+    """Create a duplicate of `a` (with duplicated storage)"""
 pprint.assign(tensor_copy, printing.IgnorePrinter())
 
 
@@ -2643,8 +3927,7 @@ def sum(input, axis=None, dtype=None, keepdims=False, acc_dtype=None):
 
 
 @constructor
-def prod(input, axis=None, dtype=None, keepdims=False, acc_dtype=None,
-         no_zeros_in_input=False):
+def prod(input, axis=None, dtype=None, keepdims=False, acc_dtype=None):
     """
     Computes the product along the given axis(es) of a tensor `input`
 
@@ -2658,8 +3941,7 @@ def prod(input, axis=None, dtype=None, keepdims=False, acc_dtype=None,
     For full documentation see ``tensor.elemwise.Prod``.
     """
 
-    out = elemwise.Prod(axis, dtype=dtype, acc_dtype=acc_dtype,
-                        no_zeros_in_input=no_zeros_in_input)(input)
+    out = elemwise.Prod(axis, dtype=dtype, acc_dtype=acc_dtype)(input)
 
     if keepdims:
         out = makeKeepDims(input, out, axis)
@@ -2711,7 +3993,7 @@ def c_code(self, node, name, inames, onames, sub):
 
 @constructor
 def mean(input, axis=None, dtype=None, op=False, keepdims=False,
-         acc_dtype=None):
+        acc_dtype=None):
     """
     Computes the mean value along the given axis(es) of a tensor `input`
 
@@ -2743,16 +4025,16 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
     if op:
         if dtype not in (None, 'float64'):
             raise NotImplementedError(
-                'The Mean op does not support the dtype argument, '
-                'and will always use float64. If you want to specify '
-                'the dtype, call tensor.mean(..., op=False).',
-                dtype)
+                    'The Mean op does not support the dtype argument, '
+                    'and will always use float64. If you want to specify '
+                    'the dtype, call tensor.mean(..., op=False).',
+                    dtype)
         if acc_dtype not in (None, 'float64'):
             raise NotImplementedError(
-                'The Mean op does not support the acc_dtype argument, '
-                'and will always use float64. If you want to specify '
-                'acc_dtype, call tensor.mean(..., op=False).',
-                dtype)
+                    'The Mean op does not support the acc_dtype argument, '
+                    'and will always use float64. If you want to specify '
+                    'acc_dtype, call tensor.mean(..., op=False).',
+                    dtype)
         out = Mean(axis)(input)
         if keepdims:
             out = makeKeepDims(input, out, axis)
@@ -2780,12 +4062,8 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
 
     if axis is None:
         axis = range(input.ndim)
-    elif isinstance(axis, (int, numpy.integer)):
+    elif isinstance(axis, int):
         axis = [axis]
-    elif isinstance(axis, numpy.ndarray) and axis.ndim == 0:
-        axis = [int(axis)]
-    else:
-        axis = [int(a) for a in axis]
 
     # This sequential division will possibly be optimized by Theano:
     for i in axis:
@@ -2816,12 +4094,8 @@ def var(input, axis=None, keepdims=False):
     input_ndim = input.type.ndim
     if axis is None:
         axis = range(input_ndim)
-    elif isinstance(axis, (int, numpy.integer)):
+    if isinstance(axis, int):
         axis = [axis]
-    elif isinstance(axis, numpy.ndarray) and axis.ndim == 0:
-        axis = [int(axis)]
-    else:
-        axis = [int(a) for a in axis]
 
     # compute the axis-wise mean
     mean_input = mean(input, axis, keepdims=True)
@@ -2872,9 +4146,9 @@ class Default(gof.Op):
 
     def make_node(self, x, default):
         x, default = as_tensor_variable(x), as_tensor_variable(default)
-        if x.type != default.type:
+        if  x.type != default.type:
             raise TypeError('Both default() arguments must have same type',
-                            x, default)
+                    x, default)
         return gof.Apply(self, [x, default], [default.type()])
 
     def perform(self, node, inp, out_):
@@ -2975,8 +4249,8 @@ def ceil_intdiv(a, b):
 
 def mod_check(x, y):
     """Make sure we do not try to use complex numbers."""
-    if ((as_tensor_variable(x).dtype in complex_dtypes or
-         as_tensor_variable(y).dtype in complex_dtypes)):
+    if (as_tensor_variable(x).dtype in complex_dtypes or
+        as_tensor_variable(y).dtype in complex_dtypes):
         # Currently forbidden.
         raise scal.Mod.complex_error
     else:
@@ -3006,7 +4280,7 @@ def clip(x, min, max):
 pprint.assign(add, printing.OperatorPrinter('+', -2, 'either'))
 pprint.assign(mul, printing.OperatorPrinter('*', -1, 'either'))
 pprint.assign(sub, printing.OperatorPrinter('-', -2, 'left'))
-pprint.assign(neg, printing.OperatorPrinter('-', 0, 'either'))
+pprint.assign(neg, printing.OperatorPrinter('-',  0, 'either'))
 pprint.assign(true_div, printing.OperatorPrinter('/', -1, 'left'))
 pprint.assign(int_div, printing.OperatorPrinter('//', -1, 'left'))
 pprint.assign(pow, printing.OperatorPrinter('**', 1, 'right'))
@@ -3016,8 +4290,36 @@ def clip(x, min, max):
 # View Operations
 ##########################
 
+##########
+# Helpful functions to deal with Subtensor and IncSubtensor
+##########
+
+def get_idx_list(inputs, idx_list):
+    '''
+    Given a list of inputs to the subtensor and its idx_list reorders
+    the inputs according to the idx list to get the right values
+    '''
+
+    # The subtensor (or idx_list) does not depend on the inputs.
+    if len(inputs) == 1:
+        return tuple(idx_list)
+    indices = list(reversed(list(inputs[1:])))
+
+    # General case
+    def convert(entry):
+        if isinstance(entry, gof.Type):
+            return indices.pop()
+        elif isinstance(entry, slice):
+            return slice(convert(entry.start),
+                     convert(entry.stop),
+                     convert(entry.step))
+        else:
+            return entry
+    cdata = tuple(map(convert, idx_list))
+    return cdata
+
 
-def extract_constant(x, elemwise=True):
+def extract_constant(x):
     '''
      This function is basically a call to tensor.get_scalar_constant_value. The
      main difference is the behaviour in case of failure. While
@@ -3027,11 +4329,11 @@ def extract_constant(x, elemwise=True):
      ScalarVariable, we convert it to a tensor with tensor_from_scalar.
     '''
     try:
-        x = get_scalar_constant_value(x, elemwise=elemwise)
+        x = get_scalar_constant_value(x)
     except NotScalarConstantError:
         pass
-    if ((isinstance(x, scal.ScalarVariable) or
-         isinstance(x, scal.sharedvar.ScalarSharedVariable))):
+    if (isinstance(x, scal.ScalarVariable) or
+        isinstance(x, scal.sharedvar.ScalarSharedVariable)):
         if x.owner and isinstance(x.owner.op, ScalarFromTensor):
             x = x.owner.inputs[0]
         else:
@@ -3039,6 +4341,155 @@ def extract_constant(x, elemwise=True):
     return x
 
 
+def get_canonical_form_slice(theslice, length):
+    '''
+    Given a slice [start:stop:step] transform it into a canonical form
+    that respects the conventions imposed by python and numpy.
+
+    In a canonical form a slice is represented by a canonical form slice,
+    in which 0 <= start <= stop <= length and step > 0, and a flag which says
+    if the resulting set of numbers needs to be reversed or not.
+    '''
+
+    if isinstance(theslice, slice):
+
+        def analyze(x):
+            try:
+                x_constant = get_scalar_constant_value(x)
+                is_constant = True
+            except NotScalarConstantError:
+                x_constant = extract_constant(x)
+                is_constant = False
+            return x_constant, is_constant
+
+        start, is_start_constant = analyze(theslice.start)
+        stop, is_stop_constant = analyze(theslice.stop)
+        step, is_step_constant = analyze(theslice.step)
+        length, is_length_constant = analyze(length)
+
+        if step is None:
+            step = 1
+
+        # First handle the easier and common case where `step` is 1 and
+        # either `start` or `stop` is a range boundary. More specializations
+        # could be added later. This makes the resulting graph smaller than
+        # in the generic case below.
+        if step == 1:
+            is_start_0 = (
+                    start in [None, 0] or
+                    (is_start_constant and is_length_constant and
+                     start < 0 and start + length <= 0))
+            is_stop_length = (
+                    stop in [None, length, maxsize] or
+                    (is_stop_constant and is_length_constant and
+                     stop >= length))
+            if is_start_0:
+                # 0:stop:1
+                if is_stop_length:
+                    # Full slice.
+                    return slice(0, length, 1), 1
+                if is_stop_constant and stop >= 0:
+                    return (slice(0, switch(lt(stop, length), stop, length),
+                                  1), 1)
+                stop_plus_len = stop + length
+                stop = switch(
+                        lt(stop, 0),
+                        # stop < 0
+                        switch(
+                            lt(stop_plus_len, 0),
+                            # stop + len < 0
+                            0,
+                            # stop + len >= 0
+                            stop_plus_len),
+                        # stop >= 0: use min(stop, length)
+                        switch(lt(stop, length), stop, length))
+                return slice(0, stop, 1), 1
+            elif is_stop_length:
+                # start:length:1
+                if is_start_constant and start >= 0:
+                    return slice(switch(lt(start, length), start, length),
+                                 length, 1), 1
+                start_plus_len = start + length
+                start = switch(
+                        lt(start, 0),
+                        # start < 0
+                        switch(
+                            lt(start_plus_len, 0),
+                            # start + len < 0
+                            0,
+                            # start + len >= 0
+                            start_plus_len),
+                        # start >= 0: use min(start, length)
+                        switch(lt(start, length), start, length))
+                return slice(start, length, 1), 1
+
+        # This is the generic case.
+
+        if is_step_constant:
+            # When we know the sign of `step`, the graph can be made simpler.
+            assert step != 0
+            if step > 0:
+                def switch_neg_step(a, b):
+                    return b
+                abs_step = step
+                sgn_step = 1
+            else:
+                def switch_neg_step(a, b):
+                    return a
+                abs_step = -step
+                sgn_step = -1
+        else:
+            is_step_neg = lt(step, 0)
+
+            def switch_neg_step(a, b):
+                return switch(is_step_neg, a, b)
+            abs_step = abs(step)
+            sgn_step = sgn(step)
+
+        defstart = switch_neg_step(length - 1, 0)
+        defstop = switch_neg_step(-1, length)
+        if start is None:
+            start = defstart
+        else:
+            start = switch(lt(start, 0), start + length, start)
+            start = switch(lt(start, 0), switch_neg_step(-1, 0), start)
+            start = switch(ge(start, length),
+                           switch_neg_step(length - 1, length),
+                           start)
+        if stop in [None, maxsize]:
+            # The special "maxsize" case is probably not needed here,
+            # as slices containing maxsize are not generated by
+            # __getslice__ anymore.
+            stop = defstop
+        else:
+            stop = switch(lt(stop, 0), stop + length, stop)
+            stop = switch(lt(stop, 0), -1, stop)
+            stop = switch(ge(stop, length), length, stop)
+
+        nw_stop = switch_neg_step(start + 1, stop)
+        slice_len = (start - stop - 1) // abs_step + 1
+        slice_len = switch(lt(slice_len, 0), 0, slice_len)
+        neg_start = nw_stop - (slice_len - 1) * abs_step - 1
+        neg_start = switch(lt(neg_start, 0), (nw_stop - 1), neg_start)
+        nw_start = switch_neg_step(neg_start, start)
+        nw_start = switch(lt(nw_start, 0), 0, nw_start)
+        nw_stop = switch(lt(nw_stop, 0), 0, nw_stop)
+        # Ensure start <= stop.
+        nw_start = switch(lt(nw_start, nw_stop), nw_start, nw_stop)
+
+        nw_step = abs_step
+        if step != 1:
+            reverse = sgn_step
+            return slice(nw_start, nw_stop, nw_step), reverse
+        else:
+            return slice(nw_start, nw_stop, nw_step), 1
+    else:
+        value = extract_constant(theslice)
+        value = switch(lt(value, 0), (value + length), value)
+
+        return value, 1
+
+
 def transpose(x, axes=None):
     """
     Reorder the dimensions of x. (Default: reverse them)
@@ -3055,1437 +4506,3274 @@ def transpose(x, axes=None):
     return ret
 
 
-def batched_dot(x, y):
+class AdvancedIndexingError(TypeError):
     """
-    :param x: A Tensor with sizes e.g.: for  3D (dim1, dim3, dim2)
-    :param y: A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4)
-    This function computes the dot product between the two tensors, by
-    iterating over the first dimension using scan.
-    Returns a tensor of size e.g. if it is 3D: (dim1, dim3, dim4)
-    Example:
-    >>> first = tensor.tensor3('first')
-    >>> second = tensor.tensor3('second')
-    >>> result = batched_dot(first, second)
-    :note:  This is a subset of numpy.einsum, but we do not provide it for now.
-    But numpy einsum is slower than dot or tensordot:
-    http://mail.scipy.org/pipermail/numpy-discussion/2012-October/064259.html
+    Raised when Subtensor is asked to perform advanced indexing.
     """
-    result, updates = theano.scan(
-        fn=lambda x_mat, y_mat:
-        theano.tensor.dot(x_mat, y_mat),
-        outputs_info=None,
-        sequences=[x, y],
-        non_sequences=None)
-    return result
 
+    def __init__(self, *args):
+        TypeError.__init__(self, *args)
 
-def batched_tensordot(x, y, axes=2):
-    """
-    :param x: A Tensor with sizes e.g.: for 3D (dim1, dim3, dim2)
-    :param y: A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4)
-    :param axes: an integer or array. If an integer, the number of axes
-        to sum over. If an array, it must have two array
-        elements containing the axes to sum over in each tensor.
-
-        If an integer i, it is converted to an array containing
-        the last i dimensions of the first tensor and the first
-        i dimensions of the second tensor (excluding the first
-        (batch) dimension):
-            axes = [range(a.ndim - i, b.ndim), range(1,i+1)]
-
-        If an array, its two elements must contain compatible axes
-        of the two tensors. For example, [[1, 2], [2, 4]] means sum
-        over the 2nd and 3rd axes of a and the 3rd and 5th axes of b.
-        (Remember axes are zero-indexed!) The 2nd axis of a and the
-        3rd axis of b must have the same shape; the same is true for
-        the 3rd axis of a and the 5th axis of b.
-    :type axes: int or array-like of length 2
 
-    A hybrid of batch_dot and tensordot, this function computes the
-    tensordot product between the two tensors, by iterating over the
-    first dimension using scan to perform a sequence of tensordots.
-    """
-    if isinstance(axes, (list, numpy.ndarray)):
-        if isinstance(axes, list):
-            axes = numpy.asarray(axes)
-        else:
-            axes = axes.copy()
-        assert numpy.greater(axes, 0).all(), (
-            "All axes should be greater than one, as the "
-            "first axis is iterated over (batch-wise scan)")
-        axes -= 1
-
-    result, updates = theano.scan(
-        fn=lambda x_mat, y_mat:
-        theano.tensor.tensordot(x_mat, y_mat, axes),
-        outputs_info=None,
-        sequences=[x, y],
-        non_sequences=None)
-    return result
+class Subtensor(Op):
+    """Return a subtensor view
 
+    The inputs array is the tensor x, followed by scalar integer types.
+    TODO: WRITEME: how are the scalar integer variables formatted?
 
-def split(x, splits_size, n_splits, axis=0):
-    the_split = Split(n_splits)
-    return the_split(x, axis, splits_size)
+    This class uses a relatively complex internal representation of the inputs
+    to remember how the input tensor x should be sliced.
 
+    idx_list: instance variable TODO: WRITEME: is this a list or a tuple?
+                                        (old docstring gives two conflicting
+                                        descriptions)
+              elements are either integers, theano scalar types, or slices.
+              one element per "explicitly named dimension"
+                TODO: WRITEME: what is an "explicitly named dimension" ?
 
-class Split(Op):
-    """Partition a `TensorVariable` along some axis.
+              if integer:
+                  indexes into the inputs array
+              if slice:
+                  start/stop/step members of each slice are integer indices
+                  into the inputs array or None
+                  integer indices be actual integers or theano scalar types
 
-    .. python::
+    Note that the idx_list defines the Op, so two Subtensor instances are
+    considered to be different Ops if they have different idx_list fields.
+    This means that the entries in it are theano Types, not theano Variables.
 
-        x = vector()
-        splits = lvector()
-        # you have to declare right away how many split_points there will be.
-        ra, rb, rc = split(x, splits, n_splits = 3, axis = 0)
+    @todo: add support for advanced tensor indexing (in Subtensor_dx too).
 
-        f = function([x, splits], [ra, rb, rc])
+    """
+    e_invalid = ('The index list is longer (size %d) than the number of '
+                 'dimensions of the tensor(namely %d). You are asking for '
+                 'a dimension of the tensor that does not exist! You might '
+                 'need to use dimshuffle to add extra dimension to your '
+                 'tensor.')
+    e_subslice = 'nested slicing is not supported'
+    e_indextype = "Invalid index type or slice for Subtensor"
+    debug = 0
 
-        a, b, c = f([0,1,2,3,4,5,6], [3, 2, 1])
+    view_map = {0: [0]}
 
-        #a == [0,1,2]
-        #b == [3, 4]
-        #c == [5]
+    @staticmethod
+    def collapse(idxs, cond):
+        """
 
-    """
+        idxs: a list of indices or slices.
+        cond: a callable that returns a bool
 
-    len_splits = None
-    """A Split instance will have this many outputs, and require that
-    the splits argument to `perform` have exactly this many elements.
-    """
+        returns: idxs, with the slices flattened out into a list.
+                if cond is true for an entry, does not flatten it.
 
-    def __init__(self, len_splits):
-        self.len_splits = int(len_splits)
+        """
+        ret = []
 
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.len_splits == other.len_splits)
+        def helper(entry):
+            if cond(entry):
+                ret.append(entry)
+            elif isinstance(entry, slice):
+                helper(entry.start)
+                helper(entry.stop)
+                helper(entry.step)
 
-    def __str__(self):
-        return self.__class__.__name__ + "{%s}" % self.len_splits
+        for idx in idxs:
+            helper(idx)
 
-    def __hash__(self):
-        return hash(Split) ^ self.len_splits
+        return ret
 
-    def make_node(self, x, axis, splits):
-        """WRITEME"""
+    @staticmethod
+    def convert(entry, slice_ok=True):
+        """
+        The "idx_list" field is unique to each Subtensor instance.
+        It is not unique to each Apply node, so it should not refer to
+        specific Variables. This method changes references to Variables
+        into references to Types.
+        TODO: WRITEME: This method also accepts "entry" already being a Type;
+            when would that happen?
+        """
+        invalid_scal_types = [scal.float64, scal.float32]
+        scal_types = [scal.int64, scal.int32, scal.int16, scal.int8]
+        tensor_types = [lscalar, iscalar, wscalar, bscalar]
+        invalid_tensor_types = [fscalar, dscalar, cscalar, zscalar]
+        if (isinstance(entry, gof.Variable)
+                and (entry.type in invalid_scal_types
+                     or entry.type in invalid_tensor_types)):
+            raise TypeError("Expected an integer")
+
+        if isinstance(entry, gof.Variable) and entry.type in scal_types:
+            return entry.type
+        elif isinstance(entry, gof.Type) and entry in scal_types:
+            return entry
+
+        if (isinstance(entry, gof.Variable)
+                and entry.type in tensor_types
+                and numpy.all(entry.type.broadcastable)):
+            return scal.Scalar(entry.type.dtype)
+        elif (isinstance(entry, gof.Type)
+                and entry in tensor_types
+                and numpy.all(entry.broadcastable)):
+            return scal.Scalar(entry.dtype)
+        elif slice_ok and isinstance(entry, slice):
+            a = entry.start
+            b = entry.stop
+            c = entry.step
+
+            if a is not None:
+                slice_a = Subtensor.convert(a, False)
+            else:
+                slice_a = None
+
+            if b is not None and b != maxsize:
+                # The special "maxsize" case is probably not needed here,
+                # as slices containing maxsize are not generated by
+                # __getslice__ anymore.
+                slice_b = Subtensor.convert(b, False)
+            else:
+                slice_b = None
+
+            if c is not None:
+                slice_c = Subtensor.convert(c, False)
+            else:
+                slice_c = None
+
+            return slice(slice_a, slice_b, slice_c)
+        # There is a bug in numpy that results in isinstance(x, int) returning
+        # False for numpy integers.
+        # See <http://projects.scipy.org/numpy/ticket/2235>.
+        elif isinstance(entry, numpy.integer):
+            return entry
+        # On Windows 64-bit, shapes are returned as Python long, as they can
+        # be bigger than what a Python int can hold.
+        # Shapes should always fit in a numpy.int64, and we support them better
+        # 2) In Python3, long replaced int. So we must assert it fit in int64.
+        elif isinstance(entry, (int, long)):
+            entry64 = numpy.int64(entry)
+            return entry64
+        else:
+            raise AdvancedIndexingError(Subtensor.e_indextype, entry)
+
+    def __init__(self, idx_list):
+        self.idx_list = tuple(map(self.convert, idx_list))
+        self.perform_cache_cdata = None
+
+    @staticmethod
+    def my_as_scalar(a):
+        # Since scal.as_scalar does not know about tensor types (it would
+        # create a circular import) , this method converts either a
+        # TensorVariable or a ScalarVariable to a scalar.
+        if isinstance(a, gof.Variable) and isinstance(a.type, TensorType):
+            return scalar_from_tensor(a)
+        else:
+            return scal.as_scalar(a)
+
+    def make_node(self, x, *inputs):
+        """
+            x: the tensor to take a subtensor of
+            inputs: a list of theano Scalars
+        """
         x = as_tensor_variable(x)
-        axis = as_tensor_variable(axis)
-        splits = as_tensor_variable(splits)
+        inputs = tuple(self.my_as_scalar(a) for a in inputs)
+
+        idx_list = list(self.idx_list)
+        if len(idx_list) > x.type.ndim:
+            exception = ValueError(Subtensor.e_invalid % (
+                len(idx_list), x.type.ndim))
+            exception.subtensor_invalid = True
+            raise exception
+
+        # infer the broadcasting pattern
+        padded = (idx_list
+                + [slice(None, None, None)] * (x.type.ndim - len(idx_list)))
+        broadcastable = [bc for p, bc in izip(padded, x.type.broadcastable)
+                if isinstance(p, slice)]
+
+        input_types = Subtensor.collapse(idx_list,
+                lambda entry: isinstance(entry, gof.Type))
+        if len(inputs) != len(input_types):
+            raise IndexError(
+                    "Not enough inputs to fill in the Subtensor template.",
+                    inputs, idx_list)
+        for input, expected_type in izip(inputs, input_types):
+            if input.type != expected_type:
+                raise TypeError(
+                    "Wrong type for Subtensor template. Expected %s, got %s."
+                    % (input.type, expected_type))
+
+        return gof.Apply(self,
+                         (x, ) + inputs,
+                         [tensor(dtype=x.type.dtype,
+                                 broadcastable=broadcastable)])
 
-        if splits.type not in int_vector_types:
-            raise TypeError('splits must have type tensor.lvector',
-                            splits.type)
-        if axis.type not in int_types:
-            raise TypeError('axis must have type lscalar', axis.type)
+    def perform(self, node, inputs, out_):
+        out, = out_
+        x = inputs[0]
 
-#         # The following lines are necessary if we allow splits of zero
-#         if isinstance(axis, gof.Constant):
-#             x = unbroadcast(x, int(axis.data))
-#         else:
-#             x = unbroadcast(x, *range(x.type.ndim))
+        # The subtensor (or idx_list) does not depend on the inputs.
+        # (and cdata was cached on initial call)
+        if self.perform_cache_cdata is not None:
+            out[0] = numpy.asarray(x.__getitem__(self.perform_cache_cdata))
+            return
 
-        inputs = [x, axis, splits]
-        outputs = [x.type() for i in xrange(self.len_splits)]
+        cdata = get_idx_list(inputs, self.idx_list)
+        if len(cdata) == 1:
+            cdata = cdata[0]
+        # (first call caches cdata here)
+        if len(inputs) == 1:
+            self.perform_cache_cdata = cdata
 
-        return Apply(self, inputs, outputs)
+        out[0] = numpy.asarray(x.__getitem__(cdata))
 
-    def perform(self, node, inputs, outputs):
-        """WRITEME"""
-        x, axis, splits = inputs
-        # in python 2.4, x.shape[numpy.asarray(1)] don't work.
-        if sys.version_info[0:2] == (2, 4) and axis.size == 1:
-            axis = int(axis)
+    def infer_shape(self, node, shapes):
+        xshp = shapes[0]
+        assert len(xshp) == node.inputs[0].ndim
+        outshp = []
+        actual_idx_list = list(get_idx_list(node.inputs, self.idx_list))
+        padded = (actual_idx_list +
+                  [slice(None, None, None)] * (len(xshp) - len(self.idx_list)))
+        i = 0
+        for idx, xl in izip(padded, xshp):
+            if isinstance(idx, slice):
+                # If it is the default (None, None, None) slice, or a variant,
+                # the shape will be xl
+                if ((idx.start in [None, 0])
+                    and (idx.stop in [None, maxsize])
+                    and (idx.step is None or idx.step == 1)):
+                    outshp.append(xl)
+                else:
+                    cnf = get_canonical_form_slice(idx, xl)[0]
+                    if cnf.step == 1:
+                        length = cnf.stop - cnf.start
+                    else:
+                        length = (cnf.stop - cnf.start - 1) // cnf.step + 1
+                    outshp.append(length)
+                i += 1
+            else:
+                # That dimension is dropped
+                pass
+        assert i == node.outputs[0].ndim
+        assert len(outshp) == node.outputs[0].ndim
+        return [outshp]
 
-        try:
-            len_along_axis = x.shape[axis]
-        except:
-            raise ValueError('Split.perform() with axis=(%s) is invalid'
-                             ' for x.shape==(%s)'
-                             % (axis, x.shape))
-        if len(splits) != self.len_splits:
-            raise ValueError('In Split.perform(), len(splits) != len_splits.',
-                             (len(splits), self.len_splits))
+    def grad(self, inputs, grads):
+        gz, = grads
+        x = inputs[0]
+        rest = inputs[1:]
+        output = self(*inputs)
+        if output.dtype.find('int') != -1:
+            first = x.zeros_like().astype(theano.config.floatX)
+        else:
+            first = IncSubtensor(self.idx_list)(zeros_like(x), gz, *rest)
+        return ([first]
+                + [DisconnectedType()()] * len(rest))
 
-        if numpy.sum(splits) != len_along_axis:
-            raise ValueError('The splits sum to %s, expected %s' %
-                             (numpy.sum(splits), len_along_axis))
-        if python_any([nb < 0 for nb in splits]):
-            raise ValueError('Split: you tried to make an ndarray with a '
-                             'negative number of elements.')
+    def connection_pattern(self, node):
 
-        # Checking is done, let's roll the splitting algorithm!
-        # Basically we step along the given axis of x, extracting
-        # subtensors of size splits[i] as we go along.
-
-        general_key = [slice(None, None, None) for s in x.shape]
-        lower_idx = 0
-        for i in xrange(self.len_splits):
-            upper_idx = lower_idx + splits[i]
-            general_key[axis] = slice(lower_idx, upper_idx, None)
-            outputs[i][0] = x.__getitem__(general_key).copy()
-            lower_idx = upper_idx
-
-    def infer_shape(self, node, in_shapes):
-        axis = node.inputs[1]
-        splits = node.inputs[2]
-        shp_x, shp_axis, shp_splits = in_shapes
-        out_shapes = []
-        for i in range(self.len_splits):
-            temp = as_tensor_variable(shp_x)
-            temp = theano.tensor.subtensor.set_subtensor(temp[axis], splits[i])
-            temp = [temp[i] for i in range(len(shp_x))]
-            out_shapes.append(temp)
-        return out_shapes
-
-    def grad(self, inputs, g_outputs):
-        """Join the gradients along the axis that was used to split x."""
-        x, axis, n = inputs
-        outputs = self(*inputs, **dict(return_list=True))
-        # If all the output gradients are disconnected, then so are the inputs
-        if python_all([isinstance(g.type, DisconnectedType)
-                       for g in g_outputs]):
-            return [DisconnectedType()(),
-                    grad_undefined(self, 1, axis),
-                    grad_undefined(self, 2, n)]
-        # Else, we have to make them zeros before joining them
-        new_g_outputs = []
-        for o, g in zip(outputs, g_outputs):
-            if isinstance(g.type, DisconnectedType):
-                new_g_outputs.append(o.zeros_like())
-            else:
-                new_g_outputs.append(g)
-
-        return [join(axis, *new_g_outputs),
-                grad_undefined(self, 1, axis),
-                grad_undefined(self, 2, n)]
-
-    def R_op(self, inputs, eval_points):
-        if eval_points[0] is None:
-            return [None for i in self.len_splits]
-        return self.make_node(eval_points[0], *inputs[1:]).outputs
+        rval = [[True]]
 
+        for ipt in node.inputs[1:]:
+            rval.append([False])
 
-def addbroadcast(x, *axes):
-    """
-    Make the input broadcastable in the specified axes.
+        return rval
 
-    We apply the opt here not to pollute the graph especially during
-    the gpu optimization
-    """
-    rval = Rebroadcast(*[(axis, True) for axis in axes])(x)
-    return theano.tensor.opt.apply_rebroadcast_opt(rval)
+    def __eq__(self, other):
+        return type(self) == type(other) and self.idx_list == other.idx_list
 
+    def __hash__(self):
+        # TODO: optimize by cache this hash value
+        msg = []
+        for entry in self.idx_list:
+            if isinstance(entry, slice):
+                msg += [(entry.start, entry.stop, entry.step)]
+            else:
+                msg += [entry]
+
+        idx_list = tuple(msg)
+        # backport
+        # idx_list = tuple((entry.start, entry.stop, entry.step)
+        #                 if isinstance(entry, slice)
+        #                 else entry
+        #                 for entry in self.idx_list)
+        return hash(idx_list)
+
+    @staticmethod
+    def str_from_slice(entry):
+        msg = []
+        for x in [entry.start, entry.stop, entry.step]:
+            if x is None:
+                msg.append("")
+            else:
+                msg.append(str(x))
+        return ":".join(msg)
 
-def unbroadcast(x, *axes):
-    """
-    Make the input impossible to broadcast in the specified axes.
+    def __str__(self):
+        indices = []
+        for entry in self.idx_list:
+            if isinstance(entry, slice):
+                indices.append(self.str_from_slice(entry))
+            else:
+                indices.append(str(entry))
+        return "%s{%s}" % (self.__class__.__name__, ", ".join(indices))
 
-    We apply the opt here not to pollute the graph especially during
-    the gpu optimization
-    """
-    rval = Rebroadcast(*[(axis, False) for axis in axes])(x)
-    return theano.tensor.opt.apply_rebroadcast_opt(rval)
+    @staticmethod
+    def default_helper_c_code_args():
+        """
+        Returns a dictionary of default arguments to
+        helper_c_code
+        """
 
+        return {
+                "c_prefix": "PyArray",
+                "update_flags": ("PyArray_UpdateFlags(%(view_name)s,"
+                " NPY_ARRAY_C_CONTIGUOUS|"
+                "NPY_ARRAY_F_CONTIGUOUS);"),
+                "set_data": "PyArray_set_data",
+                "set_dim": "PyArray_set_dim",
+                "set_stride": "PyArray_set_stride",
+                "strides_mul": 1,
+                "view_name": "xview"}
+
+    @staticmethod
+    def helper_c_code(node, name, inputs, outputs, sub, idx_list,
+                      c_prefix=None,
+                      update_flags=None,
+                      set_data=None,
+                      set_dim=None,
+                      set_stride=None,
+                      strides_mul=None,
+                      view_name=None
+                  ):
+        """
+        The parameters c_prefix, update_flags, set_data, set_dim,
+        set_stride and strides_mul are there to allow reusing this
+        function on PyArray and CudaNdarray object.
+        """
 
-def patternbroadcast(x, broadcastable):
-    """
-    Make the input adopt a specific broadcasting pattern.
+        default_args = Subtensor.default_helper_c_code_args()
 
-    We apply the opt here not to pollute the graph especially during the gpu
-    optimization.
-    """
-    rval = Rebroadcast(*[(i, broadcastable[i])
-                         for i in xrange(len(broadcastable))])(x)
-    return theano.tensor.opt.apply_rebroadcast_opt(rval)
+        if update_flags is None:
+            update_flags = default_args['update_flags']
 
+        if set_data is None:
+            set_data = default_args['set_data']
 
-class Join(Op):
-    """
-    Concatenate multiple `TensorVariable`s along some axis.
+        if set_dim is None:
+            set_dim = default_args['set_dim']
 
-    The axis must be given as first argument. All tensors must have the same
-    shape along all dimensions other than this axis.
-    Of course, TensorVariable instances do not have a shape, so this error
-    cannot be caught until runtime.  See `perform()`.
+        if set_stride is None:
+            set_stride = default_args['set_stride']
 
-    For joins involving scalar values, see @stack.
+        if strides_mul is None:
+            strides_mul = default_args['strides_mul']
 
-    .. python::
+        if c_prefix is None:
+            c_prefix = default_args['c_prefix']
 
-        x, y, z = tensor.matrix(), tensor.matrix(), tensor.matrix()
-        u = tensor.vector()
+        if view_name is None:
+            view_name = default_args['view_name']
 
-        r = join(0, x, y, z)
-        c = join(1, x, y, z)
-        join(2, x, y, z)    # WRONG: the axis has to be an index into the shape
-        join(0, x, u)       # WRONG: joined tensors must have the same rank
-    """
-    check_input = False
+        #update_flags may depend on view_name
+        update_flags = update_flags % locals()
 
-    def __eq__(self, other):
-        return type(self) == type(other)
+        #
+        # two arrays are created in C code:
+        # is_slice: len == ndim, 0 means int, 1 means slice
+        # subtensor_spec: len = n_ints + 3 * n_slices
+        #
+        fail = sub['fail']
+        init_cmds = []  # initialization for subtensor_spec
+        is_slice = []
+        # TODO: change that, it might lead to unexpected results,
+        # see assembla-#767
+        NONE_CODE = maxsize - 1
+
+        pos = [0, 1]  # annoying version of global variable for init_entry
+
+        def inc_spec_pos(amt):
+            pos[0] += amt
+
+        def inc_input_pos(amt):
+            pos[1] += amt
+
+        def spec_pos():
+            return pos[0]
+
+        def input_pos():
+            return pos[1]
+
+        def init_entry(entry, depth=0):
+            if isinstance(entry, (numpy.integer, int)):
+                init_cmds.append(
+                        "subtensor_spec[%i] = %i;" % (spec_pos(),
+                            entry))
+                inc_spec_pos(1)
+                if depth == 0:
+                    is_slice.append(0)
+            elif isinstance(entry, Type):
+                init_cmds.append(
+                        "subtensor_spec[%i] = %s;" % (spec_pos(),
+                            inputs[input_pos()]))
+                inc_spec_pos(1)
+                inc_input_pos(1)
+                if depth == 0:
+                    is_slice.append(0)
+            elif entry is None:
+                init_cmds.append(
+                        "subtensor_spec[%i] = %i;" % (spec_pos(),
+                            NONE_CODE))
+                inc_spec_pos(1)
+                if depth == 0:
+                    is_slice.append(0)
+            elif depth == 0 and isinstance(entry, slice):
+                init_entry(entry.start, depth + 1)
+                init_entry(entry.stop, depth + 1)
+                init_entry(entry.step, depth + 1)
+                is_slice.append(1)
+            else:
+                assert 0, entry
 
-    def __hash__(self):
-        return hash(type(self))
+        for entry in idx_list:
+            init_entry(entry)
+        # make sure we used all inputs
+        assert input_pos() == len(inputs), input_pos()
+        assert len(is_slice) <= node.inputs[0].ndim, node.inputs[0].ndim
 
-    def __str__(self):
-        return '%s' % (self.__class__.__name__)
+        len_is_slice = len(is_slice)
 
-    def make_node(self, *axis_and_tensors):
-        """
-        :param axis: an Int or integer-valued Variable
+        len_subtensor_spec = spec_pos()
 
-        :param tensors: a variable number (but not zero) of tensors to
-          concatenate along the specified axis.  These tensors must have
-          the same shape along all dimensions other than this axis.
+        is_slice_init = ",".join([str(s) for s in is_slice])
+        subtensor_init = "\n".join(init_cmds)
 
-        :returns: a symbolic Variable.  It has the same ndim as the
-            input tensors, and the most inclusive dtype.
+        x, = inputs[:1]
+        z, = outputs
 
-        """
-        axis, tensors = axis_and_tensors[0], axis_and_tensors[1:]
-        if not tensors:
-            raise ValueError('Cannot join an empty list of tensors')
-        as_tensor_variable_args = [as_tensor_variable(x) for x in tensors]
+        xview = view_name
+
+        rval = """
+        #define PyArray_set_dim(obj, idx, d) PyArray_DIMS(obj)[idx]=d
+        #define PyArray_set_stride(obj, idx, d) PyArray_STRIDES(obj)[idx]=d
+        #define PyArray_set_data(obj, ptr, base) PyArray_BYTES(obj)=ptr
+
+        // The subtensor is created by iterating over the dimensions
+        // and updating stride, shape, and data pointers
+
+        int is_slice[] = {%(is_slice_init)s};
+        npy_intp subtensor_spec[%(len_subtensor_spec)s];
+        %(subtensor_init)s;
+        int spec_pos = 0; //position in subtensor_spec
+        int inner_ii = 0; // the current dimension of zview
+        int outer_ii = 0; // current dimension of z
+
+        char* ptr = (char*) %(c_prefix)s_BYTES(%(xview)s);
+
+        if ((%(c_prefix)s_DIMS(%(xview)s) == %(c_prefix)s_DIMS(%(x)s))
+            && (%(c_prefix)s_DIMS(%(x)s) != NULL))
+        {
+            PyErr_Format(PyExc_ValueError, "x and %(xview)s"
+                         "(with %%d dims) have the same dimensions"
+                         " pointers: %%p and %%p",
+                         %(c_prefix)s_NDIM(%(x)s),
+                         %(c_prefix)s_DIMS(%(xview)s),
+                         %(c_prefix)s_DIMS(%(x)s));
+            Py_XDECREF(%(xview)s);
+            %(fail)s;
+        }
+        if (%(c_prefix)s_STRIDES(%(xview)s) == %(c_prefix)s_STRIDES(%(x)s)
+            && (%(c_prefix)s_DIMS(%(x)s) != NULL))
+        {
+            PyErr_Format(PyExc_ValueError, "x and %(xview)s"
+                         "(with %%d dims) have the same strides"
+                         " pointers: %%p and %%p",
+                         %(c_prefix)s_NDIM(%(x)s),
+                         %(c_prefix)s_STRIDES(%(xview)s),
+                         %(c_prefix)s_STRIDES(%(x)s));
+            Py_XDECREF(%(xview)s);
+            %(fail)s;
+        }
 
-        dtypes = [x.type.dtype for x in as_tensor_variable_args]
-        out_dtype = scal.upcast(*dtypes)
+        for (; outer_ii < %(len_is_slice)s; ++outer_ii)
+        {
+            if (is_slice[outer_ii])
+            {
+                npy_intp length = %(c_prefix)s_DIMS(%(x)s)[outer_ii];
+                npy_intp slicelength;
+                npy_intp start = subtensor_spec[spec_pos+0];
+                npy_intp stop  = subtensor_spec[spec_pos+1];
+                npy_intp step  = subtensor_spec[spec_pos+2];
+                if (step == %(NONE_CODE)s) step = 1;
+
+                npy_intp defstart = step < 0 ? length-1 : 0;
+                npy_intp defstop = step < 0 ? -1 : length;
+
+                // logic adapted from
+                // PySlice_GetIndicesEx in python source
+                if (!step)
+                {
+                    Py_DECREF(%(xview)s);
+                    PyErr_Format(PyExc_ValueError,
+                                 "slice step cannot be zero");
+                    Py_XDECREF(%(xview)s);
+                    %(fail)s;
+                }
 
-        output_maker = lambda bcastable: tensor(dtype=out_dtype,
-                                                broadcastable=bcastable)
+                if (start == %(NONE_CODE)s)
+                {
+                    start = defstart;
+                }
+                else
+                {
+                    if (start < 0) start += length;
+                    if (start < 0) start = (step < 0) ? -1 : 0;
+                    if (start >= length)
+                        start = (step < 0) ? length - 1 : length;
+                }
 
-        return self._make_node_internal(
-            axis, tensors, as_tensor_variable_args, output_maker)
+                if (stop == %(NONE_CODE)s)
+                {
+                    stop = defstop;
+                }
+                else
+                {
+                    if (stop < 0) stop += length;
+                    if (stop < 0) stop = (step < 0) ? -1 : 0;
+                    if (stop >= length)
+                        stop = (step < 0) ? length - 1 : length;
+                }
 
-    def _make_node_internal(self, axis, tensors,
-                            as_tensor_variable_args, output_maker):
-        if not python_all(targs.type.ndim for targs
-                          in as_tensor_variable_args):
-            raise TypeError('Join cannot handle arguments of dimension 0.'
-                            ' For joining scalar values, see @stack')
-        # Handle single-tensor joins immediately.
-        if len(as_tensor_variable_args) == 1:
-            bcastable = list(as_tensor_variable_args[0].type.broadcastable)
-        else:
-            # When the axis is fixed, a dimension should be
-            # broadcastable if at least one of the inputs is
-            # broadcastable on that dimension (see justification below),
-            # except for the axis dimension.
-            # Initialize bcastable all false, and then fill in some trues with
-            # the loops.
-            bcastable = [False] * len(
-                as_tensor_variable_args[0].type.broadcastable)
-            ndim = len(bcastable)
-            # Axis can also be a constant
-            if not isinstance(axis, int):
-                try:
-                    # Note : `get_scalar_constant_value` returns a ndarray not
-                    # an int
-                    axis = int(get_scalar_constant_value(axis))
+                if ((step < 0 && stop >= start)
+                    || (step > 0 && start >= stop)) {
+                    slicelength = 0;
+                }
+                else if (step < 0) {
+                    slicelength = (stop-start+1)/step+1;
+                }
+                else {
+                    slicelength = (stop-start-1)/step+1;
+                }
 
-                except NotScalarConstantError:
-                    pass
-            if isinstance(axis, int):
-                # Basically, broadcastable -> length 1, but the
-                # converse does not hold. So we permit e.g. T/F/T
-                # joins, and if they fail at runtime they fail, but if
-                # they don't then it means that the argument where
-                # that broadcastable flag was False had length 1 along
-                # this dimension, and therefore this dimension should
-                # be broadcastable for the output.
-                for x in as_tensor_variable_args:
-                    for current_axis, bflag in enumerate(x.type.broadcastable):
-                        # Not sure if this Op supports/supported/will support
-                        # negative indices, but just to be sure...
-                        if current_axis == axis % ndim:
-                            continue
-                        if bflag:
-                            bcastable[current_axis] = True
-                try:
-                    bcastable[axis] = False
-                except IndexError:
-                    raise ValueError('Join argument "axis" is out of range'
-                                     ' (given input dimensions)')
-            else:
-                # When the axis may vary, no dimension can be guaranteed to be
-                # broadcastable.
-                bcastable = [False] * len(
-                    as_tensor_variable_args[0].type.broadcastable)
+                if (0){
+                    fprintf(stdout, "start %%zi\\n", start);
+                    fprintf(stdout, "stop %%zi\\n", stop);
+                    fprintf(stdout, "step %%zi\\n", step);
+                    fprintf(stdout, "length %%zi\\n", length);
+                    fprintf(stdout, "slicelength %%zi\\n", slicelength);
+                }
 
-        if not python_all([x.ndim == len(bcastable)
-                           for x in as_tensor_variable_args[1:]]):
-            raise TypeError("Join() can only join tensors with the same "
-                            "number of dimensions.")
+                assert (slicelength <= length);
 
-        inputs = [as_tensor_variable(axis)] + list(as_tensor_variable_args)
-        if inputs[0].type not in int_types:
-            raise TypeError('Axis could not be cast to an integer type',
-                            axis, inputs[0].type, int_types)
+                ptr += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start *
+                       %(strides_mul)s;
+                %(set_dim)s(%(xview)s, inner_ii, slicelength);
+                %(set_stride)s(%(xview)s, inner_ii,
+                               %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step);
 
-        outputs = [output_maker(bcastable)]
+                inner_ii += 1;
+                spec_pos += 3;
+            }
+            else // tuple coord `outer_ii` is an int
+            {
+                int idx = subtensor_spec[spec_pos];
+                if (idx < 0) idx += %(c_prefix)s_DIMS(%(x)s)[outer_ii];
+                if (idx >= 0)
+                {
+                    if (idx < %(c_prefix)s_DIMS(%(x)s)[outer_ii])
+                    {
+                        ptr += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * idx *
+                               %(strides_mul)s;
+                    }
+                    else
+                    {
+                        PyErr_Format(PyExc_IndexError,"index out of bounds");
+                        Py_XDECREF(%(xview)s);
+                        %(fail)s;
+                    }
+                }
+                else
+                {
+                    PyErr_Format(PyExc_IndexError,"index out of bounds");
+                    Py_XDECREF(%(xview)s);
+                    %(fail)s;
+                }
 
-        node = Apply(self, inputs, outputs)
-        return node
+                spec_pos += 1;
+            }
+        }
+        %(set_data)s(%(xview)s, ptr, (PyObject*)NULL);
+        assert (inner_ii <= %(c_prefix)s_NDIM(%(xview)s));
+        while (inner_ii < %(c_prefix)s_NDIM(%(xview)s))
+        {
+            assert (outer_ii < %(c_prefix)s_NDIM(%(x)s));
+            %(set_dim)s(%(xview)s, inner_ii,
+                        %(c_prefix)s_DIMS(%(x)s)[outer_ii]);
+            %(set_stride)s(%(xview)s, inner_ii,
+                           %(c_prefix)s_STRIDES(%(x)s)[outer_ii]);
+            inner_ii += 1;
+            outer_ii += 1;
+        }
+        %(update_flags)s
+        """ % locals()
+        # print rval
+        return rval
 
-    def perform(self, node, axis_and_tensors, out_):
-        out, = out_
-        axis, tensors = axis_and_tensors[0], axis_and_tensors[1:]
-        out[0] = theano._asarray(numpy.concatenate(tensors, axis=axis),
-                                 dtype=node.outputs[0].type.dtype)
+    @staticmethod
+    def helper_c_code_cache_version():
+        return (5,)
 
-    def c_code_cache_version(self):
-        return (2,)
+    def c_code(self, node, name, inputs, outputs, sub):  # DEBUG
+        if not isinstance(node.inputs[0].type, TensorType):
+            raise NotImplementedError()
 
-    def c_code(self, node, name, inputs, outputs, sub):
-        axis, tensors = inputs[0], inputs[1:]
-        l = len(tensors)
-        out, = outputs
+        x = inputs[0]
+        z, = outputs
+        view_ndim = node.outputs[0].ndim
         fail = sub['fail']
-        adtype = node.inputs[0].type.dtype_specs()[1]
-        code = """
-        PyObject* list = PyList_New(%(l)s);
-        """ % locals()
-        for i, inp in enumerate(tensors):
-            code += """
-            Py_INCREF(%(inp)s);
-            PyList_SetItem(list, %(i)s, (PyObject*)%(inp)s);
-            """ % locals()
-        code += """
-        //PyObject* PyArray_Concatenate(PyObject* obj, int axis)
-        Py_XDECREF(%(out)s);
-        %(out)s = (PyArrayObject *)PyArray_Concatenate(list,
-                      ((%(adtype)s *)PyArray_DATA(%(axis)s))[0]);
 
-        Py_DECREF(list);
-        if(!%(out)s){
-            %(fail)s
+        build_view = """
+        //TODO: give this Op a second output so that this view can be cached
+        //TODO: alternatively, fix the memory leak on failure
+        Py_INCREF(PyArray_DESCR(%(x)s));
+        PyArrayObject * xview = (PyArrayObject*)PyArray_NewFromDescr(
+                &PyArray_Type,
+                PyArray_DESCR(%(x)s),
+                %(view_ndim)s,
+                PyArray_DIMS(%(x)s),
+                PyArray_STRIDES(%(x)s),
+                PyArray_DATA(%(x)s),
+                %(x)s->flags,
+                NULL);
+        if (!xview)
+        {
+            %(fail)s;
         }
         """ % locals()
-        return code
+        get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
+                self.idx_list)
+
+        finish_view = """
+        if (%(z)s) Py_DECREF(%(z)s);
+        Py_INCREF(py_%(x)s);
+        PyArray_BASE(xview) = py_%(x)s;
+        assert(py_%(x)s == (PyObject*)%(x)s);
+        %(z)s = xview;
+        """ % locals()
+
+        return build_view + "{" + get_xview + "}" + finish_view
+
+    def c_code_cache_version(self):
+        hv = self.helper_c_code_cache_version()
+        # If `helper_c_code_cache_version` is not versioned we do not want to
+        # have a versioned version of this op's C code.
+        if len(hv) == 0:
+            return ()
+        return (2, hv)
 
     def R_op(self, inputs, eval_points):
-        if None in eval_points[1:]:
+        # Subtensor is not differentiable wrt to its indices, therefore we
+        # do not even need to consider the eval_points provided for those
+        # (they should be defaulted to zeros_like by the global R_op)
+        if eval_points[0] is None:
             return [None]
-        return self.make_node(inputs[0], *eval_points[1:]).outputs
+        return self.make_node(eval_points[0], *inputs[1:]).outputs
 
-    def grad(self, axis_and_tensors, grads):
-        """ The gradient wrt a join op is a `Split`, used to partition
-        the gradient along the `axis` which was used for joining.
-        """
-        gz, = grads
-        axis, tensors = axis_and_tensors[0], axis_and_tensors[1:]
 
-        rval = [grad_undefined(self, 0, axis)]
+class SubtensorPrinter:
+
+    def process(self, r, pstate):
+        if r.owner is None:
+            raise TypeError("Can only print Subtensor.")
+        elif isinstance(r.owner.op, Subtensor):
+            idxs = r.owner.op.idx_list
+            inputs = list(r.owner.inputs)
+            input = inputs.pop()
+            sidxs = []
+            inbrack_pstate = pstate.clone(precedence=-1000)
+            for entry in idxs:
+                if isinstance(entry, int):
+                    sidxs.append(str(entry))
+                elif isinstance(entry, scal.Scalar):
+                    sidxs.append(inbrack_pstate.pprinter.process(inputs.pop()))
+                elif isinstance(entry, slice):
+                    if entry.start is None or entry.start == 0:
+                        msg1 = ""
+                    else:
+                        msg1 = entry.start
+
+                    if entry.stop is None or entry.stop == maxsize:
+                        msg2 = ""
+                    else:
+                        msg2 = entry.stop
+
+                    if entry.step is None:
+                        msg3 = ""
+                    else:
+                        msg3 = ":%s" % entry.step
+
+                    sidxs.append("%s:%s%s" % (msg1, msg2, msg3))
+            return "%s[%s]" % (pstate.pprinter.process(
+                input,
+                pstate.clone(precedence=1000)),
+                ", ".join(sidxs))
+        else:
+            raise TypeError("Can only print Subtensor.")
 
-        dtypes = [as_tensor_variable(x).type.dtype for x in tensors]
-        out_dtype = scal.upcast(*dtypes)
+pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Subtensor),
+        SubtensorPrinter())
 
-        if 'float' in out_dtype or 'complex' in out_dtype:
-            # assume that this is differentiable
-            split = Split(len(tensors))
-            split_gz = split(gz, axis, stack(*[shape(x)[axis]
-                                               for x in tensors]))
-            # If there is only one split, it might not be in a list.
-            if not isinstance(split_gz, list):
-                split_gz = [split_gz]
-            # Split.make_node isn't always able to infer the right
-            # broadcast. As the grad need to keep the information,
-            # readd it if needed.
-            split_gz = [patternbroadcast(g, t.broadcastable)
-                        for t, g in zip(tensors, split_gz)]
-            rval = rval + split_gz
-        else:
-            # the output has integer type, so the gradient through it
-            # is 0
-            rval = rval + [tensor.zeros_like(dtype=config.floatX)
-                           for tensor in tensors]
 
-        return rval
+def set_subtensor(x, y, inplace=False,
+        tolerate_inplace_aliasing=False):
+    """Return x with the given subtensor overwritten by y.
 
-    def infer_shape(self, node, ishapes):
-        # ishapes[0] contains the size of the axis on which we join
-        # Join op should get at least one input to join
-        assert len(ishapes) > 1
-        n_dim = len(ishapes[1])
-        for shp in ishapes[1:]:
-            assert shp is not None
-            assert len(shp) == n_dim
+    Example: To replicate the numpy expression "r[10:] = 5", type
 
-        out_shapes = []
-        for dim in xrange(n_dim):
-            # we have to deal with 2 possible cases in here :
-            #   a) we are dealing with the dimension for which we join
-            #     (called t_side from true side of the if, where the if
-            #     compares current dimension with the joining dimension)
-            #   b) a non joining dimension ( in which maybe a symbolic
-            #      assertion can be used to make sure all tensors have
-            #      the same number of elements on this non-joined dimension
-            #      this is f_side
-            # initialize
-            t_side = ishapes[1][dim]
-            f_side = ishapes[1][dim]
-            # loop over tensors and sum for the joining dimension
-            for shp in ishapes[2:]:
-                t_side = t_side + shp[dim]
-            # return the dimensions found
-            out_shapes.append(switch(eq(dim, node.inputs[0]),
-                              t_side, f_side))
+    >>> new_r = set_subtensor(r[10:], 5)
 
-        return [tuple(out_shapes)]
+    :param x: symbolic variable for the lvalue of = operation
+    :param y: symbolic variable for the rvalue of = operation
+    :param tolerate_inplace_aliasing: see inc_subtensor for documentation.
+    """
+    return inc_subtensor(x, y, inplace, set_instead_of_inc=True,
+            tolerate_inplace_aliasing=tolerate_inplace_aliasing)
 
 
-"""
-    Convenience function to concatenate `TensorType`s along the given axis.
+def batched_dot(x, y):
+    """
+    :param x: A Tensor with sizes e.g.: for  3D (dim1, dim3, dim2)
+    :param y: A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4)
+    This function computes the dot product between the two tensors, by
+    iterating over the first dimension using scan.
+    Returns a tensor of size e.g. if it is 3D: (dim1, dim3, dim4)
+    Example:
+    >>> first = tensor.tensor3('first')
+    >>> second = tensor.tensor3('second')
+    >>> result = batched_dot(first, second)
+    :note:  This is a subset of numpy.einsum, but we do not provide it for now.
+    But numpy einsum is slower than dot or tensordot:
+    http://mail.scipy.org/pipermail/numpy-discussion/2012-October/064259.html
+    """
+    result, updates = theano.scan(fn=lambda x_mat, y_mat:
+            theano.tensor.dot(x_mat, y_mat),
+            outputs_info=None,
+            sequences=[x, y],
+            non_sequences=None)
+    return result
 
-    :Parameters:
-     - `tensors` : list of tensors (or list-like)
-       A list of tensors to be concatenated along the given axis.
-     - `axis` : int (symbolic or literal)
 
-       On which dimension should the tensors be joined?  The `axis`
-       must be a valid index into the shape of the tensors to be
-       concatenated.
+def inc_subtensor(x, y, inplace=False, set_instead_of_inc=False,
+        tolerate_inplace_aliasing=False):
+    """Return x with the given subtensor incremented by y.
 
-       The `axis` parameter may either be an integer or an object that
-       can be converted to a scalar using `as_scalar`(`axis`). In the
-       former case, the axis is fixed at construction, while in the
-       latter it may vary over time depending on the value of the
-       `axis` variable.
+    :param x: the symbolic result of a Subtensor operation.
+    :param y: the amount by which to increment ths subtensor in question
+    :param tolerate_inplace_aliasing: allow x and y to be views of a single
+        underlying array even while working inplace.  For correct results,
+        x and y must not be overlapping views; if they overlap, the result
+        of this Op will generally be incorrect. This value has no effect if
+        inplace=False.
 
-    The shapes of the tensors to be concatenated must be all
-    identical, except in the dimension (`axis`) on which they are to
-    be joined.
+    Example: To replicate the numpy expression "r[10:] += 5", type
 
+    >>> new_r = inc_subtensor(r[10:], 5)
     """
-join = Join()
-
-pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Join),
-              printing.FunctionPrinter('join'))
+    # First of all, y cannot have a higher dimension than x,
+    # nor have non-broadcastable dimensions where x is broadcastable.
 
+    x = as_tensor_variable(x)
+    y = as_tensor_variable(y)
 
-def roll(x, shift, axis=None):
-    """
-    Convenience function to roll `TensorType`s along the given axis.
-    Syntax copies numpy.roll function
+    if y.ndim > x.ndim:
+        raise TypeError(("Trying to increment a %d-dimensional "
+            "subtensor with a %d-dimensional value.") % (x.ndim, y.ndim))
+
+    for dim in range(y.ndim):
+        dim_offset = x.ndim - y.ndim
+        if (x.broadcastable[dim + dim_offset]
+                and not y.broadcastable[dim]):
+            # It is acceptable to try to increment a subtensor with a
+            # broadcastable dim with a tensor that is not broadcastable
+            # on that dimension. However, its length must then be 1.
+            # We insert a Rebroadcast Op to make sure it is the case.
+            y = addbroadcast(y, dim)
+
+    if not x.owner:
+        raise TypeError('x must be the result of a subtensor operation')
+
+    # retrieve idx_list from x.owner
+    if isinstance(x.owner.op, Subtensor):
+        if tolerate_inplace_aliasing:
+            destroyhandler_tolerate_aliased = [[0, 1]]
+        else:
+            destroyhandler_tolerate_aliased = []
+        the_op = IncSubtensor(x.owner.op.idx_list, inplace, set_instead_of_inc,
+                destroyhandler_tolerate_aliased=destroyhandler_tolerate_aliased
+                )
+        real_x = x.owner.inputs[0]
+        real_idxargs = x.owner.inputs[1:]
+        return the_op(real_x, y, *real_idxargs)
+    elif isinstance(x.owner.op, AdvancedSubtensor1):
+        real_x = x.owner.inputs[0]
+        ilist = x.owner.inputs[1]
+        the_op = AdvancedIncSubtensor1(inplace,
+                                       set_instead_of_inc=set_instead_of_inc)
+        return the_op(real_x, y, ilist)
+    elif isinstance(x.owner.op, AdvancedSubtensor):
+        real_x = x.owner.inputs[0]
+        ilist = x.owner.inputs[1:]
+
+        the_op = AdvancedIncSubtensor(inplace,
+                                      set_instead_of_inc=set_instead_of_inc)
+        return the_op(real_x, y, *ilist)
+    elif isinstance(x.owner.op, DimShuffle):
+        inner_x = x.owner.inputs[0]
+        # In the dimshuffle case, there are in fact two dimshuffles:
+        # one to make the indexed dimension the last one,
+        # and one to put it back where it was. So, in the case where we have
+        # inc_subtensor(x[:,i], y), the graph is actually
+        # inc_subtensor((x.T)[i].T, y).
+        # We could get all the way to x, and then get rid of the dimshuffles
+        # completely, but the problem is that advanced_inc_subtensor1 can only
+        # work on the first (outer-most, left-most) dimension of x,
+        # just like advanced_subtensor1.
+        # So we call advanced_inc_subtensor1(x.T, i, y), but then we need to
+        # return something that has the same shape as x, not as x.T (inner_x).
+        # So re-apply the outer dimshuffle on the new inc_subtensor,
+        # and return advanced_inc_subtensor1(x.T, i, y).T.
+        inner_incsubtensor = inc_subtensor(inner_x, y,
+                inplace=inplace,
+                set_instead_of_inc=set_instead_of_inc,
+                tolerate_inplace_aliasing=tolerate_inplace_aliasing)
+        return x.owner.op(inner_incsubtensor, *x.owner.inputs[1:])
+    elif isinstance(x.owner.op, Reshape):
+        inner_x = x.owner.inputs[0]
+        # Try to apply inc_subtensor on inner_x.
+        # If it works, there is no need to reshape, as the inc_subtensor
+        # will have the same shape as inner_x, which is what we want.
+        inner_incsubtensor = inc_subtensor(inner_x, y.flatten(),
+                inplace=inplace,
+                set_instead_of_inc=set_instead_of_inc,
+                tolerate_inplace_aliasing=tolerate_inplace_aliasing)
+        return inner_incsubtensor
+    else:
+        raise TypeError('x must be the result of a subtensor operation')
 
-    Parameters
-    ----------
-    x : tensor_like
-        Input tensor.
-    shift : int (symbolic or literal)
-        The number of places by which elements are shifted.
-    axis : int (symbolic or literal) (optional)
-        The axis along which elements are shifted. By default, the array
-        is flattened before shifting, after which the original
-        shape is restored.
 
-    Returns
-    -------
-    res : tensor
-        Output tensor, with the same shape as `x`.
-    """
-    if axis is None:
-        if x.ndim > 1:
-            y = x.flatten()
-            return roll(y, shift, axis=0).reshape(x.shape)
-        else:
-            axis = 0
+class IncSubtensor(Op):
+    """Increment a subtensor.
 
-    # A slice of all elements in a dimension ':'
-    allslice = slice(None)
-    # List of slices describing the front half [:, :, shift:, :]
-    front_slice = slice(-shift, None)
-    front_list = ([allslice] * axis + [front_slice] +
-                  [allslice] * (x.ndim - axis - 1))
-    # List of slices describing the back half [:, :, :shift, :]
-    end_slice = slice(0, -shift)
-    end_list = ([allslice] * axis + [end_slice] +
-                [allslice] * (x.ndim - axis - 1))
-    return join(axis,
-                x.__getitem__(tuple(front_list)),
-                x.__getitem__(tuple(end_list)))
+    This is like numpy's
 
+        x[i,j,k] += y
 
-@constructor
-def shape_padleft(t, n_ones=1):
-    """Reshape `t` by left-padding the shape with `n_ones` 1s
+    It is used internally to implement the gradient on SubTensor.
 
-    See also: `shape_padright` and `Dimshuffle`
+    :param set_instead_of_inc: if True set the subtensor to the value instead
+    of incrementing it by that value.
     """
-    _t = as_tensor_variable(t)
 
-    pattern = ['x'] * n_ones + [i for i in xrange(_t.type.ndim)]
-    return DimShuffle(_t.broadcastable, pattern)(_t)
+    def __init__(self, idx_list, inplace=False, set_instead_of_inc=False,
+            destroyhandler_tolerate_aliased=None):
+        if destroyhandler_tolerate_aliased is None:
+            destroyhandler_tolerate_aliased = []
+        self.idx_list = map(Subtensor.convert, idx_list)
+        self.inplace = inplace
+        if inplace:
+            self.destroy_map = {0: [0]}
+        self.destroyhandler_tolerate_aliased = list(
+                destroyhandler_tolerate_aliased)
+        self.set_instead_of_inc = set_instead_of_inc
 
+    def __eq__(self, other):
+        return type(self) == type(other) \
+                and self.idx_list == other.idx_list \
+                and self.inplace == other.inplace \
+                and self.set_instead_of_inc == other.set_instead_of_inc
 
-@constructor
-def shape_padright(t, n_ones=1):
-    """Reshape `t` by right-padding the shape with `n_ones` 1s
+    def __hash__(self):
+        msg = []
+        for entry in self.idx_list:
+            if isinstance(entry, slice):
+                msg += [(entry.start, entry.stop, entry.step)]
+            else:
+                msg += [entry]
 
-    See also: `shape_padleft` and `Dimshuffle`
-    """
-    _t = as_tensor_variable(t)
+        idx_list = tuple(msg)
+        # backport
+        #idx_list = tuple((entry.start, entry.stop, entry.step)
+        #                 if isinstance(entry, slice)
+        #                 else entry
+        #                 for entry in self.idx_list)
+        return hashtype(self) ^ hash(idx_list) ^ hash(self.inplace) \
+                        ^ hash(self.set_instead_of_inc)
 
-    pattern = [i for i in xrange(_t.type.ndim)] + ['x'] * n_ones
-    return DimShuffle(_t.broadcastable, pattern)(_t)
+    def __str__(self):
+        indices = []
+        for entry in self.idx_list:
+            if isinstance(entry, slice):
+                indices.append(Subtensor.str_from_slice(entry))
+            else:
+                indices.append(str(entry))
+        if self.inplace:
+            msg = 'Inplace'
+        else:
+            msg = ''
+        if not self.set_instead_of_inc:
+            msg += 'Inc'
+        else:
+            msg += 'Set'
+        return  "%s{%s;%s}" % (
+                self.__class__.__name__,
+                msg,
+                ", ".join(indices))
 
+    def make_node(self, x, y, *inputs):
+        """
+            x: the tensor to increment
+            y: the value to increment by
+            inputs: TODO WRITEME
+        """
+        x, y = map(as_tensor_variable, [x, y])
+        if y.ndim > x.ndim:
+            raise ValueError(("Trying to increment a %d-dimensional "
+                "subtensor with a %d-dimensional value.") % (x.ndim,
+                    y.ndim))
+        inputs = tuple(map(Subtensor.my_as_scalar, inputs))
+
+        idx_list = list(self.idx_list)
+        if len(idx_list) > x.type.ndim:
+            exception = ValueError(
+                    Subtensor.e_invalid % (
+                        len(idx_list),
+                        x.type.ndim))
+            exception.subtensor_invalid = True
+            raise exception
+
+        input_types = Subtensor.collapse(idx_list,
+                lambda entry: isinstance(entry, gof.Type))
+        if len(inputs) != len(input_types):
+            raise IndexError(
+                    "Not enough inputs to fill in the Subtensor template.",
+                    inputs, idx_list)
+        for input, expected_type in izip(inputs, input_types):
+            if input.type != expected_type:
+                raise TypeError(
+                    "Wrong type for Subtensor template. Expected %s, got %s."
+                    % (input.type, expected_type))
+
+        return gof.Apply(self,
+                         (x, y) + inputs,
+                         [x.type()])
 
-@constructor
-def stack(*tensors):
-    """Insert the arguments as slices into a tensor of 1 rank greater.
+    def perform(self, node, inputs, out_):
+        out, = out_
+        x, y = inputs[:2]
+        indices = list(reversed(inputs[2:]))
+
+        def convert(entry):
+            if isinstance(entry, gof.Type):
+                rval = indices.pop()
+                if sys.version_info < (2, 5):
+                    # Before Python 2.5, PySlice_GetIndicesEx requires
+                    # Python int to be passed.
+                    rval_ = int(rval)
+                    if rval_ != rval:
+                        raise IndexError((
+                            "Invalid value for indexing: %s. "
+                            "That value may be too big.") % rval)
+                    return rval_
+                return rval
+            elif isinstance(entry, slice):
+                return slice(convert(entry.start),
+                             convert(entry.stop),
+                             convert(entry.step))
+            else:
+                return entry
+
+        cdata = tuple(map(convert, self.idx_list))
+        if len(cdata) == 1:
+            cdata = cdata[0]
+        if not self.inplace:
+            x = x.copy()
+        sub_x = x.__getitem__(cdata)
+        if sub_x.shape:
+            # we've sliced out an N-D tensor with N > 0
+            if not self.set_instead_of_inc:
+                sub_x += y
+            else:
+                #sub_x += -sub_x + y
+                x.__setitem__(cdata, y)
+        else:
+            # scalar case
+            if not self.set_instead_of_inc:
+                x.__setitem__(cdata, sub_x + y)
+            else:
+                x.__setitem__(cdata, y)
+        out[0] = x
 
-    The size in dimension 0 of the result will be equal to the number
-    of tensors passed.
-    """
-    if len(tensors) == 0:
-        raise Exception('theano.tensor.stack(*tensors) must have at least'
-                        ' one parameter')
-    # If all tensors are scalars of the same type, call make_vector.
-    # It makes the graph simpler, by not adding DimShuffles and Rebroadcasts
+    def c_code(self, node, name, inputs, outputs, sub):
 
-    # This should be an optimization!
-    # Doing it here make the graph less canonicalized
-    # (more type need to be understood by all optimization)
-    # And DebugMode can't detect error in this code as it is not in an
-    # optimization.
-    # See ticket #660
-    if numpy.all(
-        [  # in case there is direct int in tensors.
-            isinstance(t, (numpy.number, float, int, python_complex,
-                           long)) or
-            (isinstance(t, Variable) and
-             isinstance(t.type, TensorType) and
-             t.ndim == 0)
-            for t in tensors]):
-        # in case there is direct int
-        tensors = map(as_tensor_variable, tensors)
-        dtype = scal.upcast(*[i.dtype for i in tensors])
-        return theano.tensor.opt.MakeVector(dtype)(*tensors)
-    return join(0, *[shape_padleft(t, 1) for t in tensors])
+        # This method delegates much of the work to helper
+        # methods. This method implements the main logic
+        # but subclasses may override the helper methods
+        # to change the particulars, e.g. GpuIncSubtensor
+        # turns the view/copy operations on numpy arrays
+        # into the same operations on cuda arrays.
 
+        self.do_type_checking(node)
 
-@constructor
-def concatenate(tensor_list, axis=0):
-    """Alias for `join`(axis, *tensor_list).
+        if self.inplace:  # convert bool to int
+            inplace = 1
+        else:
+            inplace = 0
+        x = inputs[0]
+        y = inputs[1]
+        z, = outputs
+        if self.set_instead_of_inc:  # convert bool to int
+            op_is_set = 1
+        else:
+            op_is_set = 0
+        fail = sub['fail']
+        view_ndim = (node.inputs[0].ndim -
+                     numpy.sum([not isinstance(idx, slice)
+                                for idx in self.idx_list]))
 
-    This function is similar to `join`, but uses the signature of
-    numpy's concatenate function.
+        copy_of_x = self.copy_of_x(x)
 
-    This function
-    :Exceptions:
-     - `TypeError` : the tensor_list must be a tuple or list
+        copy_input_if_necessary = """
+        if (%(inplace)s)
+        {
+            if (%(x)s != %(z)s)
+            {
+                Py_XDECREF(%(z)s);
+                Py_INCREF(%(x)s);
+                %(z)s = %(x)s;
+            }
+        }
+        else
+        {
+            if (%(z)s) Py_DECREF(%(z)s);
+            %(z)s = %(copy_of_x)s;
+        }
+        """ % locals()
 
-    """
-    # Check someone did not make the common mistake to do something like:
-    #   c = concatenate(x, y)
-    # instead of
-    #   c = concatenate((x, y))
-    if not isinstance(tensor_list, (tuple, list)):
-        raise TypeError(
-            "The 'tensors' argument must be either a tuple "
-            "or a list, make sure you did not forget () or [] around "
-            "arguments of concatenate.", tensor_list)
-    return join(axis, *tensor_list)
+        alloc_zview = self.make_view_array(z, view_ndim)
+        # On GPU, it takes two steps to make a view
+        link_zview = self.link_view_array(z, fail)
+
+        #Make a first view on the output, as we will write into it.
+        build_view = """
+        //TODO: give this Op a second output so that this view can be cached
+        //TODO: alternatively, fix the memory leak on failure
+        %(alloc_zview)s;
+        if (!zview)
+        {
+            %(fail)s;
+        }
+        %(link_zview)s;
+        """ % locals()
+        # make zview actually a view of %(z)s
+        helper_args = self.get_helper_c_code_args()
+        helper_args['view_name'] = 'zview'
+        get_zview = self.define_set_data() + \
+                Subtensor.helper_c_code(
+                node=node,
+                name=name,
+                inputs=outputs[:1] + inputs[2:],
+                outputs=outputs,
+                sub=sub,
+                idx_list=self.idx_list,
+                ** helper_args
+                )
+
+        copy_into = self.copy_into("zview", y)
+
+        add_to_zview = self.add_to_zview(y, fail)
+
+        make_modification = """
+        if (%(op_is_set)s)
+        {
+            if (%(copy_into)s) // does broadcasting
+            {
+                Py_DECREF(zview);
+                %(fail)s;
+            }
+        }
+        else
+        {
+            %(add_to_zview)s
+        }
+        """ % locals()
 
+        return (copy_input_if_necessary
+                + build_view
+                + "{" + get_zview + "}"
+                + make_modification
+                + "Py_DECREF(zview);"
+                )
 
-def get_vector_length(v):
-    """Return the run-time length of a symbolic vector.
+    def do_type_checking(self, node):
+        """ Should raise NotImplementedError if c_code does not support
+        the types involved in this node.
+        """
 
-    :Parameters:
-     - `v` : A rank-1 TensorType variable.
+        if not isinstance(node.inputs[0].type, TensorType):
+            raise NotImplementedError()
 
-    :Exceptions:
-     - `TypeError` : `v` hasn't the proper type.
-     - `ValueError` : No special case applies, the length is not known.
+    def c_code_cache_version(self):
+        hv = Subtensor.helper_c_code_cache_version()
+        if hv:
+            return (1, hv)
+        else:
+            return ()
 
-    In general this is not possible, but for a number of special cases
-    the length can be determined at compile / graph-construction time.
-    This function implements these special cases.
+    def copy_of_x(self, x):
+        """
+            :param x: a string giving the name of a C variable
+                pointing to an array
 
-    """
-    v = as_tensor_variable(v)
-    if v.ndim != 1:
-        raise TypeError('argument must be symbolic vector')
-    if v.type.broadcastable[0]:
-        return 1
-    if isinstance(v, gof.Constant) and v.type.ndim == 1:
-        return len(v.data)
-    if v.owner and isinstance(v.owner.op, theano.tensor.opt.MakeVector):
-        return len(v.owner.inputs)
-    if v.owner and isinstance(v.owner.op, Shape):
-        return v.owner.inputs[0].type.ndim
-    # If we take this slice: var[:0], we know it will have 0 elements.
-    if ((v.owner and
-         isinstance(v.owner.op, theano.tensor.subtensor.Subtensor) and
-         isinstance(v.owner.op.idx_list[0], slice) and
-         v.owner.op.idx_list[0].start in [None, 0])):
-        stop = theano.tensor.subtensor.get_idx_list(
-            v.owner.inputs, v.owner.op.idx_list)[0].stop
-        if extract_constant(stop) == 0:
-            return 0
-    raise ValueError("length not known")
+            :return: C code expression to make a copy of x
 
+            Base class uses PyArrayObject *, subclasses may override for
+            different types of arrays.
+        """
+        # Parameters of PyArrary_FromAny are:
+        # array
+        # dtype: we pass NULL to say any dtype is acceptable, so the existing
+        #        dtype will be copied
+        # min_depth: we pass 0 to have this parameter ignored
+        # max_depth: we pass 0 to have this parameter ignored
+        # requirements: here we pass NPY_ARRAY_ENSURECOPY to force a copy
+        # context: this is almost always NULL, I'm not sure what it's used for
+        return """(PyArrayObject*)PyArray_FromAny(py_%(x)s, NULL, 0, 0,
+                NPY_ARRAY_ENSURECOPY, NULL)""" % locals()
+
+    def make_view_array(self, x, view_ndim):
+        """
+            :param x: a string identifying an array to be viewed
+            :param view_ndim: a string specifying the number of dimensions
+                to have in the view
 
-@constructor
-def horizontal_stack(*args):
-    """
-    Horizontally stack two L{TensorType}s.
-    Stack two L{TensorType}s along the second axis (column wise). These
-    L{TensorType}s must have the same shape along all dimensions but the
-    second.
-    """
-    # Note: 'horizontal_stack' and 'vertical_stack' do not behave exactly like
-    # Numpy's hstack and vstack functions. This is intended, because Numpy's
-    # functions have potentially confusing/incoherent behavior (try them on 1D
-    # arrays). If this is fixed in a future version of Numpy, it may be worth
-    # trying to get closer to Numpy's way of doing things. In the meantime,
-    # better keep different names to emphasize the implementation divergences.
-    assert len(args) >= 2
-    for arg in args:
-        assert arg.type.ndim == 2
-    return concatenate(args, axis=1)
+            This doesn't need to actually set up the view with the
+            right indexing; we'll do that manually later.
+        """
 
+        return """Py_INCREF(PyArray_DESCR(%(x)s));
+        PyArrayObject * zview =
+                (PyArrayObject*)PyArray_NewFromDescr(
+                &PyArray_Type,
+                PyArray_DESCR(%(x)s),
+                %(view_ndim)s,
+                PyArray_DIMS(%(x)s),
+                PyArray_STRIDES(%(x)s),
+                PyArray_DATA(%(x)s),
+                %(x)s->flags,
+                NULL)""" % locals()
+
+    def get_helper_c_code_args(self):
+        """ Return a dictionary of arguments to pass to helper_c_code."""
+        return Subtensor.default_helper_c_code_args()
+
+    def copy_into(self, view, source):
+        """
+            view: string, C code expression for an array
+            source: string, C code expression for an array
 
-@constructor
-def vertical_stack(*args):
-    assert len(args) >= 2
-    for arg in args:
-        assert arg.type.ndim == 2
-    return concatenate(args, axis=0)
+            returns a C code expression to copy source into view, and
+            return 0 on success
+        """
+        return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals()
 
+    def define_set_data(self):
+        """ Returns C code used to define any macros used in the
+        set data argument to the helper C code. """
+        return ""
 
-class Reshape(Op):
-    """Perform a reshape operation of the input x to the new shape shp.
+    def link_view_array(self, x, fail):
+        """ Returns code to complete making zview a view of x"""
 
-    The number of dimensions to which to reshape to (ndim) must be
-    known at graph build time."""
-    view_map = {0: [0]}  # output 0 is potentially aliased to inputs [0]
+        # On CPU there is nothing to do, make_view_array already did this
+        return ""
 
-    check_input = False
+    def set_view_base(self, x, fail):
+        """ Returns code to make zview be a correct view of x,
+        after helper_c_code is done messing with x"""
 
-    def __init__(self, ndim, name=None):
-        self.ndim = ndim
-        self.name = name
+        # On CPU there is nothing to do
+        return ""
 
-    def __eq__(self, other):
-        # .name does not participate because it doesn't affect computations
-        return (type(other) is type(self)) and (other.ndim == self.ndim)
+    def add_to_zview(self, x, fail):
+        """ Return C code to add x to zview. Should DECREF zview if the
+        add fails."""
 
-    def __hash__(self):
-        # .name does not participate because it doesn't affect computations
-        return hash(type(self)) ^ hash(self.ndim)
+        return """
+            PyArrayObject * add_rval = (PyArrayObject*)PyNumber_InPlaceAdd(
+                    (PyObject*)zview, py_%(x)s);
+            if (add_rval)
+            {
+                assert (PyArray_Check((PyObject*)add_rval));
+                assert (PyArray_DATA(add_rval) == PyArray_DATA(zview));
+                Py_DECREF(add_rval);
+            }
+            else
+            {
+                Py_DECREF(zview);
+                %(fail)s;
+            }""" % locals()
 
-    def __str__(self):
-        return '%s{%s}' % (self.__class__.__name__, self.ndim)
-
-    def make_node(self, x, shp):
-        x = as_tensor_variable(x)
-        shp_orig = shp
-        shp = as_tensor_variable(shp, ndim=1)
-        if not shp.dtype.startswith('int'):
-            raise TypeError("Shape must be integers", shp, shp.dtype)
-        assert shp.ndim == 1
-        if isinstance(shp, TensorConstant):
-            bcast = [s == 1 for s in shp.data]
-            return gof.Apply(self, [x, shp], [tensor(x.type.dtype, bcast)])
-        else:
-            bcasts = [False] * self.ndim
-            shp_list = shp_orig
-            if hasattr(shp_orig, "ndim") and shp_orig.ndim == 0:
-                shp_list = [shp_orig]
-            for index in xrange(self.ndim):
-                y = shp_list[index]
-                y = as_tensor_variable(y)
-                # Try to see if we can infer that y has a constant value of 1.
-                # If so, that dimension should be broadcastable.
-                try:
-                    bcasts[index] = (
-                        hasattr(y, 'get_scalar_constant_value') and
-                        y.get_scalar_constant_value() == 1)
-                except NotScalarConstantError:
-                    pass
-            return gof.Apply(self, [x, shp], [tensor(x.type.dtype, bcasts)])
+    def infer_shape(self, node, shapes):
+        return [shapes[0]]
 
-    def perform(self, node, inp, out_):
-        x, shp = inp
-        out, = out_
-        if (len(shp) != self.ndim):
-            raise ValueError('shape argument to Reshape.perform has incorrect'
-                             ' length %i'
-                             ', should be %i' % (len(shp), self.ndim), shp)
-        try:
-            out[0] = numpy.reshape(x, shp)
-        except Exception:
-            raise ValueError('Cannot reshape input of shape %s to shape %s' %
-                             (x.shape, shp))
-        if not out[0].flags.aligned:
-            raise RuntimeError("numpy.reshape returned a not aligned tensor."
-                               " NumPy versions 1.6.2, 1.7.0 and 1.7.1 have"
-                               " this problem for some input shape/new shape"
-                               " combinations. Use another NumPy version."
-                               " Input shape: %s, input stride: %s,"
-                               " new_shape: %s, new_strides: %s." % (
-                                   x.shape, x.strides, shp, out[0].strides))
+    def R_op(self, inputs, eval_points):
+        if eval_points[0] is None or eval_points[1] is None:
+            return [None]
+        # Again we ignore eval points for indices because incsubtensor is
+        # not differentiable wrt to those
+        return self.make_node(eval_points[0], eval_points[1],
+                            *inputs[2:]).outputs
 
     def connection_pattern(self, node):
-        return [[True], [False]]
 
-    def grad(self, inp, grads):
-        x, shp = inp
-        g_out, = grads
-        return [reshape(g_out, shape(x), ndim=x.ndim),
-                DisconnectedType()()]
+        rval = [[True], [True]]
 
-    def R_op(self, inputs, eval_points):
-        if eval_points[0] is None:
-            return [None]
-        return self(eval_points[0], *inputs[1:], **dict(return_list=True))
+        for ipt in node.inputs[2:]:
+            rval.append([False])
 
-    def infer_shape(self, node, ishapes):
-        # inputs[1] can contain at most one value of '-1', meaning the actual
-        # shape of the output will be automatically computed by reshape, so
-        # that the total number of elements stays the same.
-        # TODO: Maybe put that formula here?
-        # It's not trivial, because we would have to check if the product of
-        # all the non-minus-one shapes is a divisor of the product of the
-        # original shapes.
+        return rval
 
-        # The following expression leads to cycles in feature_shape,
-        # because it tries to replace the Shape_i node by the switch
-        # statement, which depends on Shape_i.
-        # return [tuple([switch(eq(node.inputs[1][i], -1),
-        #                      theano.tensor.opt.Shape_i(i)(node.outputs[0]),
-        #                      node.inputs[1][i])
-        #                    for i in xrange(self.ndim)]
-        #    )]
+    def grad(self, inputs, grads):
+        g_output, = grads
+        x, y = inputs[:2]
+        idx_list = inputs[2:]
+
+        if self.set_instead_of_inc:
+            gx = set_subtensor(
+                Subtensor(idx_list=self.idx_list)(g_output, *idx_list),
+                zeros_like(y))
+        else:
+            gx = g_output
+        gy = Subtensor(idx_list=self.idx_list)(g_output, *idx_list)
 
-        # Here, we only simplify if the shape (node.inputs[1]) is a constant,
-        # ideally it would suffice to check that it is always non-negative.
+        return [gx, gy] + [DisconnectedType()()] * len(idx_list)
 
-        requ = node.inputs[1]
-        if isinstance(requ, theano.tensor.TensorConstant):
-            requ = list(requ.data)
-            requ_part = [ele for ele in requ if ele != -1]
-            crit = len(requ) - len(requ_part)
-            if crit == 1 and len(requ_part) > 0:
-                missing = mul(*ishapes[0]) // mul(*requ_part)
-                for i, ele in enumerate(requ):
-                    if ele == -1:
-                        requ[i] = missing
-            elif crit == 1:  # we reshape to -1
-                requ = [mul(*ishapes[0])]
-            elif crit > 1:
-                raise ValueError('shape argument to Reshape.perform'
-                                 ' must have at most one entry equal to -1')
-            return [requ]
-        else:
-            oshape = []
-            for i in xrange(self.ndim):
-                default_os_i = theano.tensor.opt.Shape_i(i)(node.outputs[0])
-                try:
-                    os_i = get_scalar_constant_value(node.inputs[1][i]).item()
-                    if os_i == -1:
-                        os_i = default_os_i
-                except NotScalarConstantError:
-                    os_i = default_os_i
-                oshape.append(os_i)
-            return [tuple(oshape)]
 
-    def c_code_cache_version(self):
-        return (6,)
+def split(x, splits_size, n_splits, axis=0):
+    the_split = Split(n_splits)
+    return the_split(x, axis, splits_size)
 
-    def c_code(self, node, name, inputs, outputs, sub):
-        if isinstance(node.inputs[0], TensorVariable):
-            x, shp = inputs
-            z, = outputs
-            new_ndim = self.ndim
-            sdtype = node.inputs[1].type.dtype_specs()[1]
-            fail = sub['fail']
-            return """
-            assert (PyArray_NDIM(%(shp)s) == 1);
-            npy_intp new_dims[%(new_ndim)s];
-            PyArray_Dims newshape;
-            newshape.ptr = new_dims;
-            newshape.len = %(new_ndim)s;
-            for (int ii = 0; ii < %(new_ndim)s; ++ii)
-            {
-                // -- We do not want an explicit cast here. the shp can be any
-                // -- int* dtype. The compiler will explicitly upcast it, but
-                // -- will err if this will downcast. This could happen if the
-                // -- user pass an int64 dtype, but npy_intp endup being int32.
-                new_dims[ii] = ((%(sdtype)s*)(
-                        PyArray_BYTES(%(shp)s) +
-                        ii * PyArray_STRIDES(%(shp)s)[0]))[0];
-            }
-            Py_XDECREF(%(z)s);
-            %(z)s = (PyArrayObject *) PyArray_Newshape(%(x)s, &newshape,
-                NPY_CORDER);
-            if (!%(z)s)
-            {
-                //The error message should have been set by PyArray_Newshape
-                %(fail)s;
-            }
-            if (!PyArray_ISALIGNED(%(z)s)) {
-                PyErr_Format(
-                    PyExc_RuntimeError,
-                    "PyArray_Newshape returned an object that isn't aligned!"
-                    " NumPy versions 1.6.2, 1.7.0 and 1.7.1 have"
-                    " this problem for some input shape/new shape"
-                    " combinations. Use another NumPy version.");
-                %(fail)s;
-            }
-            """ % locals()
-        else:
-            return Op.c_code(self, node, name, inputs, outputs, sub)
 
+class Split(Op):
+    """Partition a `TensorVariable` along some axis.
 
-def reshape(x, newshape, ndim=None, name=None):
-    if ndim is None:
-        try:
-            ndim = get_vector_length(newshape)
-        except ValueError:
-            raise ValueError(
-                "The length of the provided shape (%s) cannot "
-                "be automatically determined, so Theano is not able "
-                "to know what the number of dimensions of the reshaped "
-                "variable will be. You can provide the 'ndim' keyword "
-                "argument to 'reshape' to avoid this problem." % newshape)
-    op = Reshape(ndim, name)
-    rval = op(x, newshape)
-    return rval
+    .. python::
 
+        x = vector()
+        splits = lvector()
+        # you have to declare right away how many split_points there will be.
+        ra, rb, rc = split(x, splits, n_splits = 3, axis = 0)
+
+        f = function([x, splits], [ra, rb, rc])
+
+        a, b, c = f([0,1,2,3,4,5,6], [3, 2, 1])
+
+        #a == [0,1,2]
+        #b == [3, 4]
+        #c == [5]
 
-class Flatten(Op):
     """
-    Flattens a tensor to `outdim` dimensions by preserving the leading
-    outdim - 1 shape components.
+
+    len_splits = None
+    """A Split instance will have this many outputs, and require that
+    the splits argument to `perform` have exactly this many elements.
     """
-    view_map = {0: [0]}
 
-    def __init__(self, outdim=1):
-        self.outdim = int(outdim)
+    def __init__(self, len_splits):
+        self.len_splits = int(len_splits)
 
     def __eq__(self, other):
-        return type(self) == type(other) and self.outdim == other.outdim
-
-    def __hash__(self):
-        return hashtype(self) ^ hash(self.outdim)
+        return (type(self) == type(other) and
+                self.len_splits == other.len_splits)
 
     def __str__(self):
-        return '%s{%s}' % (self.__class__.__name__, self.outdim)
+        return self.__class__.__name__ + "{%s}" % self.len_splits
 
-    def make_node(self, x):
-        t_x = as_tensor_variable(x)
-        if self.outdim < 1 or (x.ndim and self.outdim > x.ndim):
-            raise ValueError('invalid output ndimensions (%i) for tensor of '
-                             'rank %i' % (self.outdim, t_x.ndim))
-        return gof.Apply(self, [t_x], [tensor(x.type.dtype,
-                                              (False,) * self.outdim)])
+    def __hash__(self):
+        return hash(Split) ^ self.len_splits
 
-    def perform(self, node, inp, out_):
-        x, = inp
-        out, = out_
-        outdim = self.outdim
-        if outdim == 1:
-            try:
-                out[0] = x.reshape(x.size)
-            except AttributeError:
-                out[0] = x.reshape((numpy.prod(x.shape),))
-        elif outdim == len(x.shape):
-            out[0] = x
-        else:
-            newshape = (x.shape[:outdim - 1] +
-                        (numpy.prod(x.shape[outdim - 1:]),))
-            out[0] = x.reshape(newshape)
+    def make_node(self, x, axis, splits):
+        """WRITEME"""
+        x = as_tensor_variable(x)
+        axis = as_tensor_variable(axis)
+        splits = as_tensor_variable(splits)
 
-    def infer_shape(self, node, in_shapes):
-        in_shp, = in_shapes
-        part1 = in_shp[:self.outdim - 1]
-        part2 = in_shp[self.outdim - 1:]
+        if splits.type not in int_vector_types:
+            raise TypeError('splits must have type tensor.lvector',
+                            splits.type)
+        if axis.type not in int_types:
+            raise TypeError('axis must have type lscalar', axis.type)
 
-        if len(part2) > 1:
-            part2 = (prod(part2, dtype='int64'),)
-        elif len(part2) == 1:
-            # We do not want to force an upcast of part2 if its length is 1
-            pass
-        else:
-            if len(in_shp) == 0 and self.outdim == 1:
-                part2 = (1,)
-            else:
-                raise ValueError('invalid output ndimensions (%i) for tensor '
-                                 'of rank %i' % (self.outdim, len(in_shp)))
+#         # The following lines are necessary if we allow splits of zero
+#         if isinstance(axis, gof.Constant):
+#             x = unbroadcast(x, int(axis.data))
+#         else:
+#             x = unbroadcast(x, *range(x.type.ndim))
 
-        out_shape = (part1 + part2)
-        return [out_shape]
+        inputs = [x, axis, splits]
+        outputs = [x.type() for i in xrange(self.len_splits)]
 
-    def grad(self, inp, grads):
-        x, = inp
-        g_out, = grads
-        return [reshape(g_out, shape(x), x.ndim)]
+        return Apply(self, inputs, outputs)
+
+    def perform(self, node, inputs, outputs):
+        """WRITEME"""
+        x, axis, splits = inputs
+        # in python 2.4, x.shape[numpy.asarray(1)] don't work.
+        if sys.version_info[0:2] == (2, 4) and axis.size == 1:
+            axis = int(axis)
+
+        try:
+            len_along_axis = x.shape[axis]
+        except:
+            raise ValueError('Split.perform() with axis=(%s) is invalid'
+                             ' for x.shape==(%s)'
+                    % (axis, x.shape))
+        if len(splits) != self.len_splits:
+            raise ValueError('In Split.perform(), len(splits) != len_splits.',
+                    (len(splits), self.len_splits))
+
+        if numpy.sum(splits) != len_along_axis:
+            raise ValueError('The splits sum to %s, expected %s' %
+                             (numpy.sum(splits), len_along_axis))
+        if not python_all(splits):
+            raise ValueError('Cannot have a split of zero.')
+
+        # Checking is done, let's roll the splitting algorithm!
+        # Basically we step along the given axis of x, extracting
+        # subtensors of size splits[i] as we go along.
+
+        general_key = [slice(None, None, None) for s in x.shape]
+        lower_idx = 0
+        for i in xrange(self.len_splits):
+            upper_idx = lower_idx + splits[i]
+            general_key[axis] = slice(lower_idx, upper_idx, None)
+            outputs[i][0] = x.__getitem__(general_key).copy()
+            lower_idx = upper_idx
+
+    def infer_shape(self, node, in_shapes):
+        axis = node.inputs[1]
+        splits = node.inputs[2]
+        shp_x, shp_axis, shp_splits = in_shapes
+        out_shapes = []
+        for i in range(self.len_splits):
+            temp = as_tensor_variable(shp_x)
+            temp = set_subtensor(temp[axis], splits[i])
+            temp = [temp[i] for i in range(len(shp_x))]
+            out_shapes.append(temp)
+        return out_shapes
+
+    def grad(self, inputs, g_outputs):
+        """Join the gradients along the axis that was used to split x."""
+        _, axis, n = inputs
+        return [join(axis, *g_outputs),
+                grad_undefined(self, 1, axis),
+                grad_undefined(self, 2, n)]
 
     def R_op(self, inputs, eval_points):
-        if None in eval_points:
+        if eval_points[0]  is None:
+            return [None for i in self.len_splits]
+        return self.make_node(eval_points[0], *inputs[1:]).outputs
+
+
+class Rebroadcast(Op):
+    """
+    Change the input's broadcastable fields in
+    some predetermined way.
+    e.g.: Rebroadcast((0, True), (1, False))(x)
+          would make x broadcastable in axis 0
+          and not broadcastable in axis 1
+    See also the unbroadcast, addbroadcast and patternbroadcast functions.
+
+    ..note: work inplace and work for CudaNdarrayType
+    """
+    view_map = {0: [0]}
+
+    def __init__(self, *axis):
+        self.axis = dict(axis)
+        for axis, broad in self.axis.iteritems():
+            assert isinstance(axis, (numpy.integer, int)), (
+                "Rebroadcast need integers axis. Got ", axis)
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.axis == other.axis
+
+    def __hash__(self):
+        items = self.axis.items()
+        items.sort()  # no ambiguity because each item key is unique
+        return hash(type(self)) ^ hash(tuple(items))
+
+    def __str__(self):
+        if len(self.axis) == 0:
+            broadcast_pattern = []
+        else:
+            broadcast_pattern = ['?' for i
+                                 in xrange(1 + numpy.max(self.axis.keys()))]
+        for k, v in self.axis.iteritems():
+            broadcast_pattern[k] = str(int(v))
+        return '%s{%s}' % (self.__class__.__name__,
+                           ','.join(broadcast_pattern))
+
+    def make_node(self, x):
+        if self.axis.keys() and (x.ndim <= numpy.max(self.axis.keys())):
+            raise ValueError('Trying to rebroadcast nonexistant dimension')
+        t = x.type.__class__(dtype=x.type.dtype,
+                             broadcastable=[self.axis.get(i, b)
+                                            for i, b in enumerate(
+                                                x.type.broadcastable)])
+        return Apply(self, [x], [t()])
+
+    def perform(self, node, inp, out_):
+        x, = inp
+        out, = out_
+        for axis, value in self.axis.iteritems():
+            if value and x.shape[axis] != 1:
+                raise ValueError('Dimension %s in Rebroadcast\'s input was'
+                                 ' supposed to be 1 (got %s instead)' %
+                                 (axis, x.shape[axis]))
+        out[0] = x
+
+    def grad(self, inp, grads):
+        x, = inp
+        gz, = grads
+        # restore the broadcasting pattern of the input
+        return Rebroadcast(*[(axis, x.type.broadcastable[axis])
+                             for axis, value in self.axis.iteritems()])(gz),
+
+    def infer_shape(self, node, ishapes):
+        assert len(ishapes) == 1
+        l = []
+        one = constant(1)
+        for ax in xrange(len(ishapes[0])):
+            if self.axis.get(ax, False):
+                l.append(one)
+            else:
+                l.append(ishapes[0][ax])
+
+        return [tuple(l)]
+
+    def R_op(self, inputs, eval_points):
+        if eval_points[0] is None:
             return [None]
         return self.make_node(*eval_points).outputs
 
 
-def flatten(x, outdim=1):
-    return Flatten(outdim)(x)
+def addbroadcast(x, *axes):
+    """
+    Make the input broadcastable in the specified axes.
 
+    We apply the opt here not to pollute the graph especially during
+    the gpu optimization
+    """
+    rval = Rebroadcast(*[(axis, True) for axis in axes])(x)
+    return theano.tensor.opt.apply_rebroadcast_opt(rval)
 
-# class TileGrad(Op):
-#     """
-#     Calculates the gradient of the Tile Op.
-#     """
-#     # this is so weird, I can't think of how to make this a general thing.
-#     def make_node(self, x, reps, g_out):
-#         return gof.Apply(self, [x, reps, g_out], [x.type()])
-#
-#     def perform(self, node, inp, out):
-#         x, reps, g_out = inp
-#         gx, = out
-#         xsh = x.shape
-#         if len(reps) == 2 and reps[1] == 1 and len(x.shape) == 1:
-#             gx[0] = numpy.sum(g_out, axis=0)
-#         else:
-#             raise NotImplementedError('x.shape, reps combination not '
-#                                       'supported', (x.shape, reps))
-#
-# tilegrad = TileGrad()
 
+def unbroadcast(x, *axes):
+    """
+    Make the input impossible to broadcast in the specified axes.
 
-class Tile(Op):
+    We apply the opt here not to pollute the graph especially during
+    the gpu optimization
     """
-    Construct an array by repeating the input x according to reps pattern.
+    rval = Rebroadcast(*[(axis, False) for axis in axes])(x)
+    return theano.tensor.opt.apply_rebroadcast_opt(rval)
 
-    Tiles its input according to reps. The length of reps is the number of
-    dimension of x and contains the number of times to tile x in each
-    dimension.
 
-    :see: `numpy.tile
-    <http://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_
+def patternbroadcast(x, broadcastable):
+    """
+    Make the input adopt a specific broadcasting pattern.
+
+    We apply the opt here not to pollute the graph especially during the gpu
+    optimization.
+    """
+    rval = Rebroadcast(*[(i, broadcastable[i])
+                         for i in xrange(len(broadcastable))])(x)
+    return theano.tensor.opt.apply_rebroadcast_opt(rval)
+
+
+class Join(Op):
+    """
+    Concatenate multiple `TensorVariable`s along some axis.
+
+    The axis must be given as first argument. All tensors must have the same
+    shape along all dimensions other than this axis.
+    Of course, TensorVariable instances do not have a shape, so this error
+    cannot be caught until runtime.  See `perform()`.
+
+    For joins involving scalar values, see @stack.
+
+    .. python::
+
+        x, y, z = tensor.matrix(), tensor.matrix(), tensor.matrix()
+        u = tensor.vector()
+
+        r = join(0, x, y, z)
+        c = join(1, x, y, z)
+        join(2, x, y, z)    # WRONG: the axis has to be an index into the shape
+        join(0, x, u)       # WRONG: joined tensors must have the same rank
+    """
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def __str__(self):
+        return '%s' % (self.__class__.__name__)
+
+    def make_node(self, *axis_and_tensors):
+        """
+        :param axis: an Int or integer-valued Variable
+
+        :param tensors: a variable number (but not zero) of tensors to
+          concatenate along the specified axis.  These tensors must have
+          the same shape along all dimensions other than this axis.
+
+        :returns: a symbolic Variable.  It has the same ndim as the
+            input tensors, and the most inclusive dtype.
+
+        """
+        axis, tensors = axis_and_tensors[0], axis_and_tensors[1:]
+        if not tensors:
+            raise ValueError('Cannot join an empty list of tensors')
+        as_tensor_variable_args = [as_tensor_variable(x) for x in tensors]
+
+        dtypes = [x.type.dtype for x in as_tensor_variable_args]
+        out_dtype = scal.upcast(*dtypes)
+
+        output_maker = lambda bcastable: tensor(dtype=out_dtype,
+                                                broadcastable=bcastable)
+
+        return self._make_node_internal(axis, tensors,
+                            as_tensor_variable_args, output_maker)
+
+    def _make_node_internal(self, axis, tensors,
+                as_tensor_variable_args, output_maker):
+        if not python_all(targs.type.ndim for targs
+                          in as_tensor_variable_args):
+            raise TypeError('Join cannot handle arguments of dimension 0.'
+                            ' For joining scalar values, see @stack')
+        # Handle single-tensor joins immediately.
+        if len(as_tensor_variable_args) == 1:
+            bcastable = list(as_tensor_variable_args[0].type.broadcastable)
+        else:
+            # When the axis is fixed, a dimension should be
+            # broadcastable if at least one of the inputs is
+            # broadcastable on that dimension (see justification below),
+            # except for the axis dimension.
+            # Initialize bcastable all false, and then fill in some trues with
+            # the loops.
+            bcastable = [False] * len(
+                as_tensor_variable_args[0].type.broadcastable)
+            ndim = len(bcastable)
+            # Axis can also be a constant
+            if not isinstance(axis, int):
+                try:
+                    # Note : `get_scalar_constant_value` returns a ndarray not
+                    # an int
+                    axis = int(get_scalar_constant_value(axis))
+
+                except NotScalarConstantError:
+                    pass
+            if isinstance(axis, int):
+                # Basically, broadcastable -> length 1, but the
+                # converse does not hold. So we permit e.g. T/F/T
+                # joins, and if they fail at runtime they fail, but if
+                # they don't then it means that the argument where
+                # that broadcastable flag was False had length 1 along
+                # this dimension, and therefore this dimension should
+                # be broadcastable for the output.
+                for x in as_tensor_variable_args:
+                    for current_axis, bflag in enumerate(x.type.broadcastable):
+                        # Not sure if this Op supports/supported/will support
+                        # negative indices, but just to be sure...
+                        if current_axis == axis % ndim:
+                            continue
+                        if bflag:
+                            bcastable[current_axis] = True
+                try:
+                    bcastable[axis] = False
+                except IndexError:
+                    raise ValueError('Join argument "axis" is out of range'
+                                     ' (given input dimensions)')
+                as_tensor_variable_args = [unbroadcast(x, axis)
+                                           for x in as_tensor_variable_args]
+            else:
+                # These unbroadcasts are for the gradient... not sure exactly
+                # why...
+                as_tensor_variable_args = [unbroadcast(x, *range(x.type.ndim))
+                                           for x in as_tensor_variable_args]
+                # When the axis may vary, no dimension can be guaranteed to be
+                # broadcastable.
+                bcastable = [False] * len(
+                    as_tensor_variable_args[0].type.broadcastable)
+
+        inputs = [as_tensor_variable(axis)] + list(as_tensor_variable_args)
+        if inputs[0].type not in int_types:
+            raise TypeError('Axis could not be cast to an integer type',
+                            axis, inputs[0].type, int_types)
+
+        outputs = [output_maker(bcastable)]
+
+        node = Apply(self, inputs, outputs)
+        return node
+
+    def perform(self, node, axis_and_tensors, out_):
+        out, = out_
+        axis, tensors = axis_and_tensors[0], axis_and_tensors[1:]
+        out[0] = theano._asarray(numpy.concatenate(tensors, axis=axis),
+                dtype=node.outputs[0].type.dtype)
+
+    def R_op(self, inputs, eval_points):
+        if None in eval_points[1:]:
+            return [None]
+        return self.make_node(inputs[0], *eval_points[1:]).outputs
+
+    def grad(self, axis_and_tensors, grads):
+        """ The gradient wrt a join op is a `Split`, used to partition
+        the gradient along the `axis` which was used for joining.
+        """
+        gz, = grads
+        axis, tensors = axis_and_tensors[0], axis_and_tensors[1:]
+
+        rval = [grad_undefined(self, 0, axis)]
+
+        if 'float' in tensors[0].dtype or 'complex' in tensors[0].dtype:
+            # assume that this is differentiable
+            split = Split(len(tensors))
+            split_gz = split(gz, axis, stack(*[shape(x)[axis]
+                                               for x in tensors]))
+            # If there is only one split, it might not be in a list.
+            if not isinstance(split_gz, list):
+                split_gz = [split_gz]
+
+            rval = rval + split_gz
+        else:
+            # the output has integer type, so the gradient through it
+            # is 0
+            rval = rval + [tensor.zeros_like() for tensor in tensors]
+
+        return rval
+
+    def infer_shape(self, node, ishapes):
+        # ishapes[0] contains the size of the axis on which we join
+        # Join op should get at least one input to join
+        assert len(ishapes) > 1
+        n_dim = len(ishapes[1])
+        for shape in ishapes[1:]:
+            assert shape is not None
+            assert len(shape) == n_dim
+
+        out_shapes = []
+        for dim in xrange(n_dim):
+            # we have to deal with 2 possible cases in here :
+            #   a) we are dealing with the dimension for which we join
+            #     (called t_side from true side of the if, where the if
+            #     compares current dimension with the joining dimension)
+            #   b) a non joining dimension ( in which maybe a symbolic
+            #      assertion can be used to make sure all tensors have
+            #      the same number of elements on this non-joined dimension
+            #      this is f_side
+            # initialize
+            t_side = ishapes[1][dim]
+            f_side = ishapes[1][dim]
+            # loop over tensors and sum for the joining dimension
+            for shape in ishapes[2:]:
+                t_side = t_side + shape[dim]
+            # return the dimensions found
+            out_shapes.append(switch(eq(dim, node.inputs[0]),
+                              t_side, f_side))
+
+        return [tuple(out_shapes)]
+
+
+@_redefine_asRoutine(Join())
+def join(axis, *tensors):
+    """
+    Convenience function to concatenate `TensorType`s along the given axis.
+
+    :Parameters:
+     - `tensors` : list of tensors (or list-like)
+       A list of tensors to be concatenated along the given axis.
+     - `axis` : int (symbolic or literal)
+
+       On which dimension should the tensors be joined?  The `axis`
+       must be a valid index into the shape of the tensors to be
+       concatenated.
+
+       The `axis` parameter may either be an integer or an object that
+       can be converted to a scalar using `as_scalar`(`axis`). In the
+       former case, the axis is fixed at construction, while in the
+       latter it may vary over time depending on the value of the
+       `axis` variable.
+
+    The shapes of the tensors to be concatenated must be all
+    identical, except in the dimension (`axis`) on which they are to
+    be joined.
+
+    """
+
+pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Join),
+              printing.FunctionPrinter('join'))
+
+
+def roll(x, shift, axis=None):
+    """
+    Convenience function to roll `TensorType`s along the given axis.
+    Syntax copies numpy.roll function
+
+    Parameters
+    ----------
+    x : tensor_like
+        Input tensor.
+    shift : int (symbolic or literal)
+        The number of places by which elements are shifted.
+    axis : int (symbolic or literal) (optional)
+        The axis along which elements are shifted. By default, the array
+        is flattened before shifting, after which the original
+        shape is restored.
+
+    Returns
+    -------
+    res : tensor
+        Output tensor, with the same shape as `x`.
+    """
+    if axis is None:
+        if x.ndim > 1:
+            y = x.flatten()
+            return roll(y, shift, axis=0).reshape(x.shape)
+        else:
+            axis = 0
+
+    # A slice of all elements in a dimension ':'
+    allslice = slice(None)
+    # List of slices describing the front half [:, :, shift:, :]
+    front_slice = slice(-shift, None)
+    front_list = ([allslice] * axis + [front_slice] +
+                  [allslice] * (x.ndim - axis - 1))
+    # List of slices describing the back half [:, :, :shift, :]
+    end_slice = slice(0, -shift)
+    end_list = ([allslice] * axis + [end_slice] +
+                [allslice] * (x.ndim - axis - 1))
+    return join(axis,
+                x.__getitem__(tuple(front_list)),
+                x.__getitem__(tuple(end_list)))
+
+
+@constructor
+def shape_padleft(t, n_ones=1):
+    """Reshape `t` by left-padding the shape with `n_ones` 1s
+
+    See also: `shape_padright` and `Dimshuffle`
+    """
+    _t = as_tensor_variable(t)
+
+    pattern = ['x'] * n_ones + [i for i in xrange(_t.type.ndim)]
+    return DimShuffle(_t.broadcastable, pattern)(_t)
+
+
+@constructor
+def shape_padright(t, n_ones=1):
+    """Reshape `t` by right-padding the shape with `n_ones` 1s
+
+    See also: `shape_padleft` and `Dimshuffle`
+    """
+    _t = as_tensor_variable(t)
+
+    pattern = [i for i in xrange(_t.type.ndim)] + ['x'] * n_ones
+    return DimShuffle(_t.broadcastable, pattern)(_t)
+
+
+@constructor
+def stack(*tensors):
+    """Insert the arguments as slices into a tensor of 1 rank greater.
+
+    The size in dimension 0 of the result will be equal to the number
+    of tensors passed.
+    """
+    if len(tensors) == 0:
+        raise Exception('theano.tensor.stack(*tensors) must have at least'
+                        ' one parameter')
+    # If all tensors are scalars of the same type, call make_vector.
+    # It makes the graph simpler, by not adding DimShuffles and Rebroadcasts
+
+    # This should be an optimization!
+    # Doing it here make the graph less canonicalized
+    # (more type need to be understood by all optimization)
+    # And DebugMode can't detect error in this code as it is not in an
+    # optimization.
+    # See ticket #660
+    if numpy.all([
+                  # in case there is direct int in tensors.
+                  isinstance(t, (numpy.number, float, int, python_complex,
+                                 long)) or
+                  (isinstance(t, Variable) and
+                   isinstance(t.type, TensorType) and
+                   t.ndim == 0)
+                  for t in tensors]):
+        # in case there is direct int
+        tensors = map(as_tensor_variable, tensors)
+        dtype = scal.upcast(*[i.dtype for i in tensors])
+        return theano.tensor.opt.MakeVector(dtype)(*tensors)
+    return join(0, *[shape_padleft(t, 1) for t in tensors])
+
+
+@constructor
+def concatenate(tensor_list, axis=0):
+    """Alias for `join`(axis, *tensor_list).
+
+    This function is similar to `join`, but uses the signature of
+    numpy's concatenate function.
+
+    This function
+    :Exceptions:
+     - `TypeError` : the tensor_list must be a tuple or list
+
+    """
+    # Check someone did not make the common mistake to do something like:
+    #   c = concatenate(x, y)
+    # instead of
+    #   c = concatenate((x, y))
+    if not isinstance(tensor_list, (tuple, list)):
+        raise TypeError("The 'tensors' argument must be either a tuple "
+                "or a list, make sure you did not forget () or [] around "
+                "arguments of concatenate.", tensor_list)
+    return join(axis, *tensor_list)
+
+
+def get_vector_length(v):
+    """Return the run-time length of a symbolic vector.
+
+    :Parameters:
+     - `v` : A rank-1 TensorType variable.
+
+    :Exceptions:
+     - `TypeError` : `v` hasn't the proper type.
+     - `ValueError` : No special case applies, the length is not known.
+
+    In general this is not possible, but for a number of special cases
+    the length can be determined at compile / graph-construction time.
+    This function implements these special cases.
+
+    """
+    v = as_tensor_variable(v)
+    if v.ndim != 1:
+        raise TypeError('argument must be symbolic vector')
+    if v.type.broadcastable[0]:
+        return 1
+    if isinstance(v, gof.Constant) and v.type.ndim == 1:
+        return len(v.data)
+    if v.owner and isinstance(v.owner.op, theano.tensor.opt.MakeVector):
+        return len(v.owner.inputs)
+    if v.owner and isinstance(v.owner.op, Shape):
+        return v.owner.inputs[0].type.ndim
+    raise ValueError("length not known")
+
+
+@constructor
+def horizontal_stack(*args):
+    """
+    Horizontally stack two L{TensorType}s.
+    Stack two L{TensorType}s along the second axis (column wise). These
+    L{TensorType}s must have the same shape along all dimensions but the
+    second.
+    """
+    # Note: 'horizontal_stack' and 'vertical_stack' do not behave exactly like
+    # Numpy's hstack and vstack functions. This is intended, because Numpy's
+    # functions have potentially confusing/incoherent behavior (try them on 1D
+    # arrays). If this is fixed in a future version of Numpy, it may be worth
+    # trying to get closer to Numpy's way of doing things. In the meantime,
+    # better keep different names to emphasize the implementation divergences.
+    assert len(args) >= 2
+    for arg in args:
+        assert arg.type.ndim == 2
+    return concatenate(args, axis=1)
+
+
+@constructor
+def vertical_stack(*args):
+    assert len(args) >= 2
+    for arg in args:
+        assert arg.type.ndim == 2
+    return concatenate(args, axis=0)
+
+
+class Reshape(Op):
+    """Perform a reshape operation of the input x to the new shape shp.
+
+    The number of dimensions to which to reshape to (ndim) must be
+    known at graph build time."""
+    view_map = {0: [0]}  # output 0 is potentially aliased to inputs [0]
+
+    def __init__(self, ndim, name=None):
+        self.ndim = ndim
+        self.name = name
+
+    def __eq__(self, other):
+        # .name does not participate because it doesn't affect computations
+        return (type(other) is type(self)) and (other.ndim == self.ndim)
+
+    def __hash__(self):
+        # .name does not participate because it doesn't affect computations
+        return hash(type(self)) ^ hash(self.ndim)
+
+    def __str__(self):
+        return '%s{%s}' % (self.__class__.__name__, self.ndim)
+
+    def make_node(self, x, shp):
+        x = as_tensor_variable(x)
+        shp_orig = shp
+        shp = as_tensor_variable(shp, ndim=1)
+        if not shp.dtype.startswith('int'):
+            raise TypeError("Shape must be integers", shp, shp.dtype)
+        assert shp.ndim == 1
+        if isinstance(shp, TensorConstant):
+            bcast = [s == 1 for s in shp.data]
+            return gof.Apply(self, [x, shp], [tensor(x.type.dtype, bcast)])
+        else:
+            bcasts = [False] * self.ndim
+            shp_list = shp_orig
+            if hasattr(shp_orig, "ndim") and shp_orig.ndim == 0:
+                shp_list = [shp_orig]
+            for index in xrange(self.ndim):
+                y = shp_list[index]
+                y = as_tensor_variable(y)
+                # Try to see if we can infer that y has a constant value of 1.
+                # If so, that dimension should be broadcastable.
+                try:
+                    bcasts[index] = (
+                            hasattr(y, 'get_scalar_constant_value') and
+                            y.get_scalar_constant_value() == 1)
+                except NotScalarConstantError:
+                    pass
+            return gof.Apply(self, [x, shp], [tensor(x.type.dtype, bcasts)])
+
+    def perform(self, node, inp, out_):
+        x, shp = inp
+        out, = out_
+        if (len(shp) != self.ndim):
+            raise ValueError('shape argument to Reshape.perform has incorrect'
+                             ' length %i'
+                             ', should be %i' % (len(shp), self.ndim), shp)
+        try:
+            out[0] = numpy.reshape(x, shp)
+        except Exception:
+            raise ValueError('Cannot reshape input of shape %s to shape %s' %
+                             (x.shape, shp))
+        if not out[0].flags.aligned:
+            raise RuntimeError("numpy.reshape returned a not aligned tensor."
+                               " NumPy versions 1.6.2, 1.7.0 and 1.7.1 have"
+                               " this problem for some input shape/new shape"
+                               " combinations. Use another NumPy version."
+                               " Input shape: %s, input stride: %s,"
+                               " new_shape: %s, new_strides: %s." % (
+                                   x.shape, x.strides, shp, out[0].strides))
+
+    def connection_pattern(self, node):
+        return [[True], [False]]
+
+    def grad(self, inp, grads):
+        x, shp = inp
+        g_out, = grads
+        return [reshape(g_out, shape(x), ndim=x.ndim),
+                DisconnectedType()()]
+
+    def R_op(self, inputs, eval_points):
+        if eval_points[0] is None:
+            return [None]
+        return self.make_node(eval_points[0], *inputs[1:]).outputs
+
+    def infer_shape(self, node, ishapes):
+        # inputs[1] can contain at most one value of '-1', meaning the actual
+        # shape of the output will be automatically computed by reshape, so
+        # that the total number of elements stays the same.
+        # TODO: Maybe put that formula here?
+        # It's not trivial, because we would have to check if the product of
+        # all the non-minus-one shapes is a divisor of the product of the
+        # original shapes.
+
+        # The following expression leads to cycles in feature_shape,
+        # because it tries to replace the Shape_i node by the switch
+        # statement, which depends on Shape_i.
+        # return [tuple([switch(eq(node.inputs[1][i], -1),
+        #                      theano.tensor.opt.Shape_i(i)(node.outputs[0]),
+        #                      node.inputs[1][i])
+        #                    for i in xrange(self.ndim)]
+        #    )]
+
+        # Here, we only simplify if the shape (node.inputs[1]) is a constant,
+        # ideally it would suffice to check that it is always non-negative.
+
+        requ = node.inputs[1]
+        if isinstance(requ, theano.tensor.TensorConstant):
+            requ = list(requ.data)
+            requ_part = [ele for ele in requ if ele != -1]
+            crit = len(requ) - len(requ_part)
+            if crit == 1 and len(requ_part) > 0:
+                missing = mul(*ishapes[0]) // mul(*requ_part)
+                for i, ele in enumerate(requ):
+                    if ele == -1:
+                        requ[i] = missing
+            elif crit == 1:  # we reshape to -1
+                requ = [mul(*ishapes[0])]
+            elif crit > 1:
+                raise ValueError('shape argument to Reshape.perform'
+                    ' must have at most one entry equal to -1')
+            return [requ]
+        else:
+            oshape = []
+            for i in xrange(self.ndim):
+                default_os_i = theano.tensor.opt.Shape_i(i)(node.outputs[0])
+                try:
+                    os_i = get_scalar_constant_value(node.inputs[1][i]).item()
+                    if os_i == -1:
+                        os_i = default_os_i
+                except NotScalarConstantError:
+                    os_i = default_os_i
+                oshape.append(os_i)
+            return [tuple(oshape)]
+
+    def c_code_cache_version(self):
+        return (5,)
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        if isinstance(node.inputs[0], TensorVariable):
+            x, shp = inputs
+            z, = outputs
+            new_ndim = self.ndim
+            fail = sub['fail']
+            return """
+            assert (PyArray_NDIM(%(shp)s) == 1);
+            npy_intp new_dims[%(new_ndim)s];
+            PyArray_Dims newshape;
+            newshape.ptr = new_dims;
+            newshape.len = %(new_ndim)s;
+            for (int ii = 0; ii < %(new_ndim)s; ++ii)
+            {
+                // -- We do not want an explicit cast here. the shp can be any
+                // -- int* dtype. The compiler will explicitly upcast it, but
+                // -- will err if this will downcast. This could happen if the
+                // -- user pass an int64 dtype, but npy_intp endup being int32.
+                new_dims[ii] = ((dtype_%(shp)s*)(
+                        PyArray_BYTES(%(shp)s) +
+                        ii * PyArray_STRIDES(%(shp)s)[0]))[0];
+            }
+            Py_XDECREF(%(z)s);
+            %(z)s = (PyArrayObject *) PyArray_Newshape(%(x)s, &newshape,
+                PyArray_CORDER);
+            if (!%(z)s)
+            {
+                PyErr_Format(PyExc_ValueError,
+                             "Could not reshape array.");
+                %(fail)s;
+            }
+            if (!PyArray_ISALIGNED(%(z)s)) {
+                PyErr_Format(
+                    PyExc_RuntimeError,
+                    "PyArray_Newshape returned an object that isn't aligned!"
+                    " NumPy versions 1.6.2, 1.7.0 and 1.7.1 have"
+                    " this problem for some input shape/new shape"
+                    " combinations. Use another NumPy version.");
+                %(fail)s;
+            }
+            """ % locals()
+        else:
+            return Op.c_code(self, node, name, inputs, outputs, sub)
+
+
+def reshape(x, newshape, ndim=None, name=None):
+    if ndim is None:
+        try:
+            ndim = get_vector_length(newshape)
+        except ValueError:
+            raise ValueError("The length of the provided shape (%s) cannot "
+                    "be automatically determined, so Theano is not able "
+                    "to know what the number of dimensions of the reshaped "
+                    "variable will be. You can provide the 'ndim' keyword "
+                    "argument to 'reshape' to avoid this problem." % newshape)
+    op = Reshape(ndim, name)
+    rval = op(x, newshape)
+    return rval
+
+
+class Flatten(Op):
+    """
+    Flattens a tensor to `outdim` dimensions by preserving the leading
+    outdim - 1 shape components.
+    """
+    view_map = {0: [0]}
+
+    def __init__(self, outdim=1):
+        self.outdim = int(outdim)
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.outdim == other.outdim
+
+    def __hash__(self):
+        return hashtype(self) ^ hash(self.outdim)
+
+    def __str__(self):
+        return '%s{%s}' % (self.__class__.__name__, self.outdim)
+
+    def make_node(self, x):
+        t_x = as_tensor_variable(x)
+        if self.outdim < 1 or (x.ndim and self.outdim > x.ndim):
+            raise ValueError('invalid output ndimensions (%i) for tensor of '
+                             'rank %i' % (self.outdim, t_x.ndim))
+        return gof.Apply(self, [t_x], [tensor(x.type.dtype,
+                                              (False,) * self.outdim)])
+
+    def perform(self, node, inp, out_):
+        x, = inp
+        out, = out_
+        outdim = self.outdim
+        if outdim == 1:
+            try:
+                out[0] = x.reshape(x.size)
+            except AttributeError:
+                out[0] = x.reshape((numpy.prod(x.shape),))
+        elif outdim == len(x.shape):
+            out[0] = x
+        else:
+            newshape = (x.shape[:outdim - 1] +
+                        (numpy.prod(x.shape[outdim - 1:]),))
+            out[0] = x.reshape(newshape)
+
+    def infer_shape(self, node, in_shapes):
+        in_shp, = in_shapes
+        part1 = in_shp[:self.outdim - 1]
+        part2 = in_shp[self.outdim - 1:]
+
+        if len(part2) > 1:
+            part2 = (prod(part2, dtype='int64'),)
+        elif len(part2) == 1:
+            # We do not want to force an upcast of part2 if its length is 1
+            pass
+        else:
+            if len(in_shp) == 0 and self.outdim == 1:
+                part2 = (1,)
+            else:
+                raise ValueError('invalid output ndimensions (%i) for tensor '
+                                 'of rank %i' % (self.outdim, len(in_shp)))
+
+        out_shape = (part1 + part2)
+        return [out_shape]
+
+    def grad(self, inp, grads):
+        x, = inp
+        g_out, = grads
+        return [reshape(g_out, shape(x), x.ndim)]
+
+    def R_op(self, inputs, eval_points):
+        if None in eval_points:
+            return [None]
+        return self.make_node(*eval_points).outputs
+
+
+def flatten(x, outdim=1):
+    return Flatten(outdim)(x)
+
+
+# class TileGrad(Op):
+#     """
+#     Calculates the gradient of the Tile Op.
+#     """
+#     # this is so weird, I can't think of how to make this a general thing.
+#     def make_node(self, x, reps, g_out):
+#         return gof.Apply(self, [x, reps, g_out], [x.type()])
+#
+#     def perform(self, node, inp, out):
+#         x, reps, g_out = inp
+#         gx, = out
+#         xsh = x.shape
+#         if len(reps) == 2 and reps[1] == 1 and len(x.shape) == 1:
+#             gx[0] = numpy.sum(g_out, axis=0)
+#         else:
+#             raise NotImplementedError('x.shape, reps combination not '
+#                                       'supported', (x.shape, reps))
+#
+# tilegrad = TileGrad()
+
+
+class Tile(Op):
+    """
+    Construct an array by repeating the input x according to reps pattern.
+
+    Tiles its input according to reps. The length of reps is the number of
+    dimension of x and contains the number of times to tile x in each
+    dimension.
+
+    :see: `numpy.tile
+    <http://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_
+    """
+    def __init__(self, ndim):
+        self.ndim = ndim
+
+    def __eq__(self, other):
+        return (type(other) is Tile) and (other.ndim == self.ndim)
+
+    def __hash__(self):
+        return hash(Tile) ^ hash(self.ndim)
+
+    def make_node(self, x, reps):
+        x = as_tensor_variable(x)
+        reps = as_tensor_variable(reps)
+        return gof.Apply(self, [x, reps], [tensor(x.type.dtype, [False] *
+                                                  self.ndim)])
+
+    def perform(self, node, inp, out_):
+        x, reps = inp
+        out, = out_
+        out[0] = numpy.tile(x, reps)
+        if len(out[0].shape) != self.ndim:
+            raise ValueError('Tile.perform produced incorrect shape')
+
+    def infer_shape(self, node, in_shapes):
+        # Note: in contrast with numpy, it is assumed that x.shape and reps
+        # have equal length;  see also tile function below
+
+        # Note: if reps were to be allowed not to be a constant and x.shape
+        # and reps to be unequal, the following block of code could be used:
+        ## prepend 1 to x.shape if needed
+        # if self.ndim > x.ndim:
+        # shp = concatenate(ones(self.ndim - x.ndim), shp)
+        ## prepend 1 to reps if needed
+        # reps = concatenate(ones(self.ndim - reps.shape[0]), reps)
+
+        x, reps = node.inputs
+        shp = in_shapes[0]
+        tiled_shp = shp * reps
+        out_shape = []
+        for i in range(self.ndim):
+            out_shape.append(tiled_shp[i])
+        return [out_shape]
+
+    def grad(self, inp, grads):
+        x, reps = inp
+        g_out, = grads
+        # return [tilegrad(x, reps, g_out), None]
+        raise NotImplementedError()
+
+
+def tile(x, reps, ndim=None):
+    """
+    Tile input array `x` according to `reps`. See the docstring of `numpy.tile`
+    for details.
+
+    Currently, `reps` must be a constant, x.ndim and len(reps) must be equal
+    and, if specified, 'ndim' must be equal to both.
+
+    TODO: expand this.
+    """
+
+    try:
+        assert python_all([int(i) == i for i in iter(reps)])
+    except (TypeError, AssertionError):
+        raise ValueError("reps argument to tile must be a constant (e.g. "
+        "tuple, list of integers)")
+    if len(reps) != x.ndim:
+        raise ValueError("len(reps) != x.ndim not currently supported")
+    elif (ndim is not None) and ndim != x.ndim:
+        raise ValueError("if specified, ndim must be equal to both x.ndim and "
+                         "len(reps)")
+
+    if not hasattr(tile, 'op'):
+        tile.op = {}
+
+    if ndim is None:
+        ndim = len(reps)
+
+    # backport
+    # ndim = len(reps) if ndim is None else ndim
+    # not sure if len(shp) is going to work.
+    if ndim not in tile.op:
+        tile.op[ndim] = Tile(ndim)
+    return tile.op[ndim](x, reps)
+
+
+class ARange(Op):
+    """Create an array containing evenly spaced values within a given interval.
+
+    Parameters and behaviour are the same as numpy.arange().
+    """
+
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.dtype == other.dtype
+
+    def __hash__(self):
+        return hash(self.dtype)
+
+    def __str__(self):
+        return self.__class__.__name__
+
+    def make_node(self, start, stop, step):
+        start, stop, step = map(as_tensor_variable, (start, stop, step))
+        assert start.ndim == 0
+        assert stop.ndim == 0
+        assert step.ndim == 0
+
+        inputs = [start, stop, step]
+        outputs = [tensor(self.dtype, (False,))]
+        return Apply(self, inputs, outputs)
+
+    def infer_shape(self, node, i_shapes):
+        start, stop, step = node.inputs
+
+        def is_constant_value(var, value):
+            try:
+                v = get_scalar_constant_value(var)
+                return numpy.all(v == value)
+            except NotScalarConstantError:
+                pass
+            return False
+
+        if is_constant_value(step, 1):
+            if is_constant_value(start, 0):
+                return [(cast(stop, 'int64'),)]
+            else:
+                return [(maximum(cast(stop - start, 'int64'), 0),)]
+        else:
+            return [(maximum(cast(ceil(cast((stop - start), 'float64')
+                                       / step), 'int64'), 0),)]
+
+    def perform(self, node, inp, out_):
+        start, stop, step = inp
+        out, = out_
+        start = start.item()
+        stop = stop.item()
+        step = step.item()
+        out[0] = numpy.arange(start, stop, step, dtype=self.dtype)
+
+    def connection_pattern(self, node):
+
+        return [[True], [False], [True]]
+
+    def grad(self, inputs, grads):
+        start, stop, step = inputs
+        gz, = grads
+        # start and step affect the output values
+        # but the outputs are integers so there's
+        # no gradient through them
+        # stop does not affect the output values,
+        # just the output shape, so it is disconnected
+        return [start.zeros_like(),
+                DisconnectedType()(),
+                step.zeros_like()]
+
+    def R_op(self, inputs, eval_points):
+        return [None]
+_arange = {}
+
+
+def arange(start, stop=None, step=1, dtype=None):
+    # If only one argument is provided, it is in fact the "stop" argument,
+    # and start is 0.
+    if stop is None:
+        start, stop = 0, start
+
+    start, stop, step = map(as_tensor_variable, (start, stop, step))
+    # If dtype is not provided, infer it from the other arguments
+    if dtype is None:
+        dtype = scal.upcast(start.type.dtype, stop.type.dtype, step.type.dtype)
+        if config.cast_policy in ('numpy', 'numpy+floatX'):
+            # We enforce numpy semantics, except in the special case where
+            # `config.cast_policy` is 'numpy+floatX' and we want to use float32
+            # rather than float64.
+            # As an example, if `start`, `stop` and `step` are all int32,
+            # `numpy.arange` returns an int64 array (on 64-bit platforms),
+            # while the upcast above returns int32.
+            numpy_dtype = numpy.arange(
+                    start=numpy.array(0, dtype=start.dtype),
+                    stop=numpy.array(1, dtype=stop.dtype),
+                    step=numpy.array(1, dtype=step.dtype)).dtype
+            if numpy_dtype != dtype:
+                if (config.cast_policy == 'numpy+floatX' and
+                    config.floatX == 'float32' and
+                    numpy_dtype == 'float64' and
+                    # No explicit float64 in the three arguments?
+                    python_all(dt != 'float64'
+                        for dt in [s.dtype for s in (start, stop, step)])):
+                    # We use float32 instead.
+                    assert dtype != 'float64'
+                    dtype = 'float32'
+                else:
+                    # We use the same dtype as numpy instead of the result of
+                    # the upcast.
+                    dtype = str(numpy_dtype)
+
+    if dtype not in _arange:
+        _arange[dtype] = ARange(dtype)
+    return _arange[dtype](start, stop, step)
+
+
+class PermuteRowElements(Op):
+    """Permute the elements of each row (inner-most dim) of a tensor.
+
+    A permutation will be applied to every row (vector) of the input tensor x.
+    Depending on the dimensionality of x and the permutation tensor y,
+    different cases are possible.
+    If y.ndim = 1, y is a single permutation, that will be applied to every
+    vector of x. For instance, if x is a matrix, the same permutation will be
+    applied to each row of x.
+    If x.ndim = y.ndim, each row of x corresponds to a row of y, containing
+    a permutation that will be applied to that row. For instance, if x and y
+    are two matrices, a different permutation will be applied to each row of x.
+    If x.ndim > y.ndim, y will be broadcasted to fit x, then each row (vector)
+    of x will be reordered according to the corresponding row of y. (This is
+    a generalization of the first case).
+    If x.ndim = 1, every permutation in y will be applied to x, and the output
+    will contain all the results.
+    If x.ndim < y.ndim, x will be broadcasted to fit y, and different
+    permutations contained in y will be applied to each vector in x. (This is
+    a generalization of the previous case).
+
+    If the "inverse" argument is True, the Op will perform the inverse
+    permutation instead.
+    """
+
+    def make_node(self, x, y, inverse):
+        x = as_tensor_variable(x)
+        y = as_tensor_variable(y)
+        if inverse:  # as_tensor_variable does not accept booleans
+            inverse = as_tensor_variable(1)
+        else:
+            inverse = as_tensor_variable(0)
+
+        # y should contain integers
+        assert (y.type.dtype.startswith('int') or
+                y.type.dtype.startswith('uint'))
+        # Inverse should be an integer scalar
+        assert inverse.type.ndim == 0 and\
+                (inverse.type.dtype.startswith('int') or\
+                 inverse.type.dtype.startswith('uint'))
+
+        # Match shapes of x and y
+        x_dim = x.type.ndim
+        y_dim = y.type.ndim
+
+        if x_dim > y_dim:
+            y = shape_padleft(y, n_ones=(x_dim - y_dim))
+        elif x_dim < y_dim:
+            x = shape_padleft(x, n_ones=(y_dim - x_dim))
+
+        # Compute the broadcastable pattern of the output
+        out_broadcastable = [xb and yb for xb, yb in
+                             izip(x.type.broadcastable, y.type.broadcastable)]
+        out_type = tensor(dtype=x.type.dtype, broadcastable=out_broadcastable)
+
+        inputlist = [x, y, inverse]
+        outputlist = [out_type]
+        return Apply(self, inputlist, outputlist)
+
+    def _rec_perform(self, node, x, y, inverse, out, curdim):
+        """Perform the permutation by doing a recursion over the input
+        dimensions.
+
+        For every dimension, starting with the leftmost, the right set of
+        indices is determined (depending if broadcasting or not), then
+        the function is recursively called on the appropriate subtensors.
+
+        The terminal case is reached when the current tensors are vector,
+        then the permutation contained in y is applied to x.
+
+        :param x: The input tensor, on which the permutation is applied
+        :param y: Tensor containing the permutations to apply
+        :param out: Tensor storing the output result
+        :param curdim: Counter of the current depth of recursion
+        :param inverse: Wether to apply permutations or their inverse
+        """
+        if len(x.shape) == 1:
+            # Numpy advanced indexing works in this case
+            if inverse:
+                out[y] = x[:]
+            else:
+                out[:] = x[y]
+            if (numpy.__version__ <= '1.6.1' and
+                    out.size != numpy.uint32(out.size)):
+                warnings.warn(
+                        'Numpy versions 1.6.1 and below have a bug preventing '
+                        'advanced indexing from correctly filling arrays that '
+                        'are too big (>= 2^32 elements). It is possible that '
+                        'out (%s), with shape %s, is not correctly filled.'
+                        % (out, out.shape))
+        else:
+            xs0 = x.shape[0]
+            ys0 = y.shape[0]
+            if xs0 == ys0:
+                for i in xrange(xs0):
+                    self._rec_perform(node, x[i], y[i], inverse, out[i],
+                                      curdim + 1)
+            elif ys0 == 1 and node.inputs[1].type.broadcastable[curdim]:
+                # Broadcast y
+                for i in xrange(xs0):
+                    self._rec_perform(node, x[i], y[0], inverse, out[i],
+                                      curdim + 1)
+            elif xs0 == 1 and node.inputs[0].type.broadcastable[curdim]:
+                # Broadcast x
+                for i in xrange(ys0):
+                    self._rec_perform(node, x[0], y[i], inverse, out[i],
+                                      curdim + 1)
+            else:
+                raise ValueError('Dimension mismatch: %s, %s' % (xs0, ys0))
+
+    def perform(self, node, inp, out):
+        x, y, inverse = inp
+        outs, = out
+        x_s = x.shape
+        y_s = y.shape
+        assert len(x_s) == len(y_s)
+
+        # Make sure the output is big enough
+        out_s = []
+        for xdim, ydim in izip(x_s, y_s):
+            if xdim == ydim:
+                outdim = xdim
+            elif xdim == 1:
+                outdim = ydim
+            elif ydim == 1:
+                outdim = xdim
+            else:
+                raise ValueError('Dimension mismatch: %s, %s' % (xdim, ydim))
+            out_s.append(outdim)
+
+        if outs[0] is None or outs[0].shape != out_s:
+            outs[0] = numpy.empty(out_s, dtype=x.dtype)
+
+        self._rec_perform(node, x, y, inverse, outs[0], curdim=0)
+
+    def infer_shape(self, node, in_shapes):
+        shp_x = in_shapes[0]
+        shp_y = in_shapes[1]
+        assert len(shp_x) == len(shp_y)
+        out_shape = []
+        for i in range(len(shp_x)):
+            out_shape.append(maximum(shp_x[i], shp_y[i]))
+        return [out_shape]
+
+    def grad(self, inp, grads):
+        x, y, inverse = inp
+        gz, = grads
+        # First, compute the gradient wrt the broadcasted x.
+        # If 'inverse' is False (0), apply the inverse of y on gz.
+        # Else, apply y on gz.
+        gx = permute_row_elements(gz, y, eq(inverse, 0))
+
+        # If x has been broadcasted along some axes, we need to sum
+        # the gradient over these axes, but keep the dimension (as
+        # broadcastable)
+        broadcasted_dims = [dim for dim in xrange(gz.type.ndim)
+                            if x.type.broadcastable[dim]
+                            and not gz.type.broadcastable[dim]]
+        gx = Sum(axis=broadcasted_dims)(gx)
+
+        # Sum(...) removed the dimensions in broadcasted_dims,
+        # so we need to put them back.
+        newdims = []
+        i = 0
+        for dim in xrange(gz.type.ndim):
+            if dim in broadcasted_dims:
+                newdims.append('x')
+            else:
+                newdims.append(i)
+                i += 1
+
+        gx = DimShuffle(gx.type.broadcastable, newdims)(gx)
+        assert gx.type.broadcastable == x.type.broadcastable
+
+        # if x is an integer type, then so is the output.
+        # this means f(x+eps) = f(x) so the gradient with respect
+        # to x is zero
+        if x.type.dtype.find('int') != -1:
+            gx = x.zeros_like()
+
+        # The elements of y and of inverse both affect the output,
+        # so they are connected to the output,
+        # and the transformation isn't defined if their values
+        # are non-integer, so the gradient with respect to them is
+        # undefined
+
+        return [gx, grad_undefined(self, 1, y),
+                grad_undefined(self, 1, inverse)]
+
+_permute_row_elements = PermuteRowElements()
+
+
+def permute_row_elements(x, y, inverse=0):
+    return _permute_row_elements(x, y, inverse)
+
+
+def inverse_permutation(perm):
+    """Computes the inverse of permutations.
+    Each row of input should contain a permutation of the first integers.
     """
-    def __init__(self, ndim):
-        self.ndim = ndim
+    return permute_row_elements(
+            arange(perm.shape[-1], dtype=perm.dtype),
+            perm,
+            inverse=True)
+
+#########################
+# Advanced indexing
+#########################
+#
+# Should reproduce numpy's behaviour, see url:
+# docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing
 
-    def __eq__(self, other):
-        return (type(other) is Tile) and (other.ndim == self.ndim)
+
+class AdvancedSubtensor1(Op):
+    """Implement x[ilist] where ilist is a vector of integers."""
 
     def __hash__(self):
-        return hash(Tile) ^ hash(self.ndim)
+        return hash(type(self))
+
+    def __eq__(self, other):
+        return type(self) == type(other)
 
     def __str__(self):
-        return self.__class__.__name__ + "{ndim=%d}" % self.ndim
+        return self.__class__.__name__
 
-    def make_node(self, x, reps):
-        x = as_tensor_variable(x)
-        reps = as_tensor_variable(reps)
-        return gof.Apply(self, [x, reps], [tensor(x.type.dtype, [False] *
-                                                  self.ndim)])
+    def make_node(self, x, ilist):
+        x_ = as_tensor_variable(x)
+        ilist_ = as_tensor_variable(ilist)
+        if ilist_.type.dtype[:3] not in ('int', 'uin'):
+            raise TypeError('index must be integers')
+        if ilist_.type.ndim != 1:
+            raise TypeError('index must be vector')
+        if x_.type.ndim == 0:
+            raise TypeError('cannot index into a scalar')
+        return Apply(self, [x_, ilist_], [x_.type()])
 
     def perform(self, node, inp, out_):
-        x, reps = inp
+        x, i = inp
         out, = out_
-        res = numpy.tile(x, reps)
-        if res.ndim != self.ndim:
-            raise ValueError(
-                'Tile.perform produced incorrect number of dimensions')
+        # Copy always implied by numpy advanced indexing semantic.
+        if out[0] is not None and out[0].shape == (len(i),) + x.shape[1:]:
+            o = out[0]
+        else:
+            o = None
+
+        # If i.dtype is more precise than numpy.intp (int32 on 32-bit machines,
+        # int64 on 64-bit machines), numpy may raise the following error:
+        # TypeError: array cannot be safely cast to required type.
+        # We need to check if values in i can fit in numpy.intp, because
+        # if they don't, that should be an error (no array can have that
+        # many elements on a 32-bit arch).
+        if i.dtype != numpy.intp:
+            i_ = theano._asarray(i, dtype=numpy.intp)
+            if not numpy.can_cast(i.dtype, numpy.intp):
+                # Check if there was actually an incorrect conversion
+                if numpy.any(i != i_):
+                    raise IndexError('index contains values that are bigger '
+                            'than the maximum array size on this system.', i)
+            i = i_
+
+        out[0] = x.take(i, axis=0, out=o)
 
-        if (numpy.asarray(reps) == 1).all():
-            # In that case, some NumPy version return a view!  As this
-            # op isn't declared as inplace, we need to check that and
-            # copy the data.
-            if numpy.may_share_memory(res, x):
-                res = res.copy()
-        out[0] = res
+    def connection_pattern(self, node):
+        rval = [[True]]
 
-    def infer_shape(self, node, in_shapes):
-        # Note: in contrast with numpy, it is assumed that x.shape and reps
-        # have equal length;  see also tile function below
+        for ipt in node.inputs[1:]:
+            rval.append([False])
 
-        # Note: if reps were to be allowed not to be a constant and x.shape
-        # and reps to be unequal, the following block of code could be used:
-        ## prepend 1 to x.shape if needed
-        # if self.ndim > x.ndim:
-        # shp = concatenate(ones(self.ndim - x.ndim), shp)
-        ## prepend 1 to reps if needed
-        # reps = concatenate(ones(self.ndim - reps.shape[0]), reps)
+        return rval
 
-        x, reps = node.inputs
-        shp = in_shapes[0]
-        tiled_shp = shp * reps
-        out_shape = []
-        for i in range(self.ndim):
-            out_shape.append(tiled_shp[i])
-        return [out_shape]
+    def grad(self, inputs, grads):
+        global sparse_module_ref
+        x, ilist = inputs
+        gz, = grads
+        assert len(inputs) == 2
 
-    def grad(self, inp, grads):
-        x, reps = inp
-        g_out, = grads
-        # return [tilegrad(x, reps, g_out), None]
-        raise NotImplementedError()
+        if x.type.sparse_grad:
+            if sparse_module_ref is None:
+                import theano.sparse as sparse_module_ref
 
+            rval1 = [sparse_module_ref.construct_sparse_from_list(x, gz,
+                                                                  ilist)]
+        else:
+            rval1 = [advanced_inc_subtensor1(zeros_like(x), gz, ilist)]
+        return rval1 + [DisconnectedType()()] * (len(inputs) - 1)
 
-def tile(x, reps, ndim=None):
-    """
-    Tile input array `x` according to `reps`. See the docstring of `numpy.tile`
-    for details.
+    def R_op(self, inputs, eval_points):
+        if eval_points[0] is None:
+            return [None]
+        return self.make_node(eval_points[0], *inputs[1:]).outputs
 
-    Currently, `reps` must be a constant, x.ndim and len(reps) must be equal
-    and, if specified, 'ndim' must be equal to both.
+    def infer_shape(self, node, ishapes):
+        x, ilist = ishapes
+        return [ilist + x[1:]]
+
+    def c_support_code(self):
+        # In some versions of numpy, NPY_MIN_INTP is defined as MIN_LONG,
+        # which is not defined. It should be NPY_MIN_LONG instead in that case.
+        return dedent("""\
+                #ifndef MIN_LONG
+                #define MIN_LONG NPY_MIN_LONG
+                #endif""")
+
+    def c_code(self, node, name, input_names, output_names, sub):
+        if self.__class__ is not AdvancedSubtensor1:
+            raise MethodNotDefined(
+                "c_code defined for AdvancedSubtensor1,"
+                " not for child class", type(self))
+        a_name, i_name = input_names[0], input_names[1]
+        output_name = output_names[0]
+        fail = sub['fail']
+        return """
+            PyObject *indices;
+            int i_type = PyArray_TYPE(%(i_name)s);
+            if (i_type != NPY_INTP) {
+                // Cast %(i_name)s to NPY_INTP (expected by PyArray_TakeFrom),
+                // if all values fit.
+                if (!PyArray_CanCastSafely(i_type, NPY_INTP)) {
+                    npy_int64 min_val, max_val;
+                    PyObject* py_min_val = PyArray_Min(%(i_name)s, NPY_MAXDIMS,
+                                                       NULL);
+                    if (py_min_val == NULL) {
+                        %(fail)s;
+                    }
+                    min_val = PyLong_AsLongLong(py_min_val);
+                    Py_DECREF(py_min_val);
+                    if (min_val == -1 && PyErr_Occurred()) {
+                        %(fail)s;
+                    }
+                    PyObject* py_max_val = PyArray_Max(%(i_name)s, NPY_MAXDIMS,
+                                                       NULL);
+                    if (py_max_val == NULL) {
+                        %(fail)s;
+                    }
+                    max_val = PyLong_AsLongLong(py_max_val);
+                    Py_DECREF(py_max_val);
+                    if (max_val == -1 && PyErr_Occurred()) {
+                        %(fail)s;
+                    }
+                    if (min_val < NPY_MIN_INTP || max_val > NPY_MAX_INTP) {
+                        PyErr_SetString(PyExc_IndexError,
+                                     "Index contains values "
+                                     "that are bigger than the maximum array "
+                                     "size on this system.");
+                        %(fail)s;
+                    }
+                }
+                indices = PyArray_Cast(%(i_name)s, NPY_INTP);
+                if (indices == NULL) {
+                    %(fail)s;
+                }
+            }
+            else {
+                 indices = (PyObject *)%(i_name)s;
+                 Py_INCREF(indices);
+            }
+            if (%(output_name)s != NULL) {
+                npy_intp nd, i, *shape;
+                nd = PyArray_NDIM(%(a_name)s) + PyArray_NDIM(indices) - 1;
+                if (PyArray_NDIM(%(output_name)s) != nd) {
+                    Py_CLEAR(%(output_name)s);
+                }
+                else {
+                    shape = PyArray_DIMS(%(output_name)s);
+                    for (i = 0; i < PyArray_NDIM(indices); i++) {
+                        if (shape[i] != PyArray_DIMS(indices)[i]) {
+                            Py_CLEAR(%(output_name)s);
+                            break;
+                        }
+                    }
+                    if (%(output_name)s != NULL) {
+                        for (; i < nd; i++) {
+                            if (shape[i] != PyArray_DIMS(%(a_name)s)[
+                                                i-PyArray_NDIM(indices)+1]) {
+                                Py_CLEAR(%(output_name)s);
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+            %(output_name)s = (PyArrayObject*)PyArray_TakeFrom(
+                        %(a_name)s, indices, 0, %(output_name)s, NPY_RAISE);
+            Py_DECREF(indices);
+            if (%(output_name)s == NULL) %(fail)s;
+        """ % locals()
 
-    TODO: expand this.
-    """
+    def c_code_cache_version(self):
+        return (0, 1, 1)
 
-    try:
-        assert python_all([int(i) == i for i in iter(reps)])
-    except (TypeError, AssertionError):
-        raise ValueError("reps argument to tile must be a constant (e.g. "
-                         "tuple, list of integers)")
-    if len(reps) != x.ndim:
-        raise ValueError("len(reps) != x.ndim not currently supported")
-    elif (ndim is not None) and ndim != x.ndim:
-        raise ValueError("if specified, ndim must be equal to both x.ndim and "
-                         "len(reps)")
+advanced_subtensor1 = AdvancedSubtensor1()
 
-    if not hasattr(tile, 'op'):
-        tile.op = {}
 
-    if ndim is None:
-        ndim = len(reps)
+class AdvancedIncSubtensor1(Op):
+    """Increments a subtensor using advanced slicing (list of index)"""
+    def __init__(self, inplace=False, set_instead_of_inc=False):
+        self.inplace = inplace
+        self.set_instead_of_inc = set_instead_of_inc
+        if inplace:
+            self.destroy_map = {0: [0]}
 
-    # backport
-    # ndim = len(reps) if ndim is None else ndim
-    # not sure if len(shp) is going to work.
-    if ndim not in tile.op:
-        tile.op[ndim] = Tile(ndim)
-    return tile.op[ndim](x, reps)
+    def __hash__(self):
+        return hash((type(self), self.inplace, self.set_instead_of_inc))
 
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.inplace == other.inplace
+                and self.set_instead_of_inc == other.set_instead_of_inc)
 
-class ARange(Op):
-    """Create an array containing evenly spaced values within a given interval.
+    def __str__(self):
+        if self.inplace:
+            msg = "inplace"
+        else:
+            msg = "no_inplace"
+        if self.set_instead_of_inc:
+            msg += ",set"
+        else:
+            msg += ",inc"
+
+        return self.__class__.__name__ + "{%s}" % msg
+
+    def make_node(self, x, y, ilist):
+        x_ = as_tensor_variable(x)
+        y_ = as_tensor_variable(y)
+        ilist_ = as_tensor_variable(ilist)
+
+        if ilist_.type.dtype[:3] not in ('int', 'uin'):
+            raise TypeError('index must be integers')
+        if ilist_.type.ndim != 1:
+            raise TypeError('index must be vector')
+        if x_.type.ndim == 0:
+            raise TypeError('cannot index into a scalar')
+        if y_.type.ndim > x_.type.ndim:
+            if self.set_instead_of_inc:
+                opname = 'set'
+            else:
+                opname = 'increment'
+            raise TypeError('cannot %s x subtensor with ndim=%s'
+            ' by y with ndim=%s to x subtensor with ndim=%s ' % (
+                opname, x_.type.ndim, y_.type.ndim))
 
-    Parameters and behaviour are the same as numpy.arange().
-    """
+        return Apply(self, [x_, y_, ilist_], [x_.type()])
 
-    def __init__(self, dtype):
-        self.dtype = dtype
+    def perform(self, node, inp, out_):
+        # TODO opt to make this inplace
+        x, y, idx = inp
+        out, = out_
+        if not self.inplace:
+            x = x.copy()
+        # In Numpy, x[idx] += y doesn't work if the same index is present
+        # many times: it does it only once. Is it a bug? In any case, for
+        # this reason we implement our own 'inc' iteration.
+        if self.set_instead_of_inc:
+            x[idx] = y
+        else:
+            increment = inplace_increment
+            if increment is None:
+                increment = self.inplace_increment1d_slow
+
+            increment(x, idx, y)
+
+        out[0] = x
+
+    def inplace_increment1d_slow(self, x, idx, y):
+        # If `y` has as many dimensions as `x`, then we want to iterate
+        # jointly on `x` and `y`. Otherwise, it means `y` should be
+        # broadcasted to fill all relevant rows of `x`.
+        assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
+        if y.ndim == x.ndim:
+            assert len(y) == len(idx)
+            for (j, i) in enumerate(idx):
+                x[i] += y[j]
+        else:
+            for i in idx:
+                x[i] += y
 
-    def __eq__(self, other):
-        return type(self) == type(other) and self.dtype == other.dtype
+    def infer_shape(self, node, ishapes):
+        x, y, ilist = ishapes
+        return [x]
 
-    def __hash__(self):
-        return hash(self.dtype)
+    def R_op(self, inputs, eval_points):
+        if None in eval_points[:2]:
+            return [None]
+        return self.make_node(eval_points[0], eval_points[1],
+                              *inputs[2:]).outputs
 
-    def __str__(self):
-        return self.__class__.__name__
+    def connection_pattern(self, node):
 
-    def make_node(self, start, stop, step):
-        start, stop, step = map(as_tensor_variable, (start, stop, step))
-        assert start.ndim == 0
-        assert stop.ndim == 0
-        assert step.ndim == 0
+        rval = [[True], [True], [False]]
+        return rval
 
-        inputs = [start, stop, step]
-        outputs = [tensor(self.dtype, (False,))]
-        return Apply(self, inputs, outputs)
+    def grad(self, inputs, grads):
+        g_output, = grads
+        x, y = inputs[:2]
+        idx_list = inputs[2:]
 
-    def infer_shape(self, node, i_shapes):
-        start, stop, step = node.inputs
+        gx = g_output
+        gy = advanced_subtensor1(g_output, *idx_list)
 
-        def is_constant_value(var, value):
-            try:
-                v = get_scalar_constant_value(var)
-                return numpy.all(v == value)
-            except NotScalarConstantError:
-                pass
-            return False
+        return [gx, gy] + [DisconnectedType()()] * len(idx_list)
 
-        if is_constant_value(step, 1):
-            if is_constant_value(start, 0):
-                return [(cast(stop, 'int64'),)]
-            else:
-                return [(maximum(cast(stop - start, 'int64'), 0),)]
-        else:
-            return [(maximum(cast(ceil(cast((stop - start), 'float64')
-                                       / step), 'int64'), 0),)]
+advanced_inc_subtensor1 = AdvancedIncSubtensor1()
+
+
+def as_index_variable(idx):
+    if idx is None:
+        return NoneConst
+    if isinstance(idx, slice):
+        return make_slice(idx)
+    idx = as_tensor_variable(idx)
+    if idx.type.dtype[:3] not in ('int', 'uin'):
+        raise TypeError('index must be integers')
+    return idx
+
+
+def as_int_none_variable(x):
+    if x is None:
+        return NoneConst
+    x = as_tensor_variable(x, ndim=0)
+    if x.type.dtype[:3] not in ('int', 'uin'):
+        raise TypeError('index must be integers')
+    return x
+
+
+class MakeSlice(Op):
+    def make_node(self, slc):
+        return Apply(self,
+                     map(as_int_none_variable,
+                         [slc.start, slc.stop, slc.step]),
+                     [slicetype()])
 
     def perform(self, node, inp, out_):
-        start, stop, step = inp
         out, = out_
-        start = start.item()
-        stop = stop.item()
-        step = step.item()
-        out[0] = numpy.arange(start, stop, step, dtype=self.dtype)
+        out[0] = slice(*inp)
 
-    def connection_pattern(self, node):
+    def __str__(self):
+        return self.__class__.__name__
 
-        return [[True], [False], [True]]
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
 
     def grad(self, inputs, grads):
-        start, stop, step = inputs
-        gz, = grads
-        # start and step affect the output values
-        # but the outputs are integers so there's
-        # no gradient through them
-        # stop does not affect the output values,
-        # just the output shape, so it is disconnected
-        return [start.zeros_like(),
-                DisconnectedType()(),
-                step.zeros_like()]
+        return [DisconnectedType()() for i in inputs]
 
-    def R_op(self, inputs, eval_points):
-        return [None]
-_arange = {}
+make_slice = MakeSlice()
 
 
-def arange(start, stop=None, step=1, dtype=None):
-    # If only one argument is provided, it is in fact the "stop" argument,
-    # and start is 0.
-    if stop is None:
-        start, stop = 0, start
+class SliceType(gof.Type):
 
-    start, stop, step = map(as_tensor_variable, (start, stop, step))
-    # If dtype is not provided, infer it from the other arguments
-    if dtype is None:
-        dtype = scal.upcast(start.type.dtype, stop.type.dtype, step.type.dtype)
-        if config.cast_policy in ('numpy', 'numpy+floatX'):
-            # We enforce numpy semantics, except in the special case where
-            # `config.cast_policy` is 'numpy+floatX' and we want to use float32
-            # rather than float64.
-            # As an example, if `start`, `stop` and `step` are all int32,
-            # `numpy.arange` returns an int64 array (on 64-bit platforms),
-            # while the upcast above returns int32.
-            numpy_dtype = numpy.arange(
-                start=numpy.array(0, dtype=start.dtype),
-                stop=numpy.array(1, dtype=stop.dtype),
-                step=numpy.array(1, dtype=step.dtype)).dtype
-            if numpy_dtype != dtype:
-                if (config.cast_policy == 'numpy+floatX' and
-                    config.floatX == 'float32' and
-                    numpy_dtype == 'float64' and
-                    # No explicit float64 in the three arguments?
-                    python_all(
-                        dt != 'float64'
-                        for dt in [s.dtype for s in (start, stop, step)])):
-                    # We use float32 instead.
-                    assert dtype != 'float64'
-                    dtype = 'float32'
-                else:
-                    # We use the same dtype as numpy instead of the result of
-                    # the upcast.
-                    dtype = str(numpy_dtype)
+    def filter(self, x, strict=False, allow_downcast=None):
+        if isinstance(x, slice):
+            return x
+        else:
+            raise TypeError('Expected a slice!')
 
-    if dtype not in _arange:
-        _arange[dtype] = ARange(dtype)
-    return _arange[dtype](start, stop, step)
+    def __str__(self):
+        return "slice"
 
+slicetype = SliceType()
 
-class PermuteRowElements(Op):
-    """Permute the elements of each row (inner-most dim) of a tensor.
 
-    A permutation will be applied to every row (vector) of the input tensor x.
-    Depending on the dimensionality of x and the permutation tensor y,
-    different cases are possible.
-    If y.ndim = 1, y is a single permutation, that will be applied to every
-    vector of x. For instance, if x is a matrix, the same permutation will be
-    applied to each row of x.
-    If x.ndim = y.ndim, each row of x corresponds to a row of y, containing
-    a permutation that will be applied to that row. For instance, if x and y
-    are two matrices, a different permutation will be applied to each row of x.
-    If x.ndim > y.ndim, y will be broadcasted to fit x, then each row (vector)
-    of x will be reordered according to the corresponding row of y. (This is
-    a generalization of the first case).
-    If x.ndim = 1, every permutation in y will be applied to x, and the output
-    will contain all the results.
-    If x.ndim < y.ndim, x will be broadcasted to fit y, and different
-    permutations contained in y will be applied to each vector in x. (This is
-    a generalization of the previous case).
+class NoneTypeT(gof.Type):
 
-    If the "inverse" argument is True, the Op will perform the inverse
-    permutation instead.
+    def filter(self, x, strict=False, allow_downcast=None):
+        if x is None:
+            return x
+        else:
+            raise TypeError('Expected None!')
+
+    def __str__(self):
+        return "None"
+
+NoneConst = Constant(NoneTypeT(), None, name='None')
+
+
+def adv_index_broadcastable_pattern(a, idx):
+    """
+    This function is only used to determine the broadcast pattern for
+    AdvancedSubtensor output variable.
+
+    For this, we make a fake ndarray and a fake idx and call use ask numpy
+    the output. From this, we find the output broadcast pattern.
     """
 
-    def make_node(self, x, y, inverse):
-        x = as_tensor_variable(x)
-        y = as_tensor_variable(y)
-        if inverse:  # as_tensor_variable does not accept booleans
-            inverse = as_tensor_variable(1)
-        else:
-            inverse = as_tensor_variable(0)
+    def replace_slice(v):
+        if isinstance(v, gof.Apply):
+            if len(v.outputs) != 1:
+                raise ValueError(
+                    "It is ambiguous which output of a multi-output Op has"
+                    " to be fetched.", v)
+            else:
+                v = v.outputs[0]
 
-        # y should contain integers
-        assert (y.type.dtype.startswith('int') or
-                y.type.dtype.startswith('uint'))
-        # Inverse should be an integer scalar
-        assert (inverse.type.ndim == 0 and
-                (inverse.type.dtype.startswith('int') or
-                 inverse.type.dtype.startswith('uint')))
+        if NoneConst.equals(v):
+            return None
+        if isinstance(v.type, SliceType):
+            return slice(None, None)
 
-        # Match shapes of x and y
-        x_dim = x.type.ndim
-        y_dim = y.type.ndim
+        return numpy.zeros((2,) * v.ndim, int)
 
-        if x_dim > y_dim:
-            y = shape_padleft(y, n_ones=(x_dim - y_dim))
-        elif x_dim < y_dim:
-            x = shape_padleft(x, n_ones=(y_dim - x_dim))
+    newidx = tuple(map(replace_slice, idx))
 
-        # Compute the broadcastable pattern of the output
-        out_broadcastable = [xb and yb for xb, yb in
-                             izip(x.type.broadcastable, y.type.broadcastable)]
-        out_type = tensor(dtype=x.type.dtype, broadcastable=out_broadcastable)
+    #2 - True = 1; 2 - False = 2
+    fakeshape = [2 - bc for bc in a.broadcastable]
+    retshape = numpy.empty(fakeshape)[newidx].shape
+    return tuple([dim == 1 for dim in retshape])
 
-        inputlist = [x, y, inverse]
-        outputlist = [out_type]
-        return Apply(self, inputlist, outputlist)
 
-    def _rec_perform(self, node, x, y, inverse, out, curdim):
-        """Perform the permutation by doing a recursion over the input
-        dimensions.
+class AdvancedSubtensor(Op):
+    """Return a subtensor copy, using advanced indexing.
+    """
+    # Should be used by __getitem__ and __getslice__, as follow:
+    # AdvancedSubtensor()(self, *args),
+    # if args contains and advanced indexing pattern
 
-        For every dimension, starting with the leftmost, the right set of
-        indices is determined (depending if broadcasting or not), then
-        the function is recursively called on the appropriate subtensors.
+    def __eq__(self, other):
+        return self.__class__ == other.__class__
 
-        The terminal case is reached when the current tensors are vector,
-        then the permutation contained in y is applied to x.
+    def __hash__(self):
+        return hash(self.__class__)
 
-        :param x: The input tensor, on which the permutation is applied
-        :param y: Tensor containing the permutations to apply
-        :param out: Tensor storing the output result
-        :param curdim: Counter of the current depth of recursion
-        :param inverse: Wether to apply permutations or their inverse
-        """
-        if len(x.shape) == 1:
-            # Numpy advanced indexing works in this case
-            if inverse:
-                out[y] = x[:]
-            else:
-                out[:] = x[y]
-            if (numpy.__version__ <= '1.6.1' and
-                    out.size != numpy.uint32(out.size)):
-                warnings.warn(
+    def __str__(self):
+        return self.__class__.__name__
+
+    def make_node(self, x, *index):
+        x = as_tensor_variable(x)
+
+        index = tuple(map(as_index_variable, index))
+        bcast = adv_index_broadcastable_pattern(x, index)
+        return gof.Apply(self,
+                         (x,) + index,
+                         [tensor(dtype=x.type.dtype,
+                                 broadcastable=bcast)])
+
+    def R_op(self, inputs, eval_points):
+        if eval_points[0] is None:
+            return [None]
+        return self.make_node(eval_points[0], *inputs[1:]).outputs
+
+    def infer_shape(self, node, ishapes):
+        # Really special case
+        if len(ishapes) == 3:
+            xshp, ind1shp, ind2shp = ishapes
+            if len(xshp) == 2 and len(ind1shp) == 1 and len(ind2shp) == 1:
+                # if the graph is correct, we can assume ind1shp[0] and
+                # ind2shp[0] will have the same value.
+                # Try to return the one closest to the graph input.
+                if node.inputs[2].owner is None:
+                    return [ind2shp]
+                else:
+                    return [ind1shp]
+        # Default case, we don't know
+        return node.fgraph.shape_feature.default_infer_shape(node, ishapes)
+
+    def perform(self, node, inputs, out_):
+        out, = out_
+        # TODO: in general, we need to re-pack the inputs into a valid
+        # index, just like subtensor
+        out[0] = inputs[0].__getitem__(inputs[1:])
+        if (numpy.__version__ <= '1.6.1' and
+                out[0].size != numpy.uint32(out[0].size)):
+            warnings.warn(
                     'Numpy versions 1.6.1 and below have a bug preventing '
                     'advanced indexing from correctly filling arrays that '
                     'are too big (>= 2^32 elements). It is possible that '
-                    'out (%s), with shape %s, is not correctly filled.'
-                    % (out, out.shape))
-        else:
-            xs0 = x.shape[0]
-            ys0 = y.shape[0]
-            if xs0 == ys0:
-                for i in xrange(xs0):
-                    self._rec_perform(node, x[i], y[i], inverse, out[i],
-                                      curdim + 1)
-            elif ys0 == 1 and node.inputs[1].type.broadcastable[curdim]:
-                # Broadcast y
-                for i in xrange(xs0):
-                    self._rec_perform(node, x[i], y[0], inverse, out[i],
-                                      curdim + 1)
-            elif xs0 == 1 and node.inputs[0].type.broadcastable[curdim]:
-                # Broadcast x
-                for i in xrange(ys0):
-                    self._rec_perform(node, x[0], y[i], inverse, out[i],
-                                      curdim + 1)
-            else:
-                raise ValueError('Dimension mismatch: %s, %s' % (xs0, ys0))
-
-    def perform(self, node, inp, out):
-        x, y, inverse = inp
-        outs, = out
-        x_s = x.shape
-        y_s = y.shape
-        assert len(x_s) == len(y_s)
+                    'out[0] (%s), with shape %s, is not correctly filled.'
+                    % (out[0], out[0].shape))
+        # return
+        #raise NotImplementedError()
 
-        # Make sure the output is big enough
-        out_s = []
-        for xdim, ydim in izip(x_s, y_s):
-            if xdim == ydim:
-                outdim = xdim
-            elif xdim == 1:
-                outdim = ydim
-            elif ydim == 1:
-                outdim = xdim
-            else:
-                raise ValueError('Dimension mismatch: %s, %s' % (xdim, ydim))
-            out_s.append(outdim)
+    def connection_pattern(self, node):
 
-        if outs[0] is None or outs[0].shape != out_s:
-            outs[0] = numpy.empty(out_s, dtype=x.dtype)
+        rval = [[True]]
 
-        self._rec_perform(node, x, y, inverse, outs[0], curdim=0)
+        for ipt in node.inputs[1:]:
+            rval.append([False])
 
-    def infer_shape(self, node, in_shapes):
-        shp_x = in_shapes[0]
-        shp_y = in_shapes[1]
-        assert len(shp_x) == len(shp_y)
-        out_shape = []
-        for i in range(len(shp_x)):
-            out_shape.append(maximum(shp_x[i], shp_y[i]))
-        return [out_shape]
+        return rval
 
-    def grad(self, inp, grads):
-        x, y, inverse = inp
+    def grad(self, inputs, grads):
         gz, = grads
-        # First, compute the gradient wrt the broadcasted x.
-        # If 'inverse' is False (0), apply the inverse of y on gz.
-        # Else, apply y on gz.
-        gx = permute_row_elements(gz, y, eq(inverse, 0))
+        x = inputs[0]
+        rest = inputs[1:]
+        return [advanced_inc_subtensor(zeros_like(x), gz,
+                                       *rest)] + \
+            [DisconnectedType()()] * len(rest)
 
-        # If x has been broadcasted along some axes, we need to sum
-        # the gradient over these axes, but keep the dimension (as
-        # broadcastable)
-        broadcasted_dims = [dim for dim in xrange(gz.type.ndim)
-                            if x.type.broadcastable[dim]
-                            and not gz.type.broadcastable[dim]]
-        gx = Sum(axis=broadcasted_dims)(gx)
 
-        # Sum(...) removed the dimensions in broadcasted_dims,
-        # so we need to put them back.
-        newdims = []
-        i = 0
-        for dim in xrange(gz.type.ndim):
-            if dim in broadcasted_dims:
-                newdims.append('x')
+class AdvancedIncSubtensor(Op):
+    """Increments a subtensor using advanced indexing.
+
+    :note: We need the numpy.inplace_increment() function currently
+        numpy's PR 326 to be able to make an inplace version of this
+        op.
+
+    """
+
+    def __init__(self, inplace=False, set_instead_of_inc=False):
+        self.inplace = inplace
+        self.set_instead_of_inc = set_instead_of_inc
+        # The assert is needed as in the pass the first argument was
+        # something else that was not used.
+        assert isinstance(inplace, bool)
+        if self.inplace:
+            raise NotImplementedError('In place computation is not'
+                                      ' implemented')
+
+        self.allow_legacy_perform = False
+
+    def __hash__(self):
+        return hash((type(self), self.inplace, self.set_instead_of_inc))
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.inplace == other.inplace
+                and self.set_instead_of_inc == other.set_instead_of_inc)
+
+    def __str__(self):
+        return "%s{%s, %s}" % (self.__class__.__name__,
+                "inplace=" + str(self.inplace),
+                " set_instead_of_inc=" + str(self. set_instead_of_inc))
+
+    def make_node(self, x, y, *inputs):
+        x = as_tensor_variable(x)
+        y = as_tensor_variable(y)
+
+        op = self
+        # If we are incrementing, but the increment compiled function is not
+        # available, we need to support legacy cases.
+        if not self.set_instead_of_inc and inplace_increment is None:
+            legacy_conditions = False
+            if x.ndim == 2 and y.ndim == 1 and len(inputs) == 2:
+                ind1 = as_tensor_variable(inputs[0])
+                ind2 = as_tensor_variable(inputs[1])
+                if ind1.ndim == 1 and ind2.ndim == 1:
+                    if ind1.owner and isinstance(ind1.owner.op, ARange):
+                        legacy_conditions = True
+                    elif isinstance(ind1, Constant):
+                        # Make sure no index is duplicated
+                        val = ind1.value
+                        if numpy.unique(val).size == val.size:
+                            legacy_conditions = True
+                    elif ind2.owner and isinstance(ind2.owner.op, ARange):
+                        legacy_conditions = True
+                    elif isinstance(ind2, Constant):
+                        # Make sure no index is duplicated
+                        val = ind2.value
+                        if numpy.unique(val).size == val.size:
+                            legacy_conditions = True
+            if legacy_conditions:
+                op = python_copy(self)
+                op.allow_legacy_perform = True
             else:
-                newdims.append(i)
-                i += 1
+                raise NotImplementedError(
+                        'Could not import inplace_increment, so some advanced '
+                        'indexing features are disabled. They will be '
+                        'available if you update NumPy to version 1.8 or '
+                        'later, or to the latest development version.')
 
-        gx = DimShuffle(gx.type.broadcastable, newdims)(gx)
-        assert gx.type.broadcastable == x.type.broadcastable
+        return gof.Apply(op,
+                        (x, y) + inputs,
+                        [tensor(dtype=x.type.dtype,
+                            broadcastable=x.type.broadcastable)])
 
-        # if x is an integer type, then so is the output.
-        # this means f(x+eps) = f(x) so the gradient with respect
-        # to x is zero
-        if x.type.dtype.find('int') != -1:
-            gx = x.zeros_like()
+    def perform(self, node, inputs, out_):
+        # TODO: 1. opt to make this in place 2. generalize as described in
+        # AdvancedSubtensor's perform TODO
 
-        # The elements of y and of inverse both affect the output,
-        # so they are connected to the output,
-        # and the transformation isn't defined if their values
-        # are non-integer, so the gradient with respect to them is
-        # undefined
+        out, = out_
+        if not self.inplace:
+            out[0] = inputs[0].copy()
+        else:
+            out[0] = inputs[0]
+
+        if self.set_instead_of_inc:
+            out[0][inputs[2:]] = inputs[1]
+        elif inplace_increment is not None:
+            inplace_increment(out[0], tuple(inputs[2:]), inputs[1])
+        elif self.allow_legacy_perform:
+            out[0][inputs[2:]] += inputs[1]
+        else:
+            raise NotImplementedError(
+                    'Could not import inplace_increment, so some advanced '
+                    'indexing features are disabled. They will be '
+                    'available if you update NumPy to version 1.8 or '
+                    'later, or to the latest development version.')
+
+        if (numpy.__version__ <= '1.6.1' and
+                out[0].size != numpy.uint32(out[0].size)):
+            warnings.warn(
+                    'Numpy versions 1.6.1 and below have a bug preventing '
+                    'advanced indexing from correctly filling arrays that '
+                    'are too big (>= 2^32 elements). It is possible that '
+                    'out[0] (%s), with shape %s, is not correctly filled.'
+                    % (out[0], out[0].shape))
 
-        return [gx, grad_undefined(self, 1, y),
-                grad_undefined(self, 1, inverse)]
+    def infer_shape(self, node, ishapes):
+        return [ishapes[0]]
 
-_permute_row_elements = PermuteRowElements()
+    def connection_pattern(self, node):
 
+        rval = [[True], [True]]
 
-def permute_row_elements(x, y, inverse=0):
-    return _permute_row_elements(x, y, inverse)
+        for ipt in node.inputs[2:]:
+            rval.append([False])
 
+        return rval
 
-def inverse_permutation(perm):
-    """Computes the inverse of permutations.
-    Each row of input should contain a permutation of the first integers.
-    """
-    return permute_row_elements(
-        arange(perm.shape[-1], dtype=perm.dtype),
-        perm,
-        inverse=True)
+    def grad(self, inpt, output_gradients):
+        x, y = inpt[:2]
+        idxs = inpt[2:]
+        outgrad, = output_gradients
+        d_x_wrt_C = outgrad
+        d_y_wrt_C = AdvancedSubtensor()(outgrad, *idxs)
+        return [d_x_wrt_C, d_y_wrt_C] + \
+            [DisconnectedType()() for _ in idxs]
+
+    def R_op(self, inputs, eval_points):
+        if None in eval_points[:2]:
+            return [None]
+        return self.make_node(eval_points[0], eval_points[1],
+                              *inputs[2:]).outputs
+advanced_inc_subtensor = AdvancedIncSubtensor()
+
+
+def take(a, indices, axis=None, mode='raise'):
+    a = as_tensor_variable(a)
+    indices = as_tensor_variable(indices)
+    # Reuse advanced_subtensor1 if indices is a vector
+    if indices.ndim == 1:
+        if mode == 'clip':
+            indices = clip(indices, 0, a.shape[axis] - 1)
+        elif mode == 'wrap':
+            indices = indices % a.shape[axis]
+        if axis is None:
+            return advanced_subtensor1(a.flatten(), indices)
+        elif axis == 0:
+            return advanced_subtensor1(a, indices)
+        else:
+            if axis < 0:
+                axis += a.ndim
+            assert axis >= 0
+            shuffle = range(a.ndim)
+            shuffle[0] = axis
+            shuffle[axis] = 0
+            return advanced_subtensor1(
+                a.dimshuffle(shuffle), indices).dimshuffle(shuffle)
+    if axis is None:
+        shape = indices.shape
+        ndim = indices.ndim
+    else:
+        shape = concatenate(
+                        [a.shape[:axis], indices.shape, a.shape[axis + 1:]])
+        ndim = a.ndim + indices.ndim - 1
+    return take(a, indices.flatten(), axis, mode).reshape(shape, ndim)
 
 
 #########################
@@ -4555,11 +7843,24 @@ def make_node(self, *inputs):
     def perform(self, node, inp, out):
         x, y = inp
         z, = out
-
-        # the asarray is here because dot between two vectors
-        # gives a numpy float object but we need to return a 0d
-        # ndarray
-        z[0] = numpy.asarray(numpy.dot(x, y))
+        try:
+            # the asarray is here because dot between two vectors
+            # gives a numpy float object but we need to return a 0d
+            # ndarray
+            z[0] = numpy.asarray(numpy.dot(x, y))
+        except ValueError, e:
+            # The error raised by numpy has no shape information, we mean to
+            # add that
+            if config.exception_verbosity == 'high':
+                raise ValueError('dot product failed.\n'
+                                 'First arg dims: ' + str(x.shape) + '\n'
+                                 'Second arg dims: ' + str(y.shape) + '\n'
+                                 'First arg: \n' +
+                                 min_informative_str(node.inputs[0]) +
+                                 '\nSecond arg: \n' +
+                                 min_informative_str(node.inputs[1]))
+            e.args = e.args + (x.shape, y.shape)
+            raise
 
     def grad(self, inp, grads):
 
@@ -4587,15 +7888,6 @@ def grad(self, inp, grads):
             xgrad = dot(gz, y.T)
             ygrad = dot(x.T, gz)
 
-        # If x or y contain broadcastable dimensions but only one of
-        # them know that a matching dimensions is broadcastable, the
-        # above code don't always return the right broadcast pattern.
-        # This cause problem down the road. See gh-1461.
-        if xgrad.broadcastable != x.broadcastable:
-            xgrad = patternbroadcast(xgrad, x.broadcastable)
-        if ygrad.broadcastable != y.broadcastable:
-            ygrad = patternbroadcast(ygrad, y.broadcastable)
-
         rval = xgrad, ygrad
 
         for elem in rval:
@@ -4606,11 +7898,11 @@ def grad(self, inp, grads):
     def R_op(self, inputs, eval_points):
         # R_op for a \dot b evaluted at c for a and d for b is
         # simply c \dot b + a \dot d
+        if None in eval_points:
+            return [None]
 
         assert len(inputs) == 2
         assert len(eval_points) == 2
-        if eval_points[0] is None and eval_points[1] is None:
-            return [None]
 
         debugger_available = config.compute_test_value != 'off'
 
@@ -4629,47 +7921,37 @@ def R_op(self, inputs, eval_points):
                     'second input passed to Dot.R_op has no test value')
                 debugger_available = False
 
-            if eval_points[0]:
-                try:
-                    ev0 = gof.op.get_test_value(eval_points[0])
-                except AttributeError:
-                    gof.op.missing_test_message(
-                        'first eval point passed to Dot.R_op '
-                        'has no test value')
-                    debugger_available = False
-            if eval_points[1]:
-                try:
-                    ev1 = gof.op.get_test_value(eval_points[1])
-                except AttributeError:
-                    gof.op.missing_test_message(
-                        'second eval point passed to Dot.R_op '
-                        'has no test value')
-                    debugger_available = False
+            try:
+                ev0 = gof.op.get_test_value(eval_points[0])
+            except AttributeError:
+                gof.op.missing_test_message(
+                    'first eval point passed to Dot.R_op has no test value')
+                debugger_available = False
+            try:
+                ev1 = gof.op.get_test_value(eval_points[1])
+            except AttributeError:
+                gof.op.missing_test_message(
+                    'second eval point passed to Dot.R_op has no test value')
+                debugger_available = False
 
         if debugger_available:
             input_values = [iv0, iv1]
             eval_point_values = [ev0, ev1]
 
             for i in xrange(2):
-                if eval_point_values[i] is not None and \
-                   input_values[i].shape != eval_point_values[i].shape:
-                    raise ValueError(
-                        'input ' + str(i) + ' and eval_point ' + str(i)
-                        + ' to Dot.R_op should have the same shape, but '
-                        'their shapes are %s and %s, respectively' % (
+                if input_values[i].shape != eval_point_values[i].shape:
+                    raise ValueError('input ' + str(i) + ' and eval_point ' +
+                                     str(i) + ' to Dot.R_op '
+                                     'should have the '
+                                     'same shape, but their shapes are'
+                                     ' %s and %s, respectively' % (
                             str(input_values[i].shape),
                             str(eval_point_values[i].shape)))
-        if eval_points[0]:
-            t1 = self(eval_points[0], inputs[1])
-        if eval_points[1]:
-            t2 = self(inputs[0], eval_points[1])
-
-        if eval_points[0] and eval_points[1]:
-            return [t1 + t2]
-        elif eval_points[0]:
-            return [t1]
-        else:
-            return [t2]
+
+        t1 = self(eval_points[0], inputs[1])
+        t2 = self(inputs[0], eval_points[1])
+
+        return [t1 + t2]
 
     def infer_shape(self, node, shapes):
         xshp, yshp = shapes
@@ -4694,7 +7976,7 @@ def __str__(self):
 
 _dot = Dot()
 pprint.assign(_dot, printing.OperatorPrinter(printing.special['middle_dot'],
-                                             -1, 'left'))
+                                            -1, 'left'))
 
 
 def dot(a, b):
@@ -4850,7 +8132,7 @@ def tensordot(a, b, axes=2):
         # check if axes is valid given the dimension of a and b
         if axes > a.ndim:
             raise ValueError('axes can not be larger than the dimension of '
-                             'a (a.ndim=%i, axes=%i)' % (a.ndim, axes))
+                    'a (a.ndim=%i, axes=%i)' % (a.ndim, axes))
         if axes > b.ndim:
             raise ValueError('axes can not be larger than than the dimension '
                              'of b (b.ndim=%i, axes=%i)' % (b.ndim, axes))
@@ -4923,10 +8205,9 @@ def tensordot(a, b, axes=2):
                              (b.ndim, numpy.max(numpy.array(b_axes))))
 
         a_order = (tuple(x for x in tuple(xrange(a.ndim)) if x not in a_axes)
-                   + a_axes)
-        b_order = (b_axes + tuple(x
-                                  for x in tuple(xrange(b.ndim))
-                                  if x not in b_axes))
+                + a_axes)
+        b_order = (b_axes
+                + tuple(x for x in tuple(xrange(b.ndim)) if x not in b_axes))
 
         a_shuffled = a.dimshuffle(a_order)
         b_shuffled = b.dimshuffle(b_order)
@@ -4945,8 +8226,8 @@ def outer(x, y):
     if y.ndim != 1:
         y = y.flatten()
     return dot(
-        x.dimshuffle(0, 'x'),
-        y.dimshuffle('x', 0))
+            x.dimshuffle(0, 'x'),
+            y.dimshuffle('x', 0))
 
 
 def any(x, axis=None, keepdims=False):
@@ -5023,7 +8304,8 @@ def __str__(self):
 
 def diagonal(a, offset=0, axis1=0, axis2=1):
     if (offset, axis1, axis2) == (0, 0, 1):
-        return theano.tensor.nlinalg.extract_diag(a)
+        from theano.sandbox.linalg import extract_diag
+        return extract_diag(a)
     return Diagonal(offset, axis1, axis2)(a)
 
 
@@ -5095,183 +8377,3 @@ def stacklists(arg):
         return stack(*map(stacklists, arg))
     else:
         return arg
-
-
-def ptp(a, axis=None):
-    """
-    Range of values (maximum - minimum) along an axis.
-
-    The name of the function comes from the acronym for peak to peak.
-
-    :param a : Input tensor.
-
-    :param axis : Axis along which to find the peaks. By default,
-                flatten the array.
-
-    :return : A new array holding the result.
-    """
-
-    a = as_tensor_variable(a)
-
-    out = max(a, axis) - min(a, axis)
-
-    return out
-
-
-def power(x, y):
-    return x ** y
-
-
-def swapaxes(y, axis1, axis2):
-    "swap axes of inputted tensor"
-    y = as_tensor_variable(y)
-    ndim = y.ndim
-    li = range(0, ndim)
-    li[axis1], li[axis2] = li[axis2], li[axis1]
-    return y.dimshuffle(li)
-
-
-def choose(a, choices, out=None, mode='raise'):
-    """
-    Construct an array from an index array and a set of arrays to choose from.
-
-    First of all, if confused or uncertain, definitely look at the Examples -
-    in its full generality, this function is less simple than it might seem
-    from the following code description (below ndi = numpy.lib.index_tricks):
-
-    np.choose(a,c) == np.array([c[a[I]][I] for I in ndi.ndindex(a.shape)]).
-
-    But this omits some subtleties. Here is a fully general summary:
-
-    Given an ``index`` array (a) of integers and a sequence of n arrays
-    (choices), a and each choice array are first broadcast, as necessary,
-    to arrays of a common shape; calling these Ba and
-    Bchoices[i], i = 0,...,n-1 we have that, necessarily,
-    Ba.shape == Bchoices[i].shape for each i.
-    Then, a new array with shape Ba.shape is created as follows:
-
-    - if mode=raise (the default), then, first of all, each element of a
-      (and thus Ba) must be in the range [0, n-1]; now, suppose that
-      i (in that range) is the value at the (j0, j1, ..., jm) position in Ba -
-      then the value at the same position in the new array is the value in
-      Bchoices[i] at that same position;
-
-    - if mode=wrap, values in a (and thus Ba) may be any (signed) integer;
-      modular arithmetic is used to map integers outside the range [0, n-1]
-      back into that range; and then the new array is constructed as above;
-
-    - if mode=clip, values in a (and thus Ba) may be any (signed) integer;
-      negative integers are mapped to 0; values greater than n-1 are mapped
-      to n-1; and then the new array is constructed as above.
-
-    :Parameter: *a* - int array
-        This array must contain integers in [0, n-1], where n is the number of
-        choices, unless mode=wrap or mode=clip, in which cases any integers
-        are permissible.
-    :Parameter: *choices* - sequence of arrays
-        Choice arrays. a and all of the choices must be broadcastable to
-        the same shape. If choices is itself an array (not recommended),
-        then its outermost dimension (i.e., the one corresponding to
-        choices.shape[0]) is taken as defining the ``sequence``.
-    :Parameter: *out* - array, optional
-        If provided, the result will be inserted into this array.
-        It should be of the appropriate shape and dtype.
-    :Parameter: *mode* - {``raise`` (default), ``wrap``, ``clip``}, optional
-        Specifies how indices outside [0, n-1] will be treated:
-        ``raise`` : an exception is raised
-        ``wrap`` : value becomes value mod n
-        ``clip`` : values < 0 are mapped to 0, values > n-1 are mapped to n-1
-    :Returns: merged_array - array
-        The merged result.
-    :Raises:
-        ValueError - shape mismatch
-        If a and each choice array are not all broadcastable to the same shape.
-    """
-    # This is done to keep the same function signature then NumPy.
-    assert out is None
-    return Choose(mode)(a, choices)
-
-
-class Choose(Op):
-    __props__ = ('mode',)
-
-    def __init__(self, mode):
-        assert mode in ("raise", "wrap", "clip")
-        self.mode = mode
-
-    def infer_shape(self, node, shapes):
-
-        if isinstance(node.inputs[1], TensorVariable):
-            # We have padded node.inputs[0] to the right number of
-            # dimensions for the output
-            l = []
-            for sh1, sh2, b1 in zip(shapes[0],
-                                        shapes[1][1:],
-                                        node.inputs[0].broadcastable):
-                if b1:
-                    l.append(sh2)
-                else:
-                    l.append(sh1)
-            return [tuple(l)]
-        else:
-            import theano.typed_list
-            assert isinstance(node.inputs[1],
-                              theano.typed_list.TypedListVariable)
-            raise ShapeError("Case not implemented")
-            shape = shapes[0]
-            for i in range(len(shapes[0]) - 1):
-                shape[i] = shapes[1][i]
-            return [(shape)]
-
-    def make_node(self, a, choices):
-        # Import here as it isn't imported by default and we can't
-        # import at the top as it would cause circular import.
-        import theano.typed_list
-        a = as_tensor_variable(a)
-        if a.dtype not in theano.tensor.discrete_dtypes:
-            raise TypeError(
-                'choose first argument must have an [u]int* dtype. Got %s.'
-                % a.dtype)
-
-        if isinstance(choices, (tuple, list,
-                                theano.typed_list.TypedListVariable)):
-            choice = theano.typed_list.make_list(choices)
-            choice_ndim = choice.ttype.ndim
-            choice_bcast = choice.ttype.broadcastable
-        else:
-            choice = as_tensor_variable(choices)
-            choice_ndim = choice.ndim - 1
-            choice_bcast = choice.broadcastable[1:]
-        out_ndim = numpy.max([a.ndim, choice_ndim])
-
-        # Make explicit all added broadcastable dimensions.
-        a = shape_padleft(a, out_ndim - a.ndim)
-        if len(choice_bcast) != out_ndim:
-            if isinstance(choice.type, TensorType):
-                choice = choice.dimshuffle(0,
-                                           *(('x',) *(out_ndim - choice_ndim) +
-                                             tuple(range(1, choice.ndim))))
-                choice_ndim = choice.ndim - 1
-                choice_bcast = choice.broadcastable[1:]
-            else:
-                raise NotImplementedError(
-                    "We currently didn't implemented that case. "
-                    "To make it work, explicitly add dimensions "
-                    "of size one for dimensions that will be broadcasted")
-                assert isinstance(node.inputs[1],
-                                  theano.typed_list.TypedListVariable)
-
-        bcast = [False] * out_ndim
-        for idx, (b1, b2) in enumerate(
-            zip(a.broadcastable,
-                (True,) * (out_ndim - choice_ndim) + choice_bcast)):
-            if b1 and b2:
-                bcast[idx] = True
-        o = TensorType(choice.dtype, bcast)
-        return Apply(self, [a, choice], [o()])
-
-    def perform(self, node, inputs, (z, )):
-        a = inputs[0]
-        choice = inputs[1]
-        # TODO reuse out?
-        z[0] = numpy.choose(a, choice, mode=self.mode)
diff --git a/theano/tensor/blas.py b/theano/tensor/blas.py
index ae9a76f05c9..480e3da3565 100644
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -128,18 +128,13 @@
 import os
 import sys
 import time
-import warnings
 
 import numpy
 import numpy.distutils
-import numpy.distutils.system_info
-try:
-    import numpy.distutils.__config__
-except ImportError:
-    pass
+import numpy.distutils.__config__
 
 from theano.configparser import config, AddConfigVar, StrParam
-from theano.gof import (utils, Op, view_roots,
+from theano.gof import (utils, Op, view_roots, DestroyHandler,
                         local_optimizer, Optimizer,
                         InconsistencyError, toolbox, SequenceDB,
                         EquilibriumOptimizer, Apply,
@@ -152,7 +147,7 @@
 from theano.tensor import basic as T
 from theano.tensor.blas_headers import blas_header_text
 from theano.tensor.blas_headers import blas_header_version
-from theano.tensor.opt import in2out, local_dimshuffle_lift
+from theano.tensor.opt import local_dimshuffle_lift
 
 _logger = logging.getLogger('theano.tensor.blas')
 
@@ -161,24 +156,8 @@
 # Otherwise, we give an optimization warning for no reason in some cases.
 def default_blas_ldflags():
     try:
-        if (hasattr(numpy.distutils, '__config__') and
-            numpy.distutils.__config__):
-            #If the old private interface is available use it as it
-            #don't print information to the user.
-            blas_info = numpy.distutils.__config__.blas_opt_info
-        else:
-            #We need to catch warnings as in some cases NumPy print
-            #stuff that we don't want the user to see like this:
-            """
-SOMEPATH/Canopy_64bit/User/lib/python2.7/site-packages/numpy/distutils/system_info.py:564: UserWarning: Specified path /home/vagrant/src/master-env/lib is invalid.
-  warnings.warn('Specified path %s is invalid.' % d)
-"""
-            #I'm not able to remove all printed stuff
-            with warnings.catch_warnings(record=True):
-                numpy.distutils.system_info.system_info.verbosity = 0
-                blas_info = numpy.distutils.system_info.get_info("blas_opt")
-
         # If we are in a EPD installation, mkl is available
+        blas_info = numpy.distutils.__config__.blas_opt_info
         if "EPD" in sys.version:
             use_unix_epd = True
             if sys.platform == 'win32':
@@ -220,38 +199,28 @@ def default_blas_ldflags():
                     ['-l%s' % l for l in blas_info['libraries']])
         #Canopy
         if "Canopy" in sys.prefix:
-            subsub = 'lib'
-            if sys.platform == 'win32':
-                subsub = 'Scripts'
-            lib_path = os.path.join(sys.base_prefix, subsub)
-            if not os.path.exists(lib_path):
-                # Old logic to find the path. I don't think we still
-                # need it, but I don't have the time to test all
-                # installation configuration. So I keep this as a fall
-                # back in case the current expectation don't work.
-
-                # This old logic don't work when multiple version of
-                # Canopy is installed.
-                p = os.path.join(sys.base_prefix, "..", "..", "appdata")
-                assert os.path.exists(p), "Canopy changed the location of MKL"
-                lib_paths = os.listdir(p)
-                # Try to remove subdir that can't contain MKL
-                for sub in lib_paths:
-                    if not os.path.exists(os.path.join(p, sub, subsub)):
-                        lib_paths.remove(sub)
-                assert len(lib_paths) == 1, (
-                    "Unexpected case when looking for Canopy MKL libraries",
-                    p, lib_paths, [os.listdir(os.path.join(p, sub))
-                                   for sub in lib_paths])
-                lib_path = os.path.join(p, lib_paths[0], subsub)
-                assert os.path.exists(lib_path), "Canopy changed the location of MKL"
-            if sys.platform == "linux2" or sys.platform == "darwin":
+            if sys.platform == "darwin":
+                p2 = os.path.join(sys.base_prefix, "lib")
+                assert os.path.exists(p2), "Canopy changed the location of MKL"
+                return ' '.join(
+                    ['-L%s' % p2] +
+                    ['-l%s' % l for l in blas_info['libraries']])
+
+            p = os.path.join(sys.base_prefix, "..", "..", "appdata")
+            assert os.path.exists(p), "Canopy changed the location of MKL"
+            p2 = os.listdir(p)
+            assert len(p2) == 1, "Canopy changed the location of MKL"
+            if sys.platform == "linux2":
+                p2 = os.path.join(p, p2[0], "lib")
+                assert os.path.exists(p2), "Canopy changed the location of MKL"
                 return ' '.join(
-                    ['-L%s' % lib_path] +
+                    ['-L%s' % p2] +
                     ['-l%s' % l for l in blas_info['libraries']])
             elif sys.platform == 'win32':
+                p2 = os.path.join(p, p2[0], "Scripts")
+                assert os.path.exists(p2), "Canopy changed the location of MKL"
                 return ' '.join(
-                    ['-L%s' % lib_path] +
+                    ['-L%s' % p2] +
                     # Why on Windows, the library used are not the
                     # same as what is in blas_info['libraries']?
                     ['-l%s' % l for l in ["mk2_core", "mk2_intel_thread",
@@ -260,22 +229,16 @@ def default_blas_ldflags():
         #if numpy was linked with library that are not installed, we
         #can't reuse them.
         if any(os.path.exists(dir) for dir in blas_info['library_dirs']):
-            ret = (
-                #TODO: the Gemm op below should separate the
-                # -L and -l arguments into the two callbacks
-                # that CLinker uses for that stuff.  for now,
-                # we just pass the whole ldflags as the -l
-                # options part.
-                ['-L%s' % l for l in blas_info['library_dirs']] +
-                ['-l%s' % l for l in blas_info['libraries']] +
-                [])
-#               ['-I%s' % l for l in blas_info['include_dirs']])
-            #if numpy was linked with library that are not installed or
-            #the dev version of the package is not currently available, we
-            #can't reuse them.
-            if GCC_compiler.try_flags(ret):
-                return ' '.join(ret)
-
+            return ' '.join(
+                        #TODO: the Gemm op below should separate the
+                        # -L and -l arguments into the two callbacks
+                        # that CLinker uses for that stuff.  for now,
+                        # we just pass the whole ldflags as the -l
+                        # options part.
+                        ['-L%s' % l for l in blas_info['library_dirs']] +
+                        ['-l%s' % l for l in blas_info['libraries']] +
+                        [])
+#                       ['-I%s' % l for l in blas_info['include_dirs']])
     except KeyError:
         pass
 
@@ -722,9 +685,9 @@ def c_header_dirs(self):
         /*
         encode the stride structure of _x,_y,_zout into a single integer
         */
-        unit |= ((Sx[1] == type_size || Nx[1]==1) ? 0x0 : (Sx[0] == type_size || Nx[0]==1) ? 0x1 : 0x2) << 8;
-        unit |= ((Sy[1] == type_size || Ny[1]==1) ? 0x0 : (Sy[0] == type_size || Ny[0]==1) ? 0x1 : 0x2) << 4;
-        unit |= ((Sz[1] == type_size || Nz[1]==1) ? 0x0 : (Sz[0] == type_size || Nz[0]==1) ? 0x1 : 0x2) << 0;
+        unit |= ((Sx[1] == type_size) ? 0x0 : (Sx[0] == type_size) ? 0x1 : 0x2) << 8;
+        unit |= ((Sy[1] == type_size) ? 0x0 : (Sy[0] == type_size) ? 0x1 : 0x2) << 4;
+        unit |= ((Sz[1] == type_size) ? 0x0 : (Sz[0] == type_size) ? 0x1 : 0x2) << 0;
         """
 
     compute_strides = """
@@ -857,7 +820,7 @@ def build_gemm_call(self):
             self.end_switch_typenum), '')
 
     def build_gemm_version(self):
-        return (13, blas_header_version())
+        return (12, blas_header_version())
 
 
 class Gemm(GemmRelated):
@@ -924,22 +887,9 @@ def make_node(self, *inputs):
                 "Wrong number of inputs for %s (expected 5, got %s)" %
                 (self, len(inputs)))
         z, a, x, y, b = inputs
-
-        # For the consistency check we don't want z to be a cached constant.
-        if getattr(z, 'cached', False):
-            z = copy.copy(z)
         zr, xr, yr = [set(view_roots(i)) for i in z, x, y]
 
-        # We want the gemm to be inplace. When this op is inplace, it
-        # declare to be inplace only on z. So to make it safe, we
-        # raise an error if z can be a view on x or y.
-
-        # I don't know if Theano currently can support that case. As
-        # this case don't happen in our code, I won't spent time
-        # investigating this. So the assert is for safety.  I also
-        # think there is another mechanism that would prevent this,
-        # but I don't what to modify old code and have chance to break
-        # something.
+        # TODO: justify / delete
         if zr.intersection(xr):
             raise InconsistencyError(Gemm.E_z_uniq, (z, x))
         if zr.intersection(yr):
@@ -1027,7 +977,7 @@ def perform(self, node, inp, out):
             dims[0] = PyArray_DIMS(%(_z)s)[0];
             dims[1] = PyArray_DIMS(%(_z)s)[1];
             %(_zout)s = (PyArrayObject*)PyArray_SimpleNew(2, dims,
-                                                          PyArray_TYPE((PyArrayObject*) py_%(_z)s));
+                                                          type_num_%(_z)s);
             //fprintf(stderr, "Gemm Allocating %%i %%i\\n", dims[0], dims[1]);
             if(!%(_zout)s) {
                 PyErr_SetString(PyExc_MemoryError,
@@ -1191,31 +1141,32 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip=True):
 
     # it also might be the case that there is a dimshuffle between the +
     # and the dot22. local_dot_to_dot22 in particular will put in such things.
-    if (M.owner and isinstance(M.owner.op, T.DimShuffle) and
-        M.owner.inputs[0].owner and
-        isinstance(M.owner.inputs[0].owner.op, Dot22)):
+    if M.owner and isinstance(M.owner.op, T.DimShuffle):
         MM = M.owner.inputs[0]
-        if M.owner.op.new_order == (0,):
+        if tuple(M.owner.op.new_order) == (0,):
             # it is making a column MM into a vector
-            MMl, MMr = MM.owner.inputs
-            g = gemm_no_inplace(L.dimshuffle(0, 'x'),
-                                alpha, MMl, MMr, beta)
-            rval = [g.dimshuffle(0)]
-            return rval, MM
-        if M.owner.op.new_order == (1,):
+            if MM.owner and MM.owner.op == _dot22:
+                MMl, MMr = MM.owner.inputs
+                g = gemm_no_inplace(L.dimshuffle(0, 'x'),
+                        alpha, MMl, MMr, beta)
+                rval = [g.dimshuffle(0)]
+                return rval, MM
+        if tuple(M.owner.op.new_order) == (1,):
             # it is making a row MM into a vector
-            MMl, MMr = MM.owner.inputs
-            g = gemm_no_inplace(L.dimshuffle('x', 0),
-                                alpha, MMl, MMr, beta)
-            rval = [g.dimshuffle(1)]
-            return rval, MM
-        if len(M.owner.op.new_order) == 0:
+            if MM.owner and MM.owner.op == _dot22:
+                MMl, MMr = MM.owner.inputs
+                g = gemm_no_inplace(L.dimshuffle('x', 0),
+                        alpha, MMl, MMr, beta)
+                rval = [g.dimshuffle(1)]
+                return rval, MM
+        if tuple(M.owner.op.new_order) == ():
             # it is making a row MM into a vector
-            MMl, MMr = MM.owner.inputs
-            g = gemm_no_inplace(L.dimshuffle('x', 'x'),
-                                alpha, MMl, MMr, beta)
-            rval = [g.dimshuffle()]
-            return rval, MM
+            if MM.owner and MM.owner.op == _dot22:
+                MMl, MMr = MM.owner.inputs
+                g = gemm_no_inplace(L.dimshuffle('x', 'x'),
+                        alpha, MMl, MMr, beta)
+                rval = [g.dimshuffle()]
+                return rval, MM
 
     # this is False'd out because of inadequate testing.
     # TODO see ticket #237
@@ -1379,31 +1330,29 @@ def _gemm_from_factored_list(lst):
     """Returns None, or a list to replace node.outputs
     """
 
+    # Make every pair in list have matching dtypes
+    # sM can be a tuple of 2 elements or a theano variable.
+    # We should not use __len__ as theano variables don't support
+    # it. I don't want to change this to isinstance(sM, tuple)
+    # as I'm not able to make a test that triggers this case.
+    def is_pair(sM):
+        try:
+            s, M = sM
+            return True
+        except Exception:
+            return False
+
     lst2 = []
     # Remove the tuple that can't be cast correctly.
     # This can happen when we try to cast a complex to a real
     for sM in lst:
-        # Make every pair in list have matching dtypes
-        # sM can be a tuple of 2 elements or a theano variable.
-        if isinstance(sM, tuple):
+        if is_pair(sM):
             sm0, sm1 = sM
             sm0 = T.as_tensor_variable(sm0)
             if theano.scalar.upcast(sm0.dtype, sm1.dtype) == sm1.dtype:
                 lst2.append((T.cast(sm0, sm1.dtype), sM[1]))
-
     lst = lst2
 
-    def item_to_var(t):
-        try:
-            s, M = t
-        except Exception:
-            return t
-        if s == 1:
-            return M
-        if s == -1:
-            return -M
-        return s * M
-
     # Try every pair in the sM_list, trying to turn it into a gemm operation
     for i in xrange(len(lst) - 1):
         s_i, M_i = lst[i]
@@ -1420,6 +1369,16 @@ def item_to_var(t):
                                                               s_j, M_j)
             #print 'GOT IT', gemm_of_sM_list
             if gemm_of_sM_list:
+                def item_to_var(t):
+                    try:
+                        s, M = t
+                    except Exception:
+                        return t
+                    if s == 1:
+                        return M
+                    if s == -1:
+                        return -M
+                    return s * M
 
                 assert len(gemm_of_sM_list) == 1
                 add_inputs = [item_to_var(input)
@@ -1477,6 +1436,7 @@ def __init__(self):
 
     def add_requirements(self, fgraph):
         fgraph.attach_feature(toolbox.ReplaceValidate())
+        fgraph.attach_feature(DestroyHandler())
 
     def apply(self, fgraph):
         did_something = True
@@ -1489,21 +1449,9 @@ def apply(self, fgraph):
         time_factor_can = 0
         time_factor_list = 0
         time_toposort = 0
-        if fgraph.profile:
-            validate_before = fgraph.profile.validate_time
-            callbacks_before = fgraph.execute_callbacks_times.copy()
-            callback_before = fgraph.execute_callbacks_time
-
-        def on_import(new_node):
-            if new_node is not node:
-                nodelist.append(new_node)
-
-        u = theano.gof.opt.Updater(on_import, None, None)
-        fgraph.attach_feature(u)
         while did_something:
-            nb_iter += 1
             t0 = time.time()
-            nodelist = theano.gof.graph.io_toposort(fgraph.inputs, fgraph.outputs)
+            nodelist = list(fgraph.toposort())
             time_toposort += time.time() - t0
             did_something = False
             nodelist.reverse()
@@ -1546,30 +1494,16 @@ def on_import(new_node):
                     except ReplacementDidntRemovedError, e:
                         nb_replacement_didn_t_remove += 1
                         self.warned = True
-        fgraph.remove_feature(u)
-        if fgraph.profile:
-            validate_time = fgraph.profile.validate_time - validate_before
-            callback_time = fgraph.execute_callbacks_time - callback_before
-            callbacks_time = {}
-            for k, v in fgraph.execute_callbacks_times.iteritems():
-                if k in callbacks_before:
-                    callbacks_time[k] = v - callbacks_before[k]
-                else:
-                    callbacks_time[k] = v
-        else:
-            validate_time = None
-            callback_time = None
-            callbacks_time = {}
-
+            nb_iter += 1
         return (self, nb_iter, nb_replacement, nb_replacement_didn_t_remove,
                 nb_inconsistency_make, nb_inconsistency_replace,
                 time_canonicalize, time_factor_can,
-                time_factor_list, time_toposort,
-                validate_time, callback_time, callbacks_time,)
+                time_factor_list, time_toposort)
 
     @staticmethod
     def print_profile(stream, prof, level=0):
         blanc = ('    ' * level)
+        #1946.912556s - ('gemm_optimizer', 'GemmOptimizer', 1)
         print >> stream, blanc, "GemmOptimizer"
         print >> stream, blanc, " nb_iter", prof[1]
         print >> stream, blanc, " nb_replacement", prof[2]
@@ -1580,13 +1514,6 @@ def print_profile(stream, prof, level=0):
         print >> stream, blanc, " time_factor_can", prof[7]
         print >> stream, blanc, " time_factor_list", prof[8]
         print >> stream, blanc, " time_toposort", prof[9]
-        print >> stream, blanc, " validate_time", prof[10]
-        print >> stream, blanc, " callback_time", prof[11]
-        if prof[11] > 1:
-            print >> stream, blanc, " callbacks_time"
-            for i in sorted(prof[12].iteritems(), key=lambda a: a[1]):
-                if i[1] > 0:
-                    print i
 
 
 class Dot22(GemmRelated):
@@ -1617,7 +1544,7 @@ def perform(self, node, inp, out):
             raise
 
     def __str__(self):
-        return self.__class__.__name__
+        return "_dot22"
 
     setup_z_Nz_Sz = """
         if ((NULL == %(_zout)s)
@@ -1629,7 +1556,7 @@ def __str__(self):
             dims[0] = PyArray_DIMS(%(_x)s)[0];
             dims[1] = PyArray_DIMS(%(_y)s)[1];
             %(_zout)s = (PyArrayObject*)PyArray_SimpleNew(2, dims,
-                            PyArray_TYPE((PyArrayObject*) py_%(_x)s));
+                            type_num_%(_x)s);
             //fprintf(stderr, "Dot Allocating %%i %%i\\n", dims[0], dims[1]);
             if(!%(_zout)s) {
                 PyErr_SetString(PyExc_MemoryError,
@@ -1673,7 +1600,7 @@ def c_code_cache_version(self):
 _dot22 = Dot22()
 
 
-@local_optimizer([T.Dot])
+@local_optimizer([T._dot])
 def local_dot_to_dot22(node):
     # This works for tensor.outer too because basic.outer is a macro that
     # produces a dot(dimshuffle,dimshuffle) of form 4 below
@@ -1705,19 +1632,20 @@ def local_dot_to_dot22(node):
     _logger.info('Not optimizing dot with inputs %s %s %s %s',
                  x, y, x.type, y.type)
 
-@local_optimizer([gemm_no_inplace], inplace=True)
+
+@local_optimizer([gemm_no_inplace])
 def local_inplace_gemm(node):
     if node.op == gemm_no_inplace:
         return [gemm_inplace(*node.inputs)]
 
 
-@local_optimizer([gemv_no_inplace], inplace=True)
+@local_optimizer([gemv_no_inplace])
 def local_inplace_gemv(node):
     if node.op == gemv_no_inplace:
         return [gemv_inplace(*node.inputs)]
 
 
-@local_optimizer([ger], inplace=True)
+@local_optimizer([ger])
 def local_inplace_ger(node):
     if node.op == ger:
         return [ger_destructive(*node.inputs)]
@@ -1816,14 +1744,13 @@ def local_dot22_to_ger_or_gemv(node):
 blas_optdb = SequenceDB()
 
 # run after numerical stability optimizations (1.5)
-optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run', 'fast_compile')
+optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run')
 # run before specialize (2.0) because specialize is basically a
 # free-for-all that makes the graph crazy.
 
-#fast_compile is needed to have GpuDot22 created.
 blas_optdb.register('local_dot_to_dot22',
-                    in2out(local_dot_to_dot22),
-                    0, 'fast_run', 'fast_compile')
+        EquilibriumOptimizer([local_dot_to_dot22], max_use_ratio=5),
+        0, 'fast_run')
 blas_optdb.register('gemm_optimizer',
         GemmOptimizer(),
         10, 'fast_run')
@@ -1837,15 +1764,17 @@ def local_dot22_to_ger_or_gemv(node):
         15, 'fast_run')
 
 
-# After destroyhandler(49.5) but before we try to make elemwise things
-# inplace (75)
-blas_opt_inplace = in2out(local_inplace_gemm,
-                          local_inplace_gemv,
-                          local_inplace_ger,
-                          name="blas_opt_inplace")
+# After destroyhandler is in but before we try to make elemwise things inplace
+# Try to make gemm inplace
+# Also, need to make the gemm optimisation(step 70) happen before the
+# fusion of elemwise(step 71)
+blas_opt_inplace = EquilibriumOptimizer(
+            [local_inplace_gemm, local_inplace_gemv, local_inplace_ger],
+            failure_callback=EquilibriumOptimizer.warn_inplace,
+            max_use_ratio=5)
 optdb.register('InplaceBlasOpt',
-               blas_opt_inplace,
-               70.0, 'fast_run', 'inplace', 'blas_opt_inplace')
+        blas_opt_inplace,
+        70.0, 'fast_run', 'inplace')
 
 
 class Dot22Scalar(GemmRelated):
@@ -1888,7 +1817,7 @@ def perform(self, node, inp, out):
             raise
 
     def __str__(self):
-        return self.__class__.__name__
+        return "_dot22scalar"
 
     setup_z_Nz_Sz = Dot22.setup_z_Nz_Sz
 
@@ -1968,53 +1897,55 @@ def local_dot22_to_dot22scalar(node):
     d = node.inputs[dot22_idx]
     i_scalar = [_as_scalar(x, dtype=d.dtype) for x in node.inputs]
     if not any(i_scalar):
-        # Check if we can reorder the graph as this mul have a mul in inputs.
-        # We support only 1 additional level of mul.
-        # The canonizer should have merged those mul together.
-        i_mul = [x.owner and x.owner.op == T.mul and
-                 any([_as_scalar(x_i, dtype=d.dtype)
-                   for x_i in x.owner.inputs])
-                 for x in node.inputs]
+        i_mul = [x.owner and x.owner.op == T.mul for x in node.inputs]
         if not any(i_mul):
             #no scalar in input and no multiplication
             #if their was a multiplication we couls reorder the graph
             #by the associativity of the graph.
             return False
 
-        mul_idx = i_mul.index(True)  # The first one should always work
+        #maybe we can reorder the graph as this mul have a mul in input.
+        #The canonizer should have merged those mul together.
+        #We support only 1 additional level of mul.
+        mul_idx = i_mul.index(True)  # we take the first mul!
         m = node.inputs[mul_idx]
 
-        scalar_idx = -1
-        for i, x in enumerate(m.owner.inputs):
-            if _as_scalar(x, dtype=d.dtype) and (theano.scalar.upcast(
-                x.type.dtype, d.type.dtype)
-                                                 == d.type.dtype):
-                scalar_idx = i
-                break
-
-        if scalar_idx < 0:
-            _logger.info('Not optimizing dot22 with inputs %s %s, as the'
-                         ' type of the scalar cannot be upcasted to the'
-                         ' matrix type',
-                         node.inputs, [x.type for x in node.inputs])
+        if len(m.owner.inputs) == 2 and any([_as_scalar(x, dtype=d.dtype)
+                                             for x in m.owner.inputs]):
+            scalar_idx = -1
+            for i, x in enumerate(m.owner.inputs):
+                if _as_scalar(x, dtype=d.dtype) and (theano.scalar.upcast(
+                        x.type.dtype, d.type.dtype)
+                                      == d.type.dtype):
+                    scalar_idx = i
+                    break
+
+            if scalar_idx < 0:
+                _logger.info('Not optimizing dot22 with inputs %s %s, as the'
+                             ' type of the scalar cannot be upcasted to the'
+                             ' matrix type',
+                             node.inputs, [x.type for x in node.inputs])
+                return False
+            a = T.cast(_as_scalar(m.owner.inputs[scalar_idx],
+                                  dtype=d.dtype), d.type.dtype)
+            assert not a.type.ndim
+            dot = _dot22scalar(d.owner.inputs[0], d.owner.inputs[1], a)
+
+            # What about the other inputs to the original node that were
+            # neither part of the dot22 or this mul?
+            # I'm asserting there are no such inputs here:
+            assert dot22_idx != mul_idx
+            assert all((i in (dot22_idx, mul_idx))
+                    for i in xrange(len(node.inputs)))
+
+            return [T.mul(m.owner.inputs[1 - i], dot)]
+        elif m.owner and m.owner.op == T.mul:
+            _logger.info('Not optimizing dot22 with inputs %s %s %s %s. '
+                    'we need to check in a recursive way in the mul if we can '
+                    'reorder the graph. The canonizer should have done this.',
+                    d, m, d.type, m.type)
+        else:
             return False
-        a = T.cast(_as_scalar(m.owner.inputs[scalar_idx],
-                              dtype=d.dtype), d.type.dtype)
-        assert not a.type.ndim
-        dot = _dot22scalar(d.owner.inputs[0], d.owner.inputs[1], a)
-
-        # The other inputs to the original node that were
-        # neither part of the dot22 or this mul should be
-        # factors in the returned "mul" node.
-        assert dot22_idx != mul_idx
-        other_factors = [inpt
-                         for i, inpt in enumerate(node.inputs)
-                         if i not in (dot22_idx, mul_idx)]
-        other_m_inputs = [inpt
-                          for i, inpt in enumerate(m.owner.inputs)
-                          if i != scalar_idx]
-
-        return [T.mul(dot, *(other_factors + other_m_inputs))]
 
     scalar_idx = -1
     for i, x in enumerate(node.inputs):
@@ -2045,13 +1976,13 @@ def local_dot22_to_dot22scalar(node):
 #must happen after gemm as the gemm optimizer don't understant
 #dot22scalar and gemm give more speed up then dot22scalar
 blas_optdb.register('local_dot22_to_dot22scalar',
-                    in2out(local_dot22_to_dot22scalar),
-                    11, 'fast_run')
+        EquilibriumOptimizer([local_dot22_to_dot22scalar], max_use_ratio=5),
+        11, 'fast_run')
 
 
 #from opt import register_specialize, register_canonicalize
 #@register_specialize
-@local_optimizer([T.sub, T.add])
+@local_optimizer([])
 def local_print_as_we_go_along(node):
     if node.op in (T.sub, T.add):
         debugprint(node)
diff --git a/theano/tensor/blas_c.py b/theano/tensor/blas_c.py
index 024b51196f6..798612ef2aa 100644
--- a/theano/tensor/blas_c.py
+++ b/theano/tensor/blas_c.py
@@ -1,15 +1,10 @@
-import numpy
-
 from theano import config
 
-from theano.tensor.opt import in2out
+
 from theano.tensor.blas import ldflags, blas_header_text, blas_header_version
-from theano.tensor.blas import (
-    blas_optdb, optdb, local_optimizer, EquilibriumOptimizer)
+from theano.tensor.blas import blas_optdb, optdb, local_optimizer, EquilibriumOptimizer
 from theano.tensor.blas import Ger, ger, ger_destructive
 from theano.tensor.blas import Gemv, gemv_inplace, gemv_no_inplace
-from theano.tensor import basic as T
-import theano.compile
 
 
 class BaseBLAS(object):
@@ -29,9 +24,9 @@ def c_support_code(self):
         return blas_header_text()
 
 
-# ##### ####### #######
+####### ####### #######
 # GER
-# ##### ####### #######
+####### ####### #######
 
 def ger_c_code(A, a, x, y, Z, destructive, fail):
     return """
@@ -110,25 +105,15 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
         {
             float * zoutdata = (float*)PyArray_DATA(%(Z)s);
             const float * zdata = (float*)PyArray_DATA(%(A)s);
-            const float * xdata = (float*)PyArray_DATA(%(x)s);
-            const float * ydata = (float*)PyArray_DATA(%(y)s);
-            const float * adata = (float*)PyArray_DATA(%(a)s);
-            const float alpha = adata[0];
-            float tmp, xx;
             int Ai = PyArray_STRIDES(%(A)s)[0]/sizeof(float);
             int Aj = PyArray_STRIDES(%(A)s)[1]/sizeof(float);
             int Zi = PyArray_STRIDES(%(Z)s)[0]/sizeof(float);
             int Zj = PyArray_STRIDES(%(Z)s)[1]/sizeof(float);
-            int xi = PyArray_STRIDES(%(x)s)[0]/sizeof(float);
-            int yj = PyArray_STRIDES(%(y)s)[0]/sizeof(float);
             for (int i = 0; i < dims[0]; ++i)
             {
-                xx = alpha * xdata[xi * i];
                 for (int j = 0; j < dims[1]; ++j)
                 {
-                    tmp = zdata[Ai*i+Aj*j];
-                    tmp += xx * ydata[yj * j];
-                    zoutdata[Zi*i+Zj*j] = tmp;
+                    zoutdata[Zi*i+Zj*j] = zdata[Ai*i+Aj*j];
                 }
             }
         }
@@ -136,26 +121,15 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
         {
             double * zoutdata = (double*) PyArray_DATA(%(Z)s);
             const double * zdata = (double*)PyArray_DATA(%(A)s);
-            const double * xdata = (double*)PyArray_DATA(%(x)s);
-            const double * ydata = (double*)PyArray_DATA(%(y)s);
-            const double * adata = (double*)PyArray_DATA(%(a)s);
-            const double alpha = adata[0];
-            double tmp, xx;
-
             int Ai = PyArray_STRIDES(%(A)s)[0]/sizeof(double);
             int Aj = PyArray_STRIDES(%(A)s)[1]/sizeof(double);
             int Zi = PyArray_STRIDES(%(Z)s)[0]/sizeof(double);
             int Zj = PyArray_STRIDES(%(Z)s)[1]/sizeof(double);
-            int xi = PyArray_STRIDES(%(x)s)[0]/sizeof(double);
-            int yj = PyArray_STRIDES(%(y)s)[0]/sizeof(double);
             for (int i = 0; i < dims[0]; ++i)
             {
-                xx = alpha * xdata[xi * i];
                 for (int j = 0; j < dims[1]; ++j)
                 {
-                    tmp = zdata[Ai*i+Aj*j];
-                    tmp += xx * ydata[yj * j];
-                    zoutdata[Zi*i+Zj*j] = tmp;
+                    zoutdata[Zi*i+Zj*j] = zdata[Ai*i+Aj*j];
                 }
             }
         }
@@ -175,141 +149,93 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
             %(Z)s = %(A)s;
             Py_INCREF(%(Z)s);
         }
-        npy_intp dims[2];
-        dims[0] = PyArray_DIMS(%(A)s)[0];
-        dims[1] = PyArray_DIMS(%(A)s)[1];
-        if ((dims[0] * dims[1]) < 100000)
+    }
+
+    {
+        int Nz0 = PyArray_DIMS(%(Z)s)[0];
+        int Nz1 = PyArray_DIMS(%(Z)s)[1];
+        int Sx = PyArray_STRIDES(%(x)s)[0] / elemsize;
+        int Sy = PyArray_STRIDES(%(y)s)[0] / elemsize;
+
+        /* create appropriate strides for Z, if it is a row or column matrix.
+         * In that case, the value of the stride does not really matter, but
+         * some versions of BLAS insist that:
+         *  - they are not smaller than the number of elements in the array,
+         *  - they are not 0.
+         */
+        int Sz0 = (Nz0 > 1) ? (PyArray_STRIDES(%(Z)s)[0] / elemsize) : (Nz1 + 1);
+        int Sz1 = (Nz1 > 1) ? (PyArray_STRIDES(%(Z)s)[1] / elemsize) : (Nz0 + 1);
+
+        dtype_%(x)s* x_data = (dtype_%(x)s*) PyArray_DATA(%(x)s);
+        dtype_%(y)s* y_data = (dtype_%(y)s*) PyArray_DATA(%(y)s);
+        // gemv expects pointers to the beginning of memory arrays,
+        // but numpy provides provides a pointer to the first element,
+        // so when the stride is negative, we need to get the last one.
+        if (Sx < 0)
+            x_data += (Nz0 - 1) * Sx;
+        if (Sy < 0)
+            y_data += (Nz1 - 1) * Sy;
+
+        if (PyArray_STRIDES(%(Z)s)[0] == elemsize)
         {
             if (PyArray_DESCR(%(Z)s)->type_num == NPY_FLOAT)
             {
-                float * zoutdata = (float*)PyArray_DATA(%(Z)s);
-                const float * xdata = (float*)PyArray_DATA(%(x)s);
-                const float * ydata = (float*)PyArray_DATA(%(y)s);
-                const float * adata = (float*)PyArray_DATA(%(a)s);
-                const float alpha = adata[0];
-                float tmp, axi;
-                int Zi = PyArray_STRIDES(%(Z)s)[0]/sizeof(float);
-                int Zj = PyArray_STRIDES(%(Z)s)[1]/sizeof(float);
-                int xi = PyArray_STRIDES(%(x)s)[0]/sizeof(float);
-                int yj = PyArray_STRIDES(%(y)s)[0]/sizeof(float);
-                for (int i = 0; i < dims[0]; ++i)
-                {
-                    axi = alpha * xdata[xi * i];
-                    for (int j = 0; j < dims[1]; ++j)
-                    {
-                        zoutdata[Zi*i+Zj*j] += axi * ydata[yj * j];
-                    }
-                }
+                //fprintf(stderr, "A\\n");
+                float alpha = ((dtype_%(a)s*)PyArray_DATA(%(a)s))[0];
+                sger_(&Nz0, &Nz1, &alpha,
+                    (float*)x_data, &Sx,
+                    (float*)y_data, &Sy,
+                    (float*)(PyArray_DATA(%(Z)s)), &Sz1);
             }
             else if (PyArray_DESCR(%(Z)s)->type_num == NPY_DOUBLE)
             {
-                double * zoutdata = (double*) PyArray_DATA(%(Z)s);
-                const double * xdata = (double*)PyArray_DATA(%(x)s);
-                const double * ydata = (double*)PyArray_DATA(%(y)s);
-                const double * adata = (double*)PyArray_DATA(%(a)s);
-                const double alpha = adata[0];
-                double tmp, axi;
-
-                int Zi = PyArray_STRIDES(%(Z)s)[0]/sizeof(double);
-                int Zj = PyArray_STRIDES(%(Z)s)[1]/sizeof(double);
-                int xi = PyArray_STRIDES(%(x)s)[0]/sizeof(double);
-                int yj = PyArray_STRIDES(%(y)s)[0]/sizeof(double);
-                for (int i = 0; i < dims[0]; ++i)
-                {
-                    axi = alpha * xdata[xi * i];
-                    for (int j = 0; j < dims[1]; ++j)
-                    {
-                        zoutdata[Zi*i+Zj*j] += axi * ydata[yj * j];
-                    }
-                }
+                double alpha = ((dtype_%(a)s*)PyArray_DATA(%(a)s))[0];
+                dger_(&Nz0, &Nz1, &alpha,
+                    (double*)x_data, &Sx,
+                    (double*)y_data, &Sy,
+                    (double*)(PyArray_DATA(%(Z)s)), &Sz1);
+            }
+            else {
+                PyErr_SetString(PyExc_NotImplementedError,
+                                "not float nor double");
+                %(fail)s
             }
         }
-        else
+        else if (PyArray_STRIDES(%(Z)s)[1] == elemsize)
         {
-            int Nz0 = PyArray_DIMS(%(Z)s)[0];
-            int Nz1 = PyArray_DIMS(%(Z)s)[1];
-            int Sx = PyArray_STRIDES(%(x)s)[0] / elemsize;
-            int Sy = PyArray_STRIDES(%(y)s)[0] / elemsize;
-
-            /* create appropriate strides for Z, if it is a row or column matrix.
-             * In that case, the value of the stride does not really matter, but
-             * some versions of BLAS insist that:
-             *  - they are not smaller than the number of elements in the array,
-             *  - they are not 0.
-             */
-            int Sz0 = (Nz0 > 1) ? (PyArray_STRIDES(%(Z)s)[0] / elemsize) : (Nz1 + 1);
-            int Sz1 = (Nz1 > 1) ? (PyArray_STRIDES(%(Z)s)[1] / elemsize) : (Nz0 + 1);
-
-            dtype_%(x)s* x_data = (dtype_%(x)s*) PyArray_DATA(%(x)s);
-            dtype_%(y)s* y_data = (dtype_%(y)s*) PyArray_DATA(%(y)s);
-            // gemv expects pointers to the beginning of memory arrays,
-            // but numpy provides provides a pointer to the first element,
-            // so when the stride is negative, we need to get the last one.
-            if (Sx < 0)
-                x_data += (Nz0 - 1) * Sx;
-            if (Sy < 0)
-                y_data += (Nz1 - 1) * Sy;
-
-            if (PyArray_STRIDES(%(Z)s)[0] == elemsize)
+            if (PyArray_DESCR(%(Z)s)->type_num == NPY_FLOAT)
             {
-                if (PyArray_DESCR(%(Z)s)->type_num == NPY_FLOAT)
-                {
-                    //fprintf(stderr, "A\\n");
-                    float alpha = ((dtype_%(a)s*)PyArray_DATA(%(a)s))[0];
-                    sger_(&Nz0, &Nz1, &alpha,
-                        (float*)x_data, &Sx,
-                        (float*)y_data, &Sy,
-                        (float*)(PyArray_DATA(%(Z)s)), &Sz1);
-                }
-                else if (PyArray_DESCR(%(Z)s)->type_num == NPY_DOUBLE)
-                {
-                    double alpha = ((dtype_%(a)s*)PyArray_DATA(%(a)s))[0];
-                    dger_(&Nz0, &Nz1, &alpha,
-                        (double*)x_data, &Sx,
-                        (double*)y_data, &Sy,
-                        (double*)(PyArray_DATA(%(Z)s)), &Sz1);
-                    
-
-                }
-                else {
-                    PyErr_SetString(PyExc_NotImplementedError,
-                                    "not float nor double");
-                    %(fail)s
-                }
+                //fprintf(stderr, "B %%i %%i %%i %%i\\n", Nz0, Nz1, Sz0, Sz1);
+                float alpha = ((dtype_%(a)s*)(PyArray_DATA(%(a)s)))[0];
+                //fprintf(stderr, "alpha=%%f\\n", alpha);
+                //fprintf(stderr, "sx  sy %%i %%i\\n", Sx, Sy);
+                sger_(&Nz1, &Nz0, &alpha,
+                    (float*)y_data, &Sy,
+                    (float*)x_data, &Sx,
+                    (float*)(PyArray_DATA(%(Z)s)), &Sz0);
             }
-            else if (PyArray_STRIDES(%(Z)s)[1] == elemsize)
+            else if (PyArray_DESCR(%(Z)s)->type_num == NPY_DOUBLE)
             {
-                if (PyArray_DESCR(%(Z)s)->type_num == NPY_FLOAT)
-                {
-                    float alpha = ((dtype_%(a)s*)(PyArray_DATA(%(a)s)))[0];
-                    sger_(&Nz1, &Nz0, &alpha,
-                        (float*)y_data, &Sy,
-                        (float*)x_data, &Sx,
-                        (float*)(PyArray_DATA(%(Z)s)), &Sz0);
-                }
-                else if (PyArray_DESCR(%(Z)s)->type_num == NPY_DOUBLE)
-                {
-                    double alpha = ((dtype_%(a)s*)PyArray_DATA(%(a)s))[0];
-                    dger_(&Nz1, &Nz0, &alpha,
-                        (double*)y_data, &Sy,
-                        (double*)x_data, &Sx,
-                        (double*)(PyArray_DATA(%(Z)s)), &Sz0);
-                }
-                else
-                {
-                    PyErr_SetString(PyExc_NotImplementedError,
-                                    "not float nor double");
-                    %(fail)s
-                }
+                double alpha = ((dtype_%(a)s*)PyArray_DATA(%(a)s))[0];
+                dger_(&Nz1, &Nz0, &alpha,
+                    (double*)y_data, &Sy,
+                    (double*)x_data, &Sx,
+                    (double*)(PyArray_DATA(%(Z)s)), &Sz0);
             }
             else
             {
-                PyErr_SetString(PyExc_AssertionError,
-                    "A is a double-strided matrix, and should have been copied "
-                    "into a memory-contiguous one.");
+                PyErr_SetString(PyExc_NotImplementedError,
+                                "not float nor double");
                 %(fail)s
             }
         }
+        else
+        {
+            PyErr_SetString(PyExc_AssertionError,
+                "A is a double-strided matrix, and should have been copied "
+                "into a memory-contiguous one.");
+            %(fail)s
+        }
     }
 
     """ % locals()
@@ -320,14 +246,12 @@ def c_code(self, node, name, inp, out, sub):
         A, a, x, y = inp
         Z, = out
         code = ger_c_code(A, a, x, y, Z,
-                          destructive=int(self.destructive),
-                          fail=sub['fail'])
+            destructive=int(self.destructive),
+            fail=sub['fail'])
         return code
 
     def c_code_cache_version(self):
-        return (9, blas_header_version())
-cger_inplace = CGer(True)
-cger_no_inplace = CGer(False)
+        return (8, blas_header_version())
 
 
 @local_optimizer([ger, ger_destructive])
@@ -345,23 +269,22 @@ def use_c_ger(node):
 
 @local_optimizer([CGer(False)])
 def make_c_ger_destructive(node):
-    if node.op == cger_no_inplace:
-        return [cger_inplace(*node.inputs)]
+    if node.op == CGer(False):
+        return [CGer(True)(*node.inputs)]
 
 
-# ##### ####### #######
+####### ####### #######
 # GEMV
-# ##### ####### #######
+####### ####### #######
 
 
-def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail,
-                force_init_beta=False):
+def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
     """
     zz <- beta * aa + alpha * dot(xx, yy)
 
     where xx is a matrix, yy and aa are vectors (ergo zz is vector)
     """
-    code = """
+    return """
 
     int elemsize ;
     float fbeta;
@@ -428,7 +351,7 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail,
         {
             if (%(zz)s) Py_XDECREF(%(zz)s);
             %(zz)s = (PyArrayObject*)PyArray_SimpleNew(1,
-                PyArray_DIMS(%(aa)s), PyArray_TYPE((PyArrayObject*) py_%(aa)s));
+                PyArray_DIMS(%(aa)s), type_num_%(aa)s);
             if(!%(zz)s) {
                 PyErr_SetString(PyExc_MemoryError,
                                 "failed to alloc gemv output");
@@ -453,7 +376,7 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail,
                     zoutdata[Zi*i] = fbeta * zdata[Ai*i];
                 }
             }
-            else if (PyArray_DESCR(%(zz)s)->type_num == NPY_DOUBLE)
+            else if (PyArray_DESCR(%(xx)s)->type_num == NPY_DOUBLE)
             {
                 double * zoutdata = (double*) PyArray_DATA(%(zz)s);
                 const double * zdata = (double*)PyArray_DATA(%(aa)s);
@@ -472,40 +395,6 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail,
             }
             fbeta = dbeta = 1.0;
         }
-        else if (%(force_init_beta)d)
-        {
-            if (PyArray_CHKFLAGS(%(zz)s, NPY_ARRAY_C_CONTIGUOUS))
-            {
-                memset((void *)PyArray_DATA(%(zz)s), 0, PyArray_SIZE(%(zz)s)*PyArray_ITEMSIZE(%(zz)s));
-            }
-            else
-            {
-                if (PyArray_DESCR(%(zz)s)->type_num == NPY_FLOAT)
-                {
-                    float *zoutdata = (float *)PyArray_DATA(%(zz)s);
-                    int Zi = PyArray_STRIDES(%(zz)s)[0]/sizeof(float);
-                    for (int i = 0; i < PyArray_DIMS(%(aa)s)[0]; ++i)
-                    {
-                        zoutdata[Zi*i] = 0.0f;
-                    }
-                }
-                else if (PyArray_DESCR(%(zz)s)->type_num == NPY_DOUBLE)
-                {
-                    double *zoutdata = (double *)PyArray_DATA(%(zz)s);
-                    int Zi = PyArray_STRIDES(%(zz)s)[0]/sizeof(double);
-                    for (int i = 0; i < PyArray_DIMS(%(aa)s)[0]; ++i)
-                    {
-                        zoutdata[Zi*i] = 0.0;
-                    }
-                }
-                else
-                {
-                    PyErr_SetString(PyExc_AssertionError,
-                                    "neither float nor double dtype");
-                    %(fail)s
-                }
-            }
-        }
     }
     else
     {
@@ -675,89 +564,21 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail,
         }
     }
 
-    """
-    return code % locals()
+    """ % locals()
 
 
 class CGemv(BaseBLAS, Gemv):
-    def __init__(self, inplace, force_init_beta=False):
-        super(CGemv, self).__init__(inplace)
-        self.force_init_beta = force_init_beta
-
     def c_code(self, node, name, inp, out, sub):
         aa, alpha, xx, yy, beta = inp
         zz, = out
         code = gemv_c_code(
-            aa, xx, yy, zz, alpha, beta,
-            destructive=int(self.inplace),
-            fail=sub['fail'],
-            force_init_beta=self.force_init_beta
-        )
+                aa, xx, yy, zz, alpha, beta,
+                destructive=int(self.inplace),
+                fail=sub['fail'])
         return code
 
     def c_code_cache_version(self):
-        return (11, blas_header_version())
-cgemv_inplace = CGemv(inplace=True)
-cgemv_no_inplace = CGemv(inplace=False)
-
-
-def check_force_gemv_init():
-    if check_force_gemv_init._force_init_beta is None:
-        """
-        Test issue 1569.
-        Namely when evaulating
-
-            beta*aa + alpha*dot(xx, yy)
-
-        where we set aa = betas = zeros of the correct dimensions we do not
-        actually set aa = zeros and instead let the BLAS perform beta*aa with
-        uninitialized memory for speed. Occasionally the memory contains values
-        that are equivalent to NaN in which case the product beta*aa contains
-        NaN's for correctly implemented BLAS libraries. In this situation, since
-        we are introducing the NaN's, we need to test whether the BLAS performs
-        correctly. If it *does*, i.e. it actually performs the multiplication
-        beta*aa which will result in NaN's in the result, then we need intialize
-        the memory to zeros.
-        """
-        tv = theano.config.compute_test_value
-        tvo = theano.config.compute_test_value_opt
-        theano.config.compute_test_value = 'off'
-        theano.config.compute_test_value_opt = 'off'
-        try:
-            aa = T.vector('aa')
-            yy = T.vector('yy')
-            xx = T.matrix('xx')
-            f = theano.function(
-                [aa, yy, xx],
-                gemv_no_inplace(aa, 1., xx, yy, 0.),
-                theano.compile.Mode(optimizer='fast_compile')
-                )
-        finally:
-            theano.config.compute_test_value = tv
-            theano.config.compute_test_value_opt = tvo
-
-        # Here we introduce NaNs into the data, if they are returned by the BLAS
-        # then we want gemv_c_code to initiliaze the memory to 0 so that we
-        # don't inadvertantly introduce NaNs to the users data.
-        aa_data = numpy.array(
-            float('NaN')*numpy.ones((2,)),
-            dtype=theano.config.floatX
-        )
-        yy_data = numpy.array(
-            numpy.ones((2,))*2,
-            dtype=theano.config.floatX
-        )
-        xx_data = numpy.array(
-            numpy.ones((2, 2)),
-            dtype=theano.config.floatX
-        )
-        zz = f(aa_data, yy_data, xx_data)
-
-        check_force_gemv_init._force_init_beta = numpy.isnan(zz).any()
-
-    return check_force_gemv_init._force_init_beta
-
-check_force_gemv_init._force_init_beta = None
+        return (10, blas_header_version())
 
 
 @local_optimizer([gemv_inplace, gemv_no_inplace])
@@ -767,30 +588,7 @@ def use_c_gemv(node):
     # Only float32 and float64 are supported for now.
     if (node.op == gemv_no_inplace and
             node.outputs[0].dtype in ['float32', 'float64']):
-
-        """
-        We want to maintain the behavoir of any operation that the user adds
-        even if it results in NaNs. However we do not want optimizations to
-        introduce NaNs.
-
-        GEMV is not always implemented consistenly across BLAS libraries.
-        Sometimes, when beta is 0, they do not perform the multiplication with
-        beta. Other implmentations do. This can cause problems for the inplace
-        GEMV implementation if NaNs happen to be in the newly allocated but
-        uninitalized memory. When the multiplication is not done we do not need
-        to initialize the output memory resulting in a speed up. Otherwise we
-        must initialize the memory to avoid introducing NaN's in the output
-        that weren't in the original graph.
-
-        The following check determines whether the output memory needs to be
-        initiliazed. It is done here, as opposed to in global scope, because
-        the setup has not been completed at that time and therefore the check
-        cannot be performed at that time.
-        """
-        force_init_beta = check_force_gemv_init()
-
-        return [CGemv(inplace=False,
-                      force_init_beta=force_init_beta)(*node.inputs)]
+        return [CGemv(inplace=False)(*node.inputs)]
     if (node.op == gemv_inplace and
             node.outputs[0].dtype in ['float32', 'float64']):
         return [CGemv(inplace=True)(*node.inputs)]
@@ -798,21 +596,30 @@ def use_c_gemv(node):
 
 @local_optimizer([CGemv(inplace=False)])
 def make_c_gemv_destructive(node):
-    if node.op == cgemv_no_inplace:
-        return [cgemv_inplace(*node.inputs)]
+    if node.op == CGemv(inplace=False):
+        return [CGemv(inplace=True)(*node.inputs)]
 
 
-# ##### ####### #######
+####### ####### #######
 # Optimizers
-# ##### ####### #######
+####### ####### #######
 
 blas_optdb.register('use_c_blas',
-                    in2out(use_c_ger, use_c_gemv),
-                    20, 'fast_run', 'c_blas')
+    EquilibriumOptimizer([
+        use_c_ger,
+        use_c_gemv,
+        ],
+        max_use_ratio=5),
+    20, 'fast_run', 'c_blas')
+#print 'BLAS_OPTDB'
+#print blas_optdb
 
 # this matches the InplaceBlasOpt defined in blas.py
 optdb.register('c_blas_destructive',
-               in2out(make_c_ger_destructive,
-                      make_c_gemv_destructive,
-                      name="c_blas_destructive"),
-               70.0, 'fast_run', 'inplace', 'c_blas')
+        EquilibriumOptimizer([
+                make_c_ger_destructive,
+                make_c_gemv_destructive,
+            ],
+            failure_callback=EquilibriumOptimizer.warn_inplace,
+            max_use_ratio=5),
+        70.0, 'fast_run', 'inplace', 'c_blas')
diff --git a/theano/tensor/blas_headers.py b/theano/tensor/blas_headers.py
index 14edbeedf24..ca86fd51c57 100644
--- a/theano/tensor/blas_headers.py
+++ b/theano/tensor/blas_headers.py
@@ -6,12 +6,12 @@
 import logging
 import textwrap
 import sys
-import os
 
 from theano import config
 from theano.gof.cmodule import GCC_compiler
 
 _logger = logging.getLogger('theano.tensor.blas')
+#_logger.setLevel(logging.INFO)
 
 
 def detect_macos_sdot_bug():
@@ -49,17 +49,8 @@ def detect_macos_sdot_bug():
         # Library directories should also be added as rpath,
         # so that they can be loaded even if the environment
         # variable LD_LIBRARY_PATH does not contain them
-        lib_path = os.environ.get('DYLD_FALLBACK_LIBRARY_PATH', '').split(':')
         if f.startswith('-L'):
             flags.append('-Wl,-rpath,' + f[2:])
-            # also append those paths to DYLD_FALLBACK_LIBRARY_PATH to
-            # support libraries that have the wrong install_name
-            # (such as MKL on canopy installs)
-            if (f[2:] not in lib_path):
-                lib_path.append(f[2:])
-        # this goes into the python process environment that is
-        # inherited by subprocesses/used by dyld when loading new objects
-        os.environ['DYLD_FALLBACK_LIBRARY_PATH'] = ':'.join(lib_path)
 
     test_code = textwrap.dedent("""\
         extern "C" float sdot_(int*, float*, int*, float*, int*);
@@ -101,8 +92,9 @@ def detect_macos_sdot_bug():
 
     # Then, try a simple fix
     test_fix_code = textwrap.dedent("""\
+        extern "C" float sdot_(int*, float*, int*, float*, int*);
         extern "C" float cblas_sdot(int, float*, int, float*, int);
-        static float sdot_(int* Nx, float* x, int* Sx, float* y, int* Sy)
+        float sdot_(int* Nx, float* x, int* Sx, float* y, int* Sy)
         {
             return cblas_sdot(*Nx, x, *Sx, y, *Sy);
         }
@@ -932,7 +924,7 @@ def blas_header_text():
         if detect_macos_sdot_bug.fix_works:
             header += textwrap.dedent("""\
                     extern "C" float cblas_sdot(int, float*, int, float*, int);
-                    static float sdot_(int* Nx, float* x, int* Sx, float* y, int* Sy)
+                    float sdot_(int* Nx, float* x, int* Sx, float* y, int* Sy)
                     {
                         return cblas_sdot(*Nx, x, *Sx, y, *Sy);
                     }
@@ -940,7 +932,7 @@ def blas_header_text():
         else:
             # Make sure the buggy version of sdot_ is never used
             header += textwrap.dedent("""\
-                    static float sdot_(int* Nx, float* x, int* Sx, float* y, int* Sy)
+                    float sdot_(int* Nx, float* x, int* Sx, float* y, int* Sy)
                     {
                         fprintf(stderr,
                             "FATAL: The implementation of BLAS SDOT "
diff --git a/theano/tensor/blas_scipy.py b/theano/tensor/blas_scipy.py
index f38a666824d..3f3ded5e3ea 100644
--- a/theano/tensor/blas_scipy.py
+++ b/theano/tensor/blas_scipy.py
@@ -26,7 +26,6 @@ def make_thunk(self, node, storage_map, compute_map, no_recycling):
 
         node_input_storage = [storage_map[r] for r in node.inputs]
         node_output_storage = [storage_map[r] for r in node.outputs]
-        node_output_compute = [compute_map[r] for r in node.outputs]
 
         # get vars for containers
         cA, calpha, cx, cy = node_input_storage
@@ -52,8 +51,6 @@ def rval():
                 A = local_ger(calpha[0], cx[0], cy[0], a=A,
                         overwrite_a=int(self.destructive))
             cZ[0] = A
-            for o in node_output_compute:
-                o[0] = True
 
         #TODO: If this is currently an unofficial part of the thunk API,
         #      then maybe it should be documented and made official?
@@ -62,18 +59,15 @@ def rval():
         rval.lazy = False
         return rval
 
-scipy_ger_no_inplace = ScipyGer(False)
-scipy_ger_inplace = ScipyGer(True)
-
 @local_optimizer([ger, ger_destructive])
 def use_scipy_ger(node):
     if node.op == ger:
-        return [scipy_ger_no_inplace(*node.inputs)]
+        return [ScipyGer(False)(*node.inputs)]
 
-@local_optimizer([scipy_ger_no_inplace])
+@local_optimizer([ScipyGer(False)])
 def make_ger_destructive(node):
-    if node.op == scipy_ger_no_inplace:
-        return [scipy_ger_inplace(*node.inputs)]
+    if node.op == ScipyGer(False):
+        return [ScipyGer(True)(*node.inputs)]
 
 use_scipy_blas = in2out(use_scipy_ger)
 make_scipy_blas_destructive = in2out(make_ger_destructive)
diff --git a/theano/tensor/deprecated/rmodule.py b/theano/tensor/deprecated/rmodule.py
new file mode 100644
index 00000000000..7e3c67a4a58
--- /dev/null
+++ b/theano/tensor/deprecated/rmodule.py
@@ -0,0 +1,125 @@
+"""Define RModule, a Module providing random number streams in Theano graphs."""
+__docformat__ = "restructuredtext en"
+import sys
+if sys.version_info[:2] >= (2,5):
+  from functools import partial
+else:
+  from theano.gof.python25 import partial
+
+import numpy
+from copy import copy
+
+from theano.compile import (SymbolicInputKit, SymbolicInput,
+        Module, module, Method, Member, In, Component)
+from theano.gof import Container
+from theano.gof.python25 import deque
+
+from theano.tensor import raw_random
+
+class KitComponent(Component):
+    """
+    Represents a SymbolicInputKit (see io.py).
+    """
+
+    def __init__(self, kit):
+        super(KitComponent, self).__init__()
+        self.kit = kit
+
+    def allocate(self, memo):
+        """
+        Allocates a Container for each input in the kit. Sets a key in
+        the memo that maps the SymbolicInputKit to the list of
+        Containers.
+        """
+        for input in self.kit.sinputs:
+            r = input.variable
+            if r not in memo:
+                input = copy(input)
+                input.value = Container(r, storage = [None])
+                memo[r] = input
+
+    def build(self, mode, memo):
+        return [memo[i.variable].value for i in self.kit.sinputs]
+
+
+class RandomKit(SymbolicInputKit):
+
+    def __init__(self, name, value = None):
+        super(RandomKit, self).__init__(name)
+        self.value = value
+
+    def gen(self, op, *args, **kwargs):
+        random_state_variable = raw_random.random_state_type()
+        new_r, out = op(random_state_variable, *args, **kwargs)
+        self.add_input(SymbolicInput(random_state_variable, update = new_r))
+        out.rng = new_r
+        out.auto = self
+        return out
+
+    def distribute(self, value, indices, containers):
+        rg = partial(numpy.random.RandomState(int(value)).randint, 2**30)
+        elems = deque(zip(indices, containers))
+        i = 0
+        while elems:
+            index, container = elems.popleft()
+            while i <= index:
+                curr = rg()
+                i += 1
+            rs = numpy.random.RandomState(int(curr))
+            container.data = rs
+
+    def binomial(self, *args, **kwargs):
+        return self.gen(raw_random.binomial, *args, **kwargs)
+
+    def uniform(self, *args, **kwargs):
+        return self.gen(raw_random.uniform, *args, **kwargs)
+
+    def normal(self, *args, **kwargs):
+        return self.gen(raw_random.normal, *args, **kwargs)
+
+    def random_integers(self, *args, **kwargs):
+        return self.gen(raw_random.random_integers, *args, **kwargs)
+
+
+
+rk = RandomKit('rk', 0xBAD5EED)
+
+
+class RModule(Module):
+    """Module providing random number streams in Theano graphs."""
+
+    def __init__(self, components=None, **kwcomponents):
+        if components is None:
+            components = {}
+        super(RModule, self).__init__(components, **kwcomponents)
+        self.random = RandomKit('rkit')
+        self._rkit = KitComponent(self.random)
+
+    def __wrapper__(self, x):
+        x = module.wrap(x)
+        if isinstance(x, Method):
+            x.kits += [self.random]
+        return x
+
+    def _instance_seed(self, inst, seed, recursive = True):
+        seedgen = numpy.random.RandomState(seed)
+        if recursive:
+            #Here, we recurse through all the components (inst2) contained in (inst)
+            #and seeds each subcomponent that is an RModule
+
+
+            for path, c in self.flat_components_map(True):
+                if isinstance(c, RModule):
+                    inst2 = inst
+                    for name in path:
+                        inst2 = inst2[name]
+                    # A Kit (c._rkit.kit) contains a list of io.SymbolicIn instances
+                    # and the distribute method takes a value (seed), a list of indices
+                    # and a list of corresponding Container instances. In this
+                    # situation it will reseed all the rngs using the containers
+                    # associated to them.
+                    c._rkit.kit.distribute(seedgen.random_integers(2**30),
+                                           xrange(len(inst2._rkit)), inst2._rkit)
+        else:
+            self._rkit.kit.distribute(seedgen.random_integers(2**30), xrange(len(inst._rkit)), inst._rkit)
+
diff --git a/theano/tensor/deprecated/test_rmodule.py b/theano/tensor/deprecated/test_rmodule.py
new file mode 100644
index 00000000000..1221efb2121
--- /dev/null
+++ b/theano/tensor/deprecated/test_rmodule.py
@@ -0,0 +1,74 @@
+__docformat__ = "restructuredtext en"
+
+import sys
+import unittest
+import numpy as N
+
+from theano.tensor.deprecated.rmodule import *
+
+from theano import tensor
+from theano import compile, gof
+
+
+if 0:
+    class T_test_module(unittest.TestCase):
+        def test_state_propagation(self):
+            if 1:
+                print >> sys.stderr, "RModule deprecated"
+            else:
+                x = tensor.vector()
+                rk = RandomKit('rk', 1000)
+                f = compile.function([x, (rk, [gof.Container(r = gof.generic, storage = [123], name='bla')])], rk.binomial(tensor.shape(x)))
+                print "RK", rk.value
+                f['rk'] = 9873456
+                print "RK", rk.value
+            
+                rvals = [f([1,2,3,4,6, 7, 8]) for i in xrange(5)]
+                print rvals
+                for i in xrange(5-1):
+                    for j in xrange(i+1, 5):
+                        assert not N.all(rvals[i] == rvals[j])
+
+        def test_B(self):
+            """Test that random numbers change from call to call!
+            
+            Also, make sure that the seeding strategy doesn't change without failing a test.
+            
+            Random numbers can't be too random or experiments aren't repeatable.  Email theano-dev
+            before updating the `rvals` in this test.
+            """
+            class B(RModule):
+                def __init__(self):
+                    super(B, self).__init__()
+                    
+                    self.x = compile.Member(tensor.dvector())
+                    self.r = self.random.uniform(tensor.shape(self.x))
+                    
+                    self.f = compile.Method([self.x], self.r)
+            class E(RModule):
+                def __init__(self):
+                    super(E, self).__init__()
+                    self.b = B()
+                    self.f = compile.Method([self.b.x], self.b.r)
+
+            b = E()
+            m = b.make()
+            
+            m.seed(1000)
+        #print m.f(N.ones(5))
+        #print m.f(N.ones(5))
+        #print m.f(N.ones(5))
+            rvals = ["0.74802375876 0.872308123517 0.294830748897 0.803123780003 0.6321109955",
+                     "0.00168744844365 0.278638315678 0.725436793755 0.7788480779 0.629885140994",
+                     "0.545561221664 0.0992011009108 0.847112593242 0.188015424144 0.158046201298",
+                     "0.054382248842 0.563459168529 0.192757276954 0.360455221883 0.174805216702",
+                     "0.961942907777 0.49657319422 0.0316111492826 0.0915054717012 0.195877184515"]
+
+            for i in xrange(5):
+                s = " ".join([str(n) for n in m.f(N.ones(5))])
+                print s
+                assert s == rvals[i]
+
+if __name__ == '__main__':
+    from theano.tests import main
+    main("test_rmodule")
diff --git a/theano/tensor/elemwise.py b/theano/tensor/elemwise.py
index fad65ecdff8..f9bc800df3c 100644
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -1,4 +1,5 @@
 import sys
+import traceback
 from copy import copy
 from itertools import izip
 
@@ -6,10 +7,10 @@
 
 import theano
 from theano import gof
-from theano.gof import Apply, Op, OpenMPOp
+from theano.gof import Apply, Op
 from theano import scalar
-from theano.scalar import Scalar, get_scalar_type
-from theano.printing import pprint
+from theano.scalar import Scalar
+from theano.printing import min_informative_str, pprint
 from theano.gof.python25 import all, any
 from theano.tensor.utils import hash_from_dict
 from theano.gradient import DisconnectedType
@@ -18,10 +19,9 @@
 
 config = theano.config
 
-# We cannot import discrete_dtypes or float_dtypes from tensor.basic yet,
+# We cannot import discrete_dtypes from tensor.basic yet,
 # so we redefine them here
 discrete_dtypes = map(str, scalar.discrete_types)
-float_dtypes = map(str, scalar.float_types)
 
 
 # tensor depends on elemwise to provide definitions for several ops
@@ -96,8 +96,6 @@ class DimShuffle(Op):
     Adding, subtracting dimensions can be done with reshape.
     """
 
-    check_input = False
-
     def __init__(self, input_broadcastable, new_order, inplace=False):
         """
         Usage: DimShuffle(input_broadcastable, new_order, inplace = False)
@@ -146,6 +144,10 @@ def __init__(self, input_broadcastable, new_order, inplace=False):
 
         # list of dimensions of the input to drop
         self.drop = []
+        # this maps i before dropping dimensions to j after dropping dimensions
+        # so self.shuffle can be set properly later on
+        i2j = {}
+        j = 0
         for i, b in enumerate(input_broadcastable):
             if i not in new_order:
                 # we want to drop this dimension because it's not a value in
@@ -157,9 +159,14 @@ def __init__(self, input_broadcastable, new_order, inplace=False):
                     raise ValueError(
                             "You cannot drop a non-broadcastable dimension.",
                             (input_broadcastable, new_order))
+            else:
+                i2j[i] = j
+                j += 1
 
-        # this is the list of the original dimensions that we keep
-        self.shuffle = [x for x in new_order if x != 'x']
+        # transposition of non-broadcastable dimensions
+        # This is how the dimensions will be permuted, without accounting for
+        # the extra 'x' broadcastable dimensions to insert.
+        self.shuffle = [i2j[x] for x in new_order if x != 'x']
 
         # list of dimensions of the output that are broadcastable and were not
         # in the original input
@@ -183,20 +190,10 @@ def make_node(self, _input):
         input = as_tensor_variable(_input)
         ib = tuple(input.type.broadcastable)
         if not ib == self.input_broadcastable:
-            if len(ib) != len(self.input_broadcastable):
-                raise TypeError((
-                    "The number of dimensions of the "
-                    "input is incorrect for this op. Expected %s, got %s."
-                    % (self.input_broadcastable, ib)))
-            for expected, b in zip(self.input_broadcastable, ib):
-                if expected is True and b is False:
-                    raise TypeError((
-                        "The broadcastable pattern of the "
-                        "input is incorrect for this op. Expected %s, got %s."
-                        % (self.input_broadcastable, ib)))
-                #else, expected == b or expected is False and b is True
-                # Both case are good.
-
+            raise TypeError((
+                "The number of dimensions and/or broadcastable pattern of the "
+                "input is incorrect for this op. Expected %s, got %s."
+                % (self.input_broadcastable, ib)))
         ob = []
         for value in self.new_order:
             if value == 'x':
@@ -241,12 +238,16 @@ def perform(self, node, inp, out):
         res = input
         if type(res) != numpy.ndarray and type(res) != numpy.memmap:
             raise TypeError(res)
+        shape = list(res.shape)
+        for drop in reversed(self.drop):
+            shape.pop(drop)
+        res = res.reshape(shape)
 
         # transpose
-        res = res.transpose(self.shuffle+self.drop)
+        res = res.transpose(self.shuffle)
 
         # augment
-        shape = list(res.shape[:len(self.shuffle)])
+        shape = list(res.shape)
         for augm in self.augment:
             shape.insert(augm, 1)
         res = res.reshape(shape)
@@ -259,6 +260,9 @@ def perform(self, node, inp, out):
 
     def infer_shape(self, node, shapes):
         ishp, = shapes
+        ishp = list(ishp)
+        for drop in reversed(self.drop):
+            del ishp[drop]
         # transpose
         rval = [ishp[i] for i in self.shuffle]
 
@@ -270,7 +274,7 @@ def infer_shape(self, node, shapes):
     def R_op(self, inputs, eval_points):
         if None in eval_points:
             return [None]
-        return self(*eval_points, **dict(return_list=True))
+        return self.make_node(*eval_points).outputs
 
     def c_code(self, node, name, inp, out, sub):
         input, = inp
@@ -349,7 +353,7 @@ def statements(lst):
                 'PyArray_UpdateFlags(%(res)s, NPY_ARRAY_UPDATE_ALL)',
                 #we are making a view in both inplace and non-inplace cases
 """
-#if NPY_API_VERSION < 0x00000007
+#if NPY_VERSION <= 0x01000009
 PyArray_BASE(%(res)s) = (PyObject*)%(basename)s;
 #else
 PyArray_SetBaseObject(%(res)s, (PyObject*)%(basename)s);
@@ -382,7 +386,7 @@ def statements(lst):
         return full_code % dict(locals(), **sub)
 
     def c_code_cache_version(self):
-        return (3,)
+        return (2,)
 
     def grad(self, inp, grads):
         x, = inp
@@ -432,7 +436,7 @@ def process(self, r, pstate):
 ### Elemwise ###
 ################
 
-class Elemwise(OpenMPOp):
+class Elemwise(Op):
     """
     Generalizes a scalar op to tensors.
 
@@ -462,7 +466,7 @@ class Elemwise(OpenMPOp):
     """
 
     def __init__(self, scalar_op, inplace_pattern=None, name=None,
-                 nfunc_spec=None, openmp=None):
+            nfunc_spec=None):
         """
         Usage: Elemwise(scalar_op, inplace_pattern = {})
 
@@ -473,11 +477,14 @@ def __init__(self, scalar_op, inplace_pattern=None, name=None,
             the input's storage. (Just like destroymap, but without the lists.)
         * nfunc_spec: either None or a tuple of three elements,
             (nfunc_name, nin, nout) such that getattr(numpy, nfunc_name)
-            implements this operation, takes nin inputs and nout outputs.
-            Note that nin cannot always be inferred from the scalar op's
-            own nin field because that value is sometimes 0 (meaning a
-            variable number of inputs), whereas the numpy function may
-            not have varargs.
+            implements this operation, takes nin inputs and abs(nout) outputs
+            (nout < 0 if the numpy function does not provide the option of
+            providing a numpy array to store the results in). Note that nin
+            cannot always be inferred from the scalar op's own nin field
+            because that value is sometimes 0 (meaning a variable number of
+            inputs), whereas the numpy function may not have varargs.
+            NOTE: as of now, the sign of the nout field is ignored (some work
+            needs to be done to resize the destinations when needed).
         """
         if inplace_pattern is None:
             inplace_pattern = {}
@@ -497,7 +504,6 @@ def __init__(self, scalar_op, inplace_pattern=None, name=None,
 
         #precompute the hash of this node
         self._rehash()
-        super(Elemwise,self).__init__(openmp=openmp)
 
     def __getstate__(self):
         d = copy(self.__dict__)
@@ -508,7 +514,7 @@ def __getstate__(self):
         return d
 
     def __setstate__(self, d):
-        super(Elemwise, self).__setstate__(d)
+        self.__dict__.update(d)
         self.ufunc = None
         self.nfunc = None
         if getattr(self, 'nfunc_spec', None):
@@ -524,9 +530,10 @@ def make_node(self, *inputs):
         is left-completed to the greatest number of dimensions with 1s
         using DimShuffle.
         """
+
         inputs = map(as_tensor_variable, inputs)
         shadow = self.scalar_op.make_node(
-                *[get_scalar_type(dtype=i.type.dtype)() for i in inputs])
+                *[Scalar(dtype=i.type.dtype)() for i in inputs])
 
         target_length = max([input.type.ndim for input in inputs])
 
@@ -541,7 +548,7 @@ def make_node(self, *inputs):
                 args.append(DimShuffle(
                     input.type.broadcastable,
                     ['x'] * difference + range(length),
-                    inplace=False)(input))
+                    inplace=True)(input))
         inputs = args
 
         #HERE: all the broadcast dims have the same length now
@@ -611,7 +618,7 @@ def __str__(self):
             return self.name
 
     def R_op(self, inputs, eval_points):
-        outs = self(*inputs, **dict(return_list=True))
+        outs = self.make_node(*inputs).outputs
         rval = [None for x in outs]
         # For each output
         for idx, out in enumerate(outs):
@@ -688,7 +695,7 @@ def grad(self, inputs, ograds):
 
         #sum out the broadcasted dimensions
         for i, ipt in enumerate(inputs):
-            if isinstance(rval[i].type, (NullType, DisconnectedType)):
+            if rval[i] is None:
                 continue
 
             # list of all the dimensions that are broadcastable for input[i] so
@@ -729,13 +736,13 @@ def _bgrad(self, inputs, ograds):
             def as_scalar(t):
                 if isinstance(t.type, (NullType, DisconnectedType)):
                     return t
-                return get_scalar_type(t.type.dtype)()
+                return Scalar(t.type.dtype)()
 
             scalar_inputs = map(as_scalar, inputs)
             scalar_ograds = map(as_scalar, ograds)
             scalar_igrads = self.scalar_op.grad(scalar_inputs, scalar_ograds)
             for igrad in scalar_igrads:
-                assert igrad is not None, self.scalar_op
+                assert igrad is not None
 
         finally:
 
@@ -760,8 +767,10 @@ def transform(r):
                 # the gradient contains a constant, translate it as
                 # an equivalent TensorType of size 1 and proper number of
                 # dimensions
-                res = theano.tensor.constant(numpy.asarray(r.data), dtype=r.type.dtype)
-                return DimShuffle((), ['x'] * nd, inplace=False)(res)
+                res = TensorConstant(TensorType(dtype=r.type.dtype,
+                                            broadcastable=()),
+                                     numpy.asarray(r.data))  # .reshape(b)
+                return DimShuffle((), ['x'] * nd, inplace=True)(res)
             new_r = Elemwise(node.op, {})(
                     *[transform(ipt) for ipt in node.inputs])
             return new_r
@@ -776,15 +785,9 @@ def transform(r):
         return ret
 
     def perform(self, node, inputs, output_storage):
-        if len(node.inputs) >= 32:
-            # Some versions of NumPy will segfault, other will raise a
-            # ValueError, if the number of inputs to a ufunc is 32 or more.
-            # In that case, the C version should be used, or Elemwise fusion
-            # should be disabled.
-            super(Elemwise, self).perform(node, inputs, output_storage)
-
         maxsize = max(len(input.shape) for input in inputs)
-        for dims in izip(*[zip(input.shape, sinput.type.broadcastable)
+        for dims in izip(*[([(1, True)] * (maxsize - len(input.shape))
+                            + zip(input.shape, sinput.type.broadcastable))
                           for input, sinput in zip(inputs, node.inputs)]):
             if max(d for d, b in dims) != 1 and (1, False) in dims:
                 # yes there may be more compact ways to write this code,
@@ -804,7 +807,14 @@ def perform(self, node, inputs, output_storage):
 
                 base_exc_str = 'Dimension mismatch; shapes are %s' % (
                                ', '.join(msg))
-                raise ValueError(base_exc_str)
+                if config.exception_verbosity == 'high':
+                    msg_chunks = [base_exc_str]
+                    for i, ipt in enumerate(node.inputs):
+                        msg_chunks.append('input %d: %s' %
+                                          (i, min_informative_str(ipt)))
+                    raise ValueError('\n'.join(msg_chunks))
+                else:
+                    raise ValueError(base_exc_str)
 
         # Determine the shape of outputs
         out_shape = []
@@ -817,24 +827,41 @@ def perform(self, node, inputs, output_storage):
                 out_shape.append(max(values))
         out_shape = tuple(out_shape)
 
-        ufunc_args = inputs
-        ufunc_kwargs = {}
+        if not self.inplace_pattern:
+            for output, storage in izip(node.outputs, output_storage):
+                odat = storage[0]
+                if odat is not None:
+                    if odat.shape != out_shape:
+                        # It is unsafe to try to resize odat,
+                        # we have to allocate output storage.
+                        odat = None
+                if odat is None:
+                    odat = numpy.ndarray(out_shape, dtype=output.type.dtype)
+                storage[0] = odat
+        else:
+            for i, (output, storage) in enumerate(
+                    izip(node.outputs, output_storage)):
+                #i is an output idx
+                if i in self.inplace_pattern:
+                    odat = inputs[self.inplace_pattern[i]]
+                else:
+                    odat = storage[0]
+                    if odat is not None:
+                        if odat.shape != out_shape:
+                            # It is unsafe to try to resize odat,
+                            # we have to allocate output storage.
+                            odat = None
+                    if odat is None:
+                        odat = numpy.ndarray(out_shape,
+                                dtype=output.type.dtype)
+                storage[0] = odat
+
+        ufunc_args = inputs  # + output_storage
         if self.nfunc and len(inputs) == self.nfunc_spec[1]:
             ufunc = self.nfunc
             nout = self.nfunc_spec[2]
-            # Numpy ufuncs will sometimes perform operations in
-            # float16, in particular when the input is int8.
-            # This is not something that we want, and we do not
-            # do it in the C code, so we specify that the computation
-            # should be carried out in the returned dtype.
-            # This is done via the "sig" kwarg of the ufunc, its value
-            # should be something like "ff->f", where the characters
-            # represent the dtype of the inputs and outputs.
-            out_dtype = node.outputs[0].dtype
-            if out_dtype in float_dtypes and isinstance(ufunc, numpy.ufunc):
-                char = numpy.sctype2char(out_dtype)
-                sig = char * node.nin + '->' + char * node.nout
-                ufunc_kwargs['sig'] = sig
+            if nout < 0:
+                nout = -nout
             # Unfortunately, the else case does not allow us to
             # directly feed the destination arguments to the nfunc
             # since it sometimes requires resizing. Doing this
@@ -848,35 +875,52 @@ def perform(self, node, inputs, output_storage):
                                       self.scalar_op.nout))
             nout = ufunc.nout
 
-        variables = ufunc(*ufunc_args, **ufunc_kwargs)
+        try:
+            variables = ufunc(*ufunc_args)
+        except Exception, e:
+            errormsg = ('While computing ' + str(node.outputs) +
+                        ': Failed calling ufunc for op ' +
+                        str(self.scalar_op) +
+                        ' for params of shape ' +
+                        str([arg.shape for arg in ufunc_args]))
+
+            if config.exception_verbosity == 'high':
+                errormsg += 'inputs are: \n'
+                for i, ipt in enumerate(node.inputs):
+                    errormsg += '(' + str(i) + ') ' + \
+                            min_informative_str(ipt) + '\n'
+                errormsg += 'outputs are: \n'
+                for i, output in enumerate(node.outputs):
+                    errormsg += '(' + str(i) + ') ' + \
+                            min_informative_str(output) + '\n'
+                errormsg += 'original exception was: ' + '\n'.join(
+                        traceback.format_exception_only(*sys.exc_info()[0:2]))
+
+            e.args = e.args + (errormsg, )
+            raise
 
         if nout == 1:
             variables = [variables]
-        i = 0
         for variable, storage, nout in izip(variables, output_storage,
                                             node.outputs):
-            if getattr(variable, "dtype", "") == 'object':
+            if str(getattr(variable, "dtype", "")) == 'object':
                 # Since numpy 1.6, function created with numpy.frompyfunc
                 # always return an ndarray with dtype object
                 variable = numpy.asarray(variable, dtype=nout.dtype)
 
-            if i in self.inplace_pattern:
-                odat = inputs[self.inplace_pattern[i]]
-                odat[...] = variable
-                storage[0] = odat
-            # Sometimes NumPy return a Python type.
-            # Some Theano op return a different dtype like floor, ceil,
-            # trunc, eq, ...
-            elif (not isinstance(variable, numpy.ndarray) or
-                  variable.dtype != nout.dtype):
-                variable = numpy.asarray(variable, nout.dtype)
-                storage[0] = variable
-            # numpy.real return a view!
-            elif not variable.flags.owndata:
-                storage[0] = variable.copy()
+            # The storage has been resized earlier.
+            if hasattr(variable, 'shape'):
+                assert storage[0].shape == variable.shape
             else:
-                storage[0] = variable
-            i += 1
+                # If variable has not shape, then it is a scalar.
+                assert numpy.prod(storage[0].shape) == 1
+
+            storage[0][...] = variable
+            assert str(storage[0].dtype) != 'object'
+
+        # the following should be used instead of the previous loop,
+        # unfortunately it tends to segfault
+        # self.ufunc(*(ufunc_args+[s[0] for s in output_storage]))
 
     def infer_shape(self, node, i_shapes):
         rval = []
@@ -1026,12 +1070,20 @@ def _c_all(self, node, nodename, inames, onames, sub):
         # which is allocated, OR, if there are any aliased outputs,
         # the index of the last of these aliased outputs.
 
+        # We declare the scalar variables used in the inner loop to do
+        # the element-wise computation. Aliased scalar variables need
+        # not be declared, as they are #defined in defines
+        task_decl = "".join([
+            "%s& %s_i = *%s_iter;\n" % (dtype, name, name)
+                for name, dtype in izip(inames + list(real_onames),
+                                       idtypes + list(real_odtypes))])
+
         # We generate the C code of the inner loop using the scalar op
         task_code = self.scalar_op.c_code(
                 Apply(self.scalar_op,
-                      [get_scalar_type(dtype=input.type.dtype)()
+                      [Scalar(dtype=input.type.dtype)()
                           for input in node.inputs],
-                      [get_scalar_type(dtype=output.type.dtype)()
+                      [Scalar(dtype=output.type.dtype)()
                           for output in node.outputs]),
                 nodename + '_scalar_',
                 ["%s_i" % s for s in _inames],
@@ -1040,13 +1092,11 @@ def _c_all(self, node, nodename, inames, onames, sub):
         code = """
         {
             %(defines)s
+            %(task_decl)s
             %(task_code)s
             %(undefs)s
         }
         """ % locals()
-
-        loop_orders = orders + [range(nnested)] * len(real_onames)
-        dtypes = (idtypes + list(real_odtypes))
         if all([o.ndim <= 1 for o in node.outputs] or
                # Use simpler code when output ndim == 0 or 1
                # or for broadcated scalar.
@@ -1055,47 +1105,19 @@ def _c_all(self, node, nodename, inames, onames, sub):
                 all_code = [("", "")] * (nnested - 1) + [("", code)] + [""]
             else:
                 all_code = [code]
-            if len(all_code) == 1:
-                #No loops
-                task_decl = "".join([
-                    "%s& %s_i = *%s_iter;\n" % (dtype, name, name)
-                    for name, dtype in izip(inames + list(real_onames),
-                                            idtypes + list(real_odtypes))])
-
-                preloops = {}
-                for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
-                    for j, index in enumerate(loop_order):
-                        if index != 'x':
-                            preloops.setdefault(j, "")
-                            preloops[j] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
-                            break
-                    else:  # all broadcastable
-                            preloops.setdefault(0, "")
-                            preloops[0] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
-
-                init_array = preloops.get(0, " ")
-                loop = """
-                {
-                  %(defines)s
-                  %(init_array)s
-                  %(task_decl)s
-                  %(task_code)s
-                  %(undefs)s
-                }
-                """ % locals()
-            else:
-                loop = cgen.make_loop(
-                    loop_orders=loop_orders,
-                    dtypes=dtypes,
-                    loop_tasks=all_code,
-                    sub=sub, openmp=self.openmp)
+
+            loop = cgen.make_loop(
+                loop_orders=orders + [range(nnested)] * len(real_onames),
+                dtypes=(idtypes + list(real_odtypes)),
+                loop_tasks=all_code,
+                sub=sub)
         else:
             loop = cgen.make_reordered_loop(
-                init_loop_orders=loop_orders,
+                init_loop_orders=orders + [range(nnested)] * len(real_onames),
                 olv_index=olv_index,
-                dtypes=dtypes,
+                dtypes=(idtypes + list(real_odtypes)),
                 inner_task=code,
-                sub=sub, openmp=self.openmp)
+                sub=sub)
 
         # If all inputs and outputs are contiguous
         # and the scalar op define optimized code for that case
@@ -1137,8 +1159,7 @@ def _c_all(self, node, nodename, inames, onames, sub):
                             contig += """
             dtype_%(x)s& %(x)s_i = ((dtype_%(x)s*) PyArray_DATA(%(x)s))[0];
                             """ % locals()
-                    if self.openmp:
-                        contig += """#pragma omp parallel for if(n>=%d)""" % (config.openmp_elemwise_minsize)
+
                     contig += """
                     for(int i=0; i<n; i++){
                         %(index)s
@@ -1182,24 +1203,16 @@ def c_code_cache_version_apply(self, node):
 
         # now we insert versions for the ops on which we depend...
         scalar_node = Apply(self.scalar_op,
-                [get_scalar_type(dtype=input.type.dtype)() for input in node.inputs],
-                [get_scalar_type(dtype=output.type.dtype)() for output in node.outputs])
+                [Scalar(dtype=input.type.dtype)() for input in node.inputs],
+                [Scalar(dtype=output.type.dtype)() for output in node.outputs])
         version.append(self.scalar_op.c_code_cache_version_apply(scalar_node))
         for i in node.inputs + node.outputs:
-            version.append(get_scalar_type(dtype=i.type.dtype).c_code_cache_version())
-        version.append(('openmp', self.openmp))
+            version.append(Scalar(dtype=i.type.dtype).c_code_cache_version())
         if all(version):
             return tuple(version)
         else:
             return ()
 
-    def python_constant_folding(self, node):
-        """
-        Return True if we do not want to compile c code
-        when doing constant folding of this node.
-        """
-        return node.outputs[0].ndim == 0
-
 # def elemwise_to_scal(fgraph):
 # TODO: why is this commented out? should it be removed?
 #       it has needed maintenance despite being commented
@@ -1268,10 +1281,8 @@ def __init__(self, scalar_op, axis=None):
         # See <http://projects.scipy.org/numpy/ticket/2235>.
         elif isinstance(axis, (int, numpy.integer)):
             self.axis = (axis,)
-        elif isinstance(axis, numpy.ndarray) and axis.ndim == 0:
-            self.axis = (int(axis),)
         else:
-            self.axis = list(set(int(a) for a in axis))
+            self.axis = list(set(axis))
             self.axis.sort()
             self.axis = tuple(self.axis)
 
@@ -1560,9 +1571,9 @@ def _c_all(self, node, name, inames, onames, sub):
         task1_code = self.scalar_op.c_code(
                 Apply(
                     self.scalar_op,
-                    [get_scalar_type(dtype=input.type.dtype)()
+                    [Scalar(dtype=input.type.dtype)()
                         for input in (node.inputs * 2)],
-                    [get_scalar_type(dtype=output.type.dtype)()
+                    [Scalar(dtype=output.type.dtype)()
                         for input in node.outputs]),
                 None,
                 ["%s_i" % aname, "%s_i" % inames[0]],
@@ -1586,7 +1597,7 @@ def _c_all(self, node, name, inames, onames, sub):
                         + [("", code1), ""])
         else:
             all_code = [task0_decl + code1]
-        loop = cgen.make_loop_careduce(
+        loop = cgen.make_loop(
                 [order, range(nnested) + ['x'] * len(axis)],
                 [idtype, adtype], all_code, sub)
 
@@ -1612,11 +1623,11 @@ def c_code_cache_version_apply(self, node):
 
         # now we insert versions for the ops on which we depend...
         scalar_node = Apply(self.scalar_op,
-                [get_scalar_type(dtype=input.type.dtype)() for input in node.inputs],
-                [get_scalar_type(dtype=output.type.dtype)() for output in node.outputs])
+                [Scalar(dtype=input.type.dtype)() for input in node.inputs],
+                [Scalar(dtype=output.type.dtype)() for output in node.outputs])
         version.append(self.scalar_op.c_code_cache_version_apply(scalar_node))
         for i in node.inputs + node.outputs:
-            version.append(get_scalar_type(dtype=i.type.dtype).c_code_cache_version())
+            version.append(Scalar(dtype=i.type.dtype).c_code_cache_version())
         if all(version):
             return tuple(version)
         else:
@@ -1642,7 +1653,6 @@ def __str__(self):
             return "All{%s}" % ", ".join(map(str, self.axis))
 
     def make_node(self, input):
-        input = as_tensor_variable(input)
         if input.dtype not in ["int8", "uint8"]:
             input = theano.tensor.neq(input, 0)
         ret = super(All, self).make_node(input)
@@ -1668,7 +1678,6 @@ def __str__(self):
             return "Any{%s}" % ", ".join(map(str, self.axis))
 
     def make_node(self, input):
-        input = as_tensor_variable(input)
         if input.dtype not in ["int8", "uint8"]:
             input = theano.tensor.neq(input, 0)
         ret = super(Any, self).make_node(input)
@@ -1694,7 +1703,7 @@ class CAReduceDtype(CAReduce):
 
     def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
         """
-        Usage: CAReduceDtype(scalar_op, axis=None, dtype=None, acc_dtype=None)
+        Usage: CAReduceDtype(scalar_op, axis=None, dtype=None)
 
         :param scalar_op: a binary scalar op with only one output.
                      It must be commutative and associative.
@@ -1832,20 +1841,6 @@ def make_node(self, input):
         assert op.acc_dtype is not None
         return CAReduce.make_node(op, input)
 
-    def __str__(self):
-        name = self.__class__.__name__
-        if self.__class__.__name__ == "CAReduceDtype":
-            name = "ReduceDtype{%s}" % self.scalar_op,
-        axis = ""
-        if self.axis is not None:
-            axis = ", ".join(str(x) for x in self.axis)
-            axis = "axis=[%s], " % axis
-        return "%s{%sacc_dtype=%s}" % (
-            name,
-            axis,
-            str(self.acc_dtype)
-        )
-
 
 class Sum(CAReduceDtype):
     """
@@ -1916,7 +1911,13 @@ def R_op(self, inputs, eval_points):
         # part of self
         if None in eval_points:
             return [None]
-        return self(*eval_points, **dict(return_list=True))
+        return self.make_node(*eval_points).outputs
+
+    def __str__(self):
+        if self.axis is None:
+            return "Sum"
+        else:
+            return "Sum{%s}" % ", ".join(map(str, self.axis))
 
 
 class Prod(CAReduceDtype):
@@ -2071,6 +2072,12 @@ def grad(self, inp, grads):
 
             return [final_grad]
 
+    def __str__(self):
+        if self.axis is None:
+            return "Prod"
+        else:
+            return "Prod{%s}" % ", ".join(map(str, self.axis))
+
     def c_code_cache_version(self):
         return (1,)
 
@@ -2110,4 +2117,10 @@ def c_code_cache_version(self):
 class ProdWithoutZeros(CAReduceDtype):
     def __init__(self, axis=None, dtype=None, acc_dtype=None):
         CAReduceDtype.__init__(self, mul_without_zeros, axis=axis,
-                               dtype=dtype, acc_dtype=acc_dtype)
+                dtype=dtype, acc_dtype=acc_dtype)
+
+    def __str__(self):
+        if self.axis is None:
+            return "ProdWithoutZeros"
+        else:
+            return "ProdWithoutZeros{%s}" % ", ".join(map(str, self.axis))
diff --git a/theano/tensor/elemwise_cgen.py b/theano/tensor/elemwise_cgen.py
index 7eaf6970b74..3cf67a5f7cd 100644
--- a/theano/tensor/elemwise_cgen.py
+++ b/theano/tensor/elemwise_cgen.py
@@ -1,4 +1,3 @@
-import theano
 
 
 def make_declare(loop_orders, dtypes, sub):
@@ -13,6 +12,7 @@ def make_declare(loop_orders, dtypes, sub):
         # and an integer for the number of dimensions
         decl += """
         %(dtype)s* %(var)s_iter;
+        int %(var)s_nd;
         """ % locals()
         for j, value in enumerate(loop_order):
             if value != 'x':
@@ -21,8 +21,8 @@ def make_declare(loop_orders, dtypes, sub):
                 # the stride in that dimension,
                 # and the jump from an iteration to the next
                 decl += """
-                npy_intp %(var)s_n%(value)i;
-                ssize_t %(var)s_stride%(value)i;
+                int %(var)s_n%(value)i;
+                int %(var)s_stride%(value)i;
                 int %(var)s_jump%(value)i_%(j)i;
                 """ % locals()
             else:
@@ -121,9 +121,7 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'):
         created, otherwise it will be c order.
 
     """
-    type = dtype.upper()
-    if type.startswith('THEANO_COMPLEX'):
-                type = type.replace('THEANO_COMPLEX', 'NPY_COMPLEX')
+
     nd = len(loop_orders[0])
     init_dims = ""
     # For each dimension, the tensors are either all broadcasted, in
@@ -144,6 +142,7 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'):
     # way that its contiguous dimensions match one of the input's
     # contiguous dimensions, or the dimension with the smallest
     # stride. Right now, it is allocated to be C_CONTIGUOUS.
+
     return """
     {
         npy_intp dims[%(nd)s];
@@ -151,7 +150,7 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'):
         %(init_dims)s
         if (!%(olv)s) {
             %(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims,
-                                                    %(type)s,
+                                                    type_num_%(olv)s,
                                                     %(fortran)s);
         }
         else {
@@ -163,7 +162,7 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'):
                 // If we can't resize the ndarray we have we can allocate a new one.
                 PyErr_Clear();
                 Py_XDECREF(%(olv)s);
-                %(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims, %(type)s, 0);
+                %(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims, type_num_%(olv)s, 0);
             }
         }
         if (!%(olv)s) {
@@ -172,7 +171,8 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'):
     }
     """ % dict(locals(), **sub)
 
-def make_loop(loop_orders, dtypes, loop_tasks, sub, openmp=None):
+
+def make_loop(loop_orders, dtypes, loop_tasks, sub):
     """
     Make a nested loop over several arrays and associate specific code
     to each level of nesting.
@@ -196,29 +196,22 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub, openmp=None):
     @type sub: a dictionary.
     @param sub: Maps 'lv#' to a suitable variable name.
       The 'lvi' variable corresponds to the ith element of loop_orders.
-
     """
+
     def loop_over(preloop, code, indices, i):
         iterv = 'ITER_%i' % i
         update = ""
         suitable_n = "1"
         for j, index in enumerate(indices):
             var = sub['lv%i' % j]
-            dtype = dtypes[j]
-            update += "%(dtype)s &%(var)s_i = * ( %(var)s_iter + %(iterv)s * %(var)s_jump%(index)s_%(i)s );\n" % locals()
+            update += "%(var)s_iter += %(var)s_jump%(index)s_%(i)s;\n" % locals()
             if index != 'x':
                 suitable_n = "%(var)s_n%(index)s" % locals()
-        if openmp:
-            openmp_elemwise_minsize = theano.config.openmp_elemwise_minsize
-            forloop = """#pragma omp parallel for if( %(suitable_n)s >=%(openmp_elemwise_minsize)s)\n""" % locals()
-        else:
-            forloop = ""
-        forloop += """for (int %(iterv)s = 0; %(iterv)s<%(suitable_n)s; %(iterv)s++)""" % locals()
-        return"""
+        return """
         %(preloop)s
-        %(forloop)s {
-            %(update)s
+        for (int %(iterv)s = %(suitable_n)s; %(iterv)s; %(iterv)s--) {
             %(code)s
+            %(update)s
         }
         """ % locals()
 
@@ -233,16 +226,18 @@ def loop_over(preloop, code, indices, i):
             preloops.setdefault(0, "")
             preloops[0] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
 
-    s = ""
-
-    for i, (pre_task, task), indices in reversed(zip(xrange(len(loop_tasks) - 1), loop_tasks, zip(*loop_orders))):
+    if len(loop_tasks) == 1:
+        s = preloops.get(0, "")
+    else:
+        s = ""
+        for i, (pre_task, task), indices in reversed(zip(xrange(len(loop_tasks) - 1), loop_tasks, zip(*loop_orders))):
             s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i)
 
     s += loop_tasks[-1]
     return "{%s}" % s
 
 
-def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub, openmp=None):
+def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
     '''A bit like make_loop, but when only the inner-most loop executes code.
 
     All the loops will be reordered so that the loops over the output tensor
@@ -331,7 +326,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub, op
         ++%(ovar)s_loops_it;
         """ % locals()
 
-    ## Get sorted strides
+    ## Get sorted strides and jumps
     # Get strides in the initial order
     def get_loop_strides(loop_order, i):
         """
@@ -350,7 +345,7 @@ def get_loop_strides(loop_order, i):
         return r
 
     # We declare the initial strides as a 2D array, nvars x nnested
-    declare_strides = """
+    declare_strides_jumps = """
     int init_strides[%(nvars)i][%(nnested)i] = {
         %(strides)s
     };""" % dict(
@@ -361,57 +356,46 @@ def get_loop_strides(loop_order, i):
                 for i, lo in enumerate(init_loop_orders)
                 if len(lo)>0))
 
-    # Declare (sorted) stride and for each variable
+    # Declare (sorted) stride and jumps for each variable
     # we iterate from innermost loop to outermost loop
-    declare_strides += """
+    declare_strides_jumps += """
     std::vector< std::pair<int, int> >::reverse_iterator %(ovar)s_loops_rit;
     """ % locals()
 
     for i in xrange(nvars):
         var = sub["lv%i" % i]
-        declare_strides += """
+        declare_strides_jumps += """
         %(ovar)s_loops_rit = %(ovar)s_loops.rbegin();""" % locals()
+
+        adjust = "0"
         for j in reversed(range(nnested)):
-            declare_strides += """
+            jump = "(%s) - (%s)" % ("%(var)s_stride_l%(j)i" % locals(), adjust)
+            declare_strides_jumps +="""
             int %(var)s_stride_l%(j)i = init_strides[%(i)i][%(ovar)s_loops_rit->second];
+            int %(var)s_jump_l%(j)i = %(jump)s;
             ++%(ovar)s_loops_rit;
             """ % locals()
+            adjust = "TOTAL_%(j)i * %(var)s_stride_l%(j)i" % locals()
 
     declare_iter = ""
     for i, dtype in enumerate(dtypes):
         var = sub["lv%i" % i]
         declare_iter += "%(var)s_iter = (%(dtype)s*)(PyArray_DATA(%(var)s));\n" % locals()
 
-    pointer_update = ''
-    for j , dtype in enumerate(dtypes):
-        var = sub["lv%i" % j]
-        pointer_update += "%(dtype)s &%(var)s_i = * ( %(var)s_iter"%locals()
-        tot_jump = ''
-        for i in reversed(range(nnested)):
-            iterv = 'ITER_%i' % i
-            pointer_update += "+%(var)s_stride_l%(i)i*%(iterv)s" % locals()
-        pointer_update += ");\n"
-
     loop = inner_task
     for i in reversed(range(nnested)):
         iterv = 'ITER_%i' % i
         total = 'TOTAL_%i' % i
         update = ''
-        forloop = ''
-        # The pointers are defined only in the most inner loop
-        if i == nnested-1:
-            update = pointer_update
-        if i == 0:
-            if openmp:
-                openmp_elemwise_minsize= theano.config.openmp_elemwise_minsize
-                forloop += """#pragma omp parallel for if( %(total)s >=%(openmp_elemwise_minsize)s)\n""" % locals()
-        forloop += "for(int %(iterv)s = 0; %(iterv)s<%(total)s; %(iterv)s++)" % locals()
+        for j in xrange(nvars):
+            var = sub["lv%i" % j]
+            update += "%(var)s_iter += %(var)s_jump_l%(i)i;\n" % locals()
 
         loop = """
-        %(forloop)s
+        for (int %(iterv)s = %(total)s; %(iterv)s; %(iterv)s--)
         { // begin loop %(i)i
+            %(loop)s
             %(update)s
-            %(loop)s 
         } // end loop %(i)i
         """ % locals()
 
@@ -419,7 +403,7 @@ def get_loop_strides(loop_order, i):
             '{',
             order_loops,
             declare_totals,
-            declare_strides,
+            declare_strides_jumps,
             declare_iter,
             loop,
             '}\n',
@@ -452,77 +436,21 @@ def get_loop_strides(loop_order, i):
 ### DimShuffle ###
 ##################
 
+
+
 #################
 ### Broadcast ###
 #################
 
 
+
+
 ################
 ### CAReduce ###
 ################
 
 
-def make_loop_careduce(loop_orders, dtypes, loop_tasks, sub):
-    """
-    Make a nested loop over several arrays and associate specific code
-    to each level of nesting.
 
-    @type loop_orders: list of N tuples of length M.
-    @param loop_orders: Each value of each
-      tuple can be either the index of a dimension to loop over or
-      the letter 'x' which means there is no looping to be done
-      over that variable at that point (in other words we broadcast
-      over that dimension). If an entry is an integer, it will become
-      an alias of the entry of that rank.
 
-    @type loop_tasks: list of M+1 pieces of code.
-    @param loop_tasks: The ith loop_task is a pair of strings, the first
-      string is code to be executed before the ith loop starts, the second
-      one contains code to be executed just before going to the next element
-      of the ith dimension.
-      The last element if loop_tasks is a single string, containing code
-      to be executed at the very end.
 
-    @type sub: a dictionary.
-    @param sub: Maps 'lv#' to a suitable variable name.
-      The 'lvi' variable corresponds to the ith element of loop_orders.
-    """
-
-    def loop_over(preloop, code, indices, i):
-        iterv = 'ITER_%i' % i
-        update = ""
-        suitable_n = "1"
-        for j, index in enumerate(indices):
-            var = sub['lv%i' % j]
-            update += "%(var)s_iter += %(var)s_jump%(index)s_%(i)s;\n" % locals()
-            if index != 'x':
-                suitable_n = "%(var)s_n%(index)s" % locals()
-        return """
-        %(preloop)s
-        for (int %(iterv)s = %(suitable_n)s; %(iterv)s; %(iterv)s--) {
-            %(code)s
-            %(update)s
-        }
-        """ % locals()
-
-    preloops = {}
-    for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
-        for j, index in enumerate(loop_order):
-            if index != 'x':
-                preloops.setdefault(j, "")
-                preloops[j] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
-                break
-        else: # all broadcastable
-            preloops.setdefault(0, "")
-            preloops[0] += ("%%(lv%(i)s)s_iter = (%(dtype)s*)(PyArray_DATA(%%(lv%(i)s)s));\n" % locals()) % sub
-
-    if len(loop_tasks) == 1:
-        s = preloops.get(0, "")
-    else:
-        s = ""
-        for i, (pre_task, task), indices in reversed(zip(xrange(len(loop_tasks) - 1), loop_tasks, zip(*loop_orders))):
-            s = loop_over(preloops.get(i, "") + pre_task, s + task, indices, i)
-
-    s += loop_tasks[-1]
-    return "{%s}" % s
 
diff --git a/theano/tensor/extra_ops.py b/theano/tensor/extra_ops.py
index b14df3eb763..66d759d8f9b 100644
--- a/theano/tensor/extra_ops.py
+++ b/theano/tensor/extra_ops.py
@@ -2,259 +2,10 @@
 import numpy
 
 import theano
-
 from theano.tensor import basic
-from theano.tensor import nlinalg
-
 from theano import gof, scalar
-from theano.gradient import DisconnectedType
 tensor = basic
-
-
-class CumsumOp(theano.Op):
-    # See function cumsum for docstring
-    def __init__(self, axis=None):
-        self.axis = axis
-
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.axis == other.axis)
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.axis)
-
-    def make_node(self, x):
-        x = basic.as_tensor_variable(x)
-        out_type = x.type()
-
-        if self.axis is None:
-            out_type = theano.tensor.vector(dtype=x.dtype)  # Flatten
-        elif self.axis >= x.ndim:
-            raise ValueError('axis(={0}) out of bounds'.format(self.axis))
-
-        return theano.Apply(self, [x], [out_type])
-
-    def perform(self, node, inputs, output_storage):
-        x = inputs[0]
-        z = output_storage[0]
-        z[0] = np.cumsum(x, axis=self.axis)
-
-    def grad(self, inputs, output_gradients):
-        [gi] = output_gradients
-
-        if self.axis is None:
-            return [cumsum(gi[::-1])[::-1].reshape(inputs[0].shape)]
-
-        # We need to reverse the gradients along ``self.axis``,
-        #  compute cumsum, then reverse again
-        reverse_slicing = [slice(None, None, None)] * gi.ndim
-        reverse_slicing[self.axis] = slice(None, None, -1)
-        reverse_slicing = tuple(reverse_slicing)
-        return [cumsum(gi[reverse_slicing], self.axis)[reverse_slicing]]
-
-    def infer_shape(self, node, shapes):
-        if self.axis is None:
-            return [(tensor.prod(shapes[0]),)]  # Flatten
-
-        return shapes
-
-    def c_code(self, node, name, inames, onames, sub):
-        x, = inames
-        z, = onames
-        axis = self.axis
-        fail = sub['fail']
-
-        if self.axis is None or (self.axis == 0 and node.inputs[0].ndim == 1):
-            code = """
-                npy_intp shape[1] = { PyArray_SIZE(%(x)s) };
-                if(!(%(z)s && PyArray_DIMS(%(z)s)[0] == shape[0]))
-                {
-                    Py_XDECREF(%(z)s);
-                    %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, PyArray_TYPE((PyArrayObject*) py_%(x)s));
-                }
-
-                if (!%(z)s)
-                    %(fail)s;
-                {
-                    PyObject * t = PyArray_CumSum(
-                        %(x)s, NPY_MAXDIMS,
-                        PyArray_TYPE((PyArrayObject*) py_%(x)s), %(z)s);
-                    if (!t){
-                       %(fail)s;
-                    }
-                    // Because PyArray_CumSum returns a newly created reference on t.
-                    Py_XDECREF(t);
-                }
-            """ % locals()
-        else:
-            code = """
-                if(!(%(z)s && PyArray_CompareLists(PyArray_DIMS(%(z)s), PyArray_DIMS(%(x)s), PyArray_NDIM(%(x)s))))
-                {
-                    Py_XDECREF(%(z)s);
-                    %(z)s = (PyArrayObject*) PyArray_SimpleNew(PyArray_NDIM(%(x)s), PyArray_DIMS(%(x)s), PyArray_TYPE((PyArrayObject*) py_%(x)s));
-                }
-
-                if (!%(z)s)
-                    %(fail)s;
-                {
-
-                    PyObject * t = PyArray_CumSum(
-                        %(x)s, %(axis)s,
-                        PyArray_TYPE((PyArrayObject*) py_%(x)s), %(z)s);
-                    if (!t){
-                       %(fail)s;
-                    }
-                    // Because PyArray_CumSum returns a newly created reference on t.
-                    Py_XDECREF(t);
-                }
-            """ % locals()
-
-        return code
-
-    def c_code_cache_version(self):
-        return (6,)
-
-    def __str__(self):
-        return "%s{%s}" % (self.__class__.__name__, self.axis)
-
-
-def cumsum(x, axis=None):
-    """Return the cumulative sum of the elements along a given axis.
-
-    Wraping of numpy.cumsum.
-
-    :param x: Input tensor variable.
-
-    :param axis: The axis along which the cumulative sum is computed.
-        The default (None) is to compute the cumsum over the flattened array.
-
-    .. versionadded:: 0.6.1
-    """
-    return CumsumOp(axis=axis)(x)
-
-
-class CumprodOp(theano.Op):
-    # See function cumprod for docstring
-    def __init__(self, axis=None):
-        self.axis = axis
-
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.axis == other.axis)
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.axis)
-
-    def make_node(self, x):
-        x = basic.as_tensor_variable(x)
-        out_type = x.type()
-
-        if self.axis is None:
-            out_type = theano.tensor.vector(dtype=x.dtype)  # Flatten
-        elif self.axis >= x.ndim:
-            raise ValueError('axis(={0}) out of bounds'.format(self.axis))
-
-        return theano.Apply(self, [x], [out_type])
-
-    def perform(self, node, inputs, output_storage):
-        x = inputs[0]
-        z = output_storage[0]
-        z[0] = np.cumprod(x, axis=self.axis)
-
-    def grad(self, inputs, output_gradients):
-        x, = inputs
-        gi, = output_gradients
-        fx = cumprod(x, axis=self.axis)
-
-        if self.axis is None:
-            return [cumsum((fx * gi)[::-1])[::-1].reshape(inputs[0].shape) / x]
-
-        # We need to reverse the gradients along ``self.axis``,
-        #  compute cumsum, then reverse again
-        reverse_slicing = [slice(None, None, None)] * gi.ndim
-        reverse_slicing[self.axis] = slice(None, None, -1)
-        reverse_slicing = tuple(reverse_slicing)
-        return [cumsum((fx * gi)[reverse_slicing],
-                       self.axis)[reverse_slicing] / x]
-
-    def infer_shape(self, node, shapes):
-        if self.axis is None:
-            return [(tensor.prod(shapes[0]),)]  # Flatten
-
-        return shapes
-
-    def c_code(self, node, name, inames, onames, sub):
-        x, = inames
-        z, = onames
-        axis = self.axis
-        fail = sub['fail']
-
-        if self.axis is None or (self.axis == 0 and node.inputs[0].ndim == 1):
-            code = """
-                npy_intp shape[1] = { PyArray_SIZE(%(x)s) };
-                if(!(%(z)s && PyArray_DIMS(%(z)s)[0] == shape[0]))
-                {
-                    Py_XDECREF(%(z)s);
-                    %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, PyArray_TYPE((PyArrayObject*) py_%(x)s));
-                }
-
-                if (!%(z)s)
-                    %(fail)s;
-                {
-                    PyObject * t = PyArray_CumProd(
-                        %(x)s, NPY_MAXDIMS,
-                        PyArray_TYPE((PyArrayObject*) py_%(x)s), %(z)s);
-                    if (!t){
-                       %(fail)s;
-                    }
-                    // Because PyArray_CumSum returns a newly created reference on t.
-                    Py_XDECREF(t);
-                }
-            """ % locals()
-        else:
-            code = """
-                if(!(%(z)s && PyArray_CompareLists(PyArray_DIMS(%(z)s), PyArray_DIMS(%(x)s), PyArray_NDIM(%(x)s)) ))
-                {
-                    Py_XDECREF(%(z)s);
-                    %(z)s = (PyArrayObject*) PyArray_SimpleNew(PyArray_NDIM(%(x)s), PyArray_DIMS(%(x)s), PyArray_TYPE((PyArrayObject*) py_%(x)s));
-                }
-
-                if (!%(z)s)
-                    %(fail)s;
-                {
-                    PyObject * t = PyArray_CumProd(
-                        %(x)s, %(axis)s,
-                        PyArray_TYPE((PyArrayObject*) py_%(x)s), %(z)s);
-                    if (!t){
-                       %(fail)s;
-                    }
-                    // Because PyArray_CumSum returns a newly created reference on t.
-                    Py_XDECREF(t);
-                }
-            """ % locals()
-
-        return code
-
-    def c_code_cache_version(self):
-        return (4,)
-
-    def __str__(self):
-        return "%s{%s}" % (self.__class__.__name__, self.axis)
-
-
-def cumprod(x, axis=None):
-    """Return the cumulative product of the elements along a given axis.
-
-    Wraping of numpy.cumprod.
-
-    :param x: Input tensor variable.
-
-    :param axis: The axis along which the cumulative product is computed.
-        The default (None) is to compute the cumprod over the flattened array.
-
-    .. versionadded:: 0.6.1
-    """
-    return CumprodOp(axis=axis)(x)
+from theano.gradient import DisconnectedType
 
 
 class DiffOp(theano.Op):
@@ -349,7 +100,7 @@ def __init__(self, minlength=None):
 
     def __eq__(self, other):
         return (type(self) == type(other) and
-                self.minlength == other.minlength)
+               self.minlength == other.minlength)
 
     def __hash__(self):
         return hash(type(self)) ^ hash(self.minlength)
@@ -376,8 +127,8 @@ def make_node(self, x, weights):
 
         if x.dtype in numpy_unsupported_dtypes:
             raise TypeError(
-                ("Input dtypes %s are not supported by numpy.bincount, "
-                 % numpy_unsupported_dtypes), x.dtype)
+                    ("Input dtypes %s are not supported by numpy.bincount, "
+                    % numpy_unsupported_dtypes), x.dtype)
 
         if x.ndim != 1:
             raise TypeError("Inputs must be of dimension 1.")
@@ -495,17 +246,17 @@ def make_node(self, x, repeats):
         # Some dtypes are not supported by numpy's implementation of repeat.
         # Until another one is available, we should fail at graph construction
         # time, not wait for execution.
-        ptr_bitwidth = theano.gof.local_bitwidth()
-        if ptr_bitwidth == 64:
+        int_bitwidth = theano.gof.python_int_bitwidth()
+        if int_bitwidth == 64:
             numpy_unsupported_dtypes = ('uint64',)
-        if ptr_bitwidth == 32:
+        if int_bitwidth == 32:
             numpy_unsupported_dtypes = ('uint32', 'int64', 'uint64')
 
         if repeats.dtype in numpy_unsupported_dtypes:
             raise TypeError(
-                ("dtypes %s are not supported by numpy.repeat "
-                 "for the 'repeats' parameter, "
-                 % str(numpy_unsupported_dtypes)), repeats.dtype)
+                    ("dtypes %s are not supported by numpy.repeat "
+                     "for the 'repeats' parameter, "
+                     % numpy_unsupported_dtypes), repeats.dtype)
 
         if self.axis is None:
             broadcastable = [False]
@@ -562,7 +313,7 @@ def infer_shape(self, node, ins_shapes):
         repeats = node.inputs[1]
         out_shape = list(i0_shapes)
 
-        # uint64 shape are not supported.
+        #uint64 shape are not supported.
         dtype = None
         if repeats.dtype in ['uint8', 'uint16', 'uint32']:
             dtype = 'int64'
@@ -604,8 +355,6 @@ def repeat(x, repeats, axis=None):
 
     :param axis: int, optional.
 
-    :see: :func:`tensor.tile <tensor.tile>`
-
     .. versionadded:: 0.6
     """
     return RepeatOp(axis=axis)(x, repeats)
@@ -627,9 +376,9 @@ def make_node(self, M):
         if M.ndim != 0:
             raise TypeError('%s only works on scalar input'
                             % self.__class__.__name__)
-        elif (not M.dtype.startswith('int') and
-              not M.dtype.startswith('uint')):
-            # dtype is a theano attribute here
+        elif (not M.dtype.startswith('int')) and \
+              (not M.dtype.startswith('uint')):
+        # dtype is a theano attribute here
             raise TypeError('%s only works on integer input'
                             % self.__class__.__name__)
         return gof.Apply(self, [M], [tensor.dvector()])
@@ -642,8 +391,7 @@ def perform(self, node, inputs, out_):
     def infer_shape(self, node, in_shapes):
         temp = node.inputs[0]
         M = tensor.switch(tensor.lt(temp, 0),
-                          tensor.cast(0, temp.dtype),
-                          temp)
+            tensor.cast(0, temp.dtype), temp)
         return [[M]]
 
     def grad(self, inputs, output_grads):
@@ -651,7 +399,7 @@ def grad(self, inputs, output_grads):
 bartlett_ = Bartlett()
 
 
-# I create a function only to have the doc show well.
+#I create a function only to have the doc show well.
 def bartlett(M):
     """An instance of this class returns the Bartlett spectral window in the
     time-domain. The Bartlett window is very similar to a triangular window,
@@ -698,8 +446,8 @@ def make_node(self, a, val):
                             % self.__class__.__name__)
         val = tensor.cast(val, dtype=scalar.upcast(a.dtype, val.dtype))
         if val.dtype != a.dtype:
-            raise TypeError('%s: type of second parameter must be the same as'
-                            ' the first\'s' % self.__class__.__name__)
+            raise TypeError('%s: type of second parameter must be compatible'
+                          ' with first\'s' % self.__class__.__name__)
         return gof.Apply(self, [a, val], [a.type()])
 
     def perform(self, node, inputs, output_storage):
@@ -729,16 +477,16 @@ def grad(self, inp, cost_grad):
             return [None, None]
         elif a.ndim > 2:
             raise NotImplementedError('%s: gradient is currently implemented'
-                                      ' for matrices only' %
-                                      self.__class__.__name__)
+                            ' for matrices only' % self.__class__.__name__)
         wr_a = fill_diagonal(grad, 0)  # valid for any number of dimensions
         # diag is only valid for matrices
-        wr_val = theano.tensor.nlinalg.diag(grad).sum()
+        import theano.sandbox.linalg
+        wr_val = theano.sandbox.linalg.ops.diag(grad).sum()
         return [wr_a, wr_val]
 fill_diagonal_ = FillDiagonal()
 
 
-# I create a function only to have the doc show well.
+#I create a function only to have the doc show well.
 def fill_diagonal(a, val):
     """ Returns a copy of an array with all
     elements of the main diagonal set to a specified scalar value.
@@ -759,150 +507,3 @@ def fill_diagonal(a, val):
     .. versionadded:: 0.6
     """
     return fill_diagonal_(a, val)
-
-
-class FillDiagonalOffset(gof.Op):
-    # See function fill_diagonal_offset for docstring
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def infer_shape(self, node, in_shapes):
-        return [in_shapes[0]]
-
-    def make_node(self, a, val, offset):
-        a = tensor.as_tensor_variable(a)
-        val = tensor.as_tensor_variable(val)
-        offset = tensor.as_tensor_variable(offset)
-        if a.ndim != 2:
-            raise TypeError('%s: first parameter must have exactly'
-                            ' two dimensions' % self.__class__.__name__)
-        elif val.ndim != 0:
-            raise TypeError('%s: second parameter must be a scalar'
-                            % self.__class__.__name__)
-        elif offset.ndim != 0:
-            raise TypeError('%s: third parameter must be a scalar'
-                            % self.__class__.__name__)
-        val = tensor.cast(val, dtype=scalar.upcast(a.dtype, val.dtype))
-        if val.dtype != a.dtype:
-            raise TypeError('%s: type of second parameter must be the same'
-                            ' as the first\'s' % self.__class__.__name__)
-        elif offset.dtype[:3] != 'int':
-            raise TypeError('%s: type of third parameter must be as integer'
-                            ' use theano.tensor.cast( input, \'int32/int64\')'
-                            % self.__class__.__name__)
-
-        return gof.Apply(self, [a, val, offset], [a.type()])
-
-    def perform(self, node, inputs, output_storage):
-        a = inputs[0].copy()
-        val = inputs[1]
-        offset = inputs[2]
-        height, width = a.shape
-
-        """
-        Note: The fill_diagonal only support rectangular matrix. The output
-        of tall matrix is "wrapped", which is an option in numpy 1.9.0
-        but was regarded as a bug in numpy 1.6.2. Here I implement the
-        fill_diagonal_offset with unwrapped output, so fill_diagonal_offset
-        supports tall matrix.(This make a little difference between the output
-        of fill_diagonal and fill_diagonal_offset only in the case of tall
-        matrix)
-        """
-        if offset >= 0:
-            start = offset
-            num_of_step = min(min(width, height), width - offset)
-        else:
-            start = - offset * a.shape[1]
-            num_of_step = min(min(width, height), height + offset)
-        step = a.shape[1] + 1
-        end = start + step * num_of_step
-        # Write the value out into the diagonal.
-        a.flat[start:end:step] = val
-
-        output_storage[0][0] = a
-
-    def grad(self, inp, cost_grad):
-        """
-        Note: The gradient is currently implemented for matrices
-        only.
-        """
-        a, val, offset = inp
-        grad = cost_grad[0]
-        height, width = grad.shape
-
-        if (a.dtype.startswith('complex')):
-            return [None, None]
-
-        # only valid for matrices
-        wr_a = fill_diagonal_offset(grad, 0, offset)
-
-        offset_abs = basic.abs_(offset)
-        pos_offset_flag = basic.ge(offset, 0)
-        neg_offset_flag = basic.lt(offset, 0)
-        min_wh = basic.minimum(width, height)
-
-        start = offset * pos_offset_flag + offset_abs * width * neg_offset_flag
-        num_of_step = basic.minimum(min_wh, width * pos_offset_flag +
-                                    height * neg_offset_flag - offset_abs)
-
-        step = a.shape[1] + 1
-        end = start + step * num_of_step
-
-        # input of slice should be integer
-        start = basic.cast(start, 'int32')
-        step = basic.cast(step, 'int32')
-        end = basic.cast(end, 'int32')
-
-        wr_val = grad.flatten()[start:end:step].sum()
-
-        wr_offset = theano.gradient.grad_undefined(
-            self, 2, offset,
-            "offset is not defined for non-integer offset so"
-            " fill_diagonal_offset(a,val,offset+eps) is undefined")
-
-        return [wr_a, wr_val, wr_offset]
-
-fill_diagonal_offset_ = FillDiagonalOffset()
-
-
-def fill_diagonal_offset(a, val, offset):
-    """
-    Returns a copy of an array with all
-    elements of the main diagonal set to a specified scalar value.
-
-      :param a: Rectangular array of two dimensions.
-      :param val: Scalar value to fill the diagonal whose type must be
-          compatible with that of array 'a' (i.e. 'val' cannot be viewed
-          as an upcast of 'a').
-      :param offset: Scalar value Offset of the diagonal from the main
-          diagonal. Can be positive or negative integer.
-      :return: An array identical to 'a' except that its offset diagonal
-          is filled with scalar 'val'. The output is unwrapped.
-    """
-    return fill_diagonal_offset_(a, val, offset)
-
-
-def to_one_hot(y, nb_class, dtype=None):
-    """Return a matrix where each row correspond to the one hot
-    encoding of each element in y.
-
-        :param y: A vector of integer value between 0 and nb_class - 1.
-        :param nb_class: The number of class in y.
-        :param dtype: The dtype of the returned matrix. Default floatX.
-
-        :return: A matrix of shape (y.shape[0], nb_class), where each
-          row ``i`` is the one hot encoding of the corresponding ``y[i]``
-          value.
-
-   """
-    ret = theano.tensor.zeros((y.shape[0], nb_class),
-                              dtype=dtype)
-    ret = theano.tensor.set_subtensor(ret[theano.tensor.arange(y.shape[0]), y],
-                                      1)
-    return ret
diff --git a/theano/tensor/io.py b/theano/tensor/io.py
index 123d8e7b27f..adb5f2ae9cf 100644
--- a/theano/tensor/io.py
+++ b/theano/tensor/io.py
@@ -23,7 +23,7 @@ def __init__(self, dtype, broadcastable, mmap_mode=None):
         self.broadcastable = broadcastable
         if mmap_mode not in (None, 'c'):
             raise ValueError("The only supported values for mmap_mode "
-                             "are None and 'c', got %s" % mmap_mode)
+                    "are None and 'c', got %s" % mmap_mode)
         self.mmap_mode = mmap_mode
         self._info = (dtype, broadcastable, mmap_mode)
 
@@ -46,13 +46,12 @@ def perform(self, node, inp, out):
         result = numpy.load(path, mmap_mode=self.mmap_mode)
         if result.dtype != self.dtype:
             raise TypeError("Expected an array of type %s, got %s instead" %
-                            (self.dtype, result.dtype))
+                    (self.dtype, result.dtype))
         out[0][0] = result
 
     def __str__(self):
         return "Load{dtype: %s, broadcastable: %s, mmep: %s}" % self._info
 
-
 def load(path, dtype, broadcastable, mmap_mode=None):
     """
     Load an array from an .npy file.
@@ -92,7 +91,6 @@ def load(path, dtype, broadcastable, mmap_mode=None):
     comm = MPI.COMM_WORLD
     mpi_enabled = True
 
-
 class MPIRecv(Op):
     """
     An operation to asynchronously receive an array to a remote host using MPI
@@ -106,9 +104,9 @@ class MPIRecv(Op):
 
     def __init__(self, source, tag, shape, dtype):
         self.source = source
-        self.tag = tag
+        self.tag  = tag
         self.shape = shape
-        self.dtype = numpy.dtype(dtype)  # turn "float64" into numpy.float64
+        self.dtype = numpy.dtype(dtype) # turn "float64" into numpy.float64
         self.broadcastable = (False,) * len(shape)
         self._info = (source, tag, shape, dtype)
 
@@ -122,7 +120,6 @@ def make_node(self):
         return gof.Apply(self, [], [theano.Variable(Generic()),
                                     tensor(self.dtype,
                                            broadcastable=self.broadcastable)])
-
     def perform(self, node, inp, out):
 
         data = numpy.zeros(self.shape, dtype=self.dtype)
@@ -140,7 +137,6 @@ def infer_shape(self, node, shapes):
     def do_constant_folding(self, node):
         return False
 
-
 class MPIRecvWait(Op):
     """
     An operation to wait on a previously received array using MPI
@@ -164,11 +160,10 @@ def make_node(self, request, data):
         return gof.Apply(self, [request, data],
                                [tensor(data.dtype,
                                        broadcastable=data.broadcastable)])
-
     def perform(self, node, inp, out):
 
         request = inp[0]
-        data = inp[1]
+        data    = inp[1]
 
         request.wait()
 
@@ -182,7 +177,6 @@ def infer_shape(self, node, shapes):
 
     view_map = {0: [1]}
 
-
 class MPISend(Op):
     """
     An operation to asynchronously Send an array to a remote host using MPI
@@ -196,7 +190,7 @@ class MPISend(Op):
 
     def __init__(self, dest, tag):
         self.dest = dest
-        self.tag = tag
+        self.tag  = tag
         self._info = (dest, tag)
 
     def __eq__(self, other):
@@ -222,7 +216,6 @@ def perform(self, node, inp, out):
     def __str__(self):
         return "MPISend{dest: %d, tag: %d}" % self._info
 
-
 class MPISendWait(Op):
     """
     An operation to wait on a previously sent array using MPI
@@ -254,35 +247,18 @@ def perform(self, node, inp, out):
     def __str__(self):
         return "MPISendWait"
 
-
 def isend(var, dest, tag):
-    """
-    Non blocking send
-    """
     return MPISend(dest, tag)(var)
 
-
 def send(var, dest, tag):
-    """
-    blocking send
-    """
     return MPISendWait(tag)(*isend(var, dest, tag))
 
-
 def irecv(shape, dtype, source, tag):
-    """
-    non-blocking receive
-    """
     return MPIRecv(source, tag, shape, dtype)()
 
-
 def recv(shape, dtype, source, tag):
-    """
-    blocking receive
-    """
     return MPIRecvWait(tag)(*irecv(shape, dtype, source, tag))
 
-
 # Ordering keys for scheduling
 def mpi_send_wait_key(a):
     """ Wait as long as possible on Waits, Start Send/Recvs early """
@@ -292,7 +268,6 @@ def mpi_send_wait_key(a):
         return -1
     return 0
 
-
 def mpi_tag_key(a):
     """ Break MPI ties by using the variable tag - prefer lower tags first """
     if isinstance(a.op, (MPISend, MPIRecv, MPIRecvWait, MPISendWait)):
diff --git a/theano/tensor/nlinalg.py b/theano/tensor/nlinalg.py
deleted file mode 100644
index f783c61dfcf..00000000000
--- a/theano/tensor/nlinalg.py
+++ /dev/null
@@ -1,701 +0,0 @@
-import logging
-import theano
-
-logger = logging.getLogger(__name__)
-import numpy
-
-from theano.gof import Op, Apply
-
-from theano.tensor import as_tensor_variable, dot, DimShuffle, Dot
-from theano.tensor.blas import Dot22
-from theano.tensor.opt import (register_stabilize,
-        register_specialize, register_canonicalize)
-from theano.gof import local_optimizer
-from theano.gof.opt import Optimizer
-from theano.gradient import DisconnectedType
-from theano.tensor import basic as tensor
-
-
-class MatrixPinv(Op):
-    """Computes the pseudo-inverse of a matrix :math:`A`.
-
-    The pseudo-inverse of a matrix A, denoted :math:`A^+`, is
-    defined as: "the matrix that 'solves' [the least-squares problem]
-    :math:`Ax = b`," i.e., if :math:`\\bar{x}` is said solution, then
-    :math:`A^+` is that matrix such that :math:`\\bar{x} = A^+b`.
-
-    Note that :math:`Ax=AA^+b`, so :math:`AA^+` is close to the identity matrix.
-    This method is not faster then `matrix_inverse`. Its strength comes from
-    that it works for non-square matrices.
-    If you have a square matrix though, `matrix_inverse` can be both more
-    exact and faster to compute. Also this op does not get optimized into a
-    solve op.
-    """
-
-    __props__ = ()
-
-    def __init__(self):
-        pass
-
-    def make_node(self, x):
-        x = as_tensor_variable(x)
-        assert x.ndim == 2
-        return Apply(self, [x], [x.type()])
-
-    def perform(self, node, (x,), (z, )):
-        z[0] = numpy.linalg.pinv(x).astype(x.dtype)
-
-pinv = MatrixPinv()
-
-
-class MatrixInverse(Op):
-    """Computes the inverse of a matrix :math:`A`.
-
-    Given a square matrix :math:`A`, ``matrix_inverse`` returns a square
-    matrix :math:`A_{inv}` such that the dot product :math:`A \cdot A_{inv}`
-    and :math:`A_{inv} \cdot A` equals the identity matrix :math:`I`.
-
-    :note: When possible, the call to this op will be optimized to the call
-           of ``solve``.
-    """
-
-    __props__ = ()
-
-    def __init__(self):
-        pass
-
-    def make_node(self, x):
-        x = as_tensor_variable(x)
-        assert x.ndim == 2
-        return Apply(self, [x], [x.type()])
-
-    def perform(self, node, (x,), (z, )):
-        z[0] = numpy.linalg.inv(x).astype(x.dtype)
-
-    def grad(self, inputs, g_outputs):
-        r"""The gradient function should return
-
-            .. math:: V\frac{\partial X^{-1}}{\partial X},
-
-        where :math:`V` corresponds to ``g_outputs`` and :math:`X` to
-        ``inputs``. Using the `matrix cookbook
-        <http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=3274>`_,
-        once can deduce that the relation corresponds to
-
-            .. math:: (X^{-1} \cdot V^{T} \cdot X^{-1})^T.
-
-        """
-        x, = inputs
-        xi = self(x)
-        gz, = g_outputs
-        #TT.dot(gz.T,xi)
-        return [-matrix_dot(xi, gz.T, xi).T]
-
-    def R_op(self, inputs, eval_points):
-        r"""The gradient function should return
-
-            .. math:: \frac{\partial X^{-1}}{\partial X}V,
-
-        where :math:`V` corresponds to ``g_outputs`` and :math:`X` to
-        ``inputs``.  Using the `matrix cookbook
-        <http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=3274>`_,
-        once can deduce that the relation corresponds to
-
-            .. math:: X^{-1} \cdot V \cdot X^{-1}.
-
-        """
-        x, = inputs
-        xi = self(x)
-        ev, = eval_points
-        if ev is None:
-            return [None]
-        return [-matrix_dot(xi, ev, xi)]
-
-    def infer_shape(self, node, shapes):
-        return shapes
-
-matrix_inverse = MatrixInverse()
-
-
-def matrix_dot(*args):
-    """ Shorthand for product between several dots
-
-    Given :math:`N` matrices :math:`A_0, A_1, .., A_N`, ``matrix_dot`` will
-    generate the matrix product between all in the given order, namely
-    :math:`A_0 \cdot A_1 \cdot A_2 \cdot .. \cdot A_N`.
-    """
-    rval = args[0]
-    for a in args[1:]:
-        rval = theano.tensor.dot(rval, a)
-    return rval
-
-
-class AllocDiag(Op):
-    """
-    Allocates a square matrix with the given vector as its diagonal.
-    """
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def make_node(self, _x):
-        x = as_tensor_variable(_x)
-        if x.type.ndim != 1:
-            raise TypeError('AllocDiag only works on vectors', _x)
-        return Apply(self, [x], [theano.tensor.matrix(dtype=x.type.dtype)])
-
-    def grad(self, inputs, g_outputs):
-        return [extract_diag(g_outputs[0])]
-
-    def perform(self, node, (x,), (z,)):
-        if x.ndim != 1:
-            raise TypeError(x)
-        z[0] = numpy.diag(x)
-
-    def infer_shape(self, node, shapes):
-        x_s, = shapes
-        return [(x_s[0], x_s[0])]
-
-alloc_diag = AllocDiag()
-
-
-class ExtractDiag(Op):
-    """ Return the diagonal of a matrix.
-
-    :note: work on the GPU.
-    """
-    def __init__(self, view=False):
-        self.view = view
-        if self.view:
-            self.view_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.view == other.view
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.view)
-
-    def make_node(self, _x):
-        if not isinstance(_x, theano.Variable):
-            x = as_tensor_variable(_x)
-        else:
-            x = _x
-
-        if x.type.ndim != 2:
-            raise TypeError('ExtractDiag only works on matrices', _x)
-        return Apply(self, [x], [x.type.__class__(broadcastable=(False,),
-                                                  dtype=x.type.dtype)()])
-
-    def perform(self, node, ins, outs):
-        """ For some reason numpy.diag(x) is really slow, so we
-        implemented our own. """
-        x, = ins
-        z, = outs
-        # zero-dimensional matrices ...
-        if x.shape[0] == 0 or x.shape[1] == 0:
-            z[0] = node.outputs[0].type.value_zeros((0,))
-            return
-
-        if x.shape[0] < x.shape[1]:
-            rval = x[:, 0]
-        else:
-            rval = x[0]
-
-        rval.strides = (x.strides[0] + x.strides[1],)
-        if self.view:
-            z[0] = rval
-        else:
-            z[0] = rval.copy()
-
-    def __str__(self):
-        return 'ExtractDiag{view=%s}' % self.view
-
-    def grad(self, inputs, g_outputs):
-        x = theano.tensor.zeros_like(inputs[0])
-        xdiag = alloc_diag(g_outputs[0])
-        return [theano.tensor.set_subtensor(
-            x[:xdiag.shape[0], :xdiag.shape[1]],
-            xdiag)]
-
-    def infer_shape(self, node, shapes):
-        x_s, = shapes
-        shp = theano.tensor.min(node.inputs[0].shape)
-        return [(shp,)]
-
-extract_diag = ExtractDiag()
-#TODO: optimization to insert ExtractDiag with view=True
-
-
-def diag(x):
-    """
-    Numpy-compatibility method
-    If `x` is a matrix, return its diagonal.
-    If `x` is a vector return a matrix with it as its diagonal.
-
-    * This method does not support the `k` argument that numpy supports.
-    """
-    xx = as_tensor_variable(x)
-    if xx.type.ndim == 1:
-        return alloc_diag(xx)
-    elif xx.type.ndim == 2:
-        return extract_diag(xx)
-    else:
-        raise TypeError('diag requires vector or matrix argument', x)
-
-
-def trace(X):
-    """
-    Returns the sum of diagonal elements of matrix X.
-
-    :note: work on GPU since 0.6rc4.
-    """
-    return extract_diag(X).sum()
-
-
-class Det(Op):
-    """Matrix determinant
-    Input should be a square matrix
-    """
-    def make_node(self, x):
-        x = as_tensor_variable(x)
-        assert x.ndim == 2
-        o = theano.tensor.scalar(dtype=x.dtype)
-        return Apply(self, [x], [o])
-
-    def perform(self, node, (x,), (z, )):
-        try:
-            z[0] = numpy.asarray(numpy.linalg.det(x), dtype=x.dtype)
-        except Exception:
-            print 'Failed to compute determinant', x
-            raise
-
-    def grad(self, inputs, g_outputs):
-        gz, = g_outputs
-        x, = inputs
-        return [gz * self(x) * matrix_inverse(x).T]
-
-    def infer_shape(self, node, shapes):
-        return [()]
-
-    def __str__(self):
-        return "Det"
-det = Det()
-
-
-class Eig(Op):
-    """Compute the eigenvalues and right eigenvectors of a square array.
-
-    """
-    _numop = staticmethod(numpy.linalg.eig)
-    __props__ = ()
-
-    def make_node(self, x):
-        x = as_tensor_variable(x)
-        assert x.ndim == 2
-        w = theano.tensor.vector(dtype=x.dtype)
-        v = theano.tensor.matrix(dtype=x.dtype)
-        return Apply(self, [x], [w, v])
-
-    def perform(self, node, (x,), (w, v)):
-        w[0], v[0] = [z.astype(x.dtype) for z in self._numop(x)]
-
-    def infer_shape(self, node, shapes):
-        n = shapes[0][0]
-        return [(n,), (n, n)]
-
-eig = Eig()
-
-
-class Eigh(Eig):
-    """
-    Return the eigenvalues and eigenvectors of a Hermitian or symmetric matrix.
-
-    """
-    _numop = staticmethod(numpy.linalg.eigh)
-    __props__ = ('UPLO',)
-
-    def __init__(self, UPLO='L'):
-        assert UPLO in ['L', 'U']
-        self.UPLO = UPLO
-
-    def make_node(self, x):
-        x = as_tensor_variable(x)
-        assert x.ndim == 2
-        # Numpy's linalg.eigh may return either double or single
-        # presision eigenvalues depending on installed version of
-        # LAPACK.  Rather than trying to reproduce the (rather
-        # involved) logic, we just probe linalg.eigh with a trivial
-        # input.
-        w_dtype = self._numop([[numpy.dtype(x.dtype).type()]])[0].dtype.name
-        w = theano.tensor.vector(dtype=w_dtype)
-        v = theano.tensor.matrix(dtype=x.dtype)
-        return Apply(self, [x], [w, v])
-
-    def perform(self, node, (x,), (w, v)):
-        w[0], v[0] = self._numop(x, self.UPLO)
-
-    def grad(self, inputs, g_outputs):
-        r"""The gradient function should return
-
-           .. math:: \sum_n\left(W_n\frac{\partial\,w_n}
-                           {\partial a_{ij}} +
-                     \sum_k V_{nk}\frac{\partial\,v_{nk}}
-                           {\partial a_{ij}}\right),
-
-        where [:math:`W`, :math:`V`] corresponds to ``g_outputs``,
-        :math:`a` to ``inputs``, and  :math:`(w, v)=\mbox{eig}(a)`.
-
-        Analytic formulae for eigensystem gradients are well-known in
-        perturbation theory:
-
-           .. math:: \frac{\partial\,w_n}
-                          {\partial a_{ij}} = v_{in}\,v_{jn}
-
-
-           .. math:: \frac{\partial\,v_{kn}}
-                          {\partial a_{ij}} =
-                \sum_{m\ne n}\frac{v_{km}v_{jn}}{w_n-w_m}
-        """
-        x, = inputs
-        w, v = self(x)
-        # Replace gradients wrt disconnected variables with
-        # zeros. This is a work-around for issue #1063.
-        gw, gv = _zero_disconnected([w, v], g_outputs)
-        return [EighGrad(self.UPLO)(x, w, v, gw, gv)]
-
-
-def _zero_disconnected(outputs, grads):
-    l = []
-    for o, g in zip(outputs, grads):
-        if isinstance(g.type, DisconnectedType):
-            l.append(o.zeros_like())
-        else:
-            l.append(g)
-    return l
-
-
-class EighGrad(Op):
-    """Gradient of an eigensystem of a Hermitian matrix.
-
-    """
-    __props__ = ('UPLO',)
-
-    def __init__(self, UPLO='L'):
-        assert UPLO in ['L', 'U']
-        self.UPLO = UPLO
-        if UPLO == 'L':
-            self.tri0 = numpy.tril
-            self.tri1 = lambda a: numpy.triu(a, 1)
-        else:
-            self.tri0 = numpy.triu
-            self.tri1 = lambda a: numpy.tril(a, -1)
-
-    def make_node(self, x, w, v, gw, gv):
-        x, w, v, gw, gv = map(as_tensor_variable, (x, w, v, gw, gv))
-        assert x.ndim == 2
-        assert w.ndim == 1
-        assert v.ndim == 2
-        assert gw.ndim == 1
-        assert gv.ndim == 2
-        out_dtype = theano.scalar.upcast(x.dtype, w.dtype, v.dtype,
-                                         gw.dtype, gv.dtype)
-        out = theano.tensor.matrix(dtype=out_dtype)
-        return Apply(self, [x, w, v, gw, gv], [out])
-
-    def perform(self, node, inputs, outputs):
-        """
-        Implements the "reverse-mode" gradient for the eigensystem of
-        a square matrix.
-        """
-        x, w, v, W, V = inputs
-        N = x.shape[0]
-        outer = numpy.outer
-
-        G = lambda n: sum(v[:, m] * V.T[n].dot(v[:, m]) / (w[n] - w[m])
-                          for m in xrange(N) if m != n)
-        g = sum(outer(v[:, n], v[:, n] * W[n] + G(n))
-                for n in xrange(N))
-
-        # Numpy's eigh(a, 'L') (eigh(a, 'U')) is a function of tril(a)
-        # (triu(a)) only.  This means that partial derivative of
-        # eigh(a, 'L') (eigh(a, 'U')) with respect to a[i,j] is zero
-        # for i < j (i > j).  At the same time, non-zero components of
-        # the gradient must account for the fact that variation of the
-        # opposite triangle contributes to variation of two elements
-        # of Hermitian (symmetric) matrix. The following line
-        # implements the necessary logic.
-        out = self.tri0(g) + self.tri1(g).T
-
-        # The call to self.tri0 in perform upcast from float32 to
-        # float64 or from int* to int64 in numpy 1.6.1 but not in
-        # 1.6.2. We do not want version dependent dtype in Theano.
-        # We think it should be the same as the output.
-        outputs[0][0] = numpy.asarray(out, dtype=node.outputs[0].dtype)
-
-    def infer_shape(self, node, shapes):
-        return [shapes[0]]
-
-
-def eigh(a, UPLO='L'):
-    return Eigh(UPLO)(a)
-
-
-class QRFull(Op):
-    """
-    Full QR Decomposition.
-    Computes the QR decomposition of a matrix.
-    Factor the matrix a as qr, where q is orthonormal
-    and r is upper-triangular.
-    """
-    _numop = staticmethod(numpy.linalg.qr)
-    __props__ = ('mode',)
-
-    def __init__(self, mode):
-        self.mode = mode
-
-    def make_node(self, x):
-        x = as_tensor_variable(x)
-        assert x.ndim == 2, "The input of qr function should be a matrix."
-        q = theano.tensor.matrix(dtype=x.dtype)
-        r = theano.tensor.matrix(dtype=x.dtype)
-        return Apply(self, [x], [q, r])
-
-    def perform(self, node, (x,), (q, r)):
-        assert x.ndim == 2, "The input of qr function should be a matrix."
-
-        q[0], r[0] = self._numop(x,
-                                 self.mode)
-
-
-class QRIncomplete(Op):
-    """
-    Incomplete QR Decomposition.
-    Computes the QR decomposition of a matrix.
-    Factor the matrix a as qr and return a single matrix.
-    """
-    _numop = staticmethod(numpy.linalg.qr)
-    __props__ = ('mode',)
-
-    def __init__(self, mode):
-        self.mode = mode
-
-    def make_node(self, x):
-        x = as_tensor_variable(x)
-        assert x.ndim == 2, "The input of qr function should be a matrix."
-        q = theano.tensor.matrix(dtype=x.dtype)
-        return Apply(self, [x], [q])
-
-    def perform(self, node, (x,), (q,)):
-        assert x.ndim == 2, "The input of qr function should be a matrix."
-        q[0] = self._numop(x,
-                           self.mode)
-
-
-def qr(a, mode="full"):
-    """
-    Computes the QR decomposition of a matrix.
-    Factor the matrix a as qr, where q
-    is orthonormal and r is upper-triangular.
-
-    :type a:
-        array_like, shape (M, N)
-    :param a:
-        Matrix to be factored.
-
-    :type mode:
-        one of 'reduced', 'complete', 'r', 'raw', 'full' and
-        'economic', optional
-    :keyword mode:
-        If K = min(M, N), then
-
-        'reduced'
-          returns q, r with dimensions (M, K), (K, N)
-
-        'complete'
-           returns q, r with dimensions (M, M), (M, N)
-
-        'r'
-          returns r only with dimensions (K, N)
-
-        'raw'
-          returns h, tau with dimensions (N, M), (K,)
-
-        'full'
-          alias of 'reduced', deprecated (default)
-
-        'economic'
-          returns h from 'raw', deprecated. The options 'reduced',
-
-        'complete', and 'raw' are new in numpy 1.8, see the notes for more
-        information. The default is 'reduced' and to maintain backward
-        compatibility with earlier versions of numpy both it and the old
-        default 'full' can be omitted. Note that array h returned in 'raw'
-        mode is transposed for calling Fortran. The 'economic' mode is
-        deprecated. The modes 'full' and 'economic' may be passed using only
-        the first letter for backwards compatibility, but all others
-        must be spelled out.
-
-        Default mode is 'full' which is also default for numpy 1.6.1.
-
-        :note: Default mode was left to full as full and reduced are
-           both doing the same thing in the new numpy version but only
-           full works on the old previous numpy version.
-
-    :rtype q:
-      matrix of float or complex, optional
-    :return q:
-      A matrix with orthonormal columns. When mode = 'complete' the
-      result is an orthogonal/unitary matrix depending on whether or
-      not a is real/complex. The determinant may be either +/- 1 in
-      that case.
-
-    :rtype r:
-      matrix of float or complex, optional
-    :return r:
-      The upper-triangular matrix.
-    """
-    x = [[2, 1], [3, 4]]
-    if isinstance(numpy.linalg.qr(x,mode), tuple):
-        return QRFull(mode)(a)
-    else:
-        return QRIncomplete(mode)(a)
-
-
-class SVD(Op):
-
-    # See doc in the docstring of the function just after this class.
-    _numop = staticmethod(numpy.linalg.svd)
-    __props__ = ('full_matrices', 'compute_uv')
-
-    def __init__(self, full_matrices=True, compute_uv=True):
-        """
-        full_matrices : bool, optional
-            If True (default), u and v have the shapes (M, M) and (N, N),
-            respectively.
-            Otherwise, the shapes are (M, K) and (K, N), respectively,
-            where K = min(M, N).
-        compute_uv : bool, optional
-            Whether or not to compute u and v in addition to s.
-            True by default.
-        """
-        self.full_matrices = full_matrices
-        self.compute_uv = compute_uv
-
-    def make_node(self, x):
-        x = as_tensor_variable(x)
-        assert x.ndim == 2, "The input of svd function should be a matrix."
-        w = theano.tensor.matrix(dtype=x.dtype)
-        u = theano.tensor.vector(dtype=x.dtype)
-        v = theano.tensor.matrix(dtype=x.dtype)
-        return Apply(self, [x], [w, u, v])
-
-    def perform(self, node, (x,), (w, u, v)):
-        assert x.ndim == 2, "The input of svd function should be a matrix."
-        w[0], u[0], v[0] = self._numop(x,
-                                       self.full_matrices,
-                                       self.compute_uv)
-
-
-def svd(a, full_matrices=1, compute_uv=1):
-    """
-    This function performs the SVD on CPU.
-
-    :type full_matrices: bool, optional
-    :param full_matrices:
-        If True (default), u and v have the shapes (M, M) and (N, N),
-        respectively.
-        Otherwise, the shapes are (M, K) and (K, N), respectively,
-        where K = min(M, N).
-    :type compute_uv: bool, optional
-    :param compute_uv:
-        Whether or not to compute u and v in addition to s.
-        True by default.
-
-    :returns: U, V and D matrices.
-    """
-    return SVD(full_matrices, compute_uv)(a)
-
-
-def test_matrix_inverse_solve():
-    if not imported_scipy:
-        raise SkipTest("Scipy needed for the Solve op.")
-    A = theano.tensor.dmatrix('A')
-    b = theano.tensor.dmatrix('b')
-    node = matrix_inverse(A).dot(b).owner
-    [out] = inv_as_solve.transform(node)
-    assert isinstance(out.owner.op, Solve)
-
-
-class lstsq(Op):
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def make_node(self, x, y, rcond):
-        x = theano.tensor.as_tensor_variable(x)
-        y = theano.tensor.as_tensor_variable(y)
-        rcond = theano.tensor.as_tensor_variable(rcond)
-        return theano.Apply(self, [x, y, rcond],
-                            [theano.tensor.matrix(), theano.tensor.dvector(),
-                             theano.tensor.lscalar(), theano.tensor.dvector()])
-
-    def perform(self, node, inputs, outputs):
-        x = inputs[0]
-        y = inputs[1]
-        rcond = inputs[2]
-        zz = numpy.linalg.lstsq(inputs[0], inputs[1], inputs[2])
-        outputs[0][0] = zz[0]
-        outputs[1][0] = zz[1]
-        outputs[2][0] = numpy.array(zz[2])
-        outputs[3][0] = zz[3]
-
-
-def matrix_power(M, n):
-    result = 1
-    for i in xrange(n):
-        result = theano.dot(result, M)
-    return result
-
-
-def norm(x,ord):
-    x = as_tensor_variable(x)
-    ndim = x.ndim
-    if ndim == 0:
-        raise ValueError("'axis' entry is out of bounds.")
-    elif ndim == 1:
-        if ord is None:
-            return tensor.sum(x**2)**0.5
-        elif ord == 'inf':
-            return tensor.max(abs(x))
-        elif ord == '-inf':
-            return tensor.min(abs(x))
-        elif ord == 0:
-            return x[x.nonzero()].shape[0]
-        else:
-            try:
-                z = tensor.sum(abs(x**ord))**(1./ord)
-            except TypeError:
-                raise ValueError("Invalid norm order for vectors.")
-            return z
-    elif ndim == 2:
-        if ord is None or ord == 'fro':
-            return tensor.sum(abs(x**2))**(0.5)
-        elif ord == 'inf':
-            return tensor.max(tensor.sum(abs(x), 1))
-        elif ord == '-inf':
-            return tensor.min(tensor.sum(abs(x), 1))
-        elif ord == 1:
-            return tensor.max(tensor.sum(abs(x), 0))
-        elif ord == -1:
-            return tensor.min(tensor.sum(abs(x),0))
-        else:
-            raise ValueError(0)
-    elif ndim > 2:
-        raise NotImplementedError("We don't support norm witn ndim > 2")
diff --git a/theano/tensor/nnet/Conv3D.py b/theano/tensor/nnet/Conv3D.py
index 3ff6c8c42cc..122fce74bd1 100644
--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -2,7 +2,7 @@
 from theano.tensor import basic as T
 import numpy as N
 #from util import strutil
-from theano.tensor.blas_headers import blas_header_text, blas_header_version
+from theano.tensor.blas_headers import blas_header_text
 from theano.tensor.blas import ldflags
 from theano.misc import strutil
 from theano.gradient import grad_undefined
@@ -40,9 +40,7 @@
 #the output function is only defined when dr, dc, dt are natural numbers.
 
 class Conv3D(theano.Op):
-    """ 3D `convolution` of multiple filters on a minibatch
-        :note: does not flip the kernel, moves kernel with a user specified stride
-    """
+    """ 3D "convolution" of multiple filters on a minibatch (does not flip the kernel, moves kernel with a user specified stride) """
     def __eq__(self,other):
         return type(self) == type(other)
 
@@ -53,7 +51,8 @@ def __str__(self):
         return "Conv3D"
 
     def c_code_cache_version(self):
-        return (3, blas_header_version())
+        return (3, blas_header_text.version)
+
 
     def make_node(self, V, W, b, d):
         """
@@ -68,10 +67,8 @@ def make_node(self, V, W, b, d):
         b_ = T.as_tensor_variable(b)
         d_ = T.as_tensor_variable(d)
 
-        bcast = (V_.broadcastable[0], False, False, False, W_.broadcastable[0])
+        node = theano.Apply(self, inputs=[V_, W_,b_,d_], outputs = [ T.TensorType(V_.dtype, (V_.broadcastable[0],False,False,False, W_.broadcastable[0]))() ] )
 
-        node = theano.Apply(self, inputs=[V_, W_, b_, d_],
-                            outputs=[T.TensorType(V_.dtype, bcast)()])
 
         return node
 
@@ -122,6 +119,8 @@ def grad(self,inputs, output_gradients):
         dCdW.name = 'Conv3D_dCdW(dCdH='+dCdH_name+',V='+V_name+',W='+W_name+')'
         dCdb.name = 'Conv3D_dCdb(dCdH='+dCdH_name+',V='+V_name+',W='+W_name+',b='+b_name+')'
 
+
+
         return [ dCdV, dCdW, dCdb, dCdd ]
 
     def perform(self, node, inputs, output_storage):
@@ -151,7 +150,8 @@ def infer_shape(self, node, input_shapes):
 
         rval = (batch_size,  output_height, output_width, output_dur, output_channels )
 
-        return [rval]
+
+        return [ rval ]
 
     def c_support_code(self):
         return blas_header_text()
@@ -175,6 +175,7 @@ def c_code(self, node, nodename, inputs, outputs, sub):
 
         H = outputs[0]
 
+
         codeSource =  """
             ///////////// < code generated by Conv3D >
 
@@ -279,6 +280,7 @@ def c_code(self, node, nodename, inputs, outputs, sub):
             const long long outputWidth = int( (vidWidth - filterWidth) / dc )+1;
             const long long outputDur = int( (vidDur - filterDur) / dt ) +1;
 
+
             npy_intp dims[5];
             dims[0] = batchSize;
             dims[4] = outputChannels;
@@ -286,6 +288,8 @@ def c_code(self, node, nodename, inputs, outputs, sub):
             dims[2] = outputWidth;
             dims[3] = outputDur;
 
+
+
             if(!(%(H)s) || PyArray_DIMS(%(H)s)[0]!=dims[0] ||
             PyArray_DIMS(%(H)s)[1]!=dims[1] ||
             PyArray_DIMS(%(H)s)[2]!=dims[2] ||
@@ -300,8 +304,10 @@ def c_code(self, node, nodename, inputs, outputs, sub):
             }
 { // extra scope so fail works
 
+
             #define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) )
 
+
             const int ws0 = PyArray_STRIDES(%(W)s)[0];
             const int ws1 = PyArray_STRIDES(%(W)s)[1];
             const int ws2 = PyArray_STRIDES(%(W)s)[2];
@@ -314,14 +320,22 @@ def c_code(self, node, nodename, inputs, outputs, sub):
             const int bs  = PyArray_STRIDES(%(b)s)[0];
             const int hs4 = PyArray_STRIDES(%(H)s)[4];
 
+
+
             // Compute H
             //H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dr*r+k,dc*c+l,dt*t+m]
             //TODO: add special cases
             // ex: filterDur == 1 && batchSize == 1 && dt = 1  (for SFA)
             // ex: inputChannels == 1 """
 
-        # if the data types are not mixed, we can insert special case
-        # optimizations based on BLAS
+
+
+
+
+
+
+
+        #if the data types are not mixed, we can insert special case optimizations based on BLAS
         VV, WV, bv, dv = node.inputs
         HV = node.outputs[0]
         if (theano.config.blas.ldflags and
@@ -533,34 +547,22 @@ def c_code(self, node, nodename, inputs, outputs, sub):
 
         return strutil.render_string(codeSource,locals())
 
-_conv3D = Conv3D()
-
-def conv3D(V, W, b, d):
-    """
-    3D "convolution" of multiple filters on a minibatch
-    (does not flip the kernel, moves kernel with a user specified stride)
-
-    :param V: Visible unit, input.
-        dimensions: (batch, row, column, time, in channel)
-    :param W: Weights, filter.
-        dimensions: (out channel, row, column, time ,in channel)
-    :param b: bias, shape == (W.shape[0],)
-    :param d: strides when moving the filter over the input(dx, dy, dt)
-
-    :note: The order of dimensions does not correspond to the one in `conv2d`.
-           This is for optimization.
-
-    :note: The GPU implementation is very slow. You should use
-           :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>` or
-           :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>` for a
-           GPU graph instead.
-
-    :see: Someone made a script that shows how to swap the axes
-          between both 3d convolution implementations in Theano. See
-          the last `attachment
-          <https://groups.google.com/d/msg/theano-users/1S9_bZgHxVw/0cQR9a4riFUJ>`_.
+global conv3D
+conv3D = Conv3D()
+"""
+3D "convolution" of multiple filters on a minibatch
+(does not flip the kernel, moves kernel with a user specified stride)
+
+:param V: Visible unit, input.
+    dimensions: (batch, row, column, time, in channel)
+:param W: Weights, filter.
+    dimensions: (out channel, row, column, time ,in channel)
+:param b: bias, shape == (W.shape[0],)
+:param d: strides when moving the filter over the input(dx, dy, dt)
+
+:note: The order of dimensions does not correspond to the one in `conv2d`.
+       This is for optimization.
 """
-    return _conv3D(V, W, b, d)
 
 def computeH(V,W,b,d):
     assert len(W.shape) == 5
diff --git a/theano/tensor/nnet/conv.py b/theano/tensor/nnet/conv.py
index 9c38d9cf7ee..a31c35cacb5 100644
--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -58,8 +58,7 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
                  of shape: image_shape + filter_shape - 1
 
     :type subsample: tuple of len 2
-    :param subsample: factor by which to subsample the output.
-                      Also called strides elsewhere.
+    :param subsample: factor by which to subsample the output
 
     :type image_shape: None, tuple/list of len 4 of int or Constant variable
     :param image_shape: The shape of the input parameter.
@@ -242,31 +241,25 @@ class ConvOp(OpenMPOp):
     #valid time, full time
     speed_unroll_patch_shape = [1.2967290878295898, 5.5283889770507812]
 
-    @staticmethod
-    def has_all_shape(imshp, kshp, nkern=1, bsize=1):
-        return (nkern is not None and bsize is not None and
-                all(shp is not None for shp in imshp) and
-                all(shp is not None for shp in kshp))
-
     @staticmethod
     def getOutputShape(inshp, kshp, stride=(1, 1), mode='valid'):
         """
         Computes the output dimensions of convolving an image of shape "inshp"
-        with kernels of shape "kshp". Accepts symbolic or integer shapes.
-        Propagates `None`s (for unknown shapes).
+        with kernels of shape "kshp".
 
         :param inshp: (rows,cols) of input image
         :param kshp: (rows,cols) of filters
         :param mode: 'valid' or 'full' (see 'border_mode' in conv2d's doc)
         :return: (rows,cols) of output image
         """
-        # The formula would be ceil((i + s * k - s * 1) / float(d)),
-        # with s=1 for mode=='full' and s=-1 for mode=='valid'.
-        # To support symbolic shapes, we express this with integer arithmetics.
-        return tuple(None if i is None or k is None
-                else ((i - k) // d + 1) if mode == 'valid'
-                else ((i + k + d - 2) // d)
-                for i, k, d in zip(inshp, kshp, stride))
+        dx, dy = stride
+        if mode == 'valid':
+            s = -1
+        else:
+            s = 1
+        inshp, kshp = numpy.array(inshp), numpy.array(kshp)
+        return  numpy.int64(numpy.ceil((inshp + s * kshp - s * 1) /
+                                       numpy.array([dx, dy], dtype='float')))
 
     def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
                  dx=1, dy=1,
@@ -280,12 +273,11 @@ def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
                  kshp_logical_top_aligned=True,
                  verbose=0,
                  version=-1,
-                 direction_hint='forward',
                  openmp=None):
         """
         Initializes a ConvOp with given output_mode (full/valid). All other
         parameters are optional and are only used to generate more optimized c
-        code, or to enable graph optimizers to optimally replace the ConvOp.
+        code.
 
         NOTES ON OPTIMIZATION:
         Their is two type of optimization. The first is the selection of the
@@ -346,11 +338,8 @@ def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
 
         :type verbose: int
         :param verbose: passed to GpuConv
-        :type version: int or str
-        :param version: passed to GpuConv, if version='no_fft', fft
-            optimization will be desactivated at the op level.
-        :param direction_hint: 'forward', 'bprop weights' or 'bprop inputs'.
-            Passed to GpuConv, used by graph optimizers to aid algorithm choice
+        :type version: int
+        :param version: passed to GpuConv
 
         The 3 following parameters are used internally when we generate
         the gradient when dx!=1 or dy!=1.
@@ -368,31 +357,6 @@ def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
             Set to False in the grad again the weight when the
             output_mode is full.
         """
-        # Deactivate fft_optimization at the op level if specified
-        if version == "no_fft":
-            self.fft_opt = False
-            version = -1
-        else:
-            self.fft_opt = True
-
-        # Expand unknown image / kernel shapes into tuples of Nones
-        if imshp is None:
-            imshp = (None, None, None)
-        else:
-            imshp = tuple(imshp)
-        if kshp is None:
-            kshp = (None, None)
-        else:
-            kshp = tuple(kshp)
-
-        # Check imshp and kshp dimensionality
-        if len(imshp) == 2:
-            imshp = (1,) + imshp
-        elif len(imshp) != 3:
-            raise ValueError("len(imshp) must be 2 or 3, got %d" % len(imshp))
-        if len(kshp) != 2:
-            raise ValueError("len(kshp) must be 2, got %d" % len(kshp))
-
         # We must continue to consider None as 1 for backward compatibility.
         if dx is None:
             dx = 1
@@ -407,18 +371,34 @@ def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
             raise TypeError('ConvOp.__init__ param dy must be an int', dy)
         dy = int(dy)
 
-        all_shape = self.has_all_shape(imshp, kshp, nkern, bsize)
+        all_shape = imshp is not None and kshp is not None and \
+                    nkern is not None and bsize is not None
+
         if (unroll_batch or unroll_kern) and not all_shape:
             raise Exception("In ConvOp, when using unroll_batch and"
                             " unroll_nkern, all shape are needed")
 
         #Init the openmp attribute
         super(ConvOp, self).__init__(openmp=openmp)
+
         if not all_shape or self.openmp:
             # Only this version is parallelized
             unroll_patch = True
 
+        if imshp is not None:
+            imshp = tuple(imshp)
+
+            if len(imshp) == 2:
+                imshp = (1,) + imshp
+            elif len(imshp) == 3:
+                imshp = imshp
+            else:
+                raise Exception("bad len for imshp")
+
         self.imshp = imshp
+        if kshp is not None:
+            kshp = tuple(kshp)
+
         self.kshp = kshp
         self.nkern = nkern
         self.bsize = bsize
@@ -426,27 +406,18 @@ def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
         self.dy = dy
         self.verbose = verbose
         self.version = version
-        self.direction_hint = direction_hint
 
         # a triple
-        if imshp_logical is None:
-            self.imshp_logical = self.imshp
-        else:
-            imshp_logical = tuple(imshp_logical)
-            if len(imshp_logical) != 3:
-                raise ValueError("len(imshp_logical) must be 3, got %d" % len(imshp_logical))
-            self.imshp_logical = imshp_logical
+        self.imshp_logical = self.imshp
+        if imshp_logical is not None:
+            self.imshp_logical = tuple(imshp_logical)
+        assert ((self.imshp is None and self.imshp_logical is None) or
+                (len(self.imshp) == len(self.imshp_logical)))
 
         # a pair
-        if kshp_logical is None:
-            self.kshp_logical = self.kshp
-        else:
-            kshp_logical = tuple(kshp_logical)
-            if len(kshp_logical) != 2:
-                raise ValueError("len(kshp_logical) must be 2, got %d" % len(kshp_logical))
-            self.kshp_logical = kshp_logical
-
-        # a bool
+        self.kshp_logical = self.kshp
+        if kshp_logical is not None:
+            self.kshp_logical = tuple(kshp_logical)
         self.kshp_logical_top_aligned = kshp_logical_top_aligned
 
         self.unroll_batch = unroll_batch
@@ -497,19 +468,23 @@ def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
                 _logger.warn(warnstr, self.unroll_kern, self.nkern, new)
                 self.unroll_kern = new
 
-        self.outshp = ConvOp.getOutputShape(self.imshp_logical[1:],
-                                            self.kshp_logical, (dx, dy),
-                                            output_mode)
-        self.fulloutshp = ConvOp.getOutputShape(self.imshp_logical[1:],
-                                                self.kshp_logical, (1, 1),
+        if all_shape:
+            self.outshp = ConvOp.getOutputShape(self.imshp_logical[1:],
+                                                self.kshp_logical, (dx, dy),
                                                 output_mode)
+            self.fulloutshp = ConvOp.getOutputShape(self.imshp_logical[1:],
+                                                    self.kshp_logical, (1, 1),
+                                                    output_mode)
+        else:
+            self.outshp = None
+            self.fulloutshp = None
 
         self.out_mode = output_mode
 
         if not self.out_mode in ["valid", "full"]:
             raise Exception("Mode %s not implemented" % self.out_mode)
 
-        if any((shp is not None) and (shp <= 0) for shp in self.outshp):
+        if all_shape and not (self.outshp > 0).all():
             raise Exception("Bad size for the output shape. Verify that [post-"
                             "supersampling] input shape (%s) and kern"
                             " shape(%s) are ok. (Hint: kerns must fit inside"
@@ -526,10 +501,14 @@ def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
             elif self.bsize is not None and self.nkern is not None:
                 bsize = self.bsize
                 nkern = self.nkern
+                if bsize is None:
+                    bsize = 1
+                if nkern is None:
+                    nkern = 1
                 mode_idx = 0
                 if self.out_mode != "valid":
                     mode_idx = 1
-                if self.has_all_shape(self.imshp, self.kshp):
+                if all_shape:
                     time_unroll_patch = self.speed_unroll_patch_shape[mode_idx]
                 else:
                     time_unroll_patch = self.speed_unroll_patch_noshape[
@@ -558,6 +537,8 @@ def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
                           time_unroll_batch_kern)
 
         self._rehash()
+        if config.op.set_flops:
+            self.set_flops()
 
     def __eq__(self, other):
         if type(self) != type(other):
@@ -568,8 +549,9 @@ def __eq__(self, other):
         return True
 
     def __setstate__(self, d):
-        super(ConvOp, self).__setstate__(d)
-        self.direction_hint = d.get("direction_hint", None)
+        self.__dict__.update(d)
+        if not hasattr(self, "openmp"):
+            self.openmp = False
         self._rehash()
 
     def _rehash(self):
@@ -585,24 +567,43 @@ def __str__(self):
         return "ConvOp{" + ",".join(str((a, getattr(self, a)))
                                     for a in self.__attrnames) + "}"
 
-    def flops(self, inputs, outputs):
+    def set_flops(self):
         """ Useful with the hack in profilemode to print the MFlops"""
-        images, kerns = inputs
-        out, = outputs
-        assert images[1] == kerns[1]
-        flops = 0
         if self.out_mode == "valid":
-            # nb mul and add by output pixel
-            flops = kerns[2] * kerns[3] * 2
+            # nb mul and add by output pixed
+            self.flops = self.kshp[0] * self.kshp[1] * 2
             #nb flops by output image
-            flops *= out[2] * out[3]
-            # nb patch multiplied
-            flops *= images[1] * kerns[0] * images[0]
-        else:
-            flops = (images[0] * kerns[0] * images[1] *
-                     kerns[2] * kerns[3] *
-                     images[2] * images[3] * 2)
-        return flops
+            self.flops *= self.outshp[0] * self.outshp[1]
+            # for all outputs images#n_stack==self.imshp[0]
+            self.flops *= self.imshp[0] * self.nkern * self.bsize
+        else:  # full mode not implemented
+
+            self.flops = 0
+            for out_row in xrange(self.outshp[0]):  # loop over output row
+                for out_col in xrange(self.outshp[0]):  # loop over output col
+                    for row in xrange(self.kshp[0]):  # loop over kern row
+
+                        if (row + out_row - self.kshp[0] + 1 < 0 or
+                            row + out_row - self.kshp[0] + 1 >= self.imshp[1]):
+                            continue
+
+                        col = 0
+                        max_col = self.kshp[1]
+                        img_col = out_col - self.kshp[1] + 1
+                        max_col = min(max_col, self.imshp[2] - img_col)
+
+                        if img_col < 0:
+                            col = -img_col
+                            img_col += col
+                        while col < max_col:  # loop over kern col
+                            self.flops += 2
+                            col += 1
+            # for all outputs images#n_stack==self.imshp[0]
+            self.flops *= self.imshp[0] * self.nkern * self.bsize
+
+            assert self.flops == self.bsize * self.nkern * self.imshp[0] * \
+                    self.kshp[0] * self.kshp[1] * \
+                        self.imshp[1] * self.imshp[2] * 2
 
     def make_node(self, inputs, kerns):
         # TODO: find a way to make ConvOp work for N-D (after NIPS09)
@@ -624,7 +625,10 @@ def make_node(self, inputs, kerns):
             raise NotImplementedError(
                 "The image and the kernel must have the same type."
                 "inputs(%s), kerns(%s)" % (_inputs.dtype, _kerns.dtype))
-        bcastable23 = [self.outshp[0] == 1, self.outshp[1] == 1]
+        if self.outshp is not None:
+            bcastable23 = [self.outshp[0] == 1, self.outshp[1] == 1]
+        else:
+            bcastable23 = [False, False]
         output = theano.tensor.tensor(dtype=_inputs.type.dtype,
                                       broadcastable=[_inputs.broadcastable[0],
                                                      _kerns.broadcastable[0]] +
@@ -633,25 +637,32 @@ def make_node(self, inputs, kerns):
         return Apply(self, [_inputs, _kerns], [output])
 
     def infer_shape(self, node, input_shapes):
-        imshp = input_shapes[0]  # 4D image shape
-        kshp = input_shapes[1]   # 4D filter shape
-        bsize, imshp = imshp[0], list(imshp[1:])
-        nkern, kshp = kshp[0], list(kshp[2:])
-        # replace symbolic shapes with known shapes
-        if self.bsize is not None:
-            bsize = self.bsize
-        for i in [0, 1, 2]:
-            if self.imshp_logical[i] is not None:
-                imshp[i] = self.imshp_logical[i]
-        if self.nkern is not None:
-            nkern = self.nkern
-        for i in [0, 1]:
-            if self.kshp_logical[i] is not None:
-                kshp[i] = self.kshp_logical[i]
-        # infer output shape from what we have
-        outshp = ConvOp.getOutputShape(imshp[1:], kshp, (self.dx, self.dy),
-                                       self.out_mode)
-        return [(bsize, nkern) + outshp]
+        imshp = input_shapes[0]
+        kshp = input_shapes[1]
+
+        batch_size = imshp[0]
+        fmo = kshp[0]
+
+        if self.imshp is not None and self.kshp is not None:
+            imshp = self.imshp
+            kshp = self.kshp
+            if self.imshp_logical:
+                imshp = self.imshp_logical
+            if self.kshp_logical:
+                kshp = self.kshp_logical
+            try:
+                fmshp = ConvOp.getOutputShape(imshp[1:],
+                                              kshp, (self.dx, self.dy),
+                                              self.out_mode)
+            except TypeError:
+                raise theano.tensor.ShapeError()
+            outshp = (batch_size, fmo) + tuple(fmshp)
+            return [outshp]
+        else:
+            # Haven't implemented this case. imshp and kshp may be symbollic
+            # and ConvOp.getOutputShape doesn't handle this. In this case
+            # we simply let the default function do its work.
+            raise theano.tensor.ShapeError()
 
     def perform(self, node, inp, out):
         """
@@ -669,10 +680,10 @@ def perform(self, node, inp, out):
         # TODO: move these back out to global scope when they no longer
         #       cause an atexit error
         imshp = self.imshp
-        if any(x is None for x in imshp):
+        if imshp is None or any([x is None for x in imshp]):
             imshp = tuple(img2d.shape[1:])
         kshp = self.kshp
-        if any(x is None for x in kshp):
+        if kshp is None or any([x is None for x in kshp]):
             kshp = tuple(filtersflipped.shape[2:])
         bsize = self.bsize
         if bsize is None:
@@ -682,22 +693,18 @@ def perform(self, node, inp, out):
             nkern = filtersflipped.shape[0]
 
         imshp_logical = self.imshp_logical
-        if imshp_logical[0] is None:
-            imshp_logical = (imshp[0],) + imshp_logical[1:]
-        if imshp_logical[1] is None:
-            imshp_logical = (imshp_logical[0], imshp[1], imshp_logical[2])
-        if imshp_logical[2] is None:
-            imshp_logical = imshp_logical[:2] + (imshp[2],)
-        assert all(x is not None for x in imshp_logical)
+        if imshp_logical is None:
+            imshp_logical = imshp
+        if numpy.any([x is None for x in imshp_logical]):
+            imshp_logical = tuple(img2d.shape[1:])
 
         kshp_logical = self.kshp_logical
-        if kshp_logical[0] is None:
-            kshp_logical = (kshp[0], kshp_logical[1])
-        if kshp_logical[1] is None:
-            kshp_logical = (kshp_logical[0], kshp[1])
-        assert all(x is not None for x in kshp_logical)
+        if kshp_logical is None:
+            kshp_logical = kshp
+        if numpy.any([x is None for x in kshp_logical]):
+            kshp = tuple(filtersflipped.shape[2:])
 
-        if all(shp is not None for shp in self.fulloutshp):
+        if self.fulloutshp is not None:
             fulloutshp = tuple(self.fulloutshp)
         else:
             fulloutshp = tuple(ConvOp.getOutputShape(imshp_logical[
@@ -804,8 +811,8 @@ def grad(self, inp, grads):
                 shuffled_kerns.name = 'shuffled_for_conv3D(%s)' % flipped_kerns.name
 
             tmp_node = theano.tensor.nnet.conv3D(
-                V=shuffled_inputs,
-                W=shuffled_kerns,
+                V = shuffled_inputs,
+                W= shuffled_kerns,
                 b=theano.tensor.alloc(numpy.asarray(0, dtype=kerns.dtype),
                                       kerns.shape[0]),
                 d=(self.dx, self.dy, 1))
@@ -823,8 +830,8 @@ def grad(self, inp, grads):
                 "ERROR: We disable ConvOp.grad now when dx or "
                 "dy are different from 1 and 2, as there is a bug in it.")
 
-        all_shape = self.has_all_shape(self.imshp, self.kshp,
-                                       self.nkern, self.bsize)
+        all_shape = (self.imshp is not None and self.kshp is not None and
+                     self.nkern is not None and self.bsize is not None)
 
         if not all_shape and (self.dx != 1 or self.dy != 1):
             raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "
@@ -836,14 +843,19 @@ def grad(self, inp, grads):
         newin = inputs.dimshuffle((1, 0, 2, 3))
         newgz = gz.dimshuffle((1, 0, 2, 3))
 
+        (bsize, nkern) = None, None
+        imshp = None
+        kshp = None
         un_p = self.unroll_patch
+        imshp_logical = None
+
         if self.out_mode == 'valid':
             (img, filters) = (newin, newgz)
             kshp_logical = self.fulloutshp
             kshp_logical_top_aligned = False
-            imshp_logical = None
-            (bsize, nkern) = (self.imshp[0], self.nkern)
-            imshp = (self.bsize, self.imshp[1], self.imshp[2])
+            if all_shape:
+                (bsize, nkern) = (self.imshp[0], self.nkern)
+                imshp = (self.bsize, self.imshp[1], self.imshp[2])
             kshp = self.outshp
             un_b = self.unroll_batch
             un_k = self.unroll_kern
@@ -851,12 +863,13 @@ def grad(self, inp, grads):
             (img, filters) = (newgz, newin)
             kshp_logical = None
             kshp_logical_top_aligned = True
-            imshp_logical = (self.bsize,
-                             self.fulloutshp[0],
-                             self.fulloutshp[1])
-            (bsize, nkern) = (self.nkern, self.imshp[0])
-            imshp = (self.bsize, self.outshp[0], self.outshp[1])
-            kshp = self.imshp[1:]
+            if all_shape:
+                imshp_logical = (self.bsize,
+                                 self.fulloutshp[0],
+                                 self.fulloutshp[1])
+                (bsize, nkern) = (self.nkern, self.imshp[0])
+                imshp = (self.bsize, self.outshp[0], self.outshp[1])
+                kshp = self.imshp[1:]
             un_b = self.unroll_kern
             un_k = self.unroll_batch
         else:
@@ -893,7 +906,6 @@ def grad(self, inp, grads):
                         kshp_logical=kshp_logical,
                         kshp_logical_top_aligned=kshp_logical_top_aligned,
                         version=self.version,
-                        direction_hint='bprop weights',
                         verbose=self.verbose)
 
         else:  # let __init__ choose c params be chosen automatically from shapes
@@ -903,13 +915,15 @@ def grad(self, inp, grads):
                         kshp_logical=kshp_logical,
                         kshp_logical_top_aligned=kshp_logical_top_aligned,
                         version=self.version,
-                        direction_hint='bprop weights',
                         verbose=self.verbose)
 
+        if hasattr(self, 'flops'):
+            dw.set_flops()
+
         dw = dw(img, filters)
 
         if all_shape:
-            assert all(o == k for o, k in zip(dw.owner.op.outshp, self.kshp))
+            assert (dw.owner.op.outshp == self.kshp).all()
         if self.out_mode == 'valid':
             # before DimShuffle, dw is of shape visdim x nkern x kshp[0] x kshp[1]
             dw = dw.dimshuffle((1, 0, 2, 3))
@@ -922,11 +936,16 @@ def grad(self, inp, grads):
 
         filters = kerns.dimshuffle((1, 0, 2, 3))
         filters = filters[:, :, ::-1, ::-1]
+        nkern = None
+        imshp = None
+        imshp_logical = None
+        kshp = None
 
-        nkern = self.imshp[0]
-        imshp = (self.nkern, self.outshp[0], self.outshp[1])
-        imshp_logical = (self.nkern, self.fulloutshp[0],
-                         self.fulloutshp[1])
+        if all_shape:
+            nkern = self.imshp[0]
+            imshp = (self.nkern, self.outshp[0], self.outshp[1])
+            imshp_logical = (self.nkern, self.fulloutshp[0],
+                             self.fulloutshp[1])
 
         if 0:  # hard-code c generation parameters
             din = ConvOp(imshp, self.kshp, nkern, self.bsize,
@@ -936,7 +955,6 @@ def grad(self, inp, grads):
                          imshp_logical=imshp_logical,
                          kshp_logical=None,
                          version=-1,  # we we change the mode, we don't forward the version.
-                         direction_hint='bprop inputs',
                          verbose=self.verbose)
         else:  # let __init__ figure out the unrolling / patch sizes
             din = ConvOp(imshp, self.kshp, nkern, self.bsize,
@@ -946,13 +964,16 @@ def grad(self, inp, grads):
                          imshp_logical=imshp_logical,
                          kshp_logical=None,
                          version=-1,  # we we change the mode, we don't forward the version.
-                         direction_hint='bprop inputs',
                          verbose=self.verbose)
 
+        if hasattr(self, 'flops'):
+            din.set_flops()
+
         din = din(gz, filters)
 
-        assert all(o is None or o == i
-                   for o, i in zip(din.owner.op.outshp, self.imshp[1:]))
+        assert (din.owner.op.outshp is None and self.imshp is None) or \
+               (din.owner.op.outshp is None) or \
+               (din.owner.op.outshp == self.imshp[1:]).all()
 
         # din and dw should have the same broadcasting pattern as the
         # parameters they are the gradient of (resp. inputs and kerns).
@@ -964,7 +985,7 @@ def c_headers(self):
         return ['<numpy/noprefix.h>', '<iostream>', '<sstream>']
 
     def c_code_cache_version(self):
-        return (11, self.openmp, blas.blas_header_version())
+        return (10, self.openmp, blas.blas_header_version())
 
     def c_support_code(self):
         return """
@@ -1039,49 +1060,24 @@ def c_code(self, node, name, inp, out, sub):
         d = locals()
         d.update(sub)
 
-        all_shape = (self.has_all_shape(self.imshp, self.kshp,
-                                       self.nkern, self.bsize) and
-                     self.has_all_shape(self.imshp_logical, self.kshp_logical))
+        all_shape = (self.imshp is not None and self.kshp is not None and
+                     self.nkern is not None and self.bsize is not None)
 
         d["self_out_mode"] = self.out_mode
         d["self_dx"] = self.dx
         d["self_dy"] = self.dy
         d["mode"] = self.out_mode.upper()
         d["affectation"] = "="
-
-        # Default values, will be overrided if the shape info is provided
-        d["self_bsize"] = "PyArray_DIMS(%(img2d)s)[0]" % d
-        d["self_nkern"] = "PyArray_DIMS(%(filtersflipped)s)[0]" % d
-        d["self_outshp0"] = "-1"
-        d["self_outshp1"] = "-1"
-        d["self_imshp0"] = "PyArray_DIMS(%(img2d)s)[1]" % d
-        d["self_imshp1"] = "PyArray_DIMS(%(img2d)s)[2]" % d
-        d["self_imshp2"] = "PyArray_DIMS(%(img2d)s)[3]" % d
-        d["self_kshp0"] = "PyArray_DIMS(%(filtersflipped)s)[2]" % d
-        d["self_kshp1"] = "PyArray_DIMS(%(filtersflipped)s)[3]" % d
-
-        # Override the default value if we have it
-        if self.kshp[0] is not None:
-            d["self_kshp0"] = self.kshp[0]
-        if self.kshp[1] is not None:
-            d["self_kshp1"] = self.kshp[1]
-        if self.outshp[0] is not None:
+        if all_shape:
+            d["self_bsize"] = self.bsize
+            d["self_nkern"] = self.nkern
             d["self_outshp0"] = self.outshp[0]
-        if self.outshp[1] is not None:
             d["self_outshp1"] = self.outshp[1]
-        if self.imshp[0] is not None:
             d["self_imshp0"] = self.imshp[0]
-        if self.imshp[1] is not None:
             d["self_imshp1"] = self.imshp[1]
-        if self.imshp[2] is not None:
             d["self_imshp2"] = self.imshp[2]
-        if self.bsize is not None:
-            d["self_bsize"] = self.bsize
-        if self.nkern is not None:
-            d["self_nkern"] = self.nkern
-
-        # Other hard coded stuff only if we have all shapes
-        if all_shape:
+            d["self_kshp0"] = self.kshp[0]
+            d["self_kshp1"] = self.kshp[1]
             d["self_kshp_logical_r"] = self.kshp_logical[0]
             d["self_kshp_logical_c"] = self.kshp_logical[1]
             d["self_kshp_logical_stride_r"] = int(numpy.ceil(
@@ -1182,6 +1178,15 @@ def c_code(self, node, name, inp, out, sub):
 
 """ % (locals())
         else:
+            d["self_bsize"] = "PyArray_DIMS(%(img2d)s)[0]" % d
+            d["self_nkern"] = "PyArray_DIMS(%(filtersflipped)s)[0]" % d
+            d["self_outshp0"] = "-1"
+            d["self_outshp1"] = "-1"
+            d["self_imshp0"] = "PyArray_DIMS(%(img2d)s)[1]" % d
+            d["self_imshp1"] = "PyArray_DIMS(%(img2d)s)[2]" % d
+            d["self_imshp2"] = "PyArray_DIMS(%(img2d)s)[3]" % d
+            d["self_kshp0"] = "PyArray_DIMS(%(filtersflipped)s)[2]" % d
+            d["self_kshp1"] = "PyArray_DIMS(%(filtersflipped)s)[3]" % d
             d["affectation"] = "+="
             d["all_shape"] = "0"
             d["dim_zz_const"] = ""
diff --git a/theano/tensor/nnet/conv3d2d.py b/theano/tensor/nnet/conv3d2d.py
deleted file mode 100644
index 79c4accfd9b..00000000000
--- a/theano/tensor/nnet/conv3d2d.py
+++ /dev/null
@@ -1,329 +0,0 @@
-import theano
-from theano.compat import any
-from theano.gradient import DisconnectedType
-from theano.gof import Op, Apply, TopoOptimizer
-from theano import tensor
-import theano.sandbox.cuda as cuda
-
-
-def get_diagonal_subtensor_view(x, i0, i1):
-    """Helper function for DiagonalSubtensor and
-    IncDiagonalSubtensor
-
-    :note: it return a partial view of x, not a partial copy.
-    """
-    # We have to cast i0 and i0 to int because python 2.4 (and maybe later)
-    # do not support indexing with 0-dim, 'int*' ndarrays.
-    i0 = int(i0)
-    i1 = int(i1)
-    if x.shape[i0] < x.shape[i1]:
-        raise NotImplementedError('is this allowed?')
-    idx = [slice(None)] * x.ndim
-    idx[i0] = slice(x.shape[i1] - 1, None, None)
-    xview = x.__getitem__(tuple(idx))
-    strides = list(xview.strides)
-    strides[i1] -= strides[i0]
-    xview.strides = strides
-    return xview
-
-
-class DiagonalSubtensor(Op):
-    """Return a form a nd diagonal subtensor.
-
-    :param x: n-d tensor
-    :param i0: axis index in x
-    :param i1: axis index in x
-    :note: Work on the GPU.
-
-    ``x`` is some n-dimensional tensor, but this Op only deals with a
-    matrix-shaped slice, using axes i0 and i1. Without loss of
-    generality, suppose that ``i0`` picks out our ``row`` dimension,
-    and i1 the ``column`` dimension.
-
-    So the relevant part of ``x`` is some matrix ``u``. Suppose it has 7 rows
-    and 4 columns::
-
-        [ 0 0 0 0 ]
-        [ 0 0 0 0 ]
-        [ 0 0 0 0 ]
-        [ 0 0 0 0 ]
-        [ 0 0 0 0 ]
-        [ 0 0 0 0 ]
-
-    The view returned by this function is also a matrix. It's a thick,
-    diagonal ``stripe`` across u that discards the lower left triangle
-    and the upper right triangle:
-
-        [ x 0 0 0 ]
-        [ x x 0 0 ]
-        [ x x x 0 ]
-        [ 0 x x x ]
-        [ 0 0 x x ]
-        [ 0 0 0 x ]
-
-    In this case the return value would be this view of shape 3x4. The
-    returned view has the same number of dimensions as the input
-    ``x``, and the only difference is that the shape along dimension
-    ``i0`` has been reduced by ``shape[i1] - 1`` because of the
-    triangles that got chopped out.
-
-    The NotImplementedError is meant to catch the case where shape[i0]
-    is too small for the stripe to reach across the matrix, in which
-    case it's not clear what this function should do. Maybe always
-    raise an error. I'd look back to the call site in the Conv3D to
-    see what's necessary at that point.
-
-    """
-    def __str__(self):
-        if self.inplace:
-            return "%s{inplace}" % self.__class__.__name__
-        return "%s" % self.__class__.__name__
-
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if inplace:
-            self.view_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.inplace == other.inplace
-
-    def __hash__(self):
-        return hash((type(self), self.inplace))
-
-    def make_node(self, x, i0, i1):
-        _i0 = tensor.as_tensor_variable(i0)
-        _i1 = tensor.as_tensor_variable(i1)
-        return Apply(self, [x, _i0, _i1], [x.type()])
-
-    def perform(self, node, inputs, output_storage):
-        xview = get_diagonal_subtensor_view(*inputs)
-        if self.inplace:
-            output_storage[0][0] = xview
-        else:
-            output_storage[0][0] = xview.copy()
-
-    def grad(self, inputs, g_outputs):
-        z = tensor.zeros_like(inputs[0])
-        gx = inc_diagonal_subtensor(z, inputs[1], inputs[2], g_outputs[0])
-        return [gx, DisconnectedType()(), DisconnectedType()()]
-
-    def connection_pattern(self, node):
-        rval = [[True], [False], [False]]
-        return rval
-
-diagonal_subtensor = DiagonalSubtensor(False)
-
-
-class IncDiagonalSubtensor(Op):
-    """
-    The gradient of DiagonalSubtensor
-    """
-    def __str__(self):
-        if self.inplace:
-            return "%s{inplace}" % self.__class__.__name__
-        return "%s" % self.__class__.__name__
-
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if inplace:
-            self.destroy_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.inplace == other.inplace
-
-    def __hash__(self):
-        return hash((type(self), self.inplace))
-
-    def make_node(self, x, i0, i1, amt):
-        _i0 = tensor.as_tensor_variable(i0)
-        _i1 = tensor.as_tensor_variable(i1)
-        return Apply(self, [x, _i0, _i1, amt], [x.type()])
-
-    def perform(self, node, inputs, output_storage):
-        x, i0, i1, amt = inputs
-        if not self.inplace:
-            x = x.copy()
-        xview = get_diagonal_subtensor_view(x, i0, i1)
-        xview += amt
-        output_storage[0][0] = x
-
-    def grad(self, inputs, g_outputs):
-        x, i0, i1, amt = inputs
-        gy = g_outputs[0]
-        return [gy, DisconnectedType()(), DisconnectedType()(),
-                diagonal_subtensor(gy, i0, i1)]
-
-    def connection_pattern(self, node):
-        rval = [[True], [False], [False], [True]]
-        return rval
-inc_diagonal_subtensor = IncDiagonalSubtensor(False)
-
-
-def conv3d(signals, filters,
-           signals_shape=None, filters_shape=None,
-           border_mode='valid'):
-    """Convolve spatio-temporal filters with a movie.
-
-    It flips the filters.
-
-    :param signals: timeseries of images whose pixels have color channels.
-            shape: [Ns, Ts, C, Hs, Ws]
-    :param filters: spatio-temporal filters
-            shape: [Nf, Tf, C, Hf, Wf]
-    :param signals_shape: None or a tuple/list with the shape of signals
-    :param filters_shape: None or a tuple/list with the shape of filters
-    :param border_mode: The only one tested is 'valid'.
-
-    :note: Another way to define signals: (batch,  time, in channel, row, column)
-           Another way to define filters: (out channel,time,in channel, row, column)
-    :note: For the GPU, you can use this implementation or
-           :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>`.
-
-    :see: Someone made a script that shows how to swap the axes between
-          both 3d convolution implementations in Theano. See the last
-          `attachment <https://groups.google.com/d/msg/theano-users/1S9_bZgHxVw/0cQR9a4riFUJ>`_.
-
-    """
-
-    if isinstance(border_mode, str):
-        border_mode = (border_mode, border_mode, border_mode)
-
-    if signals_shape is None:
-        _signals_shape_5d = signals.shape
-    else:
-        _signals_shape_5d = signals_shape
-
-    if filters_shape is None:
-        _filters_shape_5d = filters.shape
-    else:
-        _filters_shape_5d = filters_shape
-
-    _signals_shape_4d = (
-        _signals_shape_5d[0] * _signals_shape_5d[1],
-        _signals_shape_5d[2],
-        _signals_shape_5d[3],
-        _signals_shape_5d[4],
-        )
-    _filters_shape_4d = (
-        _filters_shape_5d[0] * _filters_shape_5d[1],
-        _filters_shape_5d[2],
-        _filters_shape_5d[3],
-        _filters_shape_5d[4],
-        )
-
-    if border_mode[1] != border_mode[2]:
-        raise NotImplementedError('height and width bordermodes must match')
-    conv2d_signal_shape = _signals_shape_4d
-    conv2d_filter_shape = _filters_shape_4d
-    if signals_shape is None:
-        conv2d_signal_shape = None
-    if filters_shape is None:
-        conv2d_filter_shape = None
-
-    out_4d = tensor.nnet.conv2d(
-        signals.reshape(_signals_shape_4d),
-        filters.reshape(_filters_shape_4d),
-        image_shape=conv2d_signal_shape,
-        filter_shape=conv2d_filter_shape,
-        border_mode = border_mode[1])  # ignoring border_mode[2]
-
-    # reshape the output to restore its original size
-    # shape = Ns, Ts, Nf, Tf, W-Wf+1, H-Hf+1
-    if border_mode[1] == 'valid':
-        out_tmp = out_4d.reshape((
-            _signals_shape_5d[0],  # Ns
-            _signals_shape_5d[1],  # Ts
-            _filters_shape_5d[0],  # Nf
-            _filters_shape_5d[1],  # Tf
-            _signals_shape_5d[3] - _filters_shape_5d[3] + 1,
-            _signals_shape_5d[4] - _filters_shape_5d[4] + 1,
-            ))
-    elif border_mode[1] == 'full':
-        out_tmp = out_4d.reshape((
-            _signals_shape_5d[0],  # Ns
-            _signals_shape_5d[1],  # Ts
-            _filters_shape_5d[0],  # Nf
-            _filters_shape_5d[1],  # Tf
-            _signals_shape_5d[3] + _filters_shape_5d[3] - 1,
-            _signals_shape_5d[4] + _filters_shape_5d[4] - 1,
-            ))
-    elif border_mode[1] == 'same':
-        raise NotImplementedError()
-    else:
-        raise ValueError('invalid border mode', border_mode[1])
-
-    # now sum out along the Tf to get the output
-    # but we have to sum on a diagonal through the Tf and Ts submatrix.
-    if border_mode[0] == 'valid':
-        out_5d = diagonal_subtensor(out_tmp, 1, 3).sum(axis=3)
-    elif border_mode[0] in ('full', 'same'):
-        raise NotImplementedError('sequence border mode', border_mode[0])
-    else:
-        raise ValueError('invalid border mode', border_mode[1])
-    return out_5d
-
-
-def make_gpu_optimizer(op, to_gpu):
-    """This function create optimizer that move some inputs to the GPU
-    for op that work on both CPU and GPU.
-
-    The op object is created by calling op(), so good default value
-    are needed.
-
-    We suppose the same op work with CPU and GPU inputs.
-
-    :param op: the op that support GPU inputs
-    :param to_gpu: a list of op inputs that are moved to the GPU.
-
-    """
-    @theano.gof.local_optimizer([op, cuda.gpu_from_host])
-    def local_to_gpu(node):
-        """
-        op(host_from_gpu()) -> host_from_gpu(op)
-        gpu_from_host(op) -> op(gpu_from_host)
-        """
-        if isinstance(node.op, op):
-            #op(host_from_gpu()) -> host_from_gpu(op)
-            #If any of the input that go on the GPU are on the GPU,
-            #move the op to the gpu.
-            if any(node.inputs[idx].owner and
-                   isinstance(node.inputs[idx].owner.op, cuda.HostFromGpu)
-                   for idx in to_gpu):
-                new_inp = list(node.inputs)
-                for idx in to_gpu:
-                    new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
-                return [cuda.host_from_gpu(op()(*new_inp))]
-        if node.op == cuda.gpu_from_host:
-            #gpu_from_host(op) -> op(gpu_from_host)
-            host_input = node.inputs[0]
-            if host_input.owner and isinstance(host_input.owner.op,
-                                               op):
-                op_node = host_input.owner
-                new_inp = list(op_node.inputs)
-                for idx in to_gpu:
-                    new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
-                return [op()(*new_inp)]
-        return False
-    local_to_gpu.__name__ = "local_to_gpu_" + op.__name__
-    cuda.opt.register_opt()(local_to_gpu)
-
-if cuda.cuda_available:
-    make_gpu_optimizer(DiagonalSubtensor, [0])
-    make_gpu_optimizer(IncDiagonalSubtensor, [0, 3])
-
-
-@theano.gof.local_optimizer([DiagonalSubtensor, IncDiagonalSubtensor])
-def local_inplace_DiagonalSubtensor(node):
-    """ also work for IncDiagonalSubtensor """
-    if (isinstance(node.op, (DiagonalSubtensor, IncDiagonalSubtensor)) and
-        not node.op.inplace):
-        new_op = node.op.__class__(inplace=True)
-        new_node = new_op(*node.inputs)
-        return [new_node]
-    return False
-theano.compile.optdb.register(
-    'local_inplace_DiagonalSubtensor',
-    TopoOptimizer(
-        local_inplace_DiagonalSubtensor,
-        failure_callback=TopoOptimizer.warn_inplace),
-    60, 'fast_run', 'inplace')
diff --git a/theano/tensor/nnet/neighbours.py b/theano/tensor/nnet/neighbours.py
deleted file mode 100644
index f63a4d4e591..00000000000
--- a/theano/tensor/nnet/neighbours.py
+++ /dev/null
@@ -1,570 +0,0 @@
-"""
-TODO: implement Images2Neibs.infer_shape() methods
-
-"""
-import theano
-from theano import Op, Apply
-import theano.tensor as T
-from theano.gradient import grad_not_implemented
-from theano.gradient import grad_undefined
-
-import numpy
-
-
-class Images2Neibs(Op):
-    def __init__(self, mode='valid'):
-        """
-        :type mode: str
-        :param mode: Possible values:
-            'valid': Requires an input that is a multiple of the
-                pooling factor (in each direction)
-            'ignore_borders': Same as valid, but will ignore the borders
-                if the shape(s) of the input
-                is not a multiple of the pooling factor(s)
-            'wrap_centered' : ?? TODO comment
-        :return:
-            Reshapes the input as a 2D tensor where each row is an
-            pooling example
-        """
-        if mode not in ['valid', 'wrap_centered', 'ignore_borders']:
-            raise NotImplementedError("Only the mode valid, ignore_borders"
-                                      " and wrap_centered have been"
-                                      " implemented for the op Images2Neibs")
-        self.mode = mode
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.mode == other.mode
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.mode)
-
-    def __str__(self):
-        return self.__class__.__name__ + "{%s}" % self.mode
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        if not hasattr(self, "mode"):
-            self.mode = 'valid'
-
-    def make_node(self, ten4, neib_shape, neib_step=None):
-        """
-        :param ten4:     a list of lists of images
-                         ten4 is of shape (list 1 dim, list 2 dim,
-                                           row, col)
-        :param neib_shape: (r,c) where r is the height of the neighborhood
-                        in rows and c is the width of the neighborhood
-                        in columns
-        :param neib_step: (dr,dc) where dr is the number of rows to
-                          skip between patch and dc is the number of
-                          columns. When None, this is the same as
-                          neib_shape(patch are disjoint)
-
-        output:
-            a 2D matrix, written using the following pattern
-
-            idx = 0
-            for i in xrange(list 1 dim)
-                for j in xrange(list 2 dim)
-                    for k in <image column coordinates>
-                        for l in <image row coordinates>
-                            output[idx,:]
-                                 = flattened version of ten4[i,j,l:l+r,k:k+c]
-                            idx += 1
-            (note: the op isn't necessarily implemented internally with these
-            for loops, they're just the easiest way to describe the output
-            pattern)
-        """
-        ten4 = T.as_tensor_variable(ten4)
-        neib_shape = T.as_tensor_variable(neib_shape)
-        if neib_step is None:
-            neib_step = neib_shape
-        else:
-            neib_step = T.as_tensor_variable(neib_step)
-
-        assert ten4.ndim == 4
-        assert neib_shape.ndim == 1
-        assert neib_step.ndim == 1
-
-        return Apply(self, [ten4, neib_shape, neib_step],
-                     [T.matrix(dtype=ten4.type.dtype)])
-
-    def grad(self, inp, grads):
-        x, neib_shape, neib_step = inp
-        gz, = grads
-
-        if self.mode in ['valid', 'ignore_borders']:
-            if (neib_shape is neib_step or
-                neib_shape == neib_step or
-                # Theano Constant == do not compare the data
-                # the equals function do that.
-                (hasattr(neib_shape, "equals") and
-                 neib_shape.equals(neib_step))):
-                return [neibs2images(gz, neib_shape, x.shape, mode=self.mode),
-                        grad_undefined(self, 1, neib_shape),
-                        grad_undefined(self, 2, neib_step)]
-        return [grad_not_implemented(self, 0, x),
-                grad_undefined(self, 1, neib_shape),
-                grad_undefined(self, 2, neib_step)]
-
-    def c_code_cache_version(self):
-        return (5,)
-
-    def perform(self, node, inp, out_):
-        ten4, neib_shape, neib_step = inp
-        z, = out_
-        # GpuImages2Neibs should not run this perform in DebugMode
-        if type(self) != Images2Neibs:
-            raise theano.gof.utils.MethodNotDefined()
-
-        def CEIL_INTDIV(a, b):
-            if a % b:
-                return (a // b) + 1
-            else:
-                return a // b
-
-        grid_c = -1  # number of patch in height
-        grid_d = -1  # number of patch in width
-        assert ten4.ndim == 4
-        assert neib_shape.ndim == 1
-        assert neib_shape.shape[0] == 2
-        assert neib_step.ndim == 1
-        assert neib_step.shape[0] == 2
-        c, d = neib_shape
-        step_x, step_y = neib_step
-        mode = self.mode
-
-        if mode == "wrap_centered":
-            if (c % 2 != 1) or (d % 2 != 1):
-                raise TypeError(
-                    "Images2Neibs:"
-                    " in mode wrap_centered need patch with odd shapes")
-
-            if (ten4.shape[2] < c) or (ten4.shape[3] < d):
-                raise TypeError(
-                    "Images2Neibs: in wrap_centered mode, don't support"
-                    " image shapes smaller then the patch shapes:"
-                    " neib_shape=(%d,%d), ten4[2:]=[%d,%d]" %
-                    (c, d, ten4.shape[2], ten4.shape[3]))
-            grid_c = CEIL_INTDIV(ten4.shape[2], step_x)
-            grid_d = CEIL_INTDIV(ten4.shape[3], step_y)
-
-        elif mode == "valid":
-            if (ten4.shape[2] < c) or (((ten4.shape[2] - c) % step_x) != 0):
-                raise TypeError(
-                    "neib_shape[0]=%d, neib_step[0]=%d and"
-                    " ten4.shape[2]=%d not consistent" %
-                    (c, step_x, ten4.shape[2]))
-            if (ten4.shape[3] < d) or (((ten4.shape[3] - d) % step_y) != 0):
-                raise TypeError(
-                    "neib_shape[1]=%d, neib_step[1]=%d and"
-                    " ten4.shape[3]=%d not consistent" %
-                    (d, step_y, ten4.shape[3]))
-            # number of patch in height
-            grid_c = 1 + ((ten4.shape[2] - c) // step_x)
-            # number of patch in width
-            grid_d = 1 + ((ten4.shape[3] - d) // step_y)
-        elif mode == "ignore_borders":
-            # number of patch in height
-            grid_c = 1 + ((ten4.shape[2] - c) // step_x)
-            # number of patch in width
-            grid_d = 1 + ((ten4.shape[3] - d) // step_y)
-        else:
-            raise TypeError("Images2Neibs: unknow mode '%s'" % mode)
-
-        z_dim0 = grid_c * grid_d * ten4.shape[1] * ten4.shape[0]
-        z_dim1 = c * d
-        z[0] = numpy.empty((z_dim0, z_dim1), dtype=node.outputs[0].dtype)
-
-        nb_batch = ten4.shape[0]
-        nb_stack = ten4.shape[1]
-        height = ten4.shape[2]
-        width = ten4.shape[3]
-
-        wrap_centered_idx_shift_x = c // 2
-        wrap_centered_idx_shift_y = d // 2
-        for n in range(nb_batch):
-            for s in range(nb_stack):
-                # loop over the number of patch in height
-                for a in range(grid_c):
-                    # loop over the number of patch in width
-                    for b in range(grid_d):
-                        z_row = b + grid_d * (a + grid_c * (s + nb_stack * n))
-                        for i in range(c):
-                            ten4_2 = i + a * step_x
-                            if mode == "wrap_centered":
-                                ten4_2 -= wrap_centered_idx_shift_x
-                                if ten4_2 < 0:
-                                    ten4_2 += height
-                                elif ten4_2 >= height:
-                                    ten4_2 -= height
-                            for j in range(d):
-                                ten4_3 = j + b * step_y
-                                if mode == "wrap_centered":
-                                    ten4_3 -= wrap_centered_idx_shift_y
-                                    if ten4_3 < 0:
-                                        ten4_3 += width
-                                    elif ten4_3 >= width:
-                                        ten4_3 -= width
-                                z_col = j + d * i
-
-                                z[0][z_row, z_col] = ten4[n, s, ten4_2, ten4_3]
-
-    def c_code(self, node, name, inp, out, sub):
-        ten4, neib_shape, neib_step = inp
-        z, = out
-
-        fail = sub['fail']
-        mode = self.mode
-        return """
-#ifndef CEIL_INTDIV
-#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
-#endif
-
-        int grid_c = -1; //number of patch in height
-        int grid_d = -1; //number of patch in width
-        {
-        if (PyArray_NDIM(%(ten4)s) != 4)
-        {
-            PyErr_Format(PyExc_TypeError, "ten4 wrong rank");
-            %(fail)s;
-        }
-        if (PyArray_NDIM(%(neib_shape)s) != 1)
-        {
-            PyErr_Format(PyExc_TypeError, "neib_shape wrong rank");
-            %(fail)s;
-        }
-        if ( (PyArray_DIMS(%(neib_shape)s))[0] != 2)
-        {
-            PyErr_Format(PyExc_TypeError, "neib_shape wrong shape ; has to"
-                                          " contain 2 elements");
-            %(fail)s;
-        }
-        if (PyArray_NDIM(%(neib_step)s) != 1)
-        {
-            PyErr_Format(PyExc_TypeError, "neib_step wrong rank");
-            %(fail)s;
-        }
-        if ( (PyArray_DIMS(%(neib_step)s))[0] != 2)
-        {
-            PyErr_Format(PyExc_TypeError,
-                         "neib_step wrong step ; has to contain 2 elements");
-            %(fail)s;
-        }
-
-        // (c,d) = neib_shape
-        const npy_intp c = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
-        const npy_intp d = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
-        // (step_x,step_y) = neib_step
-        const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
-        const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
-
-        if ( "%(mode)s" == "wrap_centered") {
-            if (c%%2!=1 || d%%2!=1){
-                PyErr_Format(PyExc_TypeError,
-                             "Images2Neibs: in mode wrap_centered"
-                             " need patch with odd shapes");
-                %(fail)s;
-            }
-            if ( (PyArray_DIMS(%(ten4)s))[2] < c ||
-                 (PyArray_DIMS(%(ten4)s))[3] < d)
-            {
-                PyErr_Format(PyExc_TypeError,
-                    "Images2Neibs: in wrap_centered mode, don't support image"
-                    " shapes smaller then the patch shapes:"
-                    " neib_shape=(%%ld,%%ld), ten4[2:]=[%%ld,%%ld]",
-                    (long int)c, (long int)d,
-                    (long int)(PyArray_DIMS(%(ten4)s)[2]),
-                    (long int)(PyArray_DIMS(%(ten4)s)[3]));
-                %(fail)s;
-            }
-            grid_c = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[2]),step_x);
-            grid_d = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[3]),step_y);
-
-        }else if ( "%(mode)s" == "valid") {
-            if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
-                 ( (((PyArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
-            {
-                PyErr_Format(PyExc_TypeError,
-                             "neib_shape[0]=%%ld, neib_step[0]=%%ld and"
-                             " ten4.shape[2]=%%ld not consistent",
-                             (long int)c, (long int)step_x,
-                             (long int)(PyArray_DIMS(%(ten4)s)[2]));
-                %(fail)s;
-            }
-            if ( ((PyArray_DIMS(%(ten4)s))[3] < d) ||
-                 ( (((PyArray_DIMS(%(ten4)s))[3]-d) %% step_y)!=0))
-            {
-                PyErr_Format(PyExc_TypeError,
-                             "neib_shape[1]=%%ld, neib_step[1]=%%ld and"
-                             " ten4.shape[3]=%%ld not consistent",
-                             (long int)d, (long int)step_y,
-                             (long int)(PyArray_DIMS(%(ten4)s)[3]));
-                %(fail)s;
-            }
-            //number of patch in height
-            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
-            //number of patch in width
-            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
-        }else if ( "%(mode)s" == "ignore_borders") {
-            //number of patch in height
-            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
-            //number of patch in width
-            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
-        }else{
-            PyErr_Format(PyExc_TypeError,
-                         "Images2Neibs: unknow mode '%(mode)s'");
-            %(fail)s;
-        }
-
-        // new dimensions for z
-        const npy_intp z_dim1 = c * d;
-        const npy_intp z_dim0 =  grid_c
-                            * grid_d
-                            * (PyArray_DIMS(%(ten4)s))[1]
-                            * (PyArray_DIMS(%(ten4)s))[0];
-
-        if ((NULL == %(z)s)
-            || ((PyArray_DIMS(%(z)s))[0] != z_dim0 )
-            || ((PyArray_DIMS(%(z)s))[1] != z_dim1 )
-        )
-        {
-            Py_XDECREF(%(z)s);
-            npy_intp dims[2];
-            dims[0] = z_dim0;
-            dims[1] = z_dim1;
-
-            %(z)s = (PyArrayObject*) PyArray_EMPTY(2,
-                dims,
-                PyArray_TYPE((PyArrayObject*) py_%(ten4)s),
-                0);
-
-            if (!%(z)s)
-            {
-                PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
-                %(fail)s;
-            }
-        }
-        }
-
-        { // NESTED SCOPE
-
-        const int nb_batch = (PyArray_DIMS(%(ten4)s))[0];
-        const int nb_stack = (PyArray_DIMS(%(ten4)s))[1];
-        const int height = (PyArray_DIMS(%(ten4)s))[2];
-        const int width = (PyArray_DIMS(%(ten4)s))[3];
-
-        // (c,d) = neib_shape
-        const npy_intp c = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
-        const npy_intp d = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
-        // (step_x,step_y) = neib_step
-        const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
-        const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
-
-        const int wrap_centered_idx_shift_x = c/2;
-        const int wrap_centered_idx_shift_y = d/2;
-        // Oh this is messed up...
-        for (int n = 0; n < nb_batch; n++)              // loop over batches
-            for (int s = 0; s < nb_stack; s++)          // loop over stacks
-                for (int a = 0; a < grid_c; a++)        // loop over the number of patch in height
-                    for (int b = 0; b < grid_d; b++)    // loop over the number of patch in width
-                    {
-                        int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
-                        for (int i = 0; i < c; i++)     // loop over c
-                        {
-                            int ten4_2 = i + a * step_x;
-                            if ( "%(mode)s" == "wrap_centered" ){
-                                ten4_2 -= wrap_centered_idx_shift_x;
-                                if ( ten4_2 < 0 ) ten4_2 += height;
-                                else if (ten4_2 >= height) ten4_2 -= height;
-                            }
-                            for (int j = 0; j < d; j++)  // loop over d
-                            {
-
-                                int ten4_3 = j + b * step_y;
-                                if ( "%(mode)s" == "wrap_centered" ){
-                                    ten4_3 -= wrap_centered_idx_shift_y;
-                                    if ( ten4_3 < 0 ) ten4_3 += width;
-                                    else if (ten4_3 >= width) ten4_3 -= width;
-                                }
-                                int z_col = j + d * i;
-
-                                dtype_%(z)s* curr_z = (dtype_%(z)s*) PyArray_GETPTR2(%(z)s, z_row, z_col);
-                                *curr_z = *( (dtype_%(ten4)s*) PyArray_GETPTR4(%(ten4)s, n, s, ten4_2, ten4_3));
-
-                                //printf("\\n(%%i,%%i,%%i,%%i) --> (%%i,%%i)",
-                                //       n, s, ten4_2, ten4_3, z_row, z_col);
-                                //printf("%%f ", *curr_z);
-                            }
-                        }
-                    }
-        } // END NESTED SCOPE
-        """ % locals()
-
-
-def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
-    """ 
-    Function :func:`images2neibs <theano.sandbox.neighbours.images2neibs>`
-    allows to apply a sliding window operation to a tensor containing 
-    images
-    or other two-dimensional objects. 
-    The sliding window operation loops 
-    over points in input data and stores a rectangular neighbourhood of 
-    each point.   
-    It is possible to assign a step of selecting patches (parameter 
-    `neib_step`). 
-
-    :param ten4:     A 4-dimensional tensor which represents 
-                     a list of lists of images.a list of lists of images.
-                     It should have shape (list 1 dim, list 2 dim,
-                     row, col). The first two dimensions can be 
-                     useful to store different channels and batches.
-    :type ten4:      A 4d tensor-like.
-    :param neib_shape: A tuple containing two
-                    values: height and width of the neighbourhood.
-                    It should have shape (r,c) where r is the height of the
-                    neighborhood in rows and c is the width of the neighborhood
-                    in columns
-    :type neib_shape: A 1d tensor-like of 2 values.
-    :param neib_step: (dr,dc) where dr is the number of rows to
-                      skip between patch and dc is the number of
-                      columns. The parameter should be a tuple of two elements: 
-                      number 
-                      of rows and number of columns to skip each iteration. 
-                      Basically, when the step is 1, the neighbourhood of every
-                      first element is taken and every possible rectangular 
-                      subset is returned. By default it is equal to
-                      `neib_shape` in other words, the
-                      patches are disjoint. When the step is greater than 
-                      `neib_shape`, some elements are omitted. When None, this
-                      is the same as
-                      neib_shape(patch are disjoint)
-                      .. note:: Currently the step size should be chosen in the way that the 
-                         corresponding dimension :math:`i` (width or height) is equal to 
-                         :math:`n * step\_size_i + neib\_shape_i` for some :math:`n`
-    :type neib_step: A 1d tensor-like of 2 values.
-    :param mode:
-        Possible values:
-
-        ``valid``
-           Requires an input that is a multiple of the
-           pooling factor (in each direction)
-
-        ``ignore_borders``
-           Same as valid, but will ignore the borders
-           if the shape(s) of the input
-           is not a multiple of the pooling factor(s)
-
-        ``wrap_centered``
-           ?? TODO comment
-
-    :type mode: str
-    :return:
-        Reshapes the input as a 2D tensor where each row is an
-        pooling example. Pseudo-code of the output:
-
-          .. code-block:: python
-
-             idx = 0
-             for i in xrange(list 1 dim):
-                 for j in xrange(list 2 dim):
-                     for k in <image column coordinates>:
-                         for l in <image row coordinates>:
-                             output[idx,:]
-                                  = flattened version of ten4[i,j,l:l+r,k:k+c]
-                             idx += 1
-
-          .. note:: The operation isn't necessarily implemented internally with 
-             these for loops, they're just the easiest way to describe the 
-             output pattern.
-
-    Example:
-  
-    .. code-block:: python
-  
-        # Defining variables
-        images = T.tensor4('images')
-        neibs = images2neibs(images, neib_shape=(5, 5))
-  
-        # Constructing theano function 
-        window_function = theano.function([images], neibs)
-  
-        # Input tensor (one image 10x10)
-        im_val = np.arange(100.).reshape((1, 1, 10, 10))
-  
-        # Function application
-        neibs_val = window_function(im_val)
-  
-    .. note:: The underlying code will construct a 2D tensor of disjoint 
-       patches 5x5. The output has shape 4x25. 
-    """
-    return Images2Neibs(mode)(ten4, neib_shape, neib_step)
-
-
-def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
-    """
-    Function :func:`neibs2images <theano.sandbox.neighbours.neibs2images>`
-    performs the inverse operation of
-    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`. It inputs
-    the output of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
-    and reconstructs its input.
-
-    :param neibs: matrix like the one obtained by 
-                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
-    :param neib_shape: `neib_shape` that was used in 
-                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
-    :param original_shape: original shape of the 4d tensor given to 
-                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
-
-    :return: Reconstructs the input of 
-                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`,
-                  a 4d tensor of shape `original_shape`.
-
-    .. note:: Currently, the function doesn't support tensors created with
-       `neib_step` different from default value. This means that it may be
-       impossible to compute the gradient of a variable gained by 
-       :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` w.r.t. 
-       its inputs in this case, because it uses 
-       :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` for 
-       gradient computation.
-    
-
-    Example, which uses a tensor gained in example for
-    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`:
-
-    .. code-block:: python
-
-        im_new = neibs2images(neibs, (5, 5), im_val.shape)
-        # Theano function definition
-        inv_window = theano.function([neibs], im_new)
-        # Function application
-        im_new_val = inv_window(neibs_val)
-
-    .. note:: The code will output the initial image array.
-    """
-    neibs = T.as_tensor_variable(neibs)
-    neib_shape = T.as_tensor_variable(neib_shape)
-    original_shape = T.as_tensor_variable(original_shape)
-
-    new_neib_shape = T.stack(original_shape[-1] // neib_shape[1],
-                             neib_shape[1])
-    output_2d = images2neibs(neibs.dimshuffle('x', 'x', 0, 1),
-                             new_neib_shape, mode=mode)
-
-    if mode == 'ignore_borders':
-        valid_shape = list(original_shape)
-        valid_shape[2] = (valid_shape[2] // neib_shape[0]) * neib_shape[0]
-        valid_shape[3] = (valid_shape[3] // neib_shape[1]) * neib_shape[1]
-        output_4d = output_2d.reshape(valid_shape)
-        #padding the borders with zeros
-        for d in [2, 3]:
-            pad_shape = list(output_4d.shape)
-            pad_shape[d] = original_shape[d] - valid_shape[d]
-            output_4d = T.concatenate([output_4d, T.zeros(pad_shape)], axis=d)
-    elif mode == 'valid':
-        # TODO: we do not implement all mode with this code.
-        # Add a check for the good cases.
-        output_4d = output_2d.reshape(original_shape)
-    else:
-        raise NotImplementedError("neibs2images do not support mode=%s" % mode)
-
-    return output_4d
diff --git a/theano/tensor/nnet/nnet.py b/theano/tensor/nnet/nnet.py
index 788213da854..b19a56de726 100644
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1,13 +1,6 @@
 """Provides neural-network specific Ops.
 
 :note: TODO: factor this out into a neural-network toolbox.
-
-:note: We register all optimization with the gpu tag as we don't
-    implement all the intermediate case on the GPU (in particular
-    AdvancedSubtensor). So to make sure it run well on the gpu with
-    fast_compile, we register them as needed for the GPU. This can be
-    revisited later when all the intermediate part are on the GPU.
-
 """
 import logging
 import numpy
@@ -15,7 +8,6 @@
 import theano
 from theano import gof
 from theano.tensor import basic as tensor
-from theano.tensor import subtensor
 from theano.tensor import elemwise, dmatrix, fmatrix, dvector, fvector
 from theano.tensor import opt
 from theano.compile import optdb
@@ -102,7 +94,7 @@ def c_headers(self):
         return ['<iostream>', '<cmath>']
 
     @staticmethod
-    def c_code_template(dtype):
+    def c_code_template():
         # this implementation was lifted from
         # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
 
@@ -114,10 +106,6 @@ def c_code_template(dtype):
         #TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
         init_decl = """
         npy_intp* Nx = PyArray_DIMS(%(x)s);
-        npy_intp Sx = 0;
-        npy_intp Sb = 0;
-        npy_intp Ssm = 0;
-
 
         if (PyArray_NDIM(%(x)s) != 2)
         {
@@ -129,14 +117,14 @@ def c_code_template(dtype):
             PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
             %(fail)s;
         }
-        if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) &&
-            (PyArray_TYPE(%(x)s) != NPY_FLOAT))
+        if ((PyArray_DESCR(%(x)s)->type_num != NPY_DOUBLE) &&
+            (PyArray_DESCR(%(x)s)->type_num != NPY_FLOAT))
         {
             PyErr_SetString(PyExc_TypeError, "not a float");
             %(fail)s;
         }
-        if ((PyArray_TYPE(%(b)s) != NPY_DOUBLE) &&
-            (PyArray_TYPE(%(b)s) != NPY_FLOAT))
+        if ((PyArray_DESCR(%(b)s)->type_num != NPY_DOUBLE) &&
+            (PyArray_DESCR(%(b)s)->type_num != NPY_FLOAT))
         {
             PyErr_SetString(PyExc_TypeError, "b not float");
             %(fail)s;
@@ -155,17 +143,13 @@ def c_code_template(dtype):
         {
             if (NULL != %(sm)s) Py_XDECREF(%(sm)s);
             %(sm)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(x)s),
-                                                       PyArray_TYPE((PyArrayObject*) py_%(x)s));
+                                                       type_num_%(x)s);
             if(!%(sm)s) {
                 PyErr_SetString(PyExc_MemoryError,
                      "failed to alloc sm output");
                 %(fail)s
             }
         }
-        Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
-        Sb = PyArray_STRIDES(%(b)s)[0]/sizeof(dtype_%(b)s);
-        Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
-
         """
 
         begin_row_loop = """
@@ -178,7 +162,9 @@ def c_code_template(dtype):
             const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
             const dtype_%(b)s* __restrict__ b_i = (dtype_%(b)s*)(PyArray_BYTES(%(b)s));
             dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
+        """
 
+        inside_row_loop = """
             npy_intp Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
             npy_intp Sb = PyArray_STRIDES(%(b)s)[0]/sizeof(dtype_%(b)s);
             npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
@@ -195,9 +181,6 @@ def c_code_template(dtype):
                 row_max   = (row_ij > row_max) ? row_ij : row_max;
             }
 
-        """
-
-        inside_row_loop = """
             for (j = 0; j < Nx[1]; ++j)
             {
                 dtype_%(sm)s row_ij = x_i[j * Sx] +  b_i[j * Sb];
@@ -217,42 +200,6 @@ def c_code_template(dtype):
 
         """
 
-        # Get the vectorized version of exp if it exist
-        try:
-            vec_exp = theano.scalar.exp.c_code_contiguous_raw(dtype,
-                                                              "Nx[1]", "sm_i", "sm_i")
-            inside_row_loop_contig = """
-            for (j = 0; j < Nx[1]; ++j)
-            {
-                dtype_%%(sm)s row_ij = x_i[j * Sx] +  b_i[j * Sb];
-                //std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n";
-                dtype_%%(sm)s sm_ij = row_ij - row_max;
-                //std::cout << "3 " << j << " " << sm_ij << "\\n";
-                sm_i[j * Ssm] = sm_ij;
-            }
-            %(vec_exp)s;
-            for (j = 0; j < Nx[1]; ++j)
-            {
-                sum += sm_i[j * Ssm];
-            }
-
-            //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
-            double sum_inv = 1.0 / sum;
-            for (j = 0; j < Nx[1]; ++j)
-            {
-                sm_i[j * Ssm] *= sum_inv;
-            }
-
-        """ % locals()
-            inside_row_loop = """
-            if(Ssm == 1){
-                %(inside_row_loop_contig)s
-            }else{
-                %(inside_row_loop)s
-            }
-            """ % locals()
-        except theano.gof.utils.MethodNotDefined:
-            pass
         end_row_loop = """
         }
         """
@@ -262,13 +209,12 @@ def c_code_template(dtype):
     def c_code(self, node, name, inp, out, sub):
         x, b = inp
         sm, = out
-        code_template = ''.join(self.c_code_template(
-            node.inputs[0].type.dtype_specs()[1]))
+        code_template = ''.join(self.c_code_template())
         return code_template % dict(locals(), **sub)
 
     @staticmethod
     def c_code_cache_version():
-        return (8,)
+        return (6,)
 
 softmax_with_bias = SoftmaxWithBias()
 
@@ -317,15 +263,15 @@ def c_code(self, node, name, inp, out, sub):
         dy, sm = inp
         dx, = out
         return '''
-        if ((PyArray_TYPE(%(dy)s) != NPY_DOUBLE) &&
-            (PyArray_TYPE(%(dy)s) != NPY_FLOAT))
+        if ((PyArray_DESCR(%(dy)s)->type_num != NPY_DOUBLE) &&
+            (PyArray_DESCR(%(dy)s)->type_num != NPY_FLOAT))
         {
             PyErr_SetString(PyExc_TypeError,
                  "types should be float or float64");
             %(fail)s;
         }
-        if ((PyArray_TYPE(%(sm)s) != NPY_DOUBLE) &&
-            (PyArray_TYPE(%(sm)s) != NPY_FLOAT))
+        if ((PyArray_DESCR(%(sm)s)->type_num != NPY_DOUBLE) &&
+            (PyArray_DESCR(%(sm)s)->type_num != NPY_FLOAT))
         {
             PyErr_SetString(PyExc_TypeError,
                  "types should be float or float64");
@@ -349,7 +295,7 @@ def c_code(self, node, name, inp, out, sub):
             Py_XDECREF(%(dx)s);
             %(dx)s = (PyArrayObject*) PyArray_SimpleNew(2,
                                                         PyArray_DIMS(%(sm)s),
-                                                        PyArray_TYPE((PyArrayObject*) py_%(sm)s));
+                                                        type_num_%(sm)s);
             if (!%(dx)s)
             {
                 PyErr_SetString(PyExc_MemoryError,
@@ -406,15 +352,18 @@ def make_node(self, x):
         x = tensor.as_tensor_variable(x)
         if x.type.ndim not in (1, 2) \
                 or x.type.dtype not in tensor.float_dtypes:
-            raise ValueError('x must be 1-d or 2-d tensor of floats. Got ', x.type)
+            raise ValueError('x must be 1-d or 2-d tensor of floats')
         if x.ndim == 1:
             x = tensor.shape_padleft(x, n_ones=1)
         return Apply(self, [x], [x.type()])
 
     def perform(self, node, input_storage, output_storage):
         x, = input_storage
-        e_x = numpy.exp(x - x.max(axis=1)[:, None])
-        sm = e_x / e_x.sum(axis=1)[:, None]
+        sm = numpy.zeros_like(x)
+        for i in xrange(sm.shape[0]):
+            row = x[i]
+            sm[i] = numpy.exp(row - numpy.max(row))
+            sm[i] /= numpy.sum(sm[i])
         output_storage[0][0] = sm
 
     def grad(self, inp, grads):
@@ -437,7 +386,7 @@ def c_headers(self):
         return ['<iostream>', '<cmath>']
 
     @staticmethod
-    def c_code_template(dtype):
+    def c_code_template():
         # this implementation was lifted from
         # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
 
@@ -448,37 +397,33 @@ def c_code_template(dtype):
 
         #TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
         init_decl = """
-        npy_intp* Nx = PyArray_DIMS(%(x)s);
-        npy_intp Sx1 = 0;
-        npy_intp Ssm1 = 0;
+        npy_intp* Nx = %(x)s->dimensions;
 
-        if (PyArray_NDIM(%(x)s) != 2)
+        if (%(x)s->nd != 2)
         {
             PyErr_SetString(PyExc_ValueError, "not a 2d tensor");
             %(fail)s;
         }
-        if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) &&
-            (PyArray_TYPE(%(x)s) != NPY_FLOAT))
+        if ((%(x)s->descr->type_num != PyArray_DOUBLE) &&
+            (%(x)s->descr->type_num != PyArray_FLOAT))
         {
             PyErr_SetString(PyExc_TypeError, "not a float");
             %(fail)s;
         }
 
         if ((NULL == %(sm)s)
-            || (PyArray_DIMS(%(sm)s)[0] != PyArray_DIMS(%(x)s)[0])
-            || (PyArray_DIMS(%(sm)s)[1] != PyArray_DIMS(%(x)s)[1]))
+            || (%(sm)s->dimensions[0] != %(x)s->dimensions[0])
+            || (%(sm)s->dimensions[1] != %(x)s->dimensions[1]))
         {
-            Py_XDECREF(%(sm)s);
+            if (NULL != %(sm)s) Py_XDECREF(%(sm)s);
             %(sm)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(x)s),
-                                                       PyArray_TYPE((PyArrayObject*) py_%(x)s));
+                                                       type_num_%(x)s);
             if(!%(sm)s) {
                 PyErr_SetString(PyExc_MemoryError,
                      "failed to alloc sm output");
                 %(fail)s
             }
         }
-        Sx1 = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
-        Ssm1 = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
         """
 
         begin_row_loop = """
@@ -488,8 +433,13 @@ def c_code_template(dtype):
             double sum = 0.0;
             bool  discount_max = false;
 
-            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
-            dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
+            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(%(x)s->data + %(x)s->strides[0] * i);
+            dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(%(sm)s->data + %(sm)s->strides[0] * i);
+        """
+
+        inside_row_loop = """
+            npy_intp Sx = %(x)s->strides[1]/sizeof(dtype_%(x)s);
+            npy_intp Ssm = %(sm)s->strides[1]/sizeof(dtype_%(sm)s);
 
             size_t row_max_j=0;
             dtype_%(sm)s row_max = x_i[0];
@@ -497,87 +447,51 @@ def c_code_template(dtype):
             // Get the maximum value of the row
             for (j = 1; j < Nx[1]; ++j)
             {
-                dtype_%(sm)s row_ij = x_i[j * Sx1] ;
+                dtype_%(sm)s row_ij = x_i[j * Sx] ;
                 //std::cout << "1 " << row_ij << "\\n";
                 row_max_j = (row_ij > row_max) ? j : row_max_j;
                 row_max   = (row_ij > row_max) ? row_ij : row_max;
             }
 
-        """
-
-        inside_row_loop = """
             for (j = 0; j < Nx[1]; ++j)
             {
-                dtype_%(sm)s row_ij = x_i[j * Sx1] ;
+                dtype_%(sm)s row_ij = x_i[j * Sx] ;
                 //std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n";
                 dtype_%(sm)s sm_ij = exp(row_ij - row_max);
                 //std::cout << "3 " << j << " " << sm_ij << "\\n";
                 sum += sm_ij;
-                sm_i[j * Ssm1] = sm_ij;
+                sm_i[j * Ssm] = sm_ij;
             }
 
             //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
             double sum_inv = 1.0 / sum;
             for (j = 0; j < Nx[1]; ++j)
             {
-                sm_i[j * Ssm1] *= sum_inv;
+                sm_i[j * Ssm] *= sum_inv;
             }
 
         """
-        # Get the vectorized version of exp if it exist
-        try:
-            vec_exp = theano.scalar.exp.c_code_contiguous_raw(dtype,
-                                                              "Nx[1]", "sm_i", "sm_i")
-            inside_row_loop_contig = """
-            for (j = 0; j < Nx[1]; ++j)
-            {
-                sm_i[j * Ssm1] = x_i[j * Sx1] - row_max;
-            }
-            %(vec_exp)s;
-            for (j = 0; j < Nx[1]; ++j)
-            {
-                sum += sm_i[j * Ssm1];
-            }
-
-            //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
-            double sum_inv = 1.0 / sum;
-            for (j = 0; j < Nx[1]; ++j)
-            {
-                sm_i[j * Ssm1] *= sum_inv;
-            }
-
-            """ % locals()
-
-            inside_row_loop = """
-            if(Ssm1 == 1){
-                %(inside_row_loop_contig)s
-            }else{
-                %(inside_row_loop)s
-            }
-            """ % locals()
-        except theano.gof.utils.MethodNotDefined:
-            pass
 
         end_row_loop = """
         }
         """
+
         return (init_decl, begin_row_loop, inside_row_loop, end_row_loop)
 
     def c_code(self, node, name, inp, out, sub):
         x, = inp
         sm, = out
-        code_template = ''.join(self.c_code_template(
-            node.inputs[0].type.dtype_specs()[1]))
+        code_template = ''.join(self.c_code_template())
         return code_template % dict(locals(), **sub)
 
     @staticmethod
     def c_code_cache_version():
-        return (3,)
+        return (1,)
 
 softmax = Softmax()
 
 
-@opt.register_specialize('fast_compile_gpu')
+@opt.register_specialize
 @gof.local_optimizer([softmax])
 def local_softmax_with_bias(node):
     """Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias)
@@ -677,7 +591,7 @@ def softmax_simplifier(numerators, denominators):
 
 if 0:
     @opt.register_specialize
-    @gof.local_optimizer([tensor.add])
+    @gof.local_optimizer([])
     def local_softmax_grad(node):
         '''dy*sm - DimShuffle{0,'x'}(sum{1}(dy*sm))*sm -> softmax_grad(dy,sm)'''
         #TODO what if the signs are changed?
@@ -951,7 +865,7 @@ def c_headers(self):
         return ['<iostream>', '<cmath>']
 
     @staticmethod
-    def c_code_template(dtype):
+    def c_code_template():
         # this implementation was lifted from
         # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
 
@@ -962,7 +876,7 @@ def c_code_template(dtype):
 
         #TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
         (init_decl, begin_row_loop, inside_row_loop, end_row_loop) = \
-                SoftmaxWithBias.c_code_template(dtype)
+                SoftmaxWithBias.c_code_template()
         return (init_decl,
                 """
         if (PyArray_NDIM(%(y_idx)s) != 1)
@@ -984,7 +898,7 @@ def c_code_template(dtype):
         {
             if (NULL != %(nll)s) Py_XDECREF(%(nll)s);
             %(nll)s = (PyArrayObject*)PyArray_SimpleNew(1,
-                PyArray_DIMS(%(y_idx)s), PyArray_TYPE((PyArrayObject*) py_%(x)s));
+                PyArray_DIMS(%(y_idx)s), type_num_%(x)s);
             if(!%(nll)s)
             {
                 PyErr_SetString(PyExc_MemoryError,
@@ -997,7 +911,7 @@ def c_code_template(dtype):
         {
             Py_XDECREF(%(am)s);
             %(am)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                PyArray_DIMS(%(y_idx)s), PyArray_TYPE((PyArrayObject*) py_%(y_idx)s));
+                PyArray_DIMS(%(y_idx)s), type_num_%(y_idx)s);
             if(!%(am)s)
             {
                 PyErr_SetString(PyExc_MemoryError,
@@ -1035,8 +949,7 @@ def c_code(self, node, name, inp, out, sub):
         nll, sm, am = out
         y_idx_type = node.inputs[2].type.dtype_specs()[1]
         am_type = y_idx_type
-        dtype = node.inputs[0].type.dtype_specs()[1]
-        code_template = ''.join(self.c_code_template(dtype))
+        code_template = ''.join(self.c_code_template())
         return code_template % dict(locals(), **sub)
 
 
@@ -1091,7 +1004,7 @@ def grad(self, inp, grads):
         # typically we should not need the gradient w.r.t. dy).
         y_idx_range = tensor.arange(y_idx.shape[0])
         g_dy = tensor.sum(
-                g_dx * subtensor.AdvancedIncSubtensor()(
+                g_dx * tensor.AdvancedIncSubtensor()(
                     sm, tensor.fill(dy, -1), y_idx_range, y_idx),
                 axis=1)
         g_sm = dy.dimshuffle(0, 'x') * g_dx
@@ -1107,15 +1020,15 @@ def c_code(self, node, name, inp, out, sub):
         y_idx_type = node.inputs[2].type.dtype_specs()[1]
         return """
 
-        if ((PyArray_TYPE(%(dnll)s) != NPY_DOUBLE) &&
-            (PyArray_TYPE(%(dnll)s) != NPY_FLOAT))
+        if ((PyArray_DESCR(%(dnll)s)->type_num != NPY_DOUBLE) &&
+            (PyArray_DESCR(%(dnll)s)->type_num != NPY_FLOAT))
         {
             PyErr_SetString(PyExc_TypeError,
                  "dnll type should be float32 or float64");
             %(fail)s;
         }
-        if ((PyArray_TYPE(%(sm)s) != NPY_DOUBLE) &&
-            (PyArray_TYPE(%(sm)s) != NPY_FLOAT))
+        if ((PyArray_DESCR(%(sm)s)->type_num != NPY_DOUBLE) &&
+            (PyArray_DESCR(%(sm)s)->type_num != NPY_FLOAT))
         {
             PyErr_SetString(PyExc_TypeError,
                  "sm type should be float32 or float64");
@@ -1151,7 +1064,7 @@ def c_code(self, node, name, inp, out, sub):
             if (NULL != %(dx)s) Py_XDECREF(%(dx)s);
             %(dx)s = (PyArrayObject*) PyArray_SimpleNew(2,
                                                         PyArray_DIMS(%(sm)s),
-                                                        PyArray_TYPE((PyArrayObject*) py_%(sm)s));
+                                                        type_num_%(sm)s);
             if(!%(dx)s) {
                 PyErr_SetString(PyExc_MemoryError,
                      "failed to alloc dx output");
@@ -1330,8 +1243,8 @@ def grad(self, inp, grads):
 crossentropy_categorical_1hot = CrossentropyCategorical1Hot()
 
 
-@opt.register_stabilize('fast_compile_gpu')
-@opt.register_specialize('fast_compile_gpu')
+@opt.register_stabilize
+@opt.register_specialize
 @gof.optimizer
 def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
     """This is a stabilization optimization
@@ -1404,10 +1317,9 @@ def search_make_one_sub():
 
 optdb.register('crossentropy_to_crossentropy_with_softmax',
                crossentropy_to_crossentropy_with_softmax, 2.01,
-               'fast_run', 'xent', 'fast_compile_gpu')
+               'fast_run', 'xent')
 
 
-@opt.register_specialize('fast_compile_gpu')
 @gof.local_optimizer([softmax_grad])
 def local_crossentropy_to_crossentropy_with_softmax_grad(node):
     if node.op == softmax_grad:
@@ -1418,9 +1330,10 @@ def local_crossentropy_to_crossentropy_with_softmax_grad(node):
             dx = crossentropy_softmax_1hot_with_bias_dx(g_nll,
                  coding_dist, true_one_of_n)
             return [dx]
+opt.register_specialize(local_crossentropy_to_crossentropy_with_softmax_grad)
 
 
-@opt.register_specialize('fast_compile_gpu')
+@opt.register_specialize
 @gof.local_optimizer([tensor._max_and_argmax])
 def local_argmax_pushdown(node):
     if node.op == tensor._max_and_argmax and node.inputs[0].owner and \
@@ -1483,12 +1396,11 @@ def _check_rows_is_arange_len_labels(rows, labels):
 
         # Not sure if that case happens any more after the introduction of
         # ShapeOptimizer, but we keep it if ShapeOptimizer is not present
-        if isinstance(stop.owner.op, subtensor.Subtensor):
+        if isinstance(stop.owner.op, tensor.Subtensor):
             shape_subtensor = stop.owner
-            if shape_subtensor.op.get_constant_idx(shape_subtensor.inputs,
-                                                   allow_partial=True) == [0]:
-                shape_var = shape_subtensor.inputs[0]
-                if shape_var.owner and shape_var.owner.op == tensor.shape:
+            if list(shape_subtensor.op.idx_list) == [0]:
+                shape_var, = shape_subtensor.inputs
+                if shape_var.owner and shape_var.owner.op == tensor._shape:
                     return shape_var.owner.inputs[0] is labels
         else:
             shape_of = stop.owner.fgraph.shape_feature.shape_of
@@ -1506,13 +1418,13 @@ def _is_const(z, val, approx=False):
         return numpy.all(maybe == val)
 
 
-@opt.register_specialize('fast_compile_gpu')
-@gof.local_optimizer([subtensor.AdvancedSubtensor, tensor.log])
+@opt.register_specialize
+@gof.local_optimizer([])
 def local_advanced_indexing_crossentropy_onehot(node):
     log = None
     sm = None
     # First case: log(softmax(x))[rows, labels]
-    if isinstance(node.op, subtensor.AdvancedSubtensor):
+    if isinstance(node.op, tensor.AdvancedSubtensor):
         try:
             log, rows, labels = node.inputs
         except Exception:
@@ -1523,7 +1435,7 @@ def local_advanced_indexing_crossentropy_onehot(node):
     # Second case: log(softmax(x)[rows, labels])
     if node.op == tensor.log:
         pre_log = node.inputs[0].owner
-        if pre_log and isinstance(pre_log.op, subtensor.AdvancedSubtensor):
+        if pre_log and isinstance(pre_log.op, tensor.AdvancedSubtensor):
             try:
                 sm, rows, labels = pre_log.inputs
             except Exception:
@@ -1547,7 +1459,7 @@ def local_advanced_indexing_crossentropy_onehot(node):
                                                                     labels)[0]]
 
 
-@opt.register_specialize('fast_compile_gpu')
+@opt.register_specialize
 @gof.local_optimizer([softmax_grad])
 def local_advanced_indexing_crossentropy_onehot_grad(node):
     if not (node.op == softmax_grad):
@@ -1612,7 +1524,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
     # After the check for AdvancedIncSubtensor, if anything does not fit with
     # the formula above, there's no way to fit it with the the second case,
     # so we return immediately.
-    if d_sm.owner and isinstance(d_sm.owner.op, subtensor.AdvancedIncSubtensor):
+    if d_sm.owner and isinstance(d_sm.owner.op, tensor.AdvancedIncSubtensor):
         try:
             z, incr, rows, labels = d_sm.owner.inputs
         except Exception:
@@ -1654,7 +1566,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
             if not denom.owner:
                 return
 
-            if isinstance(denom.owner.op, subtensor.AdvancedSubtensor):
+            if isinstance(denom.owner.op, tensor.AdvancedSubtensor):
                 # Base case
                 adv_subtensor = denom
                 #out_grad /= 1.
@@ -1663,7 +1575,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
                 # and the output gradient
                 for i, input in enumerate(denom.owner.inputs):
                     if input.owner and isinstance(input.owner.op,
-                                                  subtensor.AdvancedSubtensor):
+                                                  tensor.AdvancedSubtensor):
                         other_inputs = [in_ for (j,
                              in_) in enumerate(denom.owner.inputs) if j != i]
                         if len(other_inputs) == 1:
@@ -1718,7 +1630,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
             return
 
         # Check the numerator (AdvancedIncSubtensor)
-        if num.owner and isinstance(num.owner.op, subtensor.AdvancedIncSubtensor):
+        if num.owner and isinstance(num.owner.op, tensor.AdvancedIncSubtensor):
             try:
                 z, incr, rows, labels = num.owner.inputs
             except Exception:
@@ -1770,7 +1682,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
         return
 
 
-@opt.register_specialize('fast_compile_gpu')
+@opt.register_specialize
 @gof.local_optimizer([softmax_with_bias])
 def graph_merge_softmax_with_crossentropy_softmax(node):
     if node.op == softmax_with_bias:
@@ -1970,10 +1882,10 @@ def make_out_pattern(X):
 
 
 local_log_softmax = gof.PatternSub(in_pattern=(tensor.log, (softmax, 'x')),
-                                   out_pattern=(make_out_pattern, 'x'),
+                                    out_pattern=(make_out_pattern, 'x'),
                                    allow_multiple_clients=True)
 
 #don't do register_stabilize, this is to make local_log_softmax run
 #only after another more specific optimization that stabilizes cross entropy
 #opt.register_stabilize(local_log_softmax, name = 'local_log_softmax')
-opt.register_specialize(local_log_softmax, 'fast_compile_gpu', name='local_log_softmax')
+opt.register_specialize(local_log_softmax, name='local_log_softmax')
diff --git a/theano/tensor/nnet/sigm.py b/theano/tensor/nnet/sigm.py
index 533e609e503..27a8b6b2229 100644
--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -31,11 +31,6 @@ def st_impl(x):
             return 0.0
         if x > 30.0:
             return 1.0
-        # If x is an int8 or uint8, numpy.exp will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return 1.0 / (1.0 + numpy.exp(-x, sig='f'))
         return 1.0 / (1.0 + numpy.exp(-x))
 
     def impl(self, x):
@@ -273,11 +268,8 @@ def hard_sigmoid(x):
     Removing the slope and shift does not make it faster.
 
     """
-    # Use the same dtype as determined by "upgrade_to_float",
-    # and perform computation in that dtype.
-    out_dtype = scalar.upgrade_to_float(scalar.Scalar(dtype=x.dtype))[0].dtype
-    slope = tensor.constant(0.2, dtype=out_dtype)
-    shift = tensor.constant(0.5, dtype=out_dtype)
+    slope = 0.2
+    shift = 0.5
     x = (x * slope) + shift
     x = tensor.clip(x, 0, 1)
     return x
@@ -308,11 +300,6 @@ def static_impl(x):
             return 0.0
         if x > 30.0:
             return x
-        # If x is an int8 or uint8, numpy.exp will compute the result in
-        # half-precision (float16), where we want float32.
-        x_dtype = str(getattr(x, 'dtype', ''))
-        if x_dtype in ('int8', 'uint8'):
-            return numpy.log1p(numpy.exp(x, sig='f'))
         return numpy.log1p(numpy.exp(x))
 
     def impl(self, x):
diff --git a/theano/tensor/nnet/tests/test_conv.py b/theano/tensor/nnet/tests/test_conv.py
index 44c17103e5c..97996dc11dc 100644
--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
@@ -1,28 +1,23 @@
 import time
-
-from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
 import numpy
 
 import theano
 import theano.tensor as T
 from theano.tests import unittest_tools as utt
+
 from theano.tensor.nnet import conv
+
 from theano.tensor.basic import _allclose, NotScalarConstantError
 
 
 class TestConv2D(utt.InferShapeTester):
-    mode = None
-    dtype = theano.config.floatX
 
     def setUp(self):
-        super(TestConv2D, self).setUp()
-        self.input = T.tensor4('input', dtype=self.dtype)
+        super (TestConv2D, self).setUp()
+        self.input = T.dtensor4('input')
         self.input.name = 'default_V'
-        self.filters = T.tensor4('filters', dtype=self.dtype)
+        self.filters = T.dtensor4('filters')
         self.filters.name = 'default_filters'
-        if not conv.imported_scipy_signal and theano.config.cxx == "":
-            raise SkipTest("conv2d tests need SciPy or a c++ compiler")
 
     def validate(self, image_shape, filter_shape,
                  border_mode='valid', subsample=(1, 1),
@@ -69,11 +64,11 @@ def sym_conv2d(input, filters):
 
         output = sym_conv2d(input, filters)
         output.name = 'conv2d(%s,%s)' % (input.name, filters.name)
-        theano_conv = theano.function([input, filters], output, mode=self.mode)
+        theano_conv = theano.function([input, filters], output)
 
         # initialize input and compute result
-        image_data = numpy.random.random(N_image_shape).astype(self.dtype)
-        filter_data = numpy.random.random(N_filter_shape).astype(self.dtype)
+        image_data = numpy.random.random(N_image_shape)
+        filter_data = numpy.random.random(N_filter_shape)
         try:
             theano_output = theano_conv(image_data, filter_data)
         except ValueError:
@@ -249,7 +244,6 @@ def test_unroll_batch_kern_fail(self):
                       N_image_shape=(2, 3, 3, 3),  N_filter_shape=(5, 3, 2, 2),
                       should_raise=True)
 
-    @attr('slow')
     def test_subsample(self):
         """
         Tests convolution where subsampling != (1,1)
@@ -286,7 +280,6 @@ def test_invalid_filter_shape(self):
                           (3, 2, 8, 8), (4, 3, 5, 5),
                           'valid')
 
-    @attr('slow')
     def test_invalid_input_shape(self):
         """
         Tests that when the shape gived at build time is not the same as
@@ -359,12 +352,6 @@ def test_missing_info(self):
         self.validate((None, 2, None, None), (None, 2, 5, 5),
                       N_image_shape=(3, 2, 8, 8),
                       N_filter_shape=(4, 2, 5, 5))
-        self.validate((3, 2, 8, 8), (4, 2, None, 5),
-                      N_image_shape=(3, 2, 8, 8),
-                      N_filter_shape=(4, 2, 5, 5))
-        self.validate((3, 2, 8, 8), (4, 2, 5, None),
-                      N_image_shape=(3, 2, 8, 8),
-                      N_filter_shape=(4, 2, 5, 5))
 
     def test_wrong_info(self):
         """
diff --git a/theano/tensor/nnet/tests/test_conv3d.py b/theano/tensor/nnet/tests/test_conv3d.py
index c80bc4b5cdb..bd0fdb7ed39 100644
--- a/theano/tensor/nnet/tests/test_conv3d.py
+++ b/theano/tensor/nnet/tests/test_conv3d.py
@@ -12,7 +12,6 @@
 if theano.sparse.enable_sparse:
     from scipy import sparse
 from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
 
 floatX = theano.config.floatX
 
@@ -223,7 +222,6 @@ def test_c_against_python(self):
         self.randomize()
         self.check_c_against_python(self.V.get_value(borrow=True).shape[1:4])
 
-    @attr('slow')
     def test_c_against_mat_mul(self):
         # Use a filter of the same size as the image, so the convolution is
         # just a dense matrix multiply.
diff --git a/theano/tensor/nnet/tests/test_conv3d2d.py b/theano/tensor/nnet/tests/test_conv3d2d.py
deleted file mode 100644
index 2ab1c40369d..00000000000
--- a/theano/tensor/nnet/tests/test_conv3d2d.py
+++ /dev/null
@@ -1,120 +0,0 @@
-import time
-
-from nose.plugins.skip import SkipTest
-import numpy
-try:
-    from scipy import ndimage
-except ImportError:
-    ndimage = None
-
-import theano
-from theano.tensor.nnet.conv3d2d import *
-import theano.tests.unittest_tools as utt
-
-
-if theano.config.mode == 'FAST_COMPILE':
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
-else:
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
-
-
-def test_get_diagonal_subtensor_view(wrap=lambda a: a):
-    x = numpy.arange(20).reshape(5, 4).astype('float32')
-    x = wrap(x)
-    xv01 = get_diagonal_subtensor_view(x, 0, 1)
-
-    # test that it works in 2d
-    assert numpy.all(numpy.asarray(xv01) == [[12, 9, 6, 3], [16, 13, 10, 7]])
-
-    x = numpy.arange(24).reshape(4, 3, 2)
-    xv01 = get_diagonal_subtensor_view(x, 0, 1)
-    xv02 = get_diagonal_subtensor_view(x, 0, 2)
-    xv12 = get_diagonal_subtensor_view(x, 1, 2)
-
-    #print 'x', x
-    #print 'xv01', xv01
-    #print 'xv02', xv02
-    assert numpy.all(numpy.asarray(xv01) == [
-        [[12, 13], [8, 9], [4, 5]],
-        [[18, 19], [14, 15], [10, 11]]])
-
-    assert numpy.all(numpy.asarray(xv02) == [
-        [[6, 1], [8, 3], [10, 5]],
-        [[12, 7], [14, 9], [16, 11]],
-        [[18, 13], [20, 15], [22, 17]],
-        ])
-
-    # diagonal views of each leading matrix is the same
-    # as the slices out of the diagonal view of the entire 3d tensor
-    for xi, xvi in zip(x, xv12):
-        assert numpy.all(xvi == get_diagonal_subtensor_view(xi, 0, 1))
-
-
-def pyconv3d(signals, filters):
-    Ns, Ts, C, Hs, Ws = signals.shape
-    Nf, Tf, C, Hf, Wf = filters.shape
-
-    Tf2 = Tf//2
-    Hf2 = Hf//2
-    Wf2 = Wf//2
-
-    rval = numpy.zeros((Ns, Ts-Tf+1, Nf, Hs-Hf+1, Ws-Wf+1))
-    for ns in xrange(Ns):
-        for nf in xrange(Nf):
-            for c in xrange(C):
-                s_i = signals[ns,:,c,:,:]
-                f_i = filters[nf,:,c,:,:]
-                r_i = rval[ns, :, nf, :, :]
-                o_i = ndimage.convolve(s_i, f_i, mode='constant', cval=1)
-                #print s_i.shape, f_i.shape, r_i.shape, o_i.shape
-                r_i += o_i[Tf2:-Tf2, Hf2:-Hf2, Wf2:-Wf2]
-    return rval
-
-
-def test_conv3d(mode=mode_without_gpu, shared=theano.tensor._shared):
-    if ndimage is None:
-        raise SkipTest("conv3d2d tests need SciPy")
-
-    Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32
-    Nf, Tf, C, Hf, Wf = 32, 5 , 3, 5 , 5
-
-    signals = numpy.arange(Ns*Ts*C*Hs*Ws).reshape(Ns, Ts, C, Hs, Ws).astype('float32')
-    filters = numpy.arange(Nf*Tf*C*Hf*Wf).reshape(Nf, Tf, C, Hf, Wf).astype('float32')
-
-    t0 = time.time()
-    pyres = pyconv3d(signals, filters)
-    print time.time() - t0
-
-    s_signals = shared(signals)
-    s_filters = shared(filters)
-    s_output = shared(signals*0)
-
-    out = conv3d(s_signals, s_filters,
-                 signals_shape=signals.shape,
-                 filters_shape=filters.shape)
-
-    newconv3d = theano.function([], [],
-                                updates={s_output: out},
-                                mode=mode)
-
-    t0 = time.time()
-    newconv3d()
-    print time.time() - t0
-    utt.assert_allclose(pyres, s_output.get_value(borrow=True))
-    gsignals, gfilters = theano.grad(out.sum(), [s_signals, s_filters])
-    gnewconv3d = theano.function([], [],
-                                 updates=[(s_filters, gfilters),
-                                          (s_signals, gsignals)],
-                                 mode=mode,
-                                 name='grad')
-
-    t0 = time.time()
-    gnewconv3d()
-    print 'grad', time.time() - t0
-
-    Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5
-    Nf, Tf, C, Hf, Wf = 4, 2, 3, 2, 2
-
-    signals = numpy.random.rand(Ns, Ts, C, Hs, Ws).astype('float32')
-    filters = numpy.random.rand(Nf, Tf, C, Hf, Wf).astype('float32')
-    utt.verify_grad(conv3d, [signals, filters], eps=1e-1)
diff --git a/theano/tensor/nnet/tests/test_nnet.py b/theano/tensor/nnet/tests/test_nnet.py
index dbc9cd11375..ea9d106c249 100644
--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -8,8 +8,9 @@
 from theano import tensor as T
 from theano import tensor
 from theano import gof
+from theano.gof.python25 import all
 from theano.tests import unittest_tools as utt
-from theano import printing
+from theano import printing, pprint
 from theano.tensor.nnet import (categorical_crossentropy,
                                 crossentropy_categorical_1hot,
                                 crossentropy_softmax_1hot,
@@ -1269,20 +1270,6 @@ def test_basic(self):
         assert softmax in f_ops
         f(self.rng.rand(3, 4).astype(config.floatX))
 
-    def test_basic_keepdims(self):
-        c = T.matrix()
-        p_y = T.exp(c) / T.exp(c).sum(axis=1, keepdims=True)
-
-        # test that function contains softmax and no div.
-        f = theano.function([c], p_y, mode=self.mode)
-        f_ops = [n.op for n in f.maker.fgraph.toposort()]
-        #print '--- f ='
-        #printing.debugprint(f)
-        #print '==='
-        assert len(f_ops) == 1
-        assert softmax in f_ops
-        f(self.rng.rand(3, 4).astype(config.floatX))
-
     def test_grad(self):
         c = T.matrix()
         p_y = T.exp(c) / T.exp(c).sum(axis=1).dimshuffle(0, 'x')
diff --git a/theano/tensor/nnet/tests/test_sigm.py b/theano/tensor/nnet/tests/test_sigm.py
index 225bf00a704..828370ea218 100644
--- a/theano/tensor/nnet/tests/test_sigm.py
+++ b/theano/tensor/nnet/tests/test_sigm.py
@@ -16,7 +16,7 @@
     register_local_1msigmoid, simplify_mul,
 )
 from theano.tensor.tests.test_basic import (makeBroadcastTester, rand,
-                                            check_floatX, upcast_int8_nfunc,
+                                            check_floatX,
                                             _good_broadcast_unary_normal_no_complex)
 
 
@@ -30,8 +30,8 @@ def test_elemwise(self):
 
 SigmoidTester = makeBroadcastTester(
     op=sigmoid,
-    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
-        inputs, 1 / (1 + numpy.exp(-inputs)))),
+    expected=lambda inputs: check_floatX(
+        inputs, 1/(1+numpy.exp(-inputs))),
     good=_good_broadcast_unary_normal_no_complex,
     #grad=_grad_broadcast_unary_normal,
     name='SigmoidTester',
@@ -39,8 +39,8 @@ def test_elemwise(self):
 
 UltraFastSigmoidTester = makeBroadcastTester(
     op=ultra_fast_sigmoid,
-    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
-        inputs, 1 / (1 + numpy.exp(-inputs)))),
+    expected=lambda inputs: check_floatX(
+        inputs, 1/(1+numpy.exp(-inputs))),
     good=_good_broadcast_unary_normal_no_complex,
     #grad=_grad_broadcast_unary_normal,
     name='UltraFastSigmoidTester',
@@ -49,21 +49,20 @@ def test_elemwise(self):
 
 HardSigmoidTester = makeBroadcastTester(
     op=hard_sigmoid,
-    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
-        inputs, 1 / (1 + numpy.exp(-inputs)))),
+    expected=lambda inputs: check_floatX(
+        inputs, 1/(1+numpy.exp(-inputs))),
     good=_good_broadcast_unary_normal_no_complex,
     #grad=_grad_broadcast_unary_normal,
-    name='HardSigmoidTester',
+    name='UltraFastSigmoidTester',
 # This is an approx of the sigmoid. That is why we raise eps
     eps=1e-1)
 
 
 SoftplusTester = makeBroadcastTester(
     op=softplus,
-    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
-        inputs, numpy.log1p(numpy.exp(inputs)))),
-    good=dict(_good_broadcast_unary_normal_no_complex,
-              int8=[numpy.arange(-127, 89, dtype='int8')]),
+    expected=lambda inputs: check_floatX(
+        inputs, numpy.log1p(numpy.exp(inputs))),
+    good=_good_broadcast_unary_normal_no_complex,
     #grad=_grad_broadcast_unary_normal,
     name='SoftplusTester',
 )
@@ -163,7 +162,7 @@ def test_exp_over_1_plus_exp(self):
             f = theano.function([x], (T.fill(x, -1.0) * T.exp(x)) /
                                 ((1 + T.exp(x)) * (1 + T.exp(-x))), mode=m)
             assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid,
-                    T.mul]
+                    T.mul, theano.tensor.inplace.neg_inplace]
             f(data)
             f = theano.function([x], (T.fill(x, -1.1) * T.exp(x)) /
                                 ((1 + T.exp(x)) * (1 + T.exp(-x))), mode=m)
@@ -239,7 +238,7 @@ def match(func, ops):
                  tensor.exp(x * y) * tensor.exp(y)),
                 mode=m)
         match(f, [sigmoid, tensor.mul, tensor.neg, tensor.exp, sigmoid,
-                  tensor.mul])
+                  tensor.mul, tensor.neg])
 
     def test_perform_sigm_times_exp(self):
         """
diff --git a/theano/tensor/opt.py b/theano/tensor/opt.py
index 7fbc72045e6..05516be7130 100644
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -7,12 +7,11 @@
 import logging
 _logger = logging.getLogger('theano.tensor.opt')
 
-import itertools
-from itertools import izip
 import operator
+import itertools
 import sys
-import time
 import traceback
+from itertools import izip
 
 import numpy
 import numpy as N  # guys... please don't do this in the library :(
@@ -25,21 +24,15 @@
 from theano.gof.utils import MethodNotDefined
 from theano.configparser import config
 from theano.tensor.elemwise import Elemwise, DimShuffle
-from theano.tensor.subtensor import (get_idx_list, get_canonical_form_slice,
-                                     Subtensor, IncSubtensor, make_constant,
-                                     AdvancedIncSubtensor1,
-                                     AdvancedIncSubtensor,
-                                     AdvancedSubtensor1)
 from theano import scalar
 from theano.tensor import basic as T
 from theano import compile  # to register the optimizer built by this file
-from theano.compile.ops import Shape_i
 
 from theano.gof.python25 import any, all
 from theano.gof.opt import (Optimizer, pre_constant_merge,
                             pre_greedy_local_optimizer)
 from theano.gof.opt import merge_optimizer
-from theano.gof import toolbox
+from theano.gof import toolbox, DestroyHandler
 from theano.tensor.basic import get_scalar_constant_value, ShapeError, NotScalarConstantError
 from theano.compat.six import StringIO
 
@@ -52,43 +45,19 @@
 # Utilities
 
 
-def out2in(*local_opts, **kwargs):
+def out2in(*local_opts):
     """WRITEME """
-    name = (kwargs and kwargs.pop('name', None))
-    if len(local_opts) > 1:
-        # Don't wrap it uselessly if their is only 1 optimization.
-        local_opts = opt.LocalOptGroup(*local_opts)
-    else:
-        local_opts, = local_opts
-        if not name:
-            name = local_opts.__name__
-    ret = opt.TopoOptimizer(local_opts,
-                            order='out_to_in',
-                            failure_callback=TopoOptimizer.warn_inplace,
-                            **kwargs)
-    if name:
-        ret.__name__ = name
-    return ret
+    return opt.TopoOptimizer(opt.LocalOptGroup(*local_opts),
+                             order='out_to_in',
+                             failure_callback=TopoOptimizer.warn_inplace)
 
 
 def in2out(*local_opts, **kwargs):
     """WRITEME """
-    name = (kwargs and kwargs.pop('name', None))
-    if len(local_opts) > 1:
-        # Don't wrap it uselessly if their is only 1 optimization.
-        local_opts = opt.LocalOptGroup(*local_opts)
-    else:
-        local_opts, = local_opts
-        if not name:
-            #import pdb;pdb.set_trace()
-            name = local_opts.__name__
-    ret = opt.TopoOptimizer(local_opts,
-                            order='in_to_out',
-                            failure_callback=TopoOptimizer.warn_inplace,
-                            **kwargs)
-    if name:
-        ret.__name__ = name
-    return ret
+    return opt.TopoOptimizer(opt.LocalOptGroup(*local_opts),
+                             order='in_to_out',
+                             failure_callback=TopoOptimizer.warn_inplace,
+                             **kwargs)
 
 
 def _fill_chain(new_out, orig_inputs):
@@ -176,7 +145,7 @@ def inplace_elemwise_optimizer_op(OP):
     """
     We parametrise it to make it work for Elemwise and GpuElemwise op.
     """
-    @gof.inplace_optimizer
+    @gof.optimizer
     def inplace_elemwise_optimizer(fgraph):
         """
         Usage: inplace_elemwise_optimizer.optimize(fgraph)
@@ -268,11 +237,10 @@ def inplace_elemwise_optimizer(fgraph):
                                 scalar.transfer_type(
                                     *[inplace_pattern.get(i, None) \
                                           for i in xrange(len(node.outputs))]))
-                        new_outputs = OP(new_scal, inplace_pattern)(
-                                *node.inputs, **dict(return_list=True))
-                        new_node = new_outputs[0].owner
+                        new = OP(new_scal, inplace_pattern).make_node(
+                            *node.inputs)
 
-                        for r, new_r in zip(node.outputs, new_outputs):
+                        for r, new_r in zip(node.outputs, new.outputs):
                             fgraph.replace(r, new_r,
                                         reason="inplace_elemwise_optimizer")
                         nb_change_no_validate += 1
@@ -290,7 +258,7 @@ def inplace_elemwise_optimizer(fgraph):
                         fgraph.revert(chk)
                         continue
                     candidate_inputs.remove(candidate_input)
-                    node = new_node
+                    node = new
                     baseline = inplace_pattern
                     break
 
@@ -305,44 +273,27 @@ def inplace_elemwise_optimizer(fgraph):
     return inplace_elemwise_optimizer
 
 inplace_elemwise_optimizer = inplace_elemwise_optimizer_op(T.Elemwise)
-compile.optdb.register('inplace_elemwise_opt', inplace_elemwise_optimizer, 75,
-                       'inplace_opt',  # for historic reason
+compile.optdb.register('inplace_opt', inplace_elemwise_optimizer, 75,
                        'inplace_elemwise_optimizer',
                        'fast_run', 'inplace')
 
 
 def register_canonicalize(lopt, *tags, **kwargs):
-    if type(lopt) == str:
-        def register(inner_lopt):
-            return register_canonicalize(inner_lopt, lopt, *tags, **kwargs)
-        return register
-    else:
-        name = (kwargs and kwargs.pop('name')) or lopt.__name__
-        compile.optdb['canonicalize'].register(name, lopt, 'fast_run', *tags)
-        return lopt
+    name = (kwargs and kwargs.pop('name')) or lopt.__name__
+    compile.optdb['canonicalize'].register(name, lopt, 'fast_run', *tags)
+    return lopt
 
 
 def register_stabilize(lopt, *tags, **kwargs):
-    if type(lopt) == str:
-        def register(inner_lopt):
-            return register_stabilize(inner_lopt, lopt, *tags, **kwargs)
-        return register
-    else:
-        name = (kwargs and kwargs.pop('name')) or lopt.__name__
-        compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
-        return lopt
+    name = (kwargs and kwargs.pop('name')) or lopt.__name__
+    compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
+    return lopt
 
 
 def register_specialize(lopt, *tags, **kwargs):
-    if type(lopt) == str:
-        def register(inner_lopt):
-            return register_specialize(inner_lopt, lopt, *tags, **kwargs)
-        return register
-    else:
-        name = (kwargs and kwargs.pop('name')) or lopt.__name__
-        compile.optdb['specialize'].register(name, lopt, 'fast_run',
-                                             *tags)
-        return lopt
+    name = (kwargs and kwargs.pop('name')) or lopt.__name__
+    compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
+    return lopt
 
 
 def register_uncanonicalize(lopt, *tags, **kwargs):
@@ -368,7 +319,7 @@ def register_specialize_device(lopt, *tags, **kwargs):
 
 @register_canonicalize
 @register_stabilize
-@gof.local_optimizer([T.Dot])
+@gof.local_optimizer([None])
 def local_0_dot_x(node):
     if not isinstance(node.op, T.Dot):
         return False
@@ -391,20 +342,12 @@ def local_0_dot_x(node):
     if replace:
         constant_zero = T.constant(0, dtype=node.outputs[0].type.dtype)
         if x.ndim == 2 and y.ndim == 2:
-            constant_zero = assert_(constant_zero,
-                                    T.eq(x.shape[1], y.shape[0]))
             return [T.alloc(constant_zero, x.shape[0], y.shape[1])]
         elif x.ndim == 1 and y.ndim == 2:
-            constant_zero = assert_(constant_zero,
-                                    T.eq(x.shape[0], y.shape[0]))
             return [T.alloc(constant_zero, y.shape[1])]
         elif x.ndim == 2 and y.ndim == 1:
-            constant_zero = assert_(constant_zero,
-                                    T.eq(x.shape[1], y.shape[0]))
             return [T.alloc(constant_zero, x.shape[0])]
         elif x.ndim == 1 and y.ndim == 1:
-            constant_zero = assert_(constant_zero,
-                                    T.eq(x.shape[0], y.shape[0]))
             return [constant_zero]
         else:
             _logger.warning("Optimization Warning: "
@@ -419,7 +362,7 @@ def local_0_dot_x(node):
 ######################
 
 
-@gof.local_optimizer([DimShuffle])
+@gof.local_optimizer([None, None])
 def local_dimshuffle_lift(node):
     """
     "Lifts" DimShuffle through Elemwise operations and merges
@@ -439,12 +382,10 @@ def local_dimshuffle_lift(node):
     input = node.inputs[0]
     inode = input.owner
     if inode and isinstance(inode.op, Elemwise) and (len(input.clients) == 1):
-        # Don't use make_node to have tag.test_value set.
-        ret = inode.op(*[op.__class__(inp.type.broadcastable,
-                                      op.new_order,
-                                      op.inplace)(inp) for inp in
-                         inode.inputs], **dict(return_list=True))
-        return ret
+        return inode.op.make_node(*[DimShuffle(input.type.broadcastable,
+                                               op.new_order,
+                                               op.inplace)(input) for input in
+                                    inode.inputs]).outputs
     if inode and isinstance(inode.op, DimShuffle):
         new_order = [x == 'x' and 'x' or inode.op.new_order[x] for x in
                      op.new_order]
@@ -454,13 +395,12 @@ def local_dimshuffle_lift(node):
                                                    iinput.type.ndim):
             return [iinput]
         else:
-            ret = op.__class__(iinput.type.broadcastable, new_order,
-                               inplace)(iinput, **dict(return_list=True))
-            return ret
+            return DimShuffle(iinput.type.broadcastable, new_order,
+                              inplace).make_node(iinput).outputs
 
 
 @register_canonicalize
-@gof.local_optimizer([T.DimShuffle])
+@gof.local_optimizer([])
 def local_lift_transpose_through_dot(node):
     """
     dot(x,y).T -> dot(y.T, x.T)
@@ -485,27 +425,25 @@ def local_lift_transpose_through_dot(node):
         return [T.dot(y.T, x.T)]
 
 
-@gof.local_optimizer([DimShuffle])
+@gof.local_optimizer([])
 def dimshuffle_as_view(node):
     op = node.op
     if not isinstance(op, DimShuffle) or op.inplace:
         return False
-    new_op = op.__class__(op.input_broadcastable, op.new_order, inplace=True)
+    new_op = DimShuffle(op.input_broadcastable, op.new_order, inplace=True)
     return [new_op(*node.inputs)]
 
 #Step 60 is the inplace optimization stage.
 compile.optdb.register('dimshuffle_as_view',
-                       TopoOptimizer(
-                           dimshuffle_as_view,
-                           failure_callback=TopoOptimizer.warn_inplace),
-                       60,
+                       TopoOptimizer(dimshuffle_as_view,
+    failure_callback=TopoOptimizer.warn_inplace), 60,
                        'fast_run', 'inplace')
 register_canonicalize(local_dimshuffle_lift)
 register_specialize(local_dimshuffle_lift)
 
 
 @register_canonicalize
-@gof.local_optimizer([T.DimShuffle])
+@gof.local_optimizer([])
 def local_dimshuffle_no_inplace_at_canonicalize(node):
     if isinstance(node.op, T.DimShuffle) and node.op.inplace:
         return [T.DimShuffle(node.op.input_broadcastable,
@@ -606,7 +544,7 @@ def perform(self, node, inputs, out_):
             out[0][...] = inputs
 
     def c_code_cache_version(self):
-        return (2,)
+        return (1,)
 
     def c_code(self, node, name, inp, out_, sub):
         out, = out_
@@ -623,10 +561,7 @@ def c_code(self, node, name, inp, out_, sub):
         ret = """
         npy_intp dims[1];
         dims[0] = %(out_shape)s;
-        if(!%(out)s || PyArray_DIMS(%(out)s)[0] != %(out_shape)s){
-            Py_XDECREF(%(out)s);
-            %(out)s = (PyArrayObject*)PyArray_EMPTY(1, dims, %(out_dtype)s, 0);
-        }
+        %(out)s = (PyArrayObject*)PyArray_EMPTY(1, dims, %(out_dtype)s, 0);
         """ % locals()
         for idx, i in enumerate(inp):
             ret += """
@@ -670,6 +605,78 @@ def process(self, r, pstate):
         r.owner.op, MakeVector), MakeVectorPrinter())
 
 
+class Shape_i(T.Op):
+    """
+    L{Op} to return the shape of a matrix.
+
+    @note: Non-differentiable.
+    """
+    def __init__(self, i):
+        self.i = i
+
+    def __hash__(self):
+        return hash(type(self)) ^ self.i
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.i == other.i
+
+    def __str__(self):
+        return '%s{%i}' % (self.__class__.__name__, self.i)
+
+    def make_node(self, x):
+        # x could be one of a number of types
+        # the only thing we require is that the variable have a .ndim,
+        # and that the value have a .shape
+        if not isinstance(x, T.Variable):
+            raise TypeError('x must be Variable with ndim attribute', x)
+        if x.ndim <= self.i:
+            raise TypeError('x has too few dimensions for Shape_i',
+                            (x, self.i))
+        return T.Apply(self, [x], [T.lscalar()])
+
+    def perform(self, node, inp, out_):
+        x, = inp
+        out, = out_
+        if out[0] is None:
+            out[0] = theano._asarray(x.shape[self.i], dtype='int64')
+        else:
+            out[0][...] = x.shape[self.i]
+
+    def c_code_cache_version(self):
+        return (0, 1)
+
+    def c_code(self, node, name, inp, out_, sub):
+        x, = inp
+        out, = out_
+        i = self.i
+        if isinstance(node.inputs[0].type, T.TensorType):
+            return """
+            if(!%(out)s)
+            %(out)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
+            ((npy_int64*)PyArray_DATA(%(out)s))[0]=PyArray_DIMS(%(x)s)[%(i)s];
+            """ % locals()
+
+        elif node.inputs[0].type.__class__.__name__ == "CudaNdarrayType":
+            #Don't want to import cuda stuff here.
+            return """
+            if(!%(out)s)
+            %(out)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
+            ((npy_int64*)PyArray_DATA(%(out)s))[0]=
+                            CudaNdarray_HOST_DIMS(%(x)s)[%(i)s];
+            """ % locals()
+        else:
+            #TODO: if your type is not listed here, make a damn registry of
+            #      shape_i ops for various types of variables.
+            #      Do not continue this madness.
+            return super(Shape_i, self).c_code(node, name, (x,), (out,), sub)
+
+    def infer_shape(self, node, input_shapes):
+        return [()]
+
+    def grad(self, inp, grads):
+        return [None]
+
+
 class ShapeFeature(object):
     """Graph optimizer for removing all calls to shape()
 
@@ -762,19 +769,10 @@ def shape_ir(self, i, r):
         if hasattr(r.type, "broadcastable") and r.type.broadcastable[i]:
             return self.lscalar_one
         else:
-            # Do not call make_node for test_value
-            s = Shape_i(i)(r)
-            try:
-                s = get_scalar_constant_value(s)
-            except NotScalarConstantError:
-                pass
-            return s
+            return Shape_i(i).make_node(r).outputs[0]
 
     def shape_tuple(self, r):
         """Return a tuple of symbolic shape vars for tensor variable r"""
-        if not hasattr(r, 'ndim'):
-            # This happen for NoneConst.
-            return None
         return tuple([self.shape_ir(i, r) for i in xrange(r.ndim)])
 
     def default_infer_shape(self, node, i_shapes):
@@ -803,9 +801,7 @@ def unpack(self, s_i):
             # don't make the optimizer merge a zillion ones together
             # by always returning the same object to represent 1
             return self.lscalar_one
-        if (type(s_i) in (int, long) or
-            isinstance(s_i, numpy.integer) or
-            (isinstance(s_i, numpy.ndarray) and s_i.ndim == 0)):
+        if type(s_i) in (int, long) or isinstance(s_i, numpy.integer):
             # this shape is a constant
             assert s_i >= 0
             return T.constant(s_i, dtype='int64')
@@ -817,29 +813,7 @@ def unpack(self, s_i):
             #
             # worst case, we loop over shape_of and replace things
             raise NotImplementedError(s_i)
-
-        # s_i is x.shape[i], we change it to Shape_i.
-        if (s_i.owner and
-            isinstance(s_i.owner.op, Subtensor) and
-            s_i.owner.inputs[0].owner and
-            isinstance(s_i.owner.inputs[0].owner.op, T.Shape)):
-            assert s_i.ndim == 0
-            assert len(s_i.owner.op.idx_list) == 1
-
-            # The current Subtensor always put constant index in the graph.
-            # This was not True in the past. So call the Subtensor function
-            # that will return the right index.
-            idx = theano.tensor.subtensor.get_idx_list(s_i.owner.inputs,
-                                                       s_i.owner.op.idx_list)
-            assert len(idx) == 1
-            idx = idx[0]
-            try:
-                i = get_scalar_constant_value(idx)
-                s_i = Shape_i(i)(s_i.owner.inputs[0].owner.inputs[0])
-            except NotScalarConstantError:
-                pass
-
-        if s_i.type.dtype[:3] in ('int', 'uint'):
+        elif s_i.type.dtype[:3] in ('int', 'uint'):
             if getattr(s_i.type, 'ndim', 0):
                 raise TypeError('Shape element must be scalar', s_i)
             return s_i
@@ -906,48 +880,22 @@ def update_shape(self, r, other_r):
             r_shape = self.shape_of[r]
         else:
             # If no info is known on r's shape, use other_shape
-            self.set_shape(r, other_shape)
-            return
-        if (other_r.owner and r.owner and
-            other_r.owner.inputs == r.owner.inputs and
-            other_r.owner.op == r.owner.op):
-            # We are doing a merge. So the 2 shapes graph will be the
-            # same.  This is only a speed optimization to call
-            # ancestors() less frequently.
+            self.shape_of[r] = other_shape
+            for sv in other_shape:
+                self.shape_of_reverse_index.setdefault(sv, set()).add(r)
             return
 
         # Merge other_shape with r_shape, giving the priority to other_shape
         merged_shape = []
         for i, ps in enumerate(other_shape):
+            # If other_shape[i] is uninformative, use r_shape[i].
+            # For now, we consider 2 cases of uninformative other_shape[i]:
+            #  - Shape_i(i)(other_r);
+            #  - Shape_i(i)(r).
             if (ps.owner
                     and isinstance(getattr(ps.owner, 'op', None), Shape_i)
                     and ps.owner.op.i == i
                     and ps.owner.inputs[0] in (r, other_r)):
-                # If other_shape[i] is uninformative, use r_shape[i].
-                # For now, we consider 2 cases of uninformative other_shape[i]:
-                #  - Shape_i(i)(other_r);
-                #  - Shape_i(i)(r).
-                merged_shape.append(r_shape[i])
-            elif isinstance(r_shape[i], (Constant, int)):
-                # We do this to call less often ancestors and make
-                # sure we have the simplest shape possible.
-                merged_shape.append(r_shape[i])
-            elif isinstance(other_shape[i], (Constant, int)):
-                # We do this to call less often ancestors and make
-                # sure we have the simplest shape possible.
-                merged_shape.append(other_shape[i])
-            elif other_shape[i] == r_shape[i]:
-                # This mean the shape is equivalent
-                # We do not want to do the ancestor check in those cases
-                merged_shape.append(r_shape[i])
-            elif r_shape[i] in theano.gof.graph.ancestors([other_shape[i]]):
-                # Another case where we want to use r_shape[i] is when
-                # other_shape[i] actually depends on r_shape[i]. In that case,
-                # we do not want to substitute an expression with another that
-                # is strictly more complex. Such a substitution could also lead
-                # to cycles: if (in the future) r_shape[i] gets replaced by an
-                # expression of other_shape[i], other_shape[i] may end up
-                # depending on itself.
                 merged_shape.append(r_shape[i])
             else:
                 merged_shape.append(other_shape[i])
@@ -977,12 +925,12 @@ def set_shape_i(self, r, i, s_i):
             else:
                 new_shape.append(s_j)
         assert all([not hasattr(r.type, "broadcastable") or
-                    not r.type.broadcastable[idx] or
+                    not r.type.broadcastable[i] or
                     # The two following comparison are a speed optimization
                     # But we never timed this speed optimization!
-                    self.lscalar_one.equals(new_shape[idx]) or
-                    self.lscalar_one.equals(T.extract_constant(new_shape[idx]))
-                    for idx in range(r.ndim)])
+                    self.lscalar_one.equals(new_shape[i]) or
+                    self.lscalar_one.equals(T.extract_constant(new_shape[i]))
+                    for i in range(r.ndim)])
         self.shape_of[r] = tuple(new_shape)
         for sv in self.shape_of[r]:
             self.shape_of_reverse_index.setdefault(sv, set()).add(r)
@@ -1020,9 +968,9 @@ def on_attach(self, fgraph):
         # shape var -> graph v
 
         for node in fgraph.toposort():
-            self.on_import(fgraph, node, reason='on_attach')
+            self.on_import(fgraph, node)
 
-    def on_import(self, fgraph, node, reason):
+    def on_import(self, fgraph, node):
         if node.outputs[0] in self.shape_of:
             # this is a revert, not really an import
             for r in node.outputs + node.inputs:
@@ -1068,22 +1016,20 @@ def on_import(self, fgraph, node, reason):
         # an element of o_shapes is either None or a tuple
         #   elements of the tuple can be either strings, or ints
         if len(o_shapes) != len(node.outputs):
-            raise Exception(
-                ('The infer_shape method for the Op "%s" returned a list ' +
-                'with the wrong number of element: len(o_shapes) = %d ' +
-                ' != len(node.outputs) = %d') % (str(node.op),
-                                                len(o_shapes),
-                                                len(node.outputs)))
+            raise Exception('len(o_shapes) = '
+                    + str(len(o_shapes))
+                    + ' != len(node.outputs) = '
+                    + str(len(node.outputs)))
 
         # Ensure shapes are in 'int64'. This is to make sure the assert
         # found in the `local_useless_subtensor` optimization does not fail.
+        new_shape = []
         for sh_idx, sh in enumerate(o_shapes):
             if sh is None:
                 continue
             if not isinstance(sh, (list, tuple)):
                 raise ValueError("infer_shape of %s didn't return a list of"
                                  " list. It returned '%s'" % (str(node), str(o_shapes)))
-            new_shape = []
             for i, d in enumerate(sh):
                 # Note: we ignore any shape element that is not typed (i.e.,
                 # does not have a 'dtype' attribute). This means there may
@@ -1091,8 +1037,8 @@ def on_import(self, fgraph, node, reason):
                 # but this works with `local_useless_subtensor`, so for now we
                 # keep it this way. See #266 for a better long-term fix.
                 if getattr(d, 'dtype', 'int64') != 'int64':
-                    assert d.dtype in theano.tensor.discrete_dtypes, (node, d.dtype)
-                    assert str(d.dtype) != 'uint64', node
+                    assert d.dtype in theano.tensor.discrete_dtypes, d.dtype
+                    assert str(d.dtype) != 'uint64'
                     new_shape += sh[len(new_shape):i + 1]
                     new_shape[i] = theano.tensor.cast(d, 'int64')
             if new_shape:
@@ -1100,11 +1046,12 @@ def on_import(self, fgraph, node, reason):
                 # 'int64'.
                 new_shape += sh[len(new_shape):]
                 o_shapes[sh_idx] = tuple(new_shape)
+                new_shape = []
 
         for r, s in izip(node.outputs, o_shapes):
             self.set_shape(r, s)
 
-    def on_change_input(self, fgraph, node, i, r, new_r, reason):
+    def on_change_input(self, fgraph, node, i, r, new_r):
         if new_r not in self.shape_of:
             # It happen that the fgraph didn't called on_import for some
             # new_r.  This happen when new_r don't have an
@@ -1126,29 +1073,6 @@ def on_change_input(self, fgraph, node, i, r, new_r, reason):
         # At that point, node is no longer a client of r, but of new_r
         for (shpnode, idx) in (r.clients + [(node, i)]):
             if isinstance(getattr(shpnode, 'op', None), Shape_i):
-                idx = shpnode.op.i
-                repl = self.shape_of[new_r][idx]
-                if repl.owner is shpnode:
-                    # This mean the replacement shape object is
-                    # exactly the same as the current shape object. So
-                    # no need for replacement. This happen for example
-                    # with the InputToGpuOptimizer optimizer.
-                    continue
-                if (repl.owner and
-                    repl.owner.inputs[0] is shpnode.inputs[0] and
-                    isinstance(repl.owner.op, Shape_i) and
-                    repl.owner.op.i == shpnode.op.i):
-                    # The replacement is a shape_i of the same
-                    # input. So no need to do this equivalent
-                    # replacement.
-                    continue
-
-                if shpnode.outputs[0] in theano.gof.graph.ancestors([repl]):
-                    raise AssertionError(
-                        "This substitution would insert a cycle in the graph:"
-                        "node: %s, i: %i, r: %s, new_r: %s"
-                        % (node, i, r, new_r))
-
                 self.scheduled[shpnode] = new_r
         # In case 2, if r is a variable that we've scheduled for shape update,
         # then we should cancel it.
@@ -1171,48 +1095,6 @@ def on_change_input(self, fgraph, node, i, r, new_r, reason):
                     self.set_shape_i(v, ii, new_r)
         self.shape_of_reverse_index[r] = set()
 
-    def same_shape(self, x, y, dim_x=None, dim_y=None):
-        """Return True if we are able to assert that x and y have the
-        same shape.
-
-        dim_x and dim_y are optional. If used, they should be an index
-        to compare only 1 shape of x or y.
-
-        """
-        sx = self.shape_of[x]
-        sy = self.shape_of[y]
-        if sx is None or sy is None:
-            return False
-        if dim_x is not None:
-            sx = [sx[dim_x]]
-        if dim_y is not None:
-            sy = [sy[dim_y]]
-        assert len(sx) == len(sy)
-
-        for dx, dy in zip(sx, sy):
-            if dx is dy:
-                continue
-            # Need to try to find that they are the same shape. We
-            # need to compare the full graph. It could be slow. So I
-            # just implement for now the case of Shape_i.
-            if not dx.owner or not dy.owner:
-                return False
-            if (not isinstance(dx.owner.op, Shape_i) or
-                not isinstance(dy.owner.op, Shape_i)):
-                return False
-            opx = dx.owner.op
-            opy = dy.owner.op
-            if not (opx.i == opy.i):
-                return False
-            # FB I'm not sure is this handle correctly constants.
-            if dx.owner.inputs[0] == dy.owner.inputs[0]:
-                return True
-            # To be sure to cover all case, call equal_computation.
-            # Can't use theano.gof.graph.is_same_graph(dx, dy)
-            # As it currently expect that dx and dy aren't in a FunctionGraph
-            from theano.scan_module.scan_utils import equal_computations
-            return equal_computations([dx], [dy])
-
 
 class ShapeOptimizer(Optimizer):
     """Optimizer that serves to add ShapeFeature as an fgraph feature.
@@ -1309,9 +1191,9 @@ def local_useless_alloc(node):
 
 @register_specialize
 @register_canonicalize
-@gof.local_optimizer([T.shape])
+@gof.local_optimizer([T._shape])
 def local_shape_to_shape_i(node):
-    if node.op == T.shape:
+    if node.op == T._shape:
         # This optimization needs ShapeOpt and fgraph.shape_feature
         if not hasattr(node.fgraph, 'shape_feature'):
             return
@@ -1319,95 +1201,73 @@ def local_shape_to_shape_i(node):
         return [shape_feature.make_vector_shape(node.inputs[0])]
 
 
-# TODO: Not sure what type of node we are expecting here
 @register_specialize
 @register_canonicalize
-@gof.local_optimizer(None)
+@gof.local_optimizer([T._shape])
 def local_track_shape_i(node):
     try:
         shape_feature = node.fgraph.shape_feature
     except AttributeError:
         return
     if node in shape_feature.scheduled:
-        # Don't unschedule node as it could be reinserted in the
-        # fgraph as we don't change it in the shapefeature internal
-        # structure.
         assert isinstance(node.op, Shape_i)
         replacement = shape_feature.scheduled[node]
         return [shape_feature.shape_of[replacement][node.op.i]]
 
 
 @register_specialize
-@register_canonicalize('fast_compile_gpu')
-@gof.local_optimizer([Subtensor, AdvancedSubtensor1])
+@register_canonicalize
+@gof.local_optimizer([T.Subtensor])
 def local_subtensor_make_vector(node):
-    """
-    replace all subtensor(make_vector) like:
-    [a,b,c][0] -> a
-    [a,b,c][0:2] -> [a,b]
-    
-    replace all AdvancedSubtensor1(make_vector) like:
-    [a,b,c][[0,2]] -> [a,c]
-    
-    we can do this for constant indexes
-    """
-    x = node.inputs[0]
-    if not x.owner or x.owner.op != make_vector:
-        return
-
-    if isinstance(node.op, Subtensor):
+    # replace all subtensor(make_vector) like:
+    # [a,b,c][0] -> a
+    # [a,b,c][0:2] -> [a,b]
+    # we can do this for constant indexes
+    if isinstance(node.op, T.Subtensor):
         # This optimization needs ShapeOpt and fgraph.shape_feature
-        try:
-            idx, = node.op.idx_list
-        except Exception:
-            #'how can you have multiple indexes into a shape?'
-            raise
-
-        if isinstance(idx, (scalar.Scalar, T.TensorType)):
-            # The idx is a Scalar, ie a Type. This means the actual index
-            # is contained in node.inputs[1]
-            old_idx, idx = idx, node.inputs[1]
-            assert idx.type == old_idx
-    elif isinstance(node.op, AdvancedSubtensor1):
-        idx = node.inputs[1]
-    else:
-        return
-
-    if isinstance(idx, (int, numpy.integer)):
-        return [x.owner.inputs[idx]]
-    elif isinstance(idx, Variable):
-        if idx.ndim == 0:
-            # if it is a constant we can do something with it
+        x = node.inputs[0]
+        if x.owner and x.owner.op == make_vector:
             try:
-                v = get_scalar_constant_value(idx)
-                if isinstance(v, numpy.integer):
-                    # Python 2.4 wants to index only with Python integers
-                    v = int(v)
-                return [x.owner.inputs[v]]
-            except NotScalarConstantError:
-                pass
-        elif idx.ndim == 1 and isinstance(idx, T.Constant):
-            values = map(int, list(idx.value))
-            return [make_vector(*[x.owner.inputs[v] for v in values])]
-        else:
-            raise TypeError('case not expected')
-    elif isinstance(idx, slice):
-        # it is a slice of ints and/or Variables
-        # check subtensor to see if it can contain constant variables, and if
-        # it can, then try to unpack them.
-        try:
-            const_slice = node.op.get_constant_idx(node.inputs,
-                                                   allow_partial=False)[0]
-            return [make_vector(*x.owner.inputs[const_slice])]
-        except NotScalarConstantError:
-            pass
-    else:
-        raise TypeError('case not expected')
-
+                idx, = node.op.idx_list
+            except Exception:
+                #'how can you have multiple indexes into a shape?'
+                raise
+
+            if isinstance(idx, (scalar.Scalar, T.TensorType)):
+                # The idx is a Scalar, ie a Type. This means the actual index
+                # is contained in node.inputs[1]
+                old_idx, idx = idx, node.inputs[1]
+                assert idx.type == old_idx
+
+            if isinstance(idx, (int, numpy.integer)):
+                return [x.owner.inputs[idx]]
+            elif isinstance(idx, Variable):
+                # if it is a constant we can do something with it
+                try:
+                    v = get_scalar_constant_value(idx)
+                    if isinstance(v, numpy.integer):
+                        # Python 2.4 wants to index only with Python integers
+                        v = int(v)
+                    return [x.owner.inputs[v]]
+                except NotScalarConstantError:
+                    pass
+            else:
+                # it is a slice of ints and/or Variables
+                #TODO: check subtensor to see if it can contain
+                #      constant variables, and if it can, then try to
+                #      unpack them.
+                try:
+                    return [make_vector(*x.owner.inputs.__getitem__(idx))]
+                except TypeError:
+                    pass
+                except Exception:
+                    _logger.error('failed to index with "%s"' % str(idx))
+                    raise
 
 #TODO: the other optimization for and, or, xor, le and ge see ticket #496.
 
-@register_canonicalize('fast_compile')
+
+@register_canonicalize
 @register_specialize
 @gof.local_optimizer([T.Elemwise])
 def local_useless_elemwise(node):
@@ -1437,7 +1297,6 @@ def local_useless_elemwise(node):
             return [node.inputs[0]]
         if node.op.scalar_op == theano.scalar.add and len(node.inputs) == 1:
             return [node.inputs[0]]
-
         if (node.op.scalar_op == theano.scalar.identity
             and len(node.inputs) == 1):
             return [node.inputs[0]]
@@ -1457,29 +1316,6 @@ def local_alloc_unary(node):
             return [T.alloc(T.cast(v, node.outputs[0].dtype), *shp)]
 
 
-@register_canonicalize
-@register_specialize
-@gof.local_optimizer([T.Elemwise])
-def local_cast_cast(node):
-    """cast(cast(x, dtype1), dtype2)
-
-    when those contrain:
-    dtype1 == dtype2
-    TODO: the base dtype is the same (int, uint, float, complex)
-          and the first cast cause an upcast.
-    """
-    if (not isinstance(node.op, T.Elemwise) or
-        not isinstance(node.op.scalar_op, scalar.Cast)):
-        return
-    x = node.inputs[0]
-    if (not x.owner or
-        not isinstance(x.owner.op, T.Elemwise) or
-        not isinstance(x.owner.op.scalar_op, scalar.Cast)):
-        return
-    if node.op.scalar_op.o_type == x.owner.op.scalar_op.o_type:
-        return [x]
-
-
 class Assert(T.Op):
     """
     Implements assertion in a computational graph.
@@ -1492,19 +1328,7 @@ class Assert(T.Op):
     """
     view_map = {0: [0]}
 
-    check_input = False
-
-    def __init__(self, msg="Theano Assert failed!"):
-        self.msg = msg
-
-    def __setstate__(self, attrs):
-        self.__dict__.update(attrs)
-        if not hasattr(self, 'msg'):
-            self.msg = "Theano Assert failed!"
-
     def make_node(self, value, *conds):
-        if not isinstance(value, Variable):
-            value = T.as_tensor_variable(value)
         cond = [T.as_tensor_variable(c) for c in conds]
         assert numpy.all([c.type.ndim == 0 for c in cond])
         return gof.Apply(self, [value] + cond, [value.type()])
@@ -1516,13 +1340,13 @@ def perform(self, node, inputs, out_):
         out, = out_
         v = inputs[0]
         out[0] = v
-        assert numpy.all(inputs[1:]), self.msg
+        assert numpy.all(inputs[1:])
 
     def __eq__(self, other):
-        return type(self) == type(other) and self.msg == other.msg
+        return type(self) == type(other)
 
     def __hash__(self):
-        return hash(type(self)) ^ hash(self.msg)
+        return hash(type(self))
 
     def grad(self, input, output_gradients):
         return output_gradients
@@ -1532,32 +1356,26 @@ def c_code(self, node, name, inames, onames, sub):
         out = onames[0]
         check = []
         fail = sub['fail']
-        msg = self.msg.replace('"', '\\"').replace('\n', '\\n')
         for idx in xrange(len(inames) - 1):
             i = inames[idx + 1]
             dtype = node.inputs[idx + 1].dtype
             check.append('if(!((npy_%(dtype)s*)PyArray_DATA(%(i)s))[0])'
-                         '{PyErr_SetString(PyExc_AssertionError,"%(msg)s");'
-                         '%(fail)s}' % locals())
+                         '{PyErr_SetString(PyExc_AssertionError,"Theano'
+                         ' Assert failed!");%(fail)s}' % locals())
         check = "\n".join(check)
         return """
         %(check)s
-        Py_XDECREF(%(out)s);
         %(out)s = %(value)s;
         Py_INCREF(%(value)s);
         """ % locals()
 
     def c_code_cache_version(self):
-        return (3, 0)
+        return (0, 1)
 
     def infer_shape(self, node, input_shapes):
         return [input_shapes[0]]
 
 assert_ = Assert()
-#Unittest.assert_ is a deprecated name for assertTrue.
-#2to3 convert theano.tensor.opt.assert_ to theano.tensor.opt.assertTrue
-#So I define a new name as a work around.
-assert_op = assert_
 
 
 @register_specialize
@@ -1582,38 +1400,14 @@ def local_remove_useless_assert(node):
             return [assert_(node.inputs[0], *cond)]
 
 
-@gof.local_optimizer([Assert])
-def local_remove_all_assert(node):
-    """An optimization disabled by default that removes all asserts from
-    the graph.
-
-    :note: See the :ref:`unsafe` section to know how to enable it.
-
-    """
-    if not isinstance(node.op, Assert):
-        return
-
-    return [node.inputs[0]]
-# Disabled by default
-compile.optdb['canonicalize'].register('local_remove_all_assert',
-                                       local_remove_all_assert,
-                                       use_db_name_as_tag=False)
-compile.optdb['stabilize'].register('local_remove_all_assert',
-                                    local_remove_all_assert,
-                                    use_db_name_as_tag=False)
-compile.optdb['specialize'].register('local_remove_all_assert',
-                                     local_remove_all_assert,
-                                     use_db_name_as_tag=False)
-
-@register_specialize("local_alloc_elemwise")
-@gof.local_optimizer([T.Elemwise])
-def local_elemwise_alloc(node):
+@gof.local_optimizer([T.Alloc])
+def local_alloc_elemwise(node):
     """
     elemwise(alloc(x, shp), ..., y.TensorType(BROADCAST CONDITION))
-      -> elemwise(x, y.TensorType(BROADCAST CONDITION))
+      -> elemwise(x, y.TensorType(no broadcast flag))
 
     elemwise(dimshuffle(alloc(x, shp)),... ,y.TensorType(BROADCAST CONDITION))
-      -> elemwise(x.dimshuffle(...), y.TensorType(BROADCAST CONDITION))
+      -> elemwise(x, y.TensorType(no broadcast flag))
 
     BROADCAST CONDITION: the condition is that the one input that are
     not to be optimized to have the same broadcast pattern as the
@@ -1625,135 +1419,99 @@ def local_elemwise_alloc(node):
     """
     if not isinstance(node.op, T.Elemwise):
         return False
-
     if len(node.outputs) > 1:
-        # Ensure all outputs have the same broadcast pattern
-        # This is a supposition that I'm not sure is always true.
-        assert all([o.type.broadcastable == 
-                    node.outputs[0].type.broadcastable for o in
+        #This is a supposition this code make that I'm not sure is always true.
+        assert all([list(o.type.broadcastable) == list(
+                    node.outputs[0].type.broadcastable) for o in
                     node.outputs[1:]])
 
-    # The broadcast pattern of the ouptut must match the broadcast pattern of
-    # at least one of the inputs.
-    if not any([i.type.broadcastable == 
-        node.outputs[0].type.broadcastable for i in node.inputs]):
+    if not any([list(i.type.broadcastable) == list(
+                node.outputs[0].type.broadcastable) for i in node.inputs]):
         return False
-
-    def dimshuffled_alloc(i):
-        return (isinstance(i.owner.op, T.DimShuffle) and
-                     i.owner.inputs[0].owner and \
-                         isinstance(i.owner.inputs[0].owner.op, T.Alloc))
-
-    # At least one input must have an owner that is either a T.Alloc or a
-    # T.DimShuffle with an owner that is a T.Alloc -- otherwise there is
-    # nothing to optimize.
-    if not any([i.owner
-                and (isinstance(i.owner.op, T.Alloc) or dimshuffled_alloc(i))
+    if not any([i.owner and (isinstance(i.owner.op, T.Alloc) or \
+                             (isinstance(i.owner.op, T.DimShuffle) and
+                              i.owner.inputs[0].owner and \
+                              isinstance(i.owner.inputs[0].owner.op, T.Alloc)))
                 for i in node.inputs]):
         return False
-
-    ## Search for input that we can use as a baseline for the dimensions.
-    assert_op_idx = -1
+    no_broad_idx = -1
     for idx, i in enumerate(node.inputs):
-        if i.type.broadcastable == node.outputs[0].type.broadcastable:
-            # Prefer an input that is not a T.Alloc nor a T.DimShuffle of a
-            # T.Alloc so that all allocs can be optimized.
-            if not (i.owner
-                    and (isinstance(i.owner.op, T.Alloc)
-                         or dimshuffled_alloc(i))):
-                assert_op_idx = idx
+        if not i.owner:
+            if list(i.type.broadcastable) == [False, ] * i.type.ndim:
+                no_broad_idx = idx
                 break
+            else:
+                continue
+        if not any(i.type.broadcastable) and not isinstance(i.owner.op,
+                                                            T.Alloc):
+            no_broad_idx = idx
+            break
+        elif list(i.type.broadcastable) == list(
+            node.outputs[0].type.broadcastable) \
+            and not isinstance(i.owner.op, T.Alloc) \
+            and not (isinstance(i.owner.op, T.DimShuffle) and
+                     i.owner.inputs[0].owner and \
+                         isinstance(i.owner.inputs[0].owner.op, T.Alloc)):
+            no_broad_idx = idx
+            break
 
-    # It may be the case that only T.Allocs and T.DimShuffle of T.Allocs exist.
-    if assert_op_idx < 0:
-        # We want to optimize as many allocs as possible. When there is more
-        # than one then do all but one.
-        # number of inputs with alloc or dimshuffle alloc
-        l2 = [i for i in node.inputs
-              if (i.owner and (isinstance(i.owner.op, T.Alloc)
-                         or dimshuffled_alloc(i)))]
-        # If only 1 alloc or dimshuffle alloc, it is the one we will use for the shape
-        # So no alloc would be removed.
-        if len(l2) > 1:
-            # l containt inputs with alloc or dimshuffle alloc only.
-            # Its length will always be at least one, as we checked that before
-            l = [idx for idx, i in enumerate(node.inputs)
-                 if i.type.broadcastable == node.outputs[0].type.broadcastable]
-            assert_op_idx = l[0]  # The first one is as good as any to use.
-        else:
-            # Nothing would be optimized!
-            return False
-
-    assert_op = node.inputs[assert_op_idx]
+    assert no_broad_idx >= 0
+    assert_op = node.inputs[no_broad_idx]
     cmp_op = assert_op
-    new_i = []
+    new = []
 
     for i in node.inputs:
-        # Remove alloc
         if (i.owner and isinstance(i.owner.op, T.Alloc)
             and i.owner.inputs[0].type != i.owner.outputs[0].type):
             # when i.owner.inputs[0].type == i.owner.outputs[0].type we
             # will remove that alloc later
 
             assert i.type.ndim == cmp_op.ndim
-            if (theano.config.experimental.local_alloc_elemwise_assert
-                and not node.fgraph.shape_feature.same_shape(i, cmp_op)):
+            if theano.config.experimental.local_alloc_elemwise_assert:
                 assert_op = assert_(assert_op,
                                     *[T.eq(i.shape[idx], cmp_op.shape[idx])\
                                           for idx in xrange(i.type.ndim) \
                                           if not i.type.broadcastable[idx]])
-            new_i.append(i.owner.inputs[0])
-
-        # Remove Alloc in DimShuffle
-        elif i.owner and dimshuffled_alloc(i):
+                new.append(i.owner.inputs[0])
+        elif i.owner and isinstance(i.owner.op, T.DimShuffle) \
+                and i.owner.inputs[0].owner \
+                and isinstance(i.owner.inputs[0].owner.op, T.Alloc):
             assert i.type.ndim == cmp_op.type.ndim
-            if (theano.config.experimental.local_alloc_elemwise_assert
-                and not node.fgraph.shape_feature.same_shape(i, cmp_op)):
+            if theano.config.experimental.local_alloc_elemwise_assert:
                 assert_op = assert_(assert_op,
                                     *[T.eq(i.shape[idx], cmp_op.shape[idx])
                                       for idx in xrange(i.type.ndim)
                                       if not i.type.broadcastable[idx]])
-            alloc_input = i.owner.inputs[0].owner.inputs[0]
-            if alloc_input.ndim != i.owner.inputs[0].ndim:
-                # The alloc can add dimension to the value
-                # We add a dimshuffle to add them.
-                # We let later optimization merge the multiple dimshuffle
-                nb_dim_to_add = i.owner.inputs[0].ndim - alloc_input.ndim
-                alloc_input = alloc_input.dimshuffle(['x'] * nb_dim_to_add +
-                                                     range(alloc_input.ndim))
-
-            # We need to keep the dimshuffle. It could swap axes or
-            # add dimensions anywhere.
-            new_i.append(i.owner.op(alloc_input))
+            new.append(i.owner.inputs[0].owner.inputs[0])
         else:
-            new_i.append(i)
-    new_i[assert_op_idx] = assert_op
-
-    return node.op(*new_i, return_list=True)
+            new.append(i)
+    new[no_broad_idx] = assert_op
+    if theano.config.experimental.local_alloc_elemwise_assert:
+        assert assert_op.owner.op is assert_
+    return [node.op(*new)]
 
 #TODO, global optimizer that lift the assert to the beginning of the graph.
-#TODO, optimize all inputs when possible -- currently when all inputs have
-# an alloc all but one is optimized.
+#TODO, when all inputs can be optimized do all except one
 
 theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
-                                 "DEPRECATED: If True, enable the experimental"
-                                 " optimization local_alloc_elemwise."
-                                 " Generates error if not True. Use"
-                                 " optimizer_excluding=local_alloc_elemwise"
-                                 " to dsiable.",
-                                 theano.configparser.BoolParam(
-                                     True,
-                                     is_valid=lambda x: x
-                                 ),
-                                 in_c_key=False)
-
-# False could make the graph faster but not as safe.
-theano.configparser.AddConfigVar(
-    'experimental.local_alloc_elemwise_assert',
-    "When the local_alloc_elemwise is applied, add"
-    " an assert to highlight shape errors.",
-    theano.configparser.BoolParam(True),
-    in_c_key=False)
+        "If True enable the experimental optimization local_alloc_elemwise",
+        theano.configparser.BoolParam(False),
+        in_c_key=False)
+#This version if faster but not as save.
+theano.configparser.AddConfigVar('experimental.local_alloc_elemwise_assert',
+        "If False enable the experimental optimization local_alloc_elemwise"
+                                 " but WITHOUT assert into the graph!",
+        theano.configparser.BoolParam(True),
+        in_c_key=False)
+if theano.config.experimental.local_alloc_elemwise:
+    #enabled by default when the lifter of assert is done.
+    register_specialize(local_alloc_elemwise)
+else:
+    #don't register them in fast_run by default to have them disabled
+    #by default disable them by default as we are not sure it is
+    #always a good idea to replace an alloc with multiple op.
+    compile.optdb['specialize'].register("local_alloc_elemwise",
+                                         local_alloc_elemwise)
 
 ############################
 # Constant Canonicalization
@@ -1761,7 +1519,7 @@ def dimshuffled_alloc(i):
 
 
 @register_canonicalize
-@gof.local_optimizer([T.Elemwise])
+@gof.local_optimizer([])
 def local_upcast_elemwise_constant_inputs(node):
     """This explicitly upcasts constant inputs to elemwise Ops, when
     those Ops do implicit upcasting anyway.
@@ -1789,7 +1547,7 @@ def local_upcast_elemwise_constant_inputs(node):
                 else:
                     try:
                         # works only for scalars
-                        cval_i = get_scalar_constant_value(i, elemwise=False)
+                        cval_i = get_scalar_constant_value(i)
                         if all(i.broadcastable):
                             new_inputs.append(T.shape_padleft(
                                 T.cast(cval_i, output_dtype),
@@ -1812,13 +1570,18 @@ def local_upcast_elemwise_constant_inputs(node):
             if new_inputs != node.inputs:
                 rval = [node.op(*new_inputs)]
                 if rval[0].type != node.outputs[0].type:
-                    # This can happen for example when floatX=float32
-                    # and we do the true division between and int64
-                    # and a constant that will get typed as int8.
-
-                    # As this is just to allow merging more case, if
-                    # the upcast don't work, we can just skip it.
-                    return
+                    print >> sys.stderr, "NODE:", node
+                    print >> sys.stderr, "NODE INPUT TYPES:", [i.type for i
+                                                               in node.inputs]
+                    print >> sys.stderr, "NODE OUTPUT TYPES:", [
+                        o.type for o in node.outputs]
+                    print >> sys.stderr, "RVAL:", rval
+                    print >> sys.stderr, "NEW INPUT TYPES:", [i.type for i
+                                                              in new_inputs]
+                    print >> sys.stderr, "RVAL INPUT TYPES:", [
+                        i.type for i in rval[0].owner.inputs]
+                    print >> sys.stderr, "RVAL TYPES:", [o.type for o in rval]
+                assert rval[0].type == node.outputs[0].type, (node, rval[0])
                 return rval
 
 ##################
@@ -1828,66 +1591,18 @@ def local_upcast_elemwise_constant_inputs(node):
 
 @register_canonicalize
 @register_specialize
-@gof.local_optimizer([IncSubtensor])
-def local_useless_inc_subtensor(node):
-    """Remove IncSubtensor, when we overwrite the full inputs with the
-    new value.
-
-    """
-    if not isinstance(node.op, IncSubtensor):
-        return
-    if node.op.set_instead_of_inc is False:
-        # This is an IncSubtensor, so the init value must be zeros
-        try:
-            c = get_scalar_constant_value(node.inputs[0])
-            if c != 0:
-                return
-        except NotScalarConstantError:
-            return
-    if (node.inputs[0].ndim != node.inputs[1].ndim or
-        node.inputs[0].broadcastable != node.inputs[1].broadcastable):
-        # FB: I didn't check if this case can happen, but this opt
-        # don't support it.
-        return
-    # We have a SetSubtensor or an IncSubtensor on zeros
-    # If is this IncSubtensor useful?
-
-    # Check that we keep all the original data.
-    # Put the constant inputs in the slice.
-    idx_cst = theano.tensor.subtensor.get_idx_list(node.inputs[1:],
-                                                   node.op.idx_list)
-    if all(isinstance(e, slice) and e.start is None and
-           e.stop is None and (e.step is None or T.extract_constant(e.step) == -1)
-           for e in idx_cst):
-        # IncSubtensor broadcast node.inputs[1] on node.inputs[0]
-        # based on run time shapes, so we must check they are the same.
-        if not hasattr(node.fgraph, 'shape_feature'):
-            return
-        if not node.fgraph.shape_feature.same_shape(node.inputs[0],
-                                                    node.inputs[1]):
-            return
-        # There is no reverse, so we don't need a replacement.
-        if all(e.step is None
-               for e in node.op.idx_list):
-            # They are the same shape, so we can remore this IncSubtensor
-            return [node.inputs[1]]
-        return [Subtensor(node.op.idx_list)(*node.inputs[1:])]
-
-
-@register_canonicalize
-@register_specialize
-@gof.local_optimizer([Subtensor])
+@gof.local_optimizer([T.Subtensor])
 def local_useless_subtensor(node):
     """
     Remove Subtensor if it takes the full input
     """
-    if isinstance(node.op, Subtensor):
+    if isinstance(node.op, T.Subtensor):
         # This optimization needs ShapeOpt and fgraph.shape_feature
         if not hasattr(node.fgraph, 'shape_feature'):
             return
         shape_of = node.fgraph.shape_feature.shape_of
-        cdata = node.op.get_constant_idx(node.inputs, allow_partial=True)
-        for pos, idx in enumerate(cdata):
+        node_input_idx = 1
+        for pos, idx in enumerate(node.op.idx_list):
             if not isinstance(idx, slice):
                 # If idx is not a slice, this means we remove this dimension
                 # from the output, so the subtensor is not useless
@@ -1912,8 +1627,8 @@ def local_useless_subtensor(node):
             if isinstance(idx.stop, (int, numpy.integer)):
                 if idx.stop < length_pos_data:
                     return False
-            elif isinstance(idx.stop, gof.Variable):
-                length_pos_shape_i = idx.stop
+            elif isinstance(idx.stop, theano.scalar.Scalar):
+                length_pos_shape_i = node.inputs[node_input_idx]
                 # length_pos is a tensor variable, but length_pos_shape_i
                 # is a scalar variable. We try to see if they represent
                 # the same underlying variable.
@@ -1936,6 +1651,9 @@ def local_useless_subtensor(node):
                 assert str(length_pos.type.dtype) == "int64"
                 assert str(length_pos_shape_i.type.dtype) in ["int8", "int16",
                                                               "int32", "int64"]
+                # We already know that start and step are not variables
+                # and so they don't appear in the input of the node
+                node_input_idx += 1
 
                 # length_pos_shape_i cannot be None
                 if length_pos_shape_i != length_pos:
@@ -1949,7 +1667,7 @@ def local_useless_subtensor(node):
 
 
 @register_canonicalize
-@gof.local_optimizer([Subtensor])
+@gof.local_optimizer([])
 def local_subtensor_lift(node):
     """
     unary(x)[idx] -> unary(x[idx])#any broadcast pattern.
@@ -1959,7 +1677,7 @@ def local_subtensor_lift(node):
       when x,... are broadcasted scalar or not broadcasted at all
     rebroadcast(x)[idx] => rebroadcast(x[idx])
     """
-    if isinstance(node.op, Subtensor):
+    if isinstance(node.op, T.Subtensor):
         u = node.inputs[0]
         if not u.owner or len(u.clients) > 1:
             return False
@@ -1995,7 +1713,8 @@ def local_subtensor_lift(node):
                 return [u.owner.op(*new_inputs)]
 
         if isinstance(u.owner.op, T.Rebroadcast):
-            # make sure that Rebroadcast has only 1 input
+            # make sure that Subtensor and Rebroadcast only have 1 input/output
+            assert len(node.inputs) == 1
             assert len(u.owner.inputs) == 1
 
             # Subtensor might reduce dim., adapt broadcast pattern accordingly
@@ -2017,7 +1736,7 @@ def local_subtensor_lift(node):
                 new_axis += [(j, u.broadcastable[i])]
                 j += 1
 
-            subt_x = node.op(u.owner.inputs[0], *node.inputs[1:])
+            subt_x = T.Subtensor(node.op.idx_list)(u.owner.inputs[0])
             rbcast_subt_x = T.Rebroadcast(*new_axis)(subt_x)
 
             return [rbcast_subt_x]
@@ -2045,8 +1764,8 @@ def merge_two_slices(slice1, len1, slice2, len2):
     if type(slice1) is not slice:
         raise ValueError(('First provided slice should actually be of type'
                          'slice and not an index !'), slice1)
-    sl1, reverse1 = get_canonical_form_slice(slice1, len1)
-    sl2, reverse2 = get_canonical_form_slice(slice2, len2)
+    sl1, reverse1 = T.get_canonical_form_slice(slice1, len1)
+    sl2, reverse2 = T.get_canonical_form_slice(slice2, len2)
 
     if type(sl2) is not slice:
         if reverse1 is None:
@@ -2158,7 +1877,7 @@ def merge_two_slices(slice1, len1, slice2, len2):
 
 @register_canonicalize
 @register_specialize
-@gof.local_optimizer([Subtensor])
+@gof.local_optimizer([])
 def local_subtensor_merge(node):
     """
     Refactored optimization to deal with all cases of tensor merging.
@@ -2166,15 +1885,15 @@ def local_subtensor_merge(node):
     expresses all slices in a canonical form, and then merges them together.
     """
 
-    if isinstance(node.op, Subtensor):
+    if isinstance(node.op, T.Subtensor):
         u = node.inputs[0]
-        if u.owner and isinstance(u.owner.op, Subtensor):
+        if u.owner and isinstance(u.owner.op, T.Subtensor):
             # We can merge :)
             # x actual tensor on which we are picking slices
             x = u.owner.inputs[0]
             # slices of the first applied subtensor
-            slices1 = get_idx_list(u.owner.inputs, u.owner.op.idx_list)
-            slices2 = get_idx_list(node.inputs, node.op.idx_list)
+            slices1 = T.get_idx_list(u.owner.inputs, u.owner.op.idx_list)
+            slices2 = T.get_idx_list(node.inputs, node.op.idx_list)
             # Get the shapes of the vectors !
             try:
                 # try not to introduce new shape into the graph
@@ -2208,30 +1927,28 @@ def local_subtensor_merge(node):
             else:
                 merged_slices += slices1[pos_1:]
 
-            merged_slices = make_constant(merged_slices)
-            subtens = Subtensor(merged_slices)
-            sl_ins = Subtensor.collapse(
+            subtens = T.Subtensor(merged_slices)
+            sl_ins = T.Subtensor.collapse(
                 merged_slices,
                 lambda x: isinstance(x, T.Variable))
-            # Do not call make_node for test_value
-            out = subtens(x, *sl_ins)
+            out = subtens.make_node(x, *sl_ins).outputs[0]
 
             return [out]
 
 
 @register_canonicalize
 @register_specialize
-@gof.local_optimizer([Subtensor])
+@gof.local_optimizer([])
 def local_subtensor_of_alloc(node):
     """alloc[x:y] -> alloc"""
-    if not isinstance(node.op, Subtensor):
+    if not isinstance(node.op, T.Subtensor):
         return False
     u = node.inputs[0]
     if u.owner is None:
         return False
     if not isinstance(u.owner.op, T.Alloc):
         return False
-    slices = get_idx_list(node.inputs, node.op.idx_list)
+    slices = T.get_idx_list(node.inputs, node.op.idx_list)
     val = u.owner.inputs[0]
     dims = u.owner.inputs[1:]
     assert len(slices) <= len(dims)
@@ -2255,19 +1972,12 @@ def local_subtensor_of_alloc(node):
             else:
                 val_slices.append(sl)
 
-        csl, _ = get_canonical_form_slice(sl, dim)
+        csl, _ = T.get_canonical_form_slice(sl, dim)
         if type(csl) is not slice:
             # That dimension is removed.
             pass
         else:
-            nw_dim = csl.stop - csl.start
-
-            if csl.step != 1:
-                # Do not add the ceil_intdiv() graphs in the graphs
-                # when this is not needed as it prevent detecting the
-                # correct broadcast pattern.
-                nw_dim = T.ceil_intdiv(nw_dim, csl.step)
-            nw_dims += [nw_dim]
+            nw_dims += [T.ceil_intdiv((csl.stop - csl.start), csl.step)]
 
     nw_val = val[tuple(val_slices)]
     nw_dims += dims[len(slices):]
@@ -2276,67 +1986,12 @@ def local_subtensor_of_alloc(node):
     rval = T.alloc(nw_val, *nw_dims)
     if type(rval) not in (list, tuple):
         rval = [rval]
-    if rval[0].type != node.outputs[0].type:
-        #It happen that the make_node() isn't able to infer that some
-        #dimensions are broadcastable, but that now we can infer
-        #that. So we need to remove that information here.
-        rval[0] = theano.tensor.unbroadcast(
-            rval[0],
-            *[i for i, (b1, b2) in enumerate(zip(rval[0].broadcastable,
-                                                node.outputs[0].broadcastable))
-             if b1 and not b2])
-    return rval
-
-
-@register_canonicalize
-@register_stabilize
-@register_specialize
-@gof.local_optimizer([Subtensor])
-def local_subtensor_of_dot(node):
-    """
-    This optimization translates T.dot(A, B)[idxs] into T.dot(A[idxs_a], B[idxs_b]),
-    where idxs_a and idxs_b are defined appropriately.
-
-    idxs_a is the first A.ndim-1 entries of idxs,
-    and idxs_b is the remaining entries of idxs (if any),
-    modified to skip the second-to-last dimension of B
-    (because dot sums over this dimension)
-    """
-    if not isinstance(node.op, Subtensor):
-        return
-    if (not node.inputs[0].owner or
-        not isinstance(node.inputs[0].owner.op, T.Dot)):
-        return
-    # If there is other node that use the outputs of the dot
-    # We don't want to compute twice the sub part.
-    if len(node.inputs[0].clients) > 1:
-        return
 
-    a = node.inputs[0].owner.inputs[0]
-    b = node.inputs[0].owner.inputs[1]
-
-    idx_list = theano.tensor.subtensor.get_idx_list(node.inputs, node.op.idx_list)
-
-    num_a_indices = min(a.ndim - 1, len(idx_list))
-    a_indices = idx_list[:num_a_indices]
-    b_indices = idx_list[num_a_indices:]
-
-    # This is necessary because np.dot sums the last index of a with the second to last of b
-    # so we want to skip the second-to-last index into b.
-    # This wasn't necessary for a, because we just ommitted the last index.
-    # We skip this if b.ndim = 1, since then we just want b_sub = b, not b_sub = b[:]
-    # (dot also handles b.ndim < 2 as a special case)
-    if b.ndim > 1 and len(b_indices) >= b.ndim - 1:
-        b_indices = b_indices[:b.ndim-2] + (slice(None, None, None),) + b_indices[b.ndim-2:]
-
-    a_sub = a.__getitem__(tuple(a_indices))
-    b_sub = b.__getitem__(tuple(b_indices)) if b_indices else b
-
-    return [T.dot(a_sub, b_sub)]
+    return rval
 
 
 @register_canonicalize
-@gof.local_optimizer([T.add])
+@gof.local_optimizer([None])
 def local_IncSubtensor_serialize(node):
     """
     When using Subtensor, gradient graphs can be ugly.
@@ -2371,10 +2026,7 @@ def local_IncSubtensor_serialize(node):
     def movable(i):
         # Return True iff this is a incsubtensor that we can move
         return i.owner \
-                and isinstance(i.owner.op, (IncSubtensor,
-                                            AdvancedIncSubtensor1,
-                                            AdvancedIncSubtensor,
-                                        )) \
+                and isinstance(i.owner.op, T.IncSubtensor) \
                 and i.type == o_type \
                 and len(i.clients) == 1 \
                 and not i.owner.op.set_instead_of_inc
@@ -2399,41 +2051,33 @@ def movable(i):
 
         #print incsub_inputs, [id(i.owner.inputs[0]) for i in incsub_inputs]
 
-# We register it in a TopoOptimizer inside the canonizer EQ optimizer.
-# Otherwise in some cases it was making the EQ optimizer use 45. In
-# the TopoOptimizer, the EQ only use 5 passes.
-compile.optdb.register('pre_local_IncSubtensor_serialize',
-                       in2out(local_IncSubtensor_serialize),
-                       #Just before canonizer
-                       0.99, 'fast_run')
-
 
 #after priority 50 Destructive inplace operations
 #gemm is the first one now, at priority 70
 
-@gof.local_optimizer([IncSubtensor], inplace=True)
+@gof.local_optimizer([None])
 def local_inplace_setsubtensor(node):
     """
     Also work for GpuIncSubtensor
     """
-    if isinstance(node.op, IncSubtensor) and not node.op.inplace:
+    if isinstance(node.op, T.IncSubtensor) and not node.op.inplace:
         new_op = node.op.__class__(
-            node.op.idx_list, inplace=True,
-            set_instead_of_inc=node.op.set_instead_of_inc,
-            destroyhandler_tolerate_aliased=node.op.destroyhandler_tolerate_aliased)
+       node.op.idx_list, inplace=True,
+       set_instead_of_inc=node.op.set_instead_of_inc,
+       destroyhandler_tolerate_aliased=node.op.destroyhandler_tolerate_aliased)
         new_node = new_op(*node.inputs)
         return [new_node]
     return False
-compile.optdb.register('local_inplace_setsubtensor',
+compile.optdb.register('inplace_setsubtensor',
                        TopoOptimizer(local_inplace_setsubtensor,
     failure_callback=TopoOptimizer.warn_inplace), 60,
                        'fast_run', 'inplace')  # DEBUG
 
 
-@gof.local_optimizer([AdvancedIncSubtensor1], inplace=True)
+@gof.local_optimizer([None])
 def local_inplace_incsubtensor1(node):
     """ also work for GpuAdvancedIncSubtensor1 """
-    if isinstance(node.op, AdvancedIncSubtensor1) and not node.op.inplace:
+    if isinstance(node.op, T.AdvancedIncSubtensor1) and not node.op.inplace:
         new_op = node.op.__class__(
             inplace=True, set_instead_of_inc=node.op.set_instead_of_inc)
         new_node = new_op(*node.inputs)
@@ -2446,20 +2090,14 @@ def local_inplace_incsubtensor1(node):
                        60, 'fast_run', 'inplace')  # DEBUG
 
 
-# Register old name
-@register_canonicalize("local_incsubtensor_of_allocs")
-@register_stabilize("local_incsubtensor_of_allocs")
-@gof.local_optimizer([IncSubtensor,
-                      AdvancedIncSubtensor,
-                      AdvancedIncSubtensor1])
-def local_incsubtensor_of_zeros(node):
+@register_canonicalize
+@register_stabilize
+@gof.local_optimizer([None])
+def local_incsubtensor_of_allocs(node):
     """
     IncSubtensor(x, zeros, idx) -> x
     """
-    if (isinstance(node.op, (IncSubtensor,
-                             AdvancedIncSubtensor,
-                             AdvancedIncSubtensor1)) and
-        not node.op.set_instead_of_inc):
+    if isinstance(node.op, T.IncSubtensor) and not node.op.set_instead_of_inc:
         x = node.inputs[0]
         y = node.inputs[1]
         replace = False
@@ -2475,16 +2113,16 @@ def local_incsubtensor_of_zeros(node):
             return False
 
 
-@register_canonicalize('local_setsubtensor_of_allocs')
-@register_stabilize('local_setsubtensor_of_allocs')
-@gof.local_optimizer([IncSubtensor])
-def local_setsubtensor_of_constants(node):
+@register_canonicalize
+@register_stabilize
+@gof.local_optimizer([None])
+def local_setsubtensor_of_allocs(node):
     """
     SetSubtensor(x, x[idx], idx) -> x
 
     when x is constant or alloc.
     """
-    if isinstance(node.op, IncSubtensor) and node.op.set_instead_of_inc:
+    if isinstance(node.op, T.IncSubtensor) and node.op.set_instead_of_inc:
         x = node.inputs[0]
         y = node.inputs[1]
         replace_x = None
@@ -2508,48 +2146,6 @@ def local_setsubtensor_of_constants(node):
             return False
 
 
-@register_canonicalize
-@register_stabilize
-@gof.local_optimizer([AdvancedSubtensor1])
-def local_adv_sub1_adv_inc_sub1(node):
-    """Optimize the possible AdvSub1(AdvIncSub1(...), ...)
-
-    AdvancedSubtensor1(AdvancedIncSubtensor1(0s, y, idx), idx) -> y
-    AdvancedSubtensor1(AdvancedSetSubtensor1(x, y, idx), idx) -> y
-
-    :note: This opt add AssertOp. Otherwise, it would remove shape and
-        index error. If you want to get rid of them, see the
-        :ref:`unsafe_optimization` section.
-
-    """
-    if not isinstance(node.op, AdvancedSubtensor1):
-        return
-    inp = node.inputs[0]
-    if (not inp.owner or
-        not isinstance(inp.owner.op, AdvancedIncSubtensor1)):
-        return
-    idx = node.inputs[1]
-    idx2 = inp.owner.inputs[2]
-    x = inp.owner.inputs[0]
-    y = inp.owner.inputs[1]
-    if idx is not idx2:
-        return
-    if (not inp.owner.op.set_instead_of_inc and
-        T.extract_constant(x) != 0):
-        return
-    cond = [T.all(T.and_(T.lt(idx, x.shape[0]),
-                        T.ge(idx, -x.shape[0])))]
-    if not node.fgraph.shape_feature.same_shape(idx, y, 0, 0):
-        cond.append(T.eq(idx.shape[0], y.shape[0]))
-    y = Assert("Bad indexing or shapes in a AdvancedIncSubtensor1 that was optimized away")(y, *cond)
-
-    if y.dtype == node.outputs[0].dtype:
-        return [y]
-    # It is possible that y is upcast or downcast to x.dtype.
-    # In all case, as we set or add with 0, we can just cast y.
-    return [T.cast(y, node.outputs[0].dtype)]
-
-
 ####################
 # Rebroadcast opts #
 ####################
@@ -2656,6 +2252,7 @@ def local_join_1(node):
     """
     if not isinstance(node.op, T.Join):
         return
+    axis = node.inputs[0]
     tensors = node.inputs[1:]
     if len(tensors) == 1:
         return [tensors[0]]
@@ -2666,7 +2263,7 @@ def local_join_1(node):
 ###############
 
 @register_canonicalize
-@gof.local_optimizer([T.Elemwise])
+@gof.local_optimizer([])
 def local_remove_switch_const_cond(node):
     """
     This optimization makes the following changes in the graph:
@@ -2676,7 +2273,7 @@ def local_remove_switch_const_cond(node):
     """
     if (isinstance(node.op, T.Elemwise) and
         isinstance(node.op.scalar_op, scalar.basic.Switch)):
-        cond = T.extract_constant(node.inputs[0], elemwise=False)
+        cond = T.extract_constant(node.inputs[0])
         if type(cond) is numpy.ndarray and cond.ndim == 0:
             if cond == 0:
                 out = node.inputs[2]
@@ -2726,8 +2323,7 @@ def local_mul_switch_sink(node):
         if i.owner and i.owner.op == T.switch:
             switch = i.owner
             try:
-                if (isinstance(switch.inputs[0], Constant) and
-                    get_scalar_constant_value(switch.inputs[1]) == 0.):
+                if get_scalar_constant_value(switch.inputs[1]) == 0.:
                     listmul = node.inputs[:idx] + node.inputs[idx + 1:]
                     fct = [T.switch(switch.inputs[0], 0,
                                     T.mul(*(listmul + [switch.inputs[2]])))]
@@ -2737,8 +2333,7 @@ def local_mul_switch_sink(node):
             except NotScalarConstantError:
                 pass
             try:
-                if (isinstance(switch.inputs[2], Constant) and
-                    get_scalar_constant_value(switch.inputs[2]) == 0.):
+                if get_scalar_constant_value(switch.inputs[2]) == 0.:
                     listmul = node.inputs[:idx] + node.inputs[idx + 1:]
                     fct = [T.switch(switch.inputs[0],
                                     T.mul(*(listmul + [switch.inputs[1]])), 0)]
@@ -2751,7 +2346,7 @@ def local_mul_switch_sink(node):
 
 
 @register_canonicalize
-@gof.local_optimizer([T.true_div, T.int_div, T.floor_div])
+@gof.local_optimizer([T.true_div])
 def local_div_switch_sink(node):
     """
     This optimization makes the folowing changes in the graph:
@@ -2790,51 +2385,12 @@ def local_div_switch_sink(node):
     return False
 
 
-#############
-# Tile Opts #
-#############
-@register_canonicalize
-@register_stabilize
-@gof.local_optimizer([T.Tile])
-def local_useless_tile(node):
-    """Tile(x, (1,)*N) -> x
-
-    This is useless tile. (1,)*N, just mean a vector with all element
-    being 1.
-
-    """
-    if isinstance(node.op, T.Tile):
-        try:
-            a = T.get_scalar_constant_value(node.inputs[1])
-            if a == 1:
-                try:
-                    l = T.get_vector_length(node.inputs[1])
-                    if l == node.inputs[0].ndim:
-                        return [node.inputs[0]]
-                    elif l < node.inputs[0].ndim:
-                        # The Op don't support that case, so we can't
-                        # implement the opt and test it.
-                        return
-                        return [node.inputs[0]]
-                    else:
-                        # The Op don't support that case, so we can't
-                        # implement the opt and test it.
-                        return
-                        x_nd = node.inputs[0].ndim
-                        broad = ['x'] * (l - x_nd) + range(x_nd)
-                        return [node.inputs[0].dimshuffle(broad)]
-                except ValueError:
-                    return
-        except NotScalarConstantError:
-            return
-
-
 ################
 # Flatten Opts #
 ################
 @register_canonicalize
 @register_stabilize
-@gof.local_optimizer([T.Flatten])
+@gof.local_optimizer([])
 def local_flatten_lift(node):
     """
     Flatten(UnaryElemwise(x)) -> UnaryElemwise(Flatten(x))
@@ -2855,7 +2411,7 @@ def local_flatten_lift(node):
 ##################
 
 
-@gof.local_optimizer([T.Reshape])
+@gof.local_optimizer([None, None])
 def local_reshape_chain(node):
     """
     Reshape(Reshape(shape1),shape2) -> Reshape(shape2)
@@ -2883,7 +2439,7 @@ def local_reshape_chain(node):
 
 @register_canonicalize
 @register_stabilize
-@gof.local_optimizer([T.Reshape])
+@gof.local_optimizer([])
 def local_reshape_lift(node):
     """
     Reshape(UnaryElemwise(x)) -> UnaryElemwise(Reshape(x))
@@ -2897,17 +2453,13 @@ def local_reshape_lift(node):
         len(node.inputs[0].owner.inputs) == 1):
         r = node.op(node.inputs[0].owner.inputs[0], node.inputs[1])
         e = node.inputs[0].owner.op(r)
-        # In rare case the original broadcast was (False, True), but
-        # the new one is (False, False). So don't crash in that case.
-        if e.type != node.outputs[0].type:
-            e = T.patternbroadcast(e, node.outputs[0].broadcastable)
         return [e]
 
 
 if 0:
     # TODO: Test that this optimziation works.
     @register_canonicalize
-    @gof.local_optimizer([T.Reshape])
+    @gof.local_optimizer([])
     def local_scalar_reshape(node):
         """Eliminate reshape Ops whose inputs and outputs are scalars """
         if isinstance(node.op, T.Reshape):
@@ -2923,7 +2475,7 @@ def local_scalar_reshape(node):
     # TODO: Remember to take into account the new sum dtype argument if this
     #       optimization is enabled.
     @register_canonicalize
-    @gof.local_optimizer([T.Sum])
+    @gof.local_optimizer([])
     def local_sum_over_empty(node):
         if isinstance(node.op, T.Sum):
             # This optimization needs ShapeOpt and fgraph.shape_feature
@@ -2945,7 +2497,7 @@ def tmp(thing):
 ##################
 
 
-@gof.local_optimizer([T.Elemwise])
+@gof.local_optimizer([None, T.fill])
 def local_fill_cut(node):
     """
     f(fill(a,b), c) -> f(b, c)
@@ -2960,12 +2512,12 @@ def local_fill_cut(node):
     # scalars, but we can't ignore the large matrix because it gives
     # the shape of the result.
 
-    if node.op != T.Elemwise:
+    if not opt.check_chain(node, T.Elemwise):
         return False
 
     output = node.outputs[0]
     try:
-        #reference is some input with the same type as the output but
+        #reference is some input with the same type as the input but
         #that is not produced by a fill
         reference = [input
                      for input in node.inputs
@@ -2975,18 +2527,16 @@ def local_fill_cut(node):
         return False
 
     new_inputs = []
-    new = False
     for input in node.inputs:
-        if input.owner and input.owner.op == T.fill:
+        if opt.check_chain(input, T.fill):
             model, filling = input.owner.inputs
             if encompasses_broadcastable(reference.type.broadcastable,
                                          filling.type.broadcastable):
                 new_inputs.append(filling)
-                new = True
                 continue
         new_inputs.append(input)
 
-    if not new:
+    if new_inputs == node.inputs:
         return False
 
     rval = node.op(*new_inputs)
@@ -3001,14 +2551,12 @@ def local_fill_cut(node):
 register_canonicalize(gof.OpRemove(T.tensor_copy), name='remove_tensor_copy')
 
 
-@gof.local_optimizer([T.Elemwise])
+@gof.local_optimizer([None, T.fill])
 def local_fill_sink(node):
     """
     f(fill(a, b), fill(c, d), e) -> fill(a, fill(c, f(b, d, e)))
-
-    f need to be an elemwise
     """
-    if not isinstance(node.op, T.Elemwise) or node.op == T.fill:
+    if not (node.op and isinstance(node.op, T.Elemwise) and node.op != T.fill):
         return False
     models = []
     inputs = []
@@ -3018,7 +2566,7 @@ def local_fill_sink(node):
             inputs.append(input.owner.inputs[1])
         else:
             inputs.append(input)
-    if not models:
+    if inputs == node.inputs:
         return False
     c = node.op(*inputs)
     for model in models:
@@ -3091,7 +2639,8 @@ def add_simplifier(self, simplifier, reason):
         self.external_simplifiers.append((reason, simplifier))
 
     def tracks(self):
-        return [self.main, self.inverse, self.reciprocal]
+        return [[self.main, None], [self.inverse, None],
+                [self.reciprocal, None]]
 
     def get_num_denum(self, input):
         """
@@ -3192,9 +2741,9 @@ def get_num_denum(self, input):
         pairs = [self.get_num_denum(input2) for input2 in parent.inputs]
 
         if parent.op == self.main:
-            # If we have main(x, y, ...), numx, denumx, numy, denumy, ...
-            # then num is concat(numx, numy, num...) and denum is
-            # concat(denumx, denumy, denum...) note that main() can have any
+            # If we have main(x, y), numx, denumx, numy and denumy
+            # then num is concat(numx, numy) and denum is
+            # concat(denumx, denumy) note that main() can have any
             # number of arguments >= 0 concat is list concatenation
             num = reduce(list.__iadd__, map(operator.itemgetter(0), pairs))
             denum = reduce(list.__iadd__, map(operator.itemgetter(1), pairs))
@@ -3270,13 +2819,12 @@ def get_constant(v):
         else:
             return v
 
-    def simplify(self, num, denum, out_type):
+    def simplify(self, num, denum):
         """
         Shorthand for:
         self.simplify_constants(*self.simplify_factors(num, denum))
         """
-        rval = self.simplify_constants(*self.simplify_factors(num, denum),
-                                       out_type=out_type)
+        rval = self.simplify_constants(*self.simplify_factors(num, denum))
         for reason, simplifier in self.external_simplifiers:
             # TODO: document that 'reason' is associated with this
             #       simplification to help auditing when things go
@@ -3300,7 +2848,7 @@ def simplify_factors(self, num, denum):
                 denum.remove(v)
         return num, denum
 
-    def simplify_constants(self, orig_num, orig_denum, out_type=None):
+    def simplify_constants(self, orig_num, orig_denum):
         """
 
         Finds all constants in orig_num and orig_denum (using
@@ -3318,6 +2866,7 @@ def simplify_constants(self, orig_num, orig_denum, out_type=None):
 
         # Lists representing the numerator and denumerator
         num, denum = list(orig_num), list(orig_denum)
+        out_type = self.merge_num_denum(orig_num, orig_denum).type
 
         # Lists representing the *constant* elements of num and denum
         numct, denumct = [], []
@@ -3386,26 +2935,29 @@ def transform(self, node):
         if op not in [self.main, self.inverse, self.reciprocal]:
             return False
 
-        assert len(node.outputs) == 1
         out = node.outputs[0]
+        assert len(node.outputs) == 1
 
         # check if any of the clients of this node would be part of
         # this canonized graph...  if so, we do nothing and wait for
         # them to be transformed.
+        def _bypass_dimshuffle(n):
+            if (isinstance(getattr(n, 'op', None), DimShuffle) and
+                len(n.outputs[0].clients) <= 1):
+                return _bypass_dimshuffle(n.outputs[0].clients[0][0])
+            else:
+                return n
         for c, c_idx in out.clients:
             if c == 'output':
                 continue
-            while (isinstance(getattr(c, 'op', None), DimShuffle) and
-                   len(c.outputs[0].clients) <= 1):
-                c = c.outputs[0].clients[0][0]
-            if getattr(c, 'op', '') in [self.main, self.inverse,
-                                        self.reciprocal]:
+            if getattr(_bypass_dimshuffle(c), 'op', '') in [
+                self.main, self.inverse, self.reciprocal]:
                 return False
 
         # Here we make the canonical version of the graph around this node
         # See the documentation of get_num_denum and simplify
         orig_num, orig_denum = self.get_num_denum(node.outputs[0])
-        num, denum = self.simplify(list(orig_num), list(orig_denum), out.type)
+        num, denum = self.simplify(list(orig_num), list(orig_denum))
 
         def same(x, y):
             return len(x) == len(y) and all(N.all(xe == ye) for xe, ye in
@@ -3476,7 +3028,7 @@ def local_neg_to_mul(node):
 
 
 @register_specialize
-@gof.local_optimizer([T.Sum])
+@gof.local_optimizer([])
 def local_sum_mul_by_scalar(node):
     """sum(scalar * smth) -> scalar * sum(smth)
        sum(-smth) -> -sum(smth)
@@ -3513,7 +3065,7 @@ def local_sum_mul_by_scalar(node):
 
 
 @register_specialize
-@gof.local_optimizer([T.Elemwise])
+@gof.local_optimizer([])
 def local_elemwise_sub_zeros(node):
     """
     Elemwise{sub}(X,X) -> zeros_like(X)
@@ -3527,7 +3079,7 @@ def local_elemwise_sub_zeros(node):
 
 @register_canonicalize
 @register_specialize
-@gof.local_optimizer([T.Sum])
+@gof.local_optimizer([])
 def local_sum_div_dimshuffle(node):
     '''sum(a / dimshuffle{...}(b), axis=l) -> sum(a, axis={...}) / b,
     if dimension l of the DimShuffle is 'x'.'''
@@ -3616,7 +3168,7 @@ def local_sum_div_dimshuffle(node):
 
 
 @register_canonicalize
-@gof.local_optimizer([T.Sum])
+@gof.local_optimizer([])
 def local_sum_all_to_none(node):
     """Sum{0,1,...N} -> Sum{}"""
     if isinstance(node.op, T.Sum):
@@ -3629,7 +3181,7 @@ def local_sum_all_to_none(node):
 
 
 @register_canonicalize
-@gof.local_optimizer([T.Sum])
+@gof.local_optimizer([])
 def local_sum_sum(node):
     """
     Sum(Sum()) -> Sum
@@ -3695,87 +3247,9 @@ def local_sum_sum(node):
                 combined_sum = T.Sum(newaxis, dtype=out_dtype)
                 return [combined_sum(summed.owner.inputs[0])]
 
-ALL_REDUCE = [T.elemwise.CAReduce, T.elemwise.All, T.elemwise.Any,
-              T.elemwise.Sum, T.elemwise.Prod,
-              T.elemwise.ProdWithoutZeros]
-
 
 @register_canonicalize
-@register_uncanonicalize  # Needed for MaxAndArgmax -> CAReduce
-@gof.local_optimizer(ALL_REDUCE)
-def local_reduce_join(node):
-    """Reduce{scalar.op}(Join(axis=0, a, b), axis=0) -> Elemwise{scalar.op}(a, b)
-
-    :note: supported scalar.op are Maximum, Mimimum in some cases and
-        Add and Mul in all cases.
-
-    :note: Currently we must reduce on axis 0. It is probably
-        extensible to the case where we join and reduce on the same
-        set of axis.
-
-    """
-    if (isinstance(node.op, T.CAReduce) and
-        node.inputs[0].owner and
-        isinstance(node.inputs[0].owner.op, T.Join)):
-
-        join = node.inputs[0].owner
-        if T.extract_constant(join.inputs[0]) != 0:
-            return
-
-        if isinstance(node.op.scalar_op, (scalar.Maximum, scalar.Minimum)):
-            # Support only 2 inputs for now
-            if len(join.inputs) != 3:
-                return
-        elif not isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul)):
-            return
-        elif len(join.inputs) <= 2:
-            # This is a useless join, that will get removed by another opt.
-            return
-
-        new_inp = []
-        for inp in join.inputs[1:]:
-            inp = inp.owner
-            if not inp:
-                return
-            if (not isinstance(inp.op, DimShuffle) or
-                inp.op.new_order != ('x',) + tuple(range(inp.inputs[0].ndim))):
-                return
-            new_inp.append(inp.inputs[0])
-        ret = Elemwise(node.op.scalar_op)(*new_inp)
-
-        if ret.dtype != node.outputs[0].dtype:
-            # The reduction do something about the dtype.
-            return
-
-        # I put this warning late to don't add extra warning.
-        if len(node.op.axis) != 1 or 0 not in node.op.axis:
-            if theano.config.warn.reduce_join:
-                _logger.warn((
-                    'Your current code is fine, but Theano versions '
-                    'prior to 0.7 (or this development version Sept 2014) '
-                    'might have given an incorrect result for this code. '
-                    'To disable this warning, set the Theano flag '
-                    'warn.reduce_join to False. The problem was an '
-                    'optimization, that modified the pattern '
-                    '"Reduce{scalar.op}(Join(axis=0, a, b), axis=0)", '
-                    'did not check the reduction axis. So if the '
-                    'reduction axis was not 0, you got a wrong answer.'
-                    ))
-            return
-
-        # We add the new check late to don't add extra warning.
-        try:
-            join_axis = get_scalar_constant_value(join.inputs[0])
-            if join_axis != node.op.axis[0]:
-                return
-        except NotScalarConstantError:
-            return
-
-        return [ret]
-
-
-@register_canonicalize('fast_compile')
-@gof.local_optimizer(ALL_REDUCE)
+@gof.local_optimizer([])
 def local_cut_useless_reduce(node):
     """Sum(a, axis=[]) -> a  """
     if isinstance(node.op, T.CAReduce):
@@ -3790,9 +3264,8 @@ def local_cut_useless_reduce(node):
 # see gh-790 issue.
 #
 #@register_canonicalize
-@register_uncanonicalize
 @register_specialize
-@gof.local_optimizer(ALL_REDUCE)
+@gof.local_optimizer([])
 def local_reduce_broadcastable(node):
     """Remove reduction over broadcastable dimensions"""
     if isinstance(node.op, T.CAReduce):
@@ -3831,7 +3304,7 @@ def local_reduce_broadcastable(node):
 
 
 @register_specialize
-@gof.local_optimizer([T.Sum])
+@gof.local_optimizer([])
 def local_sum_alloc(node):
     """ sum(alloc(constant,shapes...)) => constant*prod(shapes)"""
     if isinstance(node.op, T.Sum):
@@ -3864,6 +3337,20 @@ def local_sum_alloc(node):
                     pass
 
 
+@gof.local_optimizer([T.mul])
+def local_mul_to_neg(node):
+    if node.op == T.mul and N.all(
+        local_mul_canonizer.get_constant(node.inputs[0]) == -1.0):
+        other_prod = local_mul_canonizer.merge_num_denum(node.inputs[1:], [])
+        if other_prod.type == node.outputs[0].type:
+            return [-other_prod]
+        # else the multiplication is also acting as a cast, so we
+        # might as well leave it alone.  I don't think it's better to
+        # turn this into a negation in the wrong type, followed by an
+        # explicit cast.
+register_specialize(local_mul_to_neg)
+
+
 @register_specialize
 @gof.local_optimizer([T.neg])
 def local_neg_neg(node):
@@ -3910,7 +3397,7 @@ def local_mul_zero(node):
             except NotScalarConstantError:
                 continue
             #print 'MUL by value', value, node.inputs
-            if value == 0:
+            if N.all(value == 0):
                 #print '... returning zeros'
                 return _fill_chain(theano._asarray(0, dtype=otype.dtype),
                                    node.inputs)
@@ -3948,9 +3435,9 @@ def local_inv_canon(node):
 @gof.local_optimizer([T.pow])
 def local_pow_canonicalize(node):
     if node.op == T.pow:
-        if local_mul_canonizer.get_constant(node.inputs[1]) == 0:
+        if N.all(local_mul_canonizer.get_constant(node.inputs[1]) == 0):
             return [broadcast_like(1, node.outputs[0], node.fgraph)]
-        if local_mul_canonizer.get_constant(node.inputs[1]) == 1:
+        if N.all(local_mul_canonizer.get_constant(node.inputs[1]) == 1):
             return [broadcast_like(node.inputs[0], node.outputs[0], node.fgraph)]
     else:
         return False
@@ -4044,7 +3531,7 @@ def local_pow_specialize_device(node):
             # 512 is too small for the cpu and too big for some gpu!
             if abs(y) == int(abs(y)) and abs(y) <= 512:
                 pow2 = [xsym]
-                pow2_scal = [theano.scalar.get_scalar_type(xsym.dtype)()]
+                pow2_scal = [theano.scalar.Scalar(xsym.dtype)()]
                 y_to_do = abs(y)
                 for i in xrange(int(numpy.log2(y_to_do))):
                     pow2.append(T.sqr(pow2[i]))
@@ -4079,15 +3566,7 @@ def local_pow_specialize_device(node):
 
 @gof.local_optimizer([T.mul])
 def local_mul_specialize(node):
-    """Remove special-case constants from mul arguments and useless neg in inputs.
-
-    mul(-1, x) -> neg(x)
-    mul(1, x, y) -> mul(x, y)
-    mul(0, ...) -> alloc(0, shapes...)
-
-    This is not done if we would add more nodes in the graph, like with:
-
-    mul(-1, x, y) -/-> neg(mul(x, y))
+    """Remove special-case constants from mul arguments
     """
     # here, we are past the point of canonicalization, so we don't
     # want to put in un-necessary fills.
@@ -4097,23 +3576,19 @@ def local_mul_specialize(node):
         #the idea here is that we have pow(x, y)
         neg = False
         new_inputs = []
-        nb_neg_node = 0
-        nb_cst = 0
         for input in node.inputs:
             # remove any neg arguments
             while input.owner and input.owner.op == T.neg:
                 neg ^= True
                 input = input.owner.inputs[0]
-                nb_neg_node += 1
 
             # remove special case arguments of 1, -1 or 0
             y = local_mul_canonizer.get_constant(input)
-            if y == 1.0:
-                nb_cst += 1
-            elif y == -1.0:
-                nb_cst += 1
+            if N.all(y == 1.0):
+                continue
+            elif N.all(y == -1.0):
                 neg ^= True  # toggles
-            elif y == 0.0:
+            elif N.all(y == 0.0):
                 # if we find any zero, we just return right away
                 return [broadcast_like(0, node.outputs[0], node.fgraph)]
             else:
@@ -4127,17 +3602,10 @@ def local_mul_specialize(node):
                     else:
                         rval = new_inputs[0]
                 else:
-                    # The next case would cause a replace by an equivalent case.
-                    if (neg and
-                        nb_neg_node == 0 and
-                        nb_cst == 1):
-                        return
-                    elif neg:
-                        # Don't add an extra neg node as we can't
-                        # fully replace this mul by a neg.
-                        m1 = numpy.asarray(-1, dtype=node.outputs[0].dtype)
-                        new_inputs = [m1] + new_inputs
-                    rval = T.mul(*new_inputs)
+                    if neg:
+                        rval = -T.mul(*new_inputs)
+                    else:
+                        rval = T.mul(*new_inputs)
 
                 return [broadcast_like(rval, node.outputs[0], node.fgraph)]
             else:
@@ -4170,16 +3638,17 @@ def fill_chain(v):
                 continue
             new_inputs.append(input)
 
-
         if len(new_inputs) < len(node.inputs):
             dtype = node.outputs[0].type.dtype
             if len(new_inputs) == 0:
                 #we got rid of the entire expression!
                 ndim = node.outputs[0].type.ndim
-                #Reuse call to constant for cache()
-                cst = T.constant(numpy.zeros((1,) * ndim, dtype=dtype))
-                assert cst.type.broadcastable == (True,) * ndim
-                return fill_chain(cst)
+                return fill_chain(
+                        T.TensorConstant(
+                            T.TensorType(
+                                dtype=dtype,
+                                broadcastable=[True] * ndim),
+                            numpy.zeros((1,) * ndim, dtype=dtype)))
 
             if len(new_inputs) == 1:
                 ret = fill_chain(new_inputs[0])
@@ -4194,9 +3663,11 @@ def fill_chain(v):
         return False
 register_specialize(local_add_specialize)
 
+# neg_to_mul = out2in(gof.LocalOptGroup(local_neg_to_mul))
+# mul_to_neg = out2in(gof.LocalOptGroup(local_mul_to_neg))
+
 mul_canonizer = in2out(gof.LocalOptGroup(local_mul_canonizer, local_fill_cut,
-                                         local_fill_sink),
-                       name='mul_canonizer_groups')
+                                         local_fill_sink))
 
 
 def check_for_x_over_absX(numerators, denominators):
@@ -4240,7 +3711,7 @@ def local_abs_lift(node):
 
 
 @register_specialize
-@gof.local_optimizer([T.mul, T.true_div])
+@gof.local_optimizer([])
 def local_abs_merge(node):
     """
     merge abs generated by local_abs_lift when the canonizer don't
@@ -4253,7 +3724,7 @@ def local_abs_merge(node):
         for i in node.inputs:
             if i.owner and i.owner.op == T.abs_:
                 inputs.append(i.owner.inputs[0])
-            elif isinstance(i, Constant):
+            else:
                 try:
                     const = get_scalar_constant_value(i)
                 except NotScalarConstantError:
@@ -4261,8 +3732,6 @@ def local_abs_merge(node):
                 if not (const >= 0).all():
                     return False
                 inputs.append(i)
-            else:
-                return False
         return [T.abs_(T.mul(*inputs))]
     if node.op == T.true_div and sum([i.owner.op == T.abs_ for i in
                                       node.inputs if i.owner]) == 2:
@@ -4309,11 +3778,6 @@ def local_log_add(node):
         z = node.inputs[0]
         if z.owner and z.owner.op == T.add:
             zi = z.owner.inputs
-            if len(zi) != 2:
-                # -- upgrading Maximum to handle multiple inputs wasn't trivial
-                #    TODO
-                #raise NotImplementedError()
-                return
             pre_exp = [x.owner.inputs[0] for x in zi
                        if x.owner and x.owner.op == T.exp]
             if len(pre_exp) == len(zi):
@@ -4345,8 +3809,7 @@ def add_calculate(num, denum, aslist=False, out_type=None):
 
 local_add_canonizer = Canonizer(T.add, T.sub, T.neg, add_calculate)
 add_canonizer = in2out(gof.LocalOptGroup(local_add_canonizer, local_fill_cut,
-                                         local_fill_sink),
-                       name='add_canonizer_group')
+                                         local_fill_sink))
 
 
 register_canonicalize(local_add_canonizer, name='local_add_canonizer')
@@ -4357,8 +3820,7 @@ def add_calculate(num, denum, aslist=False, out_type=None):
 ##################
 
 
-def distribute_greedy(pos_pairs, neg_pairs, num, denum,
-                      out_type, minscore=0):
+def distribute_greedy(pos_pairs, neg_pairs, num, denum, minscore=0):
     # each pair in pos_pairs and neg_pairs is a num/denum pair. this
     # function attempts to add num and denum to the corresponding parts
     # of each pair, and counts how many multiplications/divisions can
@@ -4374,10 +3836,10 @@ def distribute_greedy(pos_pairs, neg_pairs, num, denum,
     # score is number of operations saved, higher is better
     score = len(num) + div_cost * len(denum)
     new_pos_pairs = list(itertools.starmap(local_mul_canonizer.simplify,
-                                           [(n + num, d + denum, out_type) for (n, d)
+                                           [(n + num, d + denum) for (n, d)
                                             in pos_pairs]))
     new_neg_pairs = list(itertools.starmap(local_mul_canonizer.simplify,
-                                           [(n + num, d + denum, out_type) for (n, d)
+                                           [(n + num, d + denum) for (n, d)
                                             in neg_pairs]))
     for (n, d), (nn, dd) in zip(pos_pairs + neg_pairs, new_pos_pairs +
                                 new_neg_pairs):
@@ -4390,7 +3852,7 @@ def distribute_greedy(pos_pairs, neg_pairs, num, denum,
     return True, new_pos_pairs, new_neg_pairs
 
 
-def attempt_distribution(factor, num, denum, out_type):
+def attempt_distribution(factor, num, denum):
     # we try to insert each num and each denum in the factor
     # returns: changes?, new_factor, new_num, new_denum
     # if there are changes, new_num and new_denum contain all the numerators
@@ -4403,13 +3865,13 @@ def attempt_distribution(factor, num, denum, out_type):
     change = False
     for n in list(num):
         success, pos_pairs, neg_pairs = distribute_greedy(pos_pairs,
-                                                          neg_pairs, [n], [], out_type)
+                                                          neg_pairs, [n], [])
         if success:
             change = True
             num.remove(n)
     for d in list(denum):
         success, pos_pairs, neg_pairs = distribute_greedy(pos_pairs,
-                                                          neg_pairs, [], [d], out_type)
+                                                          neg_pairs, [], [d])
         if success:
             change = True
             denum.remove(d)
@@ -4423,9 +3885,8 @@ def attempt_distribution(factor, num, denum, out_type):
                                    neg_pairs))), num, denum
 
 
-@register_canonicalize
-@register_stabilize
-@gof.local_optimizer([T.mul, T.true_div, T.inv])
+@gof.local_optimizer([T.mul, T.add, T.mul], [T.mul, T.sub, T.mul],
+                     [T.mul, T.add, T.true_div], [T.mul, T.sub, T.true_div])
 def local_greedy_distributor(node):
     """
     This optimization tries to apply distributivity of multiplication
@@ -4437,10 +3898,9 @@ def local_greedy_distributor(node):
     The following expressions are simplified:
     1. ((a/x + b/y) * x * y) --> a*y + b*x
     2. ((a/x + b) * x) --> a + b*x
-    3. There are other forms too where node is a true_div.
 
     The following expressions are not simplified:
-    4. ((a + b) * x) -/-> a*x + b*x
+    3. ((a + b) * x) -/-> a*x + b*x
 
     This optimization aims to reduce computational cost. It may also
     increase numerical stability, e.g. when x and/or y tend to 0 in
@@ -4456,13 +3916,12 @@ def local_greedy_distributor(node):
 
     change = False
 
-    out_type = out.type
     for candidate in list(num):
         if candidate not in num:
             continue
         num.remove(candidate)
         _change, candidate, num, denum = attempt_distribution(candidate,
-                                                              num, denum, out_type)
+                                                              num, denum)
         change |= _change
         new_num.append(candidate)
 
@@ -4471,7 +3930,7 @@ def local_greedy_distributor(node):
             continue
         denum.remove(candidate)
         _change, candidate, denum, num = attempt_distribution(candidate,
-                                                              denum, num, out_type)
+                                                              denum, num)
         change |= _change
         new_denum.append(candidate)
 
@@ -4489,11 +3948,11 @@ def local_greedy_distributor(node):
 
     return [rval]
 
+register_canonicalize(local_greedy_distributor)
+register_stabilize(local_greedy_distributor)
+
 
-@register_canonicalize('fast_compile')
-@register_stabilize('fast_compile')
-@register_specialize('fast_compile')
-@gof.local_optimizer(None)
+@gof.local_optimizer([None])
 def constant_folding(node):
     for input in node.inputs:
         if not isinstance(input, Constant):
@@ -4508,21 +3967,9 @@ def constant_folding(node):
     for o in node.outputs:
         storage_map[o] = [None]
         compute_map[o] = [False]
-    if (hasattr(node.op, 'python_constant_folding') and
-        node.op.python_constant_folding(node)):
 
-        old_value = getattr(node.op, '_op_use_c_code', False)
-        try:
-            node.op._op_use_c_code = False
-            thunk = node.op.make_thunk(node,
-                                       storage_map,
-                                       compute_map,
-                                       [])
-        finally:
-            node.op._op_use_c_code = old_value
-    else:
-        thunk = node.op.make_thunk(node, storage_map, compute_map,
-                                   no_recycling=[])
+    thunk = node.op.make_thunk(node, storage_map, compute_map,
+            no_recycling=[])
 
     required = thunk()
     assert not required  # a node whose inputs are all provided should always
@@ -4538,6 +3985,10 @@ def constant_folding(node):
         rval.append(constant(output.type, storage_map[output][0]))
     return rval
 
+register_canonicalize(constant_folding, 'fast_compile')
+register_stabilize(constant_folding)
+register_specialize(constant_folding)
+
 
 def _is_1(expr):
     """rtype bool. True iff expr is a constant close to 1
@@ -4558,43 +4009,25 @@ def _is_minus1(expr):
     except NotScalarConstantError:
         return False
 
-def get_clients(node):
-    "Used by erf/erfc opt to track less frequent op"
-    return [c for c, i in node.outputs[0].clients
-            if c != "output"]
-
-def get_clients2(node):
-    "Used by erf/erfc opt to track less frequent op"
-    l = []
-    for c, i in node.outputs[0].clients:
-        if c != "output":
-            for var in c.outputs:
-                l.extend([cc for cc, ii in var.clients if cc != "output"])
-    return l
-
 #1+erf(x)=>erfc(-x)
 local_one_plus_erf = gof.PatternSub((T.add,
                                      dict(pattern='y', constraint=_is_1),
                                      (T.erf, 'x')),
                                     (T.erfc, (T.neg, 'x')),
-                                    allow_multiple_clients=True,
-                                    name='local_one_plus_erf',
-                                    tracks=[T.erf],
-                                    get_nodes=get_clients)
-register_canonicalize(local_one_plus_erf)
-register_stabilize(local_one_plus_erf)
-register_specialize(local_one_plus_erf)
+                                    allow_multiple_clients=True,)
+register_canonicalize(local_one_plus_erf, name='local_one_plus_erf')
+register_stabilize(local_one_plus_erf, name='local_one_plus_erf')
+register_specialize(local_one_plus_erf, name='local_one_plus_erf')
 
 #1-erf(x)=>erfc(x)
 local_one_minus_erf = gof.PatternSub((T.sub,
-                                      dict(pattern='y', constraint=_is_1),
-                                      (T.erf, 'x')),
-                                     (T.erfc, 'x'),
-                                     allow_multiple_clients=True,
-                                     name='local_one_minus_erf',)
-register_canonicalize(local_one_minus_erf)
-register_stabilize(local_one_minus_erf)
-register_specialize(local_one_minus_erf)
+                                     dict(pattern='y', constraint=_is_1),
+                                     (T.erf, 'x')),
+                                    (T.erfc, 'x'),
+                                    allow_multiple_clients=True,)
+register_canonicalize(local_one_minus_erf, name='local_one_minus_erf')
+register_stabilize(local_one_minus_erf, name='local_one_minus_erf')
+register_specialize(local_one_minus_erf, name='local_one_minus_erf')
 
 local_one_minus_erf2 = gof.PatternSub((T.add,
                                       1,
@@ -4609,52 +4042,41 @@ def get_clients2(node):
 #1+(-erf(x))=>erfc(x) This is a different graph then the previous as
 #the canonicalize don't work completly
 local_one_plus_neg_erf = gof.PatternSub((T.add,
-                                         dict(pattern='y', constraint=_is_1),
-                                         (T.neg, (T.erf, 'x'))),
-                                        (T.erfc, 'x'),
-                                        allow_multiple_clients=True,
-                                        name='local_one_plus_neg_erf',
-                                        tracks=[T.erf],
-                                        get_nodes=get_clients2)
-register_canonicalize(local_one_plus_neg_erf)
-register_stabilize(local_one_plus_neg_erf)
-register_specialize(local_one_plus_neg_erf)
+                                     dict(pattern='y', constraint=_is_1),
+                                     (T.neg, (T.erf, 'x'))),
+                                    (T.erfc, 'x'),
+                                    allow_multiple_clients=True,)
+register_canonicalize(local_one_plus_neg_erf, name='local_one_plus_neg_erf')
+register_stabilize(local_one_plus_neg_erf, name='local_one_plus_neg_erf')
+register_specialize(local_one_plus_neg_erf, name='local_one_plus_neg_erf')
 
 #(-1)+erf(x) => -erfc(x) don't need erf(x)+(-1) as the canonicalize
 #will put the -1 as the first argument.
 local_erf_minus_one = gof.PatternSub((T.add,
-                                      dict(pattern='y', constraint=_is_minus1),
-                                      (T.erf, 'x')),
-                                     (T.neg, (T.erfc, 'x')),
-                                     allow_multiple_clients=True,
-                                     name='local_erf_minus_one',
-                                     tracks=[T.erf],
-                                     get_nodes=get_clients)
-register_canonicalize(local_erf_minus_one)
-register_stabilize(local_erf_minus_one)
-register_specialize(local_erf_minus_one)
+                                     dict(pattern='y', constraint=_is_minus1),
+                                     (T.erf, 'x')),
+                                    (T.neg, (T.erfc, 'x')),
+                                    allow_multiple_clients=True,)
+register_canonicalize(local_erf_minus_one, name='local_erf_minus_one')
+register_stabilize(local_erf_minus_one, name='local_erf_minus_one')
+register_specialize(local_erf_minus_one, name='local_erf_minus_one')
 
 #1-erfc(x) => erf(x)
 local_one_minus_erfc = gof.PatternSub((T.sub,
-                                       dict(pattern='y', constraint=_is_1),
-                                       (T.erfc, 'x')),
-                                      (T.erf, 'x'),
-                                      allow_multiple_clients=True,
-                                      name='local_one_minus_erfc',
-                                      tracks=[T.erfc],
-                                      get_nodes=get_clients)
-register_canonicalize(local_one_minus_erfc)
-register_stabilize(local_one_minus_erfc)
-register_specialize(local_one_minus_erfc)
+                                     dict(pattern='y', constraint=_is_1),
+                                     (T.erfc, 'x')),
+                                    (T.erf, 'x'),
+                                    allow_multiple_clients=True,)
+register_canonicalize(local_one_minus_erfc, name='local_one_minus_erfc')
+register_stabilize(local_one_minus_erfc, name='local_one_minus_erfc')
+register_specialize(local_one_minus_erfc, name='local_one_minus_erfc')
 
 local_one_minus_erfc2 = gof.PatternSub((T.add,
                                         1,
                                         (T.neg, (T.erfc, 'x'))),
                                        (T.erf, 'x'),
                                        allow_multiple_clients=True,
-                                       name='local_one_minus_erfc2',
-                                       tracks=[T.erfc],
-                                       get_nodes=get_clients2)
+                                       name='local_one_minus_erfc2')
 register_canonicalize(local_one_minus_erfc2)
 register_stabilize(local_one_minus_erfc2)
 register_specialize(local_one_minus_erfc2)
@@ -4664,9 +4086,7 @@ def get_clients2(node):
                                         (T.mul, -1, (T.erfc, 'x'))),
                                        (T.erf, 'x'),
                                        allow_multiple_clients=True,
-                                       name='local_one_minus_erfc3',
-                                       tracks=[T.erfc],
-                                       get_nodes=get_clients2)
+                                       name='local_one_minus_erfc3')
 register_canonicalize(local_one_minus_erfc3)
 register_stabilize(local_one_minus_erfc3)
 register_specialize(local_one_minus_erfc3)
@@ -4674,40 +4094,31 @@ def get_clients2(node):
 #1+(-erfc(x)) => erf(x) This is a different graph then the previous as
 #the canonicalize don't work completly
 local_one_add_neg_erfc = gof.PatternSub((T.add,
-                                         dict(pattern='y', constraint=_is_1),
-                                         (T.neg, (T.erfc, 'x'))),
-                                        (T.erf, 'x'),
-                                        allow_multiple_clients=True,
-                                        name='local_one_add_neg_erfc',
-                                        tracks=[T.erfc],
-                                        get_nodes=get_clients2)
-
-register_canonicalize(local_one_add_neg_erfc)
-register_stabilize(local_one_add_neg_erfc)
-register_specialize(local_one_add_neg_erfc)
+                                     dict(pattern='y', constraint=_is_1),
+                                     (T.neg, (T.erfc, 'x'))),
+                                    (T.erf, 'x'),
+                                    allow_multiple_clients=True,)
+register_canonicalize(local_one_add_neg_erfc, name='local_one_add_neg_erfc')
+register_stabilize(local_one_add_neg_erfc, name='local_one_add_neg_erfc')
+register_specialize(local_one_add_neg_erfc, name='local_one_add_neg_erfc')
 
 #(-1)+erfc(-x)=>erf(x)
 local_erf_neg_minus_one = gof.PatternSub((T.add,
-                                          dict(pattern='y', constraint=_is_minus1),
-                                          (T.erfc, (T.neg, 'x'))),
-                                         (T.erf, 'x'),
-                                         allow_multiple_clients=True,
-                                         name='local_erf_neg_minus_one',
-                                         tracks=[T.erfc],
-                                         get_nodes=get_clients)
-register_canonicalize(local_erf_neg_minus_one)
-register_stabilize(local_erf_neg_minus_one)
-register_specialize(local_erf_neg_minus_one)
+                                     dict(pattern='y', constraint=_is_minus1),
+                                     (T.erfc, (T.neg, 'x'))),
+                                    (T.erf, 'x'),
+                                    allow_multiple_clients=True,)
+register_canonicalize(local_erf_neg_minus_one, name='local_erf_neg_minus_one')
+register_stabilize(local_erf_neg_minus_one, name='local_erf_neg_minus_one')
+register_specialize(local_erf_neg_minus_one, name='local_erf_neg_minus_one')
 
 #(-1)+erfc(-1*x)=>erf(x)
 local_erf_neg_minus_one2 = gof.PatternSub((T.add,
-                                           dict(pattern='y', constraint=_is_minus1),
-                                           (T.erfc, (T.mul, -1, 'x'))),
-                                          (T.erf, 'x'),
-                                          allow_multiple_clients=True,
-                                          name='local_erf_neg_minus_one2',
-                                          tracks=[T.erfc],
-                                          get_nodes=get_clients)
+                                     dict(pattern='y', constraint=_is_minus1),
+                                     (T.erfc, (T.mul, -1, 'x'))),
+                                    (T.erf, 'x'),
+                                    allow_multiple_clients=True,
+                                          name='local_erf_neg_minus_one2')
 register_canonicalize(local_erf_neg_minus_one2)
 register_stabilize(local_erf_neg_minus_one2)
 register_specialize(local_erf_neg_minus_one2)
@@ -5123,7 +4534,7 @@ def local_fuse(node):
         # worthwhile if the summation axis doesn't line up with a
         # contiguous dimension)
 
-        if type(node.op) is not OP:
+        if not isinstance(node.op, OP):
             return False
         inputs = []  # inputs of the new Elemwise op.
         s_inputs = []  # inputs of the new scalar op used by the Composite.
@@ -5158,11 +4569,7 @@ def local_fuse(node):
             # we still want to fusion. So we take the set.
             if (i.owner and
                 isinstance(i.owner.op, OP) and
-                len(set([n for n, idx in i.clients])) == 1 and
-                # Do not merge elemwise that don't have the same
-                # broadcastable pattern to don't redo duplicate
-                # computation due to broadcast.
-                i.owner.outputs[0].broadcastable == node.outputs[0].broadcastable):
+                len(set([n for n, idx in i.clients])) == 1):
 
                 do_fusion = True
                 try:
@@ -5174,12 +4581,8 @@ def local_fuse(node):
                         elif ii in tmp_input:
                             tmp_s_input.append(tmp_scalar[tmp_input.index(ii)])
                         else:
-                            tmp = scalar.get_scalar_type(ii.dtype).make_variable()
-                            try:
-                                tmp.tag.test_value = gof.op.get_test_value(ii).flatten()[0]
-                            except AttributeError:
-                                pass
-                            tmp_s_input.append(tmp)
+                            tmp_s_input.append(scalar.Scalar(
+                                    ii.dtype).make_variable())
                             tmp_input.append(ii)
                             tmp_scalar.append(tmp_s_input[-1])
                     s_op = i.owner.op.scalar_op(*tmp_s_input)
@@ -5228,15 +4631,7 @@ def local_fuse(node):
                 if inputs.count(i) == node.inputs.count(i):
                     s = s_inputs[inputs.index(i)]
                 else:
-                    s = scalar.get_scalar_type(i.dtype).make_variable()
-                    try:
-                        if theano.config.compute_test_value != 'off':
-                            v = gof.op.get_test_value(i)
-                            if v.size > 0:
-                                s.tag.test_value = v.flatten()[0]
-                    except AttributeError:
-                        pass
-
+                    s = scalar.Scalar(i.dtype).make_variable()
                     inputs.append(i)
                     s_inputs.append(s)
                 s_g.append(s)
@@ -5270,8 +4665,7 @@ def local_fuse(node):
         C = scalar.Composite(s_inputs, [s_new_out])
 
         #create the new node.
-        #Do not call make_node to have test_value
-        n = OP(C)(*inputs).owner
+        n = OP(C).make_node(*inputs)
         assert len(n.outputs) == 1
         assert node.outputs[0].dtype == n.outputs[0].dtype
 
@@ -5295,17 +4689,7 @@ def local_fuse(node):
         return n.outputs
     return local_fuse
 
-
-def elemwise_max_input_fct(node):
-    # The Elemwise.perform use numpy ufunc and they are limited to 31
-    # inputs.
-    if not theano.config.cxx:
-        return 31
-    return 1024
-
-
-local_elemwise_fusion = local_elemwise_fusion_op(T.Elemwise,
-                                                 elemwise_max_input_fct)
+local_elemwise_fusion = local_elemwise_fusion_op(T.Elemwise)
 
 
 class FusionOptimizer(Optimizer):
@@ -5316,21 +4700,12 @@ def __init__(self, local_optimizer):
 
     def add_requirements(self, fgraph):
         fgraph.attach_feature(toolbox.ReplaceValidate())
+        fgraph.attach_feature(DestroyHandler())
 
     def apply(self, fgraph):
         did_something = True
-        nb_iter = 0
-        nb_replacement = 0
-        nb_inconsistency_replace = 0
-        time_toposort = 0
-        if fgraph.profile:
-            validate_before = fgraph.profile.validate_time
-            callbacks_before = fgraph.execute_callbacks_times.copy()
-            callback_before = fgraph.execute_callbacks_time
         while did_something:
-            t0 = time.time()
             nodelist = list(fgraph.toposort())
-            time_toposort += time.time() - t0
             nodelist.reverse()
             did_something = False
             for node in nodelist:
@@ -5344,96 +4719,16 @@ def apply(self, fgraph):
                                 zip(node.outputs, new_outputs),
                                 reason=self.__class__.__name__)
                             did_something = True
-                            nb_replacement += 1
                         except InconsistencyError:
-                            nb_inconsistency_replace += 1
                             pass
-            nb_iter += 1
-
-        if fgraph.profile:
-            validate_time = fgraph.profile.validate_time - validate_before
-            callback_time = fgraph.execute_callbacks_time - callback_before
-            callbacks_time = {}
-            for k, v in fgraph.execute_callbacks_times.iteritems():
-                if k in callbacks_before:
-                    callbacks_time[k] = v - callbacks_before[k]
-                else:
-                    callbacks_time[k] = v
-        else:
-            validate_time = None
-            callback_time = None
-            callbacks_time = {}
-        return (self, nb_iter, nb_replacement,
-                nb_inconsistency_replace,
-                validate_time, callback_time, callbacks_time,
-                time_toposort)
-
-    @staticmethod
-    def print_profile(stream, prof, level=0):
-        blanc = ('    ' * level)
-        print >> stream, blanc, "FusionOptimizer"
-        print >> stream, blanc, " nb_iter", prof[1]
-        print >> stream, blanc, " nb_replacement", prof[2]
-        print >> stream, blanc, " nb_inconsistency_replace", prof[3]
-        print >> stream, blanc, " validate_time", prof[4]
-        print >> stream, blanc, " callback_time", prof[5]
-        if prof[5] > 1:
-            print >> stream, blanc, " callbacks_time"
-            for i in sorted(prof[6].iteritems(), key=lambda a: a[1]):
-                if i[1] > 0:
-                    print i
-        print >> stream, blanc, " time_toposort", prof[7]
-
-
-def local_add_mul_fusion(node):
-    """Fuse consecutive add or mul in one such node with more inputs.
-
-    It is better to fuse add/mul that way then in a Composite node as
-    this make the inner graph of the Compiste smaller. This allow to
-    put more computation in a Composite before hitting the max
-    recusion limit when pickling Composite.
-
-    """
-    if (not isinstance(node.op, Elemwise) or
-        not isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul))):
-        return False
-
-    s_op = node.op.scalar_op.__class__
-    for inp in node.inputs:
-        if (inp.owner and
-            isinstance(inp.owner.op, Elemwise) and
-            isinstance(inp.owner.op.scalar_op, s_op)):
-            l = list(node.inputs)
-            l.remove(inp)
-            return [node.op(*(l + inp.owner.inputs))]
 
 if config.tensor.local_elemwise_fusion:
     _logger.debug("enabling optimization fusion elemwise in fast_run")
-    #Must be after gpu(48.5) and before AddDestroyHandler(49.5)
-    fuse_seqopt = gof.SequenceDB()
-    fuse_seqopt.register('local_add_mul_fusion',
-                         FusionOptimizer(local_add_mul_fusion),
-                         0, 'fast_run', 'fusion')
-    fuse_seqopt.register('composite_elemwise_fusion',
-                         FusionOptimizer(local_elemwise_fusion),
-                         1, 'fast_run', 'fusion')
     compile.optdb.register('elemwise_fusion',
-                           fuse_seqopt, 49,
-                           'fast_run', 'fusion', 'local_elemwise_fusion',
-                           'FusionOptimizer')
+                           FusionOptimizer(local_elemwise_fusion), 71.00,
+                           'fast_run', 'fusion', 'local_elemwise_fusion')
 else:
     _logger.debug("not enabling optimization fusion elemwise in fast_run")
     compile.optdb.register('elemwise_fusion',
-                           FusionOptimizer(local_elemwise_fusion), 49,
-                           'fusion', 'local_elemwise_fusion',
-                           'FusionOptimizer')
-
-
-# ############################
-# # Remove consider_constant #
-# ############################
-
-# Although the op just returns its input, it should be removed from
-# the graph to make sure all possible optimizations can be applied.
-register_canonicalize(gof.OpRemove(theano.gradient.consider_constant_),
-    'fast_compile', 'fast_run', name='remove_consider_constant')
+                           FusionOptimizer(local_elemwise_fusion), 71.00,
+                           'fusion', 'local_elemwise_fusion')
diff --git a/theano/tensor/opt_uncanonicalize.py b/theano/tensor/opt_uncanonicalize.py
index e972b9cdba9..94049f92b59 100644
--- a/theano/tensor/opt_uncanonicalize.py
+++ b/theano/tensor/opt_uncanonicalize.py
@@ -22,43 +22,60 @@
 
 # TODO: intelligent merge for mul/add
 # TODO: 0*x -> 0
+
+
+
 import logging
 _logger = logging.getLogger('theano.tensor.opt')
 
 from theano import gof
-from theano.compat.python2x import deque
 from theano.tensor.elemwise import CAReduce
 from theano.tensor import basic as T
 
-from theano.tensor.basic import (get_scalar_constant_value,
-                                 NotScalarConstantError)
+from theano.gof.opt import Optimizer
+from theano.gof import InconsistencyError, toolbox
+
+from theano.tensor.basic import get_scalar_constant_value, NotScalarConstantError
 from theano.tensor.opt import register_uncanonicalize
 from theano import scalar as scal
 
+class MaxAndArgmaxOptimizer(Optimizer):
+    """Replace MaxAndArgmax by CAReduce when the argmax is not used
 
-@register_uncanonicalize
-@gof.local_optimizer([T._max_and_argmax])
-def local_max_and_argmax(node):
+       This is faster as MaxAndArgmax don't have c code and execute it
+       in two pass.
     """
-    If we don't use the argmax, change it to a max only.
-    """
-    if node.op == T._max_and_argmax:
-        if len(node.outputs[1].clients) == 0:
-            #MaxAndArgmax support variable axis,
-            #but CAReduce support only constant axis.
-            if node.inputs[1].data is None:
-                axis = None
-            else:
-                try:
-                    axis = get_scalar_constant_value(node.inputs[1])
-                except NotScalarConstantError:
-                    return False
-
-            new = CAReduce(scal.maximum, axis)(node.inputs[0])
-            return [new, None]
+
+    def add_requirements(self, fgraph):
+        fgraph.attach_feature(toolbox.ReplaceValidate())
+
+    def apply(self, fgraph):
+        did_something = True
+        while did_something:
+            nodelist = fgraph.toposort()
+            did_something = False
+            for node in nodelist:
+                if node.op == T._max_and_argmax:
+                    if len(node.outputs[1].clients)==0:
+                        try:
+                            axis=get_scalar_constant_value(node.inputs[1])
+                        except NotScalarConstantError:
+                            return False
+
+                        new = CAReduce(scal.maximum,axis)(node.inputs[0])
+                        try:
+                            fgraph.replace_all_validate(
+                                ((node.outputs[0],new),),
+                                reason = self.__class__.__name__)
+                            did_something = True
+                            break
+                        except InconsistencyError, e:
+                            pass
+
+register_uncanonicalize(MaxAndArgmaxOptimizer(),name='MaxAndArgmaxOptimizer')
 
 @register_uncanonicalize
-@gof.local_optimizer([T.neg])
+@gof.local_optimizer([T._shape])
 def local_max_to_min(node):
     """
     change -(max(-x)) to min
@@ -70,12 +87,9 @@ def local_max_to_min(node):
     """
     if node.op == T.neg and node.inputs[0].owner:
         max = node.inputs[0]
-        if (max.owner and
-            isinstance(max.owner.op, CAReduce)
-            and max.owner.op.scalar_op == scal.maximum):
+        if max.owner and isinstance(max.owner.op, CAReduce) and max.owner.op.scalar_op==scal.maximum:
             neg = max.owner.inputs[0]
             if neg.owner and neg.owner.op == T.neg:
-                return [CAReduce(scal.minimum,
-                                 max.owner.op.axis)(neg.owner.inputs[0])]
+                return [CAReduce(scal.minimum,max.owner.op.axis)(neg.owner.inputs[0])]
 
     return False
diff --git a/theano/tensor/raw_random.py b/theano/tensor/raw_random.py
index 114cb0fd813..8a824a97d7f 100644
--- a/theano/tensor/raw_random.py
+++ b/theano/tensor/raw_random.py
@@ -75,11 +75,6 @@ def get_size(self, shape_info):
             else:
                 raise NotImplementedError()
         return size
-
-    @staticmethod
-    def may_share_memory(a, b):
-        return a is b
-
 # Register RandomStateType's C code for ViewOp.
 theano.compile.register_view_op_c_code(
         RandomStateType,
@@ -183,11 +178,7 @@ def make_node(self, r, shape, *args):
         draw.
 
         """
-        shape_ = tensor.as_tensor_variable(shape, ndim=1)
-        if shape == ():
-            shape = shape_.astype('int32')
-        else:
-            shape = shape_
+        shape = tensor.as_tensor_variable(shape, ndim=1)
         assert shape.type.ndim == 1
         assert (shape.type.dtype == 'int64') or (shape.type.dtype == 'int32')
         if not isinstance(r.type, RandomStateType):
@@ -310,12 +301,14 @@ def _infer_ndim_bcast(ndim, shape, *args):
     else:
         args_ndim = 0
 
-    if isinstance(shape, (tuple, list)):
-        # there is a convention that -1 means the corresponding shape of a
-        # potentially-broadcasted symbolic arg
-        #
-        # This case combines together symbolic and non-symbolic shape
-        # information
+    # there is a convention that -1 means the corresponding shape of a
+    # potentially-broadcasted symbolic arg
+    if (isinstance(shape, (tuple, list))
+            and numpy.all(numpy.asarray(shape) >= 0)):
+        bcast = [(s == 1) for s in shape]
+        v_shape = tensor.TensorConstant(type=tensor.lvector,
+                                        data=theano._asarray(shape,
+                                                             dtype='int64'))
         shape_ndim = len(shape)
         if ndim is None:
             ndim = shape_ndim
@@ -324,7 +317,18 @@ def _infer_ndim_bcast(ndim, shape, *args):
                 raise ValueError('ndim should be equal to len(shape), but\n',
                             'ndim = %s, len(shape) = %s, shape = %s'
                             % (ndim, shape_ndim, shape))
-
+    elif isinstance(shape, (tuple, list)):
+        # there is a convention that -1 means the corresponding shape of a
+        # potentially-broadcasted symbolic arg
+        #
+        # This case combines together symbolic and non-symbolic shape
+        # information
+        if ndim is None:
+            ndim = args_ndim
+        else:
+            ndim = max(args_ndim, ndim)
+        ndim = max(args_ndim, len(shape))
+        shape = [-1] * (ndim - len(shape)) + list(shape)
         bcast = []
         pre_v_shape = []
         for i, s in enumerate(shape):
@@ -360,10 +364,7 @@ def _infer_ndim_bcast(ndim, shape, *args):
                     ValueError('negative shape', s)
         # post-condition: shape may still contain both symbolic and
         # non-symbolic things
-        if len(pre_v_shape) == 0:
-            v_shape = tensor.constant([], dtype='int32')
-        else:
-            v_shape = tensor.stack(*pre_v_shape)
+        v_shape = tensor.stack(*pre_v_shape)
 
     elif shape is None:
         # The number of drawn samples will be determined automatically,
@@ -584,78 +585,6 @@ def random_integers(random_state, size=None, low=0, high=1, ndim=None,
     return op(random_state, size, low, high)
 
 
-def choice_helper(random_state, a, replace, p, size):
-    """Helper function to draw random numbers using numpy's choice function.
-
-    This is a generalization of numpy.random.choice that coerces
-    `replace` to a bool and replaces `p` with None when p is a vector
-    of 0 elements.
-    """
-    if a.ndim > 1:
-        raise ValueError('a.ndim (%i) must be 0 or 1' % a.ndim)
-    if p.ndim == 1:
-        if p.size == 0:
-            p = None
-    else:
-        raise ValueError('p.ndim (%i) must be 1' % p.ndim)
-    replace = bool(replace)
-    return random_state.choice(a, size, replace, p)
-
-
-def choice(random_state, size=None, a=2, replace=True, p=None, ndim=None,
-           dtype='int64'):
-    """
-    Choose values from `a` with or without replacement. `a` can be a 1-D array
-    or a positive scalar. If `a` is a scalar, the samples are drawn from the
-    range 0,...,a-1.
-
-    If the size argument is ambiguous on the number of dimensions, ndim
-    may be a plain integer to supplement the missing information.
-
-    If size is None, a scalar will be returned.
-    """
-    # numpy.random.choice is only available for numpy versions >= 1.7
-    major, minor, _ = numpy.version.short_version.split('.')
-    if (int(major), int(minor)) < (1, 7):
-        raise ImportError('choice requires at NumPy version >= 1.7 '
-                          '(%s)' % numpy.__version__)
-    a = tensor.as_tensor_variable(a)
-    if isinstance(replace, bool):
-        replace = tensor.constant(replace, dtype='int8')
-    else:
-        replace = tensor.as_tensor_variable(replace)
-    # encode p=None as an empty vector
-    p = tensor.as_tensor_variable(p or [])
-    ndim, size, bcast = _infer_ndim_bcast(ndim, size)
-    op = RandomFunction(choice_helper, tensor.TensorType(dtype=dtype,
-                                                         broadcastable=bcast))
-    return op(random_state, size, a, replace, p)
-
-def poisson(random_state, size=None, lam=1.0, ndim=None, dtype='int64'):
-    """
-    Draw samples from a Poisson distribution.
-
-    The Poisson distribution is the limit of the Binomial distribution for large N.
-    
-    :param lam: float or ndarray-like of the same shape as size parameter
-        Expectation of interval, should be >= 0.
-
-    :param size: int or tuple of ints, optional
-        Output shape. If the given shape is, e.g., (m, n, k), then m * n * k samples are drawn.
-
-    :param dtype: the dtype of the return value (which will represent counts)
-
-    size or ndim must be given
-    """
-    lam = tensor.as_tensor_variable(lam)
-    
-    ndim, size, bcast = _infer_ndim_bcast(ndim, size)
-
-    op = RandomFunction("poisson", tensor.TensorType(dtype=dtype,
-                                                     broadcastable=bcast))
-    return op(random_state, size, lam)
-
-
 def permutation_helper(random_state, n, shape):
     """Helper function to generate permutations from integers.
 
@@ -667,9 +596,6 @@ def permutation_helper(random_state, n, shape):
 
     If you wish to perform a permutation of the elements of an existing vector,
     see shuffle_row_elements.
-
-    This is a generalization of numpy.random.permutation to tensors.
-    Otherwise it behaves the same.
     """
     # n should be a 0-dimension array
     assert n.shape == ()
@@ -682,7 +608,7 @@ def permutation_helper(random_state, n, shape):
         shape = ()
     out_shape = list(shape)
     out_shape.append(n)
-    out = numpy.empty(out_shape, int)
+    out = numpy.zeros(out_shape, int)
     for i in numpy.ndindex(*shape):
         out[i] = random_state.permutation(n)
 
@@ -704,15 +630,7 @@ def permutation(random_state, size=None, n=1, ndim=None, dtype='int64'):
     :note:
         Note that the output will then be of dimension ndim+1.
     """
-    if size is None or size == ():
-        if not(ndim is None or ndim == 1):
-            raise TypeError(
-                "You asked for just one permutation but asked for more then 1 dimensions.")
-        ndim = 1
-        size = ()
-        bcast = ()
-    else:
-        ndim, size, bcast = _infer_ndim_bcast(ndim, size)
+    ndim, size, bcast = _infer_ndim_bcast(ndim, size)
     #print "NDIM", ndim, size
     op = RandomFunction(permutation_helper,
             tensor.TensorType(dtype=dtype, broadcastable=bcast + (False,)),
@@ -860,7 +778,7 @@ def multinomial(random_state, size=None, n=1, pvals=[0.5, 0.5],
     return op(random_state, size, n, pvals)
 
 
-@gof.local_optimizer([RandomFunction])
+@gof.local_optimizer([None])
 def random_make_inplace(node):
     op = node.op
     if isinstance(op, RandomFunction) and not op.inplace:
@@ -879,7 +797,7 @@ class RandomStreamsBase(object):
     def binomial(self, size=None, n=1, p=0.5, ndim=None, dtype='int64',
                  prob=None):
         """
-        Sample n times with probability of success p for each trial and
+        Sample n times with probability of success prob for each trial,
         return the number of successes.
 
         If the size argument is ambiguous on the number of dimensions,
@@ -924,31 +842,6 @@ def random_integers(self, size=None, low=0, high=1, ndim=None,
         """
         return self.gen(random_integers, size, low, high, ndim=ndim,
                         dtype=dtype)
-    
-    def choice(self, size=None, a=2, replace=True, p=None, ndim=None,
-               dtype='int64'):
-        """
-        Choose values from `a` with or without replacement. `a` can be a 1-D
-        array or a positive scalar. If `a` is a scalar, the samples are drawn
-        from the range 0,...,a-1.
-
-        If the size argument is ambiguous on the number of dimensions,
-        ndim may be a plain integer to supplement the missing
-        information.
-        """
-        return self.gen(choice, size, a, replace, p, ndim=ndim, dtype=dtype)
-
-    def poisson(self, size=None, lam=None, ndim=None, dtype='int64'):
-        """
-        Draw samples from a Poisson distribution.
-  
-        The Poisson distribution is the limit of the Binomial distribution for large N.
-
-        If the size argument is ambiguous on the number of dimensions,
-        ndim may be a plain integer to supplement the missing
-        information.
-        """
-        return self.gen(poisson, size, lam, ndim=ndim, dtype=dtype)
 
     def permutation(self, size=None, n=1, ndim=None, dtype='int64'):
         """
diff --git a/theano/tensor/signal/conv.py b/theano/tensor/signal/conv.py
index f530074252c..605864e155f 100644
--- a/theano/tensor/signal/conv.py
+++ b/theano/tensor/signal/conv.py
@@ -5,9 +5,6 @@
 
 __docformat__ = "restructuredtext en"
 
-import warnings
-
-import theano
 import theano.tensor as tensor
 from theano.tensor.nnet import conv
 
@@ -85,16 +82,7 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
     output = op(input4D, filters4D)
 
     # flatten to 3D tensor if convolving with single filter or single image
-    if input.ndim == 2 and filters.ndim == 2:
-        if theano.config.warn.signal_conv2d_interface:
-            warnings.warn(
-                "theano.tensor.signal.conv2d() now outputs a 2d tensor when both"
-                " inputs are 2d. To disable this warning, set the Theano flag"
-                " warn.signal_conv2d_interface to False",
-                stacklevel=3)
-
-        output = tensor.flatten(output.T, outdim=2).T
-    elif input.ndim == 2 or filters.ndim == 2:
+    if input.ndim==2 or filters.ndim==2:
         output = tensor.flatten(output.T, outdim=3).T
 
     return output
diff --git a/theano/tensor/signal/downsample.py b/theano/tensor/signal/downsample.py
index fd0127afcf3..e77070dab1d 100644
--- a/theano/tensor/signal/downsample.py
+++ b/theano/tensor/signal/downsample.py
@@ -26,13 +26,11 @@ def max_pool_2d(input, ds, ignore_border=False):
     patches of size (ds[0],ds[1])
 
     :type input: N-D theano tensor of input images.
-    :param input: input images. Max pooling will be done over the 2 last
-        dimensions.
+    :param input: input images. Max pooling will be done over the 2 last dimensions.
     :type ds: tuple of length 2
-    :param ds: factor by which to downscale (vertical ds, horizontal ds).
-        (2,2) will halve the image in each dimension.
-    :param ignore_border: boolean value. When True, (5,5) input with ds=(2,2)
-        will generate a (2,2) output. (3,3) otherwise.
+    :param ds: factor by which to downscale. (2,2) will halve the image in each dimension.
+    :param ignore_border: boolean value. When True, (5,5) input with ds=(2,2) will generate a
+      (2,2) output. (3,3) otherwise.
     """
     if input.ndim < 2:
         raise NotImplementedError('max_pool_2d requires a dimension >= 2')
@@ -46,7 +44,7 @@ def max_pool_2d(input, ds, ignore_border=False):
 
     # store as 4D tensor with shape: (batch_size,1,height,width)
     new_shape = tensor.cast(tensor.join(0, batch_size,
-                                        tensor.as_tensor([1]),
+                                        tensor.as_tensor([1,]),
                                         img_shape), 'int64')
     input_4D = tensor.reshape(input, new_shape, ndim=4)
 
@@ -68,113 +66,67 @@ class DownsampleFactorMax(Op):
     """
 
     @staticmethod
-    def out_shape(imgshape, ds, ignore_border=False, st=None):
-        """Return the shape of the output from this op, for input of given
-        shape and flags.
+    def out_shape(imgshape, ds, ignore_border=False):
+        """Return the shape of the output from this op, for input of given shape and flags.
 
-        :param imgshape: the shape of a tensor of images. The last two elements
-            are interpreted as the number of rows, and the number of cols.
+        :param imgshape: the shape of a tensor of images. The last two elements are interpreted
+        as the number of rows, and the number of cols.
         :type imgshape: tuple, list, or similar of integer or
-            scalar Theano variable.
+        scalar Theano variable.
 
         :param ds: downsample factor over rows and columns
-                   this parameter indicates the size of the pooling region
         :type ds: list or tuple of two ints
 
-        :param st: the stride size. This is the distance between the pooling
-                   regions. If it's set to None, in which case it equlas ds.
-        :type st: list or tuple of two ints
-
-        :param ignore_border: if ds doesn't divide imgshape, do we include an
-            extra row/col of partial downsampling (False) or ignore it (True).
+        :param ignore_border: if ds doesn't divide imgshape, do we include an extra row/col of
+        partial downsampling (False) or ignore it (True).
         :type ignore_border: bool
 
         :rtype: list
-        :returns: the shape of the output from this op, for input of given
-            shape.  This will have the same length as imgshape, but with last
-            two elements reduced as per the downsampling & ignore_border flags.
+        :returns: the shape of the output from this op, for input of given shape.  This will
+        have the same length as imgshape, but with last two elements reduced as per the
+        downsampling & ignore_border flags.
         """
         if len(imgshape) < 2:
-            raise TypeError('imgshape must have at least two elements '
-                            '(rows, cols)')
-
-        if st is None:
-            st = ds
+            raise TypeError('imgshape must have at least two elements (rows, cols)')
         r, c = imgshape[-2:]
+        rval = list(imgshape[:-2]) + [r // ds[0], c // ds[1]]
 
-        if ignore_border:
-            out_r = (r - ds[0]) // st[0] + 1
-            out_c = (c - ds[1]) // st[1] + 1
-            if isinstance(r, theano.Variable):
-                nr = tensor.maximum(out_r, 0)
-            else:
-                nr = numpy.maximum(out_r, 0)
-            if isinstance(c, theano.Variable):
-                nc = tensor.maximum(out_c, 0)
-            else:
-                nc = numpy.maximum(out_c, 0)
-        else:
+        if not ignore_border:
             if isinstance(r, theano.Variable):
-                nr = tensor.switch(tensor.ge(st[0], ds[0]),
-                                   (r - 1) // st[0] + 1,
-                                   tensor.maximum(0, (r - 1 - ds[0])
-                                                  // st[0] + 1) + 1)
-            elif st[0] >= ds[0]:
-                nr = (r - 1) // st[0] + 1
-            else:
-                nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1
-
+                rval[-2] = tensor.switch(r % ds[0], rval[-2] + 1, rval[-2])
+            elif r % ds[0]:
+                rval[-2] += 1
             if isinstance(c, theano.Variable):
-                nc = tensor.switch(tensor.ge(st[1], ds[1]),
-                                   (c - 1) // st[1] + 1,
-                                   tensor.maximum(0, (c - 1 - ds[1])
-                                                  // st[1] + 1) + 1)
-            elif st[1] >= ds[1]:
-                nc = (c - 1) // st[1] + 1
-            else:
-                nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1
-
-        rval = list(imgshape[:-2]) + [nr, nc]
+                rval[-1] = tensor.switch(c % ds[1], rval[-1] + 1, rval[-1])
+            elif c % ds[1]:
+                rval[-1] += 1
         return rval
 
-    def __init__(self, ds, ignore_border=False, st=None):
+    def __init__(self, ds, ignore_border=False):
         """
-        :param ds: downsample factor over rows and column.
-                   ds indicates the pool region size.
+        :param ds: downsample factor over rows and columns
         :type ds: list or tuple of two ints
 
-        : param st: stride size, which is the number of shifts
-            over rows/cols to get the the next pool region.
-            if st is None, it is considered equal to ds
-            (no overlap on pooling regions)
-        : type st: list or tuple of two ints
-
-        :param ignore_border: if ds doesn't divide imgshape, do we include
-            an extra row/col of partial downsampling (False) or
-            ignore it (True).
+        :param ignore_border: if ds doesn't divide imgshape, do we include an extra row/col of
+        partial downsampling (False) or ignore it (True).
         :type ignore_border: bool
 
         TODO: why is poolsize an op parameter here?
         """
         self.ds = tuple(ds)
-        if st is None:
-            st = ds
-        self.st = tuple(st)
         self.ignore_border = ignore_border
 
     def __eq__(self, other):
         return (type(self) == type(other) and
                 self.ds == other.ds and
-                self.st == other.st and
                 self.ignore_border == other.ignore_border)
 
     def __hash__(self):
-        return hash(type(self)) ^ hash(self.ds) ^ \
-            hash(self.st) ^ hash(self.ignore_border)
+        return hash(type(self)) ^ hash(self.ds) ^ hash(self.ignore_border)
 
     def __str__(self):
-        return '%s{%s,%s,%s}' % (self.__class__.__name__,
-                                 self.ds, self.st, self.ignore_border)
+        return '%s{%s,%s}' % (self.__class__.__name__,
+                              self.ds, self.ignore_border)
 
     def make_node(self, x):
         if x.type.ndim != 4:
@@ -190,57 +142,46 @@ def perform(self, node, inp, out):
         if len(x.shape) != 4:
             raise NotImplementedError(
                 'DownsampleFactorMax requires 4D input for now')
-        z_shape = self.out_shape(x.shape, self.ds, self.ignore_border, self.st)
+        z_shape = self.out_shape(x.shape, self.ds, self.ignore_border)
         if (z[0] is None) or (z[0].shape != z_shape):
             z[0] = numpy.zeros(self.out_shape(x.shape, self.ds,
-                                              self.ignore_border, self.st))
+                                              self.ignore_border))
             z[0] = theano._asarray(z[0], dtype=x.dtype)
         zz = z[0]
 
         ## zz needs to be initialized with -inf for the following to work
         zz -= numpy.inf
-        #number of pooling output rows
-        pr = zz.shape[-2]
-        #number of pooling output cols
-        pc = zz.shape[-1]
         ds0, ds1 = self.ds
-        st0, st1 = self.st
-        img_rows = x.shape[-2]
-        img_cols = x.shape[-1]
-
+        if self.ignore_border:
+            x_usable2 = (x.shape[2] // ds0 * ds0)
+        else:
+            x_usable2 = x.shape[2]
+        if self.ignore_border:
+            x_usable3 = (x.shape[3] // ds1 * ds1)
+        else:
+            x_usable3 = x.shape[3]
         for n in xrange(x.shape[0]):
             for k in xrange(x.shape[1]):
-                for r in xrange(pr):
-                    row_st = r * st0
-                    row_end = __builtin__.min(row_st + ds0, img_rows)
-                    for c in xrange(pc):
-                        col_st = c * st1
-                        col_end = __builtin__.min(col_st + ds1, img_cols)
-                        for row_ind in xrange(row_st, row_end):
-                            for col_ind in xrange(col_st, col_end):
-                                zz[n, k, r, c] = \
-                                    __builtin__.max(zz[n, k, r, c],
-                                                    x[n, k, row_ind, col_ind])
+                for i in xrange(x_usable2):
+                    zi = i / ds0
+                    for j in xrange(x_usable3):
+                        zj = j / ds1
+                        zz[n, k, zi, zj] = __builtin__.max(zz[n, k, zi, zj],
+                                                           x[n, k, i, j])
 
     def infer_shape(self, node, in_shapes):
-        shp = self.out_shape(in_shapes[0], self.ds,
-                             self.ignore_border, self.st)
+        shp = self.out_shape(in_shapes[0], self.ds, self.ignore_border)
         return [shp]
 
     def grad(self, inp, grads):
         x, = inp
         gz, = grads
         maxout = self(x)
-        if self.st != self.ds:
-            return theano.gradient.grad_not_implemented
         return [DownsampleFactorMaxGrad(self.ds,
-                                        ignore_border=self.ignore_border,
-                                        st=self.st)(
+                                        ignore_border=self.ignore_border)(
                                             x, maxout, gz)]
 
     def c_code(self, node, name, inp, out, sub):
-        if self.ds != self.st:
-           raise theano.gof.utils.MethodNotDefined()
         x, = inp
         z, = out
         fail = sub['fail']
@@ -318,26 +259,21 @@ def c_code_cache_version(self):
 
 class DownsampleFactorMaxGrad(Op):
 
-    def __init__(self, ds, ignore_border, st=None):
+    def __init__(self, ds, ignore_border):
         self.ds = tuple(ds)
         self.ignore_border = ignore_border
-        if st is None:
-            st = ds
-        self.st = tuple(st)
 
     def __eq__(self, other):
         return (type(self) == type(other) and
                 self.ds == other.ds and
-                self.st == other.st and
                 self.ignore_border == other.ignore_border)
 
     def __hash__(self):
-        return hash(type(self)) ^ hash(self.ds) ^ \
-            hash(self.st) ^ hash(self.ignore_border)
+        return hash(type(self)) ^ hash(self.ds) ^ hash(self.ignore_border)
 
     def __str__(self):
-        return '%s{%s,%s,%s}' % (self.__class__.__name__,
-                                 self.ds, self.st, self.ignore_border)
+        return '%s{%s,%s}' % (self.__class__.__name__,
+                              self.ds, self.ignore_border)
 
     def make_node(self, x, maxout, gz):
         # make_node should only be called by the grad function of
@@ -353,42 +289,27 @@ def perform(self, node, inp, out):
         gx_stg, = out
         gx = numpy.zeros_like(x)
 
-        #number of pooling output rows
-        pr = maxout.shape[-2]
-        #number of pooling output cols
-        pc = maxout.shape[-1]
         ds0, ds1 = self.ds
-        st0, st1 = self.st
-        img_rows = x.shape[-2]
-        img_cols = x.shape[-1]
-
+        shape2 = (x.shape[2] // ds0 * ds0)
+        if not self.ignore_border:
+            shape2 = x.shape[2]
+        shape3 = (x.shape[3] // ds1 * ds1)
+        if not self.ignore_border:
+            shape3 = x.shape[3]
         for n in xrange(x.shape[0]):
             for k in xrange(x.shape[1]):
-                for r in xrange(pr):
-                    row_st = r * st0
-                    row_end = __builtin__.min(row_st + ds0, img_rows)
-                    for c in xrange(pc):
-                        col_st = c * st1
-                        col_end = __builtin__.min(col_st + ds1, img_cols)
-                        for row_ind in xrange(row_st, row_end):
-                            for col_ind in xrange(col_st, col_end):
-                                if (maxout[n, k, r, c] == x[n, k, row_ind, col_ind]):
-                                    gx[n, k, row_ind, col_ind] += gz[n, k, r, c]
+                for i in xrange(shape2):
+                    zi = i / ds0
+                    for j in xrange(shape3):
+                        zj = j / ds1
+                        if (maxout[n,k,zi,zj] == x[n,k,i,j]):
+                            gx[n,k,i,j] = gz[n,k,zi,zj]
+                        else: gx[n,k,i,j] = 0
         gx_stg[0] = gx
 
     def infer_shape(self, node, in_shapes):
         return [in_shapes[0]]
 
-    def grad(self, inp, grads):
-        x, maxout, gz = inp
-        ggx, = grads
-        if self.st != self.ds:
-            return theano.gradient.grad_not_implemented
-        return [theano.tensor.zeros_like(x),
-                theano.tensor.zeros_like(maxout),
-                DownsampleFactorMaxGradGrad(
-                    self.ds, ignore_border=self.ignore_border, st=self.st)(x, maxout, ggx)]
-
     def c_code(self, node, name, inp, out, sub):
         x, z, gz = inp
         gx, = out
@@ -479,149 +400,7 @@ def c_code(self, node, name, inp, out, sub):
             }
           }//for k
         }//for b
-        """ % locals()
+        """ %locals()
 
     def c_code_cache_version(self):
-        return (0, 1)
-
-
-class DownsampleFactorMaxGradGrad(Op):
-
-    @staticmethod
-    def out_shape(imgshape, ds, ignore_border=False, st=None):
-        """Return the shape of the output from this op, for input of given
-        shape and flags.
-
-        :param imgshape: the shape of a tensor of images. The last two elements
-            are interpreted as the number of rows, and the number of cols.
-        :type imgshape: tuple, list, or similar of integer or
-            scalar Theano variable.
-
-        :param ds: downsample factor over rows and columns
-                   this parameter indicates the size of the pooling region
-        :type ds: list or tuple of two ints
-
-        :param st: the stride size. This is the distance between the pooling
-                   regions. If it's set to None, in which case it equlas ds.
-        :type st: list or tuple of two ints
-
-        :param ignore_border: if ds doesn't divide imgshape, do we include an
-            extra row/col of partial downsampling (False) or ignore it (True).
-        :type ignore_border: bool
-
-        :rtype: list
-        :returns: the shape of the output from this op, for input of given
-            shape.  This will have the same length as imgshape, but with last
-            two elements reduced as per the downsampling & ignore_border flags.
-        """
-        if len(imgshape) < 2:
-            raise TypeError('imgshape must have at least two elements '
-                            '(rows, cols)')
-
-        if st is None:
-            st = ds
-        r, c = imgshape[-2:]
-
-        if ignore_border:
-            out_r = (r - ds[0]) // st[0] + 1
-            out_c = (c - ds[1]) // st[1] + 1
-            if isinstance(r, theano.Variable):
-                nr = tensor.maximum(out_r, 0)
-            else:
-                nr = numpy.maximum(out_r, 0)
-            if isinstance(c, theano.Variable):
-                nc = tensor.maximum(out_c, 0)
-            else:
-                nc = numpy.maximum(out_c, 0)
-        else:
-            if isinstance(r, theano.Variable):
-                nr = tensor.switch(tensor.ge(st[0], ds[0]),
-                                   (r - 1) // st[0] + 1,
-                                   tensor.maximum(0, (r - 1 - ds[0])
-                                                  // st[0] + 1) + 1)
-            elif st[0] >= ds[0]:
-                nr = (r - 1) // st[0] + 1
-            else:
-                nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1
-
-            if isinstance(c, theano.Variable):
-                nc = tensor.switch(tensor.ge(st[1], ds[1]),
-                                   (c - 1) // st[1] + 1,
-                                   tensor.maximum(0, (c - 1 - ds[1])
-                                                  // st[1] + 1) + 1)
-            elif st[1] >= ds[1]:
-                nc = (c - 1) // st[1] + 1
-            else:
-                nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1
-
-        rval = list(imgshape[:-2]) + [nr, nc]
-        return rval
-
-    def __init__(self, ds, ignore_border, st=None):
-        self.ds = tuple(ds)
-        self.ignore_border = ignore_border
-        if st is None:
-            st = ds
-        self.st = tuple(st)
-
-    def __eq__(self, other):
-        return (type(self) == type(other)
-                and self.ds == other.ds
-                and self.st == other.st
-                and self.ignore_border == other.ignore_border)
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.ds) ^ \
-            hash(self.st) ^ hash(self.ignore_border)
-
-    def __str__(self):
-        return '%s{%s,%s,%s}' % (self.__class__.__name__,
-                                 self.ds, self.st, self.ignore_border)
-
-    def make_node(self, x, maxout, gz):
-        # make_node should only be called by the grad function of
-        # DownsampleFactorMaxGrad, so these asserts should not fail.
-        assert isinstance(x, Variable) and x.ndim == 4
-        assert isinstance(maxout, Variable) and maxout.ndim == 4
-        assert isinstance(gz, Variable) and gz.ndim == 4
-
-        return Apply(self, [x, maxout, gz], [x.type()])
-
-    def perform(self, node, inp, out):
-        x, maxout, ggx = inp
-        z, = out
-
-        if len(x.shape) != 4:
-            raise NotImplementedError(
-                'DownsampleFactorMaxGradGrad requires 4D input for now')
-        z_shape = self.out_shape(x.shape, self.ds, self.ignore_border, self.st)
-        if (z[0] is None) or (z[0].shape != z_shape):
-            z[0] = numpy.zeros(self.out_shape(x.shape, self.ds,
-                                              self.ignore_border, self.st))
-            z[0] = theano._asarray(z[0], dtype=x.dtype)
-        ggz = z[0]
-
-        #number of pooling output rows
-        pr = ggz.shape[-2]
-        #number of pooling output cols
-        pc = ggz.shape[-1]
-        ds0, ds1 = self.ds
-        st0, st1 = self.st
-        img_rows = x.shape[-2]
-        img_cols = x.shape[-1]
-
-        for n in xrange(x.shape[0]):
-            for k in xrange(x.shape[1]):
-                for r in xrange(pr):
-                    row_st = r * st0
-                    row_end = __builtin__.min(row_st + ds0, img_rows)
-                    for c in xrange(pc):
-                        col_st = c * st1
-                        col_end = __builtin__.min(col_st + ds1, img_cols)
-                        for row_ind in xrange(row_st, row_end):
-                            for col_ind in xrange(col_st, col_end):
-                                if (maxout[n, k, r, c] == x[n, k, row_ind, col_ind]):
-                                    ggz[n, k, r, c] = ggx[n, k, row_ind, col_ind]
-
-    def infer_shape(self, node, in_shapes):
-        return [in_shapes[0]]
+        return (0,1)
diff --git a/theano/tensor/signal/tests/test_conv.py b/theano/tensor/signal/tests/test_conv.py
index bc59a911def..23752a22e8a 100644
--- a/theano/tensor/signal/tests/test_conv.py
+++ b/theano/tensor/signal/tests/test_conv.py
@@ -1,6 +1,5 @@
 import unittest
 
-from nose.plugins.skip import SkipTest
 import numpy
 
 import theano
@@ -17,7 +16,7 @@ class TestSignalConv2D(unittest.TestCase):
     def setUp(self):
         utt.seed_rng()
 
-    def validate(self, image_shape, filter_shape, out_dim, verify_grad=True):
+    def validate(self, image_shape, filter_shape, verify_grad=True):
 
         image_dim = len(image_shape)
         filter_dim = len(filter_shape)
@@ -36,7 +35,6 @@ def validate(self, image_shape, filter_shape, out_dim, verify_grad=True):
         def sym_conv2d(input, filters):
             return conv.conv2d(input, filters)
         output = sym_conv2d(input, filters)
-        assert output.ndim == out_dim
         theano_conv = theano.function([input, filters], output)
 
         # initialize input and compute result
@@ -87,14 +85,10 @@ def test_basic(self):
         signal.conv.conv2d can support inputs and filters of type
         matrix or tensor3.
         """
-        if (not theano.tensor.nnet.conv.imported_scipy_signal and
-            theano.config.cxx == ""):
-            raise SkipTest("conv2d tests need SciPy or a c++ compiler")
-
-        self.validate((1, 4, 5), (2, 2, 3), out_dim=4, verify_grad=True)
-        self.validate((7, 5), (5, 2, 3), out_dim=3, verify_grad=False)
-        self.validate((3, 7, 5), (2, 3), out_dim=3, verify_grad=False)
-        self.validate((7, 5), (2, 3), out_dim=2, verify_grad=False)
+        self.validate((1, 4, 5), (2, 2, 3), verify_grad=True)
+        self.validate((7, 5), (5, 2, 3), verify_grad=False)
+        self.validate((3, 7, 5), (2, 3), verify_grad=False)
+        self.validate((7, 5), (2, 3), verify_grad=False)
 
     def test_fail(self):
         """
diff --git a/theano/tensor/signal/tests/test_downsample.py b/theano/tensor/signal/tests/test_downsample.py
index d82b19d70d0..a0df2b2ed31 100644
--- a/theano/tensor/signal/tests/test_downsample.py
+++ b/theano/tensor/signal/tests/test_downsample.py
@@ -1,5 +1,4 @@
 import unittest
-import __builtin__
 import numpy
 import theano.tensor as tensor
 from theano.tests import unittest_tools as utt
@@ -15,8 +14,8 @@ def numpy_max_pool_2d(input, ds, ignore_border=False):
         '''Helper function, implementing max_pool_2d in pure numpy'''
         if len(input.shape) < 2:
             raise NotImplementedError('input should have at least 2 dim,'
-                                      ' shape is %s'
-                                      % str(input.shape))
+                                      ' shape is %s'\
+                    % str(input.shape))
         xi = 0
         yi = 0
         if not ignore_border:
@@ -38,64 +37,6 @@ def numpy_max_pool_2d(input, ds, ignore_border=False):
                     output_val[k][i, j] = numpy.max(patch)
         return output_val
 
-    @staticmethod
-    def numpy_max_pool_2d_stride(input, ds, ignore_border=False, st=None):
-        '''Helper function, implementing max_pool_2d in pure numpy
-           this function provides st input to indicate the stide size
-           for the pooling regions. if not indicated, st == sd.'''
-        if len(input.shape) < 2:
-            raise NotImplementedError('input should have at least 2 dim,'
-                                      ' shape is %s'
-                                      % str(input.shape))
-
-        if st is None:
-            st = ds
-        xi = 0
-        yi = 0
-        img_rows = input.shape[-2]
-        img_cols = input.shape[-1]
-
-        out_r = 0
-        out_c = 0
-        if img_rows - ds[0] >= 0:
-            out_r = (img_rows - ds[0]) // st[0] + 1
-        if img_cols - ds[1] >= 0:
-            out_c = (img_cols - ds[1]) // st[1] + 1
-
-        if not ignore_border:
-            if out_r > 0:
-                if img_rows - ((out_r - 1) * st[0] + ds[0]) > 0:
-                    rr = img_rows - out_r * st[0]
-                    if rr > 0:
-                        out_r += 1
-            else:
-                if img_rows > 0:
-                        out_r += 1
-            if out_c > 0:
-                if img_cols - ((out_c - 1) * st[1] + ds[1]) > 0:
-                    cr = img_cols - out_c * st[1]
-                    if cr > 0:
-                        out_c += 1
-            else:
-                if img_cols > 0:
-                        out_c += 1
-
-        out_shp = list(input.shape[:-2])
-        out_shp.append(out_r)
-        out_shp.append(out_c)
-
-        output_val = numpy.zeros(out_shp)
-        for k in numpy.ndindex(*input.shape[:-2]):
-            for i in range(output_val.shape[-2]):
-                ii_st = i * st[0]
-                ii_end = __builtin__.min(ii_st + ds[0], img_rows)
-                for j in range(output_val.shape[-1]):
-                    jj_st = j * st[1]
-                    jj_end = __builtin__.min(jj_st + ds[1], img_cols)
-                    patch = input[k][ii_st:ii_end, jj_st:jj_end]
-                    output_val[k][i, j] = numpy.max(patch)
-        return output_val
-
     def test_DownsampleFactorMax(self):
         rng = numpy.random.RandomState(utt.fetch_seed())
         # generate random images
@@ -118,83 +59,10 @@ def test_DownsampleFactorMax(self):
 
                 #DownsampleFactorMax op
                 maxpool_op = DownsampleFactorMax(maxpoolshp,
-                                                 ignore_border=
-                                                 ignore_border)(images)
-                f = function([images], maxpool_op)
-                output_val = f(imval)
-                utt.assert_allclose(output_val, numpy_output_val)
-
-    def test_DownsampleFactorMaxStride(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        maxpoolshps = ((1, 1), (3, 3), (5, 3))
-        stridesizes = ((1, 1), (3, 3), (5, 7))
-        # generate random images
-        imval = rng.rand(4, 10, 16, 16)
-        outputshps = ((4, 10, 16, 16), (4, 10, 6, 6), (4, 10, 4, 3),
-                      (4, 10, 16, 16), (4, 10, 6, 6), (4, 10, 4, 3),
-                      (4, 10, 14, 14), (4, 10, 5, 5), (4, 10, 3, 2),
-                      (4, 10, 14, 14), (4, 10, 6, 6), (4, 10, 4, 3),
-                      (4, 10, 12, 14), (4, 10, 4, 5), (4, 10, 3, 2),
-                      (4, 10, 12, 14), (4, 10, 5, 6), (4, 10, 4, 3))
-        images = tensor.dtensor4()
-        indx = 0
-        for maxpoolshp in maxpoolshps:
-            for ignore_border in [True, False]:
-                for stride in stridesizes:
-                    outputshp = outputshps[indx]
-                    indx += 1
-                    #DownsampleFactorMax op
-                    numpy_output_val = \
-                        self.numpy_max_pool_2d_stride(imval, maxpoolshp,
-                                                      ignore_border, stride)
-                    assert numpy_output_val.shape == outputshp, (
-                        "outshape is %s, calculated shape is %s"
-                        % (outputshp, numpy_output_val.shape))
-                    maxpool_op = \
-                        DownsampleFactorMax(maxpoolshp,
-                                            ignore_border=ignore_border,
-                                            st=stride)(images)
-                    f = function([images], maxpool_op)
-                    output_val = f(imval)
-                    utt.assert_allclose(output_val, numpy_output_val)
-
-    def test_DownsampleFactorMaxStrideExtra(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        maxpoolshps = ((5, 3), (5, 3), (5, 3), (5, 5), (3, 2), (7, 7), (9, 9))
-        stridesizes = ((3, 2), (7, 5), (10, 6), (1, 1),
-                       (2, 3), (10, 10), (1, 1))
-        imvsizs = ((16, 16), (16, 16), (16, 16), (8, 5),
-                   (8, 5), (8, 5), (8, 5))
-        outputshps = ((4, 10, 4, 7), (4, 10, 5, 8), (4, 10, 2, 3),
-                      (4, 10, 3, 4), (4, 10, 2, 3), (4, 10, 2, 3),
-                      (4, 10, 4, 1), (4, 10, 4, 1), (4, 10, 3, 2),
-                      (4, 10, 4, 2), (4, 10, 1, 0), (4, 10, 1, 1),
-                      (4, 10, 0, 0), (4, 10, 1, 1))
-        images = tensor.dtensor4()
-        for indx in numpy.arange(len(maxpoolshps)):
-            imvsize = imvsizs[indx]
-            imval = rng.rand(4, 10, imvsize[0], imvsize[1])
-            stride = stridesizes[indx]
-            maxpoolshp = maxpoolshps[indx]
-            for ignore_border in [True, False]:
-                indx_out = indx * 2
-                if not ignore_border:
-                    indx_out += 1
-                outputshp = outputshps[indx_out]
-                #DownsampleFactorMax op
-                numpy_output_val = \
-                    self.numpy_max_pool_2d_stride(imval, maxpoolshp,
-                                                  ignore_border, stride)
-                assert numpy_output_val.shape == outputshp, (
-                    "outshape is %s, calculated shape is %s"
-                    % (outputshp, numpy_output_val.shape))
-                maxpool_op = \
-                    DownsampleFactorMax(maxpoolshp,
-                                        ignore_border=ignore_border,
-                                        st=stride)(images)
+                                                 ignore_border=ignore_border)(images)
                 f = function([images], maxpool_op)
                 output_val = f(imval)
-                utt.assert_allclose(output_val, numpy_output_val)
+                assert (numpy.abs(output_val - numpy_output_val) < 1e-5).all()
 
     def test_DownsampleFactorMax_grad(self):
         rng = numpy.random.RandomState(utt.fetch_seed())
@@ -208,50 +76,9 @@ def test_DownsampleFactorMax_grad(self):
                 #print 'ignore_border =', ignore_border
                 def mp(input):
                     return DownsampleFactorMax(maxpoolshp,
-                                               ignore_border=
-                                               ignore_border)(input)
+                                    ignore_border=ignore_border)(input)
                 utt.verify_grad(mp, [imval], rng=rng)
 
-    def test_DownsampleFactorMaxGrad_grad(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        maxpoolshps = ((1, 1), (3, 2), (2, 3))
-        imval = rng.rand(2, 3, 3, 4) * 10.0
-        #more variance means numeric gradient will be more accurate
-
-        for maxpoolshp in maxpoolshps:
-            for ignore_border in [True, False]:
-                #print 'maxpoolshp =', maxpoolshp
-                #print 'ignore_border =', ignore_border
-                # The shape of the gradient will be the shape of the output
-                grad_shape = DownsampleFactorMax.out_shape(
-                    imval.shape, maxpoolshp, ignore_border=ignore_border)
-                grad_val = rng.rand(*grad_shape) * 10.0
-
-                def mp(input, grad):
-                    out = DownsampleFactorMax(
-                        maxpoolshp, ignore_border=ignore_border)(input)
-                    grad_op = DownsampleFactorMaxGrad(
-                        maxpoolshp, ignore_border=ignore_border)
-                    return grad_op(input, out, grad)
-
-                utt.verify_grad(mp, [imval, grad_val], rng=rng)
-
-    def test_DownsampleFactorMax_hessian(self):
-        # Example provided by Frans Cronje, see
-        # https://groups.google.com/d/msg/theano-users/qpqUy_3glhw/JMwIvlN5wX4J
-        x_vec = tensor.vector('x')
-        z = tensor.dot(x_vec.dimshuffle(0, 'x'),
-                       x_vec.dimshuffle('x', 0))
-        y = max_pool_2d(input=z, ds=(2, 2))
-        C = tensor.exp(tensor.sum(y))
-
-        grad_hess = tensor.hessian(cost=C, wrt=x_vec)
-        fn_hess = function(inputs=[x_vec], outputs=grad_hess)
-
-        # The value has been manually computed from the theoretical gradient,
-        # and confirmed by the implementation.
-        assert numpy.allclose(fn_hess([1, 2]), [[0., 0.], [0., 982.7667]])
-
     def test_max_pool_2d_2D(self):
         rng = numpy.random.RandomState(utt.fetch_seed())
         maxpoolshps = ((1, 1), (3, 2))
@@ -266,10 +93,7 @@ def test_max_pool_2d_2D(self):
                                                           ignore_border)
                 output = max_pool_2d(images, maxpoolshp, ignore_border)
                 output_val = function([images], output)(imval)
-                assert numpy.all(output_val == numpy_output_val), (
-                    "output_val is %s, numpy_output_val is %s"
-                    % (output_val, numpy_output_val))
-
+                assert numpy.all(output_val == numpy_output_val)
                 def mp(input):
                     return max_pool_2d(input, maxpoolshp, ignore_border)
                 utt.verify_grad(mp, [imval], rng=rng)
@@ -288,17 +112,15 @@ def test_max_pool_2d_3D(self):
                                                           ignore_border)
                 output = max_pool_2d(images, maxpoolshp, ignore_border)
                 output_val = function([images], output)(imval)
-                assert numpy.all(output_val == numpy_output_val), (
-                    "output_val is %s, numpy_output_val is %s"
-                    % (output_val, numpy_output_val))
+                assert numpy.all(output_val == numpy_output_val)
                 c = tensor.sum(output)
                 c_val = function([images], c)(imval)
                 g = tensor.grad(c, images)
                 g_val = function([images],
-                                 [g.shape,
-                                 tensor.min(g, axis=(0, 1, 2)),
-                                 tensor.max(g, axis=(0, 1, 2))]
-                                 )(imval)
+                        [g.shape,
+                            tensor.min(g, axis=(0, 1, 2)),
+                            tensor.max(g, axis=(0, 1, 2))]
+                        )(imval)
 
 #removed as already tested in test_max_pool_2d_2D
 #This make test in debug mode too slow.
@@ -347,20 +169,19 @@ def test_infer_shape(self):
 
                 # checking shapes generated by DownsampleFactorMax
                 self._compile_and_check([image],
-                                        [DownsampleFactorMax(maxpoolshp,
-                                        ignore_border=ignore_border)(image)],
-                                        [image_val], DownsampleFactorMax)
+                        [DownsampleFactorMax(maxpoolshp,
+                        ignore_border=ignore_border)(image)],
+                        [image_val], DownsampleFactorMax)
 
                 # checking shapes generated by DownsampleFactorMaxGrad
                 maxout_val = rng.rand(*out_shapes[i][j])
                 gz_val = rng.rand(*out_shapes[i][j])
                 self._compile_and_check([image, maxout, gz],
-                                        [DownsampleFactorMaxGrad(maxpoolshp,
-                                        ignore_border=ignore_border)
-                                        (image, maxout, gz)],
-                                        [image_val, maxout_val, gz_val],
+                        [DownsampleFactorMaxGrad(maxpoolshp,
+                        ignore_border=ignore_border)(image, maxout, gz)],
+                        [image_val, maxout_val, gz_val],
                                         DownsampleFactorMaxGrad,
-                                        warn=False)
+                        warn=False)
 
 
 if __name__ == '__main__':
diff --git a/theano/tensor/slinalg.py b/theano/tensor/slinalg.py
deleted file mode 100644
index b43de8297e2..00000000000
--- a/theano/tensor/slinalg.py
+++ /dev/null
@@ -1,338 +0,0 @@
-import logging
-
-logger = logging.getLogger(__name__)
-import numpy
-
-from theano.gof import Op, Apply
-
-from theano.tensor import as_tensor_variable, dot, DimShuffle, Dot
-from theano.tensor.blas import Dot22
-from theano import tensor
-import theano.tensor
-from theano.tensor.opt import (register_stabilize,
-        register_specialize, register_canonicalize)
-from theano.gof import local_optimizer
-from theano.gof.opt import Optimizer
-from theano.gradient import DisconnectedType
-
-try:
-    import scipy.linalg
-    imported_scipy = True
-except ImportError:
-    # some ops (e.g. Cholesky, Solve, A_Xinv_b) won't work
-    imported_scipy = False
-
-MATRIX_STRUCTURES = (
-        'general',
-        'symmetric',
-        'lower_triangular',
-        'upper_triangular',
-        'hermitian',
-        'banded',
-        'diagonal',
-        'toeplitz',
-        )
-
-class Cholesky(Op):
-    """
-    Return a triangular matrix square root of positive semi-definite `x`
-
-    L = cholesky(X, lower=True) implies dot(L, L.T) == X
-    """
-    #TODO: inplace
-    #TODO: for specific dtypes
-    #TODO: LAPACK wrapper with in-place behavior, for solve also
-
-    __props__ = ('lower', 'destructive')
-
-    def __init__(self, lower=True):
-        self.lower = lower
-        self.destructive = False
-
-    def infer_shape(self, node, shapes):
-        return [shapes[0]]
-
-    def make_node(self, x):
-        assert imported_scipy, (
-            "Scipy not available. Scipy is needed for the Cholesky op")
-        x = as_tensor_variable(x)
-        assert x.ndim == 2
-        return Apply(self, [x], [x.type()])
-
-    def perform(self, node, inputs, outputs):
-        x = inputs[0]
-        z = outputs[0]
-        z[0] = scipy.linalg.cholesky(x, lower=self.lower).astype(x.dtype)
-
-    def grad(self, inputs, gradients):
-        return [CholeskyGrad(self.lower)(inputs[0], self(inputs[0]),
-                                         gradients[0])]
-
-cholesky = Cholesky()
-
-
-class CholeskyGrad(Op):
-    """
-    """
-
-    __props__ = ('lower', 'destructive')
-
-    def __init__(self, lower=True):
-        self.lower = lower
-        self.destructive = False
-
-    def make_node(self, x, l, dz):
-        x = as_tensor_variable(x)
-        l = as_tensor_variable(l)
-        dz = as_tensor_variable(dz)
-        assert x.ndim == 2
-        assert l.ndim == 2
-        assert dz.ndim == 2
-        assert l.owner.op.lower == self.lower, (
-            "lower/upper mismatch between Cholesky op and CholeskyGrad op"
-        )
-        return Apply(self, [x, l, dz], [x.type()])
-
-    def perform(self, node, inputs, outputs):
-        """Implements the "reverse-mode" gradient [1]_ for the
-        Cholesky factorization of a positive-definite matrix.
-
-        .. [1] S. P. Smith. "Differentiation of the Cholesky Algorithm".
-               Journal of Computational and Graphical Statistics,
-               Vol. 4, No. 2 (Jun.,1995), pp. 134-147
-               http://www.jstor.org/stable/1390762
-
-        """
-        x = inputs[0]
-        L = inputs[1]
-        dz = inputs[2]
-        dx = outputs[0]
-        N = x.shape[0]
-        if self.lower:
-            F = numpy.tril(dz)
-            for k in xrange(N - 1, -1, -1):
-                for j in xrange(k + 1, N):
-                    for i in xrange(j, N):
-                        F[i, k] -= F[i, j] * L[j, k]
-                        F[j, k] -= F[i, j] * L[i, k]
-                for j in xrange(k + 1, N):
-                    F[j, k] /= L[k, k]
-                    F[k, k] -= L[j, k] * F[j, k]
-                F[k, k] /= (2 * L[k, k])
-        else:
-            F = numpy.triu(dz)
-            M = N - 1
-            for k in xrange(N - 1, -1, -1):
-                for j in xrange(k + 1, N):
-                    for i in xrange(j, N):
-                        F[k, i] -= F[j, i] * L[k, j]
-                        F[k, j] -= F[j, i] * L[k, i]
-                for j in xrange(k + 1, N):
-                    F[k, j] /= L[k, k]
-                    F[k, k] -= L[k, j] * F[k, j]
-                F[k, k] /= (2 * L[k, k])
-        dx[0] = F
-
-    def infer_shape(self, node, shapes):
-        return [shapes[0]]
-
-
-class Solve(Op):
-    """Solve a system of linear equations"""
-
-    __props__ = ('A_structure', 'lower', 'overwrite_A', 'overwrite_b')
-
-    def __init__(self,
-                 A_structure='general',
-                 lower=False,
-                 overwrite_A=False,
-                 overwrite_b=False):
-        if A_structure not in MATRIX_STRUCTURES:
-            raise ValueError('Invalid matrix structure argument', A_structure)
-        self.A_structure = A_structure
-        self.lower = lower
-        self.overwrite_A = overwrite_A
-        self.overwrite_b = overwrite_b
-
-    def __repr__(self):
-        return 'Solve{%s}' % str(self._props())
-
-    def make_node(self, A, b):
-        assert imported_scipy, (
-            "Scipy not available. Scipy is needed for the Solve op")
-        A = as_tensor_variable(A)
-        b = as_tensor_variable(b)
-        assert A.ndim == 2
-        assert b.ndim in [1, 2]
-        otype = tensor.tensor(
-                broadcastable=b.broadcastable,
-                dtype=(A * b).dtype)
-        return Apply(self, [A, b], [otype])
-
-    def perform(self, node, inputs, output_storage):
-        A, b = inputs
-        if self.A_structure == 'lower_triangular':
-            rval = scipy.linalg.solve_triangular(
-                A, b, lower=True)
-        elif self.A_structure == 'upper_triangular':
-            rval = scipy.linalg.solve_triangular(
-                A, b, lower=False)
-        else:
-            rval = scipy.linalg.solve(A, b)
-        output_storage[0][0] = rval
-        
-    # computes shape of x where x = inv(A) * b
-    def infer_shape(self, node, shapes):
-        Ashape, Bshape = shapes
-        rows = Ashape[1]
-        if len(Bshape) == 1:  # b is a Vector
-            return [(rows,)]
-        else:
-            cols = Bshape[1]  # b is a Matrix
-            return [(rows, cols)]
-
-solve = Solve()  # general solve
-
-#TODO : SolveTriangular
-
-#TODO: Optimizations to replace multiplication by matrix inverse
-#      with solve() Op (still unwritten)
-
-
-class Eigvalsh(Op):
-    """Generalized eigenvalues of a Hermetian positive definite eigensystem
-    """
-
-    __props__ = ('lower',)
-
-    def __init__(self, lower=True):
-        assert lower in [True, False]
-        self.lower = lower
-
-    def make_node(self, a, b):
-        assert imported_scipy, (
-            "Scipy not  available. Scipy is needed for the Eigvalsh op")
-
-        if b == theano.tensor.NoneConst:
-            a = as_tensor_variable(a)  
-            assert a.ndim == 2
-
-            out_dtype = theano.scalar.upcast(a.dtype)
-            w = theano.tensor.vector(dtype=out_dtype)
-            return Apply(self, [a], [w])
-        else:
-            a = as_tensor_variable(a)
-            b = as_tensor_variable(b)
-            assert a.ndim == 2
-            assert b.ndim == 2
-
-            out_dtype = theano.scalar.upcast(a.dtype, b.dtype)
-            w = theano.tensor.vector(dtype=out_dtype)
-            return Apply(self, [a, b], [w])
-
-    def perform(self, node, inputs, (w,)):
-        if len(inputs) == 2:
-            w[0] = scipy.linalg.eigvalsh(a=inputs[0], b=inputs[1], lower=self.lower)
-        else:
-            w[0] = scipy.linalg.eigvalsh(a=inputs[0], b=None, lower=self.lower)
-
-    def grad(self, inputs, g_outputs):
-        a, b = inputs
-        gw, = g_outputs
-        return EigvalshGrad(self.lower)(a, b, gw)
-
-    def infer_shape(self, node, shapes):
-        n = shapes[0][0]
-        return [(n,)]
-
-
-class EigvalshGrad(Op):
-    """Gradient of generalized eigenvalues of a Hermetian positive definite
-    eigensystem
-    """
-
-    # Note: This Op (EigvalshGrad), should be removed and replaced with a graph
-    # of theano ops that is constructed directly in Eigvalsh.grad.
-    # But this can only be done once scipy.linalg.eigh is available as an Op
-    # (currently the Eigh uses numpy.linalg.eigh, which doesn't let you
-    # pass the right-hand-side matrix for a generalized eigenproblem.) See the
-    # discussion on github at
-    # https://github.com/Theano/Theano/pull/1846#discussion-diff-12486764
-
-    __props__ = ('lower',)
-
-    def __init__(self, lower=True):
-        assert lower in [True, False]
-        self.lower = lower
-        if lower:
-            self.tri0 = numpy.tril
-            self.tri1 = lambda a: numpy.triu(a, 1)
-        else:
-            self.tri0 = numpy.triu
-            self.tri1 = lambda a: numpy.tril(a, -1)
-
-    def make_node(self, a, b, gw):
-        assert imported_scipy, (
-            "Scipy not available. Scipy is needed for the GEigvalsh op")
-        a = as_tensor_variable(a)
-        b = as_tensor_variable(b)
-        gw = as_tensor_variable(gw)  
-        assert a.ndim == 2
-        assert b.ndim == 2
-        assert gw.ndim == 1
-
-        out_dtype = theano.scalar.upcast(a.dtype, b.dtype, gw.dtype)
-        out1 = theano.tensor.matrix(dtype=out_dtype)
-        out2 = theano.tensor.matrix(dtype=out_dtype)
-        return Apply(self, [a, b, gw], [out1, out2])
-
-    def perform(self, node, (a, b, gw), outputs):
-        w, v = scipy.linalg.eigh(a, b, lower=self.lower)
-        gA = v.dot(numpy.diag(gw).dot(v.T))
-        gB = - v.dot(numpy.diag(gw*w).dot(v.T))
-
-        # See EighGrad comments for an explanation of these lines
-        out1 = self.tri0(gA) + self.tri1(gA).T
-        out2 = self.tri0(gB) + self.tri1(gB).T
-        outputs[0][0] = numpy.asarray(out1, dtype=node.outputs[0].dtype)
-        outputs[1][0] = numpy.asarray(out2, dtype=node.outputs[1].dtype)
-
-    def infer_shape(self, node, shapes):
-        return [shapes[0], shapes[1]]
-
-
-def eigvalsh(a, b, lower=True):
-    return Eigvalsh(lower)(a, b)
-
-
-def kron(a, b):
-    """ Kronecker product
-
-    Same as scipy.linalg.kron(a, b).
-
-    :note: numpy.kron(a, b) != scipy.linalg.kron(a, b)!
-        They don't have the same shape and order when
-        a.ndim != b.ndim != 2.
-
-    :param a: array_like
-    :param b: array_like
-    :return: array_like with a.ndim + b.ndim - 2 dimensions.
-
-    """
-    a = tensor.as_tensor_variable(a)
-    b = tensor.as_tensor_variable(b)
-    if (a.ndim + b.ndim <= 2):
-        raise TypeError('kron: inputs dimensions must sum to 3 or more. '
-                        'You passed %d and %d.' % (a.ndim, b.ndim))
-    o = tensor.outer(a, b)
-    o = o.reshape(tensor.concatenate((a.shape, b.shape)),
-                  a.ndim + b.ndim)
-    shf = o.dimshuffle(0, 2, 1, * range(3, o.ndim))
-    if shf.ndim == 3:
-        shf = o.dimshuffle(1, 0, 2)
-        o = shf.flatten()
-    else:
-        o = shf.reshape((o.shape[0] * o.shape[2],
-                         o.shape[1] * o.shape[3]) +
-                        tuple([o.shape[i] for i in range(4, o.ndim)]))
-    return o
diff --git a/theano/tensor/sort.py b/theano/tensor/sort.py
index ab39555f265..fa70476ec32 100644
--- a/theano/tensor/sort.py
+++ b/theano/tensor/sort.py
@@ -3,7 +3,7 @@
 import theano
 from theano.tensor import tensor
 
-from theano.tensor.basic import mul, arange
+from theano.tensor.basic import mul
 
 
 class SortOp(theano.Op):
@@ -27,8 +27,7 @@ def __str__(self):
 
     def make_node(self, input, axis=-1):
         input = theano.tensor.as_tensor_variable(input)
-        if (axis is None or
-            (isinstance(axis, theano.Constant) and axis.data is None)):
+        if axis is None:
             axis = theano.Constant(theano.gof.generic, None)
             # axis=None flattens the array before sorting
             out_type = tensor(dtype=input.dtype, broadcastable=[False])
@@ -56,35 +55,8 @@ def infer_shape(self, node, inputs_shapes):
         assert inputs_shapes[1] == ()
         return [inputs_shapes[0]]
 
-    def grad(self, inputs, output_grads):
-        a, axis = inputs
-        inp_grad = theano.gradient.grad_not_implemented(
-            self, 0, axis,
-            "Currently, we only implement the gradient on sort for vector"
-            " and matrix (and axis is None)")
-        if a.ndim == 1:
-            idx = argsort(*inputs, kind=self.kind, order=self.order)
-#            rev_idx = numpy.where(idx[None, :]==numpy.arange(5)[:,None])[1]
-            rev_idx = theano.tensor.eq(idx[None, :],
-                                       arange(a.shape[0])[:, None]).nonzero()[1]
-            inp_grad = output_grads[0][rev_idx]
-        elif a.ndim == 2:
-            if (axis is None or
-                (isinstance(axis, theano.Constant) and axis.data is None)):
-                idx = argsort(*inputs, kind=self.kind, order=self.order)
-                rev_idx = theano.tensor.eq(idx[None, :],
-                                           arange(a.shape[0]*a.shape[1])[:, None]).nonzero()[1]
-                inp_grad = output_grads[0][rev_idx].reshape(a.shape)
-            elif (axis == 0 or
-                  (isinstance(axis, theano.Constant) and axis.data == 0)):
-                idx = argsort(*inputs, kind=self.kind, order=self.order)
-                #not working: numpy.where(idx[None, :]==numpy.arange(2)[:, None, None])
-                pass
-        axis_grad = theano.gradient.grad_undefined(
-            self, 1, axis,
-            "sort is not defined for non-integer axes so"
-            " sort(x, axis+eps) is undefined")
-        return [inp_grad, axis_grad]
+    #**** It need the argsort, so we can't do it now.
+    #def grad(self, inputs, output_grads):
     """
     def R_op(self, inputs, eval_points):
         # R_op can receive None as eval_points.
@@ -143,8 +115,7 @@ def __str__(self):
 
     def make_node(self, input, axis=-1):
         input = theano.tensor.as_tensor_variable(input)
-        if (axis is None or
-            (isinstance(axis, theano.Constant) and axis.data is None)):
+        if axis is None:
             axis = theano.Constant(theano.gof.generic, None)
             bcast = [False]
         else:
@@ -173,17 +144,7 @@ def infer_shape(self, node, inputs_shapes):
 
     def grad(self, inputs, output_grads):
         #No grad defined for intergers.
-        inp, axis = inputs
-        inp_grad = theano.gradient.grad_not_implemented(
-            self, 0, axis,
-            "I'm not sure if argsort should have its gradient"
-            " implemented or is should be marked as undefined."
-            " So I mark it as not implemented for now.")
-        axis_grad = theano.gradient.grad_undefined(
-            self, 1, axis,
-            "argsort is not defined for non-integer axes so"
-            " argsort(x, axis+eps) is undefined")
-        return [inp_grad, axis_grad]
+        return [None, None]
     """
     def R_op(self, inputs, eval_points):
         # R_op can receive None as eval_points.
diff --git a/theano/tensor/subtensor.py b/theano/tensor/subtensor.py
deleted file mode 100644
index fd4b671a419..00000000000
--- a/theano/tensor/subtensor.py
+++ /dev/null
@@ -1,2126 +0,0 @@
-from copy import copy
-from itertools import izip
-import sys
-from textwrap import dedent
-import warnings
-import logging
-_logger = logging.getLogger("theano.tensor.subtensor")
-
-import numpy
-
-import theano
-from theano.gradient import DisconnectedType
-from theano import gof
-from theano.gof import Apply, Constant, hashtype, Op, Type, MethodNotDefined
-from theano.gof.python25 import maxsize
-from theano.printing import pprint
-from theano import scalar as scal
-from theano.tensor.basic import (addbroadcast, clip, get_scalar_constant_value,
-                                 ARange, TensorType, NotScalarConstantError)
-from theano.tensor.elemwise import DimShuffle
-from theano.tensor.type_other import NoneConst, SliceType, make_slice
-from theano import config
-
-inplace_increment = None
-if config.cxx:
-    import theano.gof.cutils  # needed to import cutils_ext
-    try:
-        from cutils_ext.cutils_ext import inplace_increment
-    except ImportError:
-        pass
-
-
-# Do a lazy import of the sparse module
-sparse_module_ref = None
-
-
-class AdvancedIndexingError(TypeError):
-    """
-    Raised when Subtensor is asked to perform advanced indexing.
-    """
-
-    def __init__(self, *args):
-        TypeError.__init__(self, *args)
-
-
-##########
-# Helpful functions to deal with Subtensor and IncSubtensor
-##########
-
-def make_constant(args):
-    """
-    Convert python litterals to theano constants in subtensor arguments.
-    """
-    def conv(a):
-            if a is None:
-                return a
-            elif isinstance(a, slice):
-                return slice(conv(a.start),
-                             conv(a.stop),
-                             conv(a.step))
-            elif isinstance(a, (int, long, numpy.integer)):
-                return scal.ScalarConstant(scal.int64, a)
-            else:
-                return a
-    return tuple(map(conv, args))
-
-
-def get_idx_list(inputs, idx_list, get_count=False):
-    '''
-    Given a list of inputs to the subtensor and its idx_list reorders
-    the inputs according to the idx list to get the right values.
-
-    If get_counts=True, instead returns the number of inputs consumed
-    during this process.
-    '''
-
-    # The number of indices
-    n = len(inputs) - 1
-
-    # The subtensor (or idx_list) does not depend on the inputs.
-    if n == 0:
-        return tuple(idx_list)
-    indices = list(reversed(list(inputs[1:])))
-
-    # General case
-    def convert(entry):
-        if isinstance(entry, gof.Type):
-            return indices.pop()
-        elif isinstance(entry, slice):
-            return slice(convert(entry.start),
-                         convert(entry.stop),
-                         convert(entry.step))
-        else:
-            return entry
-    cdata = tuple(map(convert, idx_list))
-    if get_count:
-        return n - len(indices)
-    else:
-        return cdata
-
-
-def get_canonical_form_slice(theslice, length):
-    '''
-    Given a slice [start:stop:step] transform it into a canonical form
-    that respects the conventions imposed by python and numpy.
-
-    In a canonical form a slice is represented by a canonical form slice,
-    in which 0 <= start <= stop <= length and step > 0, and a flag which says
-    if the resulting set of numbers needs to be reversed or not.
-    '''
-    from theano.tensor import switch, lt, ge, sgn
-    if isinstance(theslice, slice):
-
-        def analyze(x):
-            try:
-                x_constant = get_scalar_constant_value(x)
-                is_constant = True
-            except theano.tensor.NotScalarConstantError:
-                x_constant = theano.tensor.extract_constant(x)
-                is_constant = False
-            return x_constant, is_constant
-
-        start, is_start_constant = analyze(theslice.start)
-        stop, is_stop_constant = analyze(theslice.stop)
-        step, is_step_constant = analyze(theslice.step)
-        length, is_length_constant = analyze(length)
-
-        if step is None:
-            step = 1
-            is_step_constant = True
-
-        # First handle the easier and common case where `step` is 1 and
-        # either `start` or `stop` is a range boundary. More specializations
-        # could be added later. This makes the resulting graph smaller than
-        # in the generic case below.
-        if step == 1:
-            is_start_0 = (
-                start in [None, 0] or
-                (is_start_constant and is_length_constant and
-                 start < 0 and start + length <= 0))
-            is_stop_length = (
-                stop in [None, length, maxsize] or
-                (is_stop_constant and is_length_constant and
-                 stop >= length))
-            if is_start_0:
-                # 0:stop:1
-                if is_stop_length:
-                    # Full slice.
-                    return slice(0, length, 1), 1
-                if is_stop_constant and stop >= 0:
-                    return (slice(0, switch(lt(stop, length), stop, length),
-                                  1), 1)
-                stop_plus_len = stop + length
-                stop = switch(
-                        lt(stop, 0),
-                        # stop < 0
-                        switch(
-                            lt(stop_plus_len, 0),
-                            # stop + len < 0
-                            0,
-                            # stop + len >= 0
-                            stop_plus_len),
-                        # stop >= 0: use min(stop, length)
-                        switch(lt(stop, length), stop, length))
-                return slice(0, stop, 1), 1
-            elif is_stop_length:
-                # start:length:1
-                if is_start_constant and start >= 0:
-                    return slice(switch(lt(start, length), start, length),
-                                 length, 1), 1
-                start_plus_len = start + length
-                start = switch(
-                        lt(start, 0),
-                        # start < 0
-                        switch(
-                            lt(start_plus_len, 0),
-                            # start + len < 0
-                            0,
-                            # start + len >= 0
-                            start_plus_len),
-                        # start >= 0: use min(start, length)
-                        switch(lt(start, length), start, length))
-                return slice(start, length, 1), 1
-
-        # This is the generic case.
-
-        if is_step_constant:
-            # When we know the sign of `step`, the graph can be made simpler.
-            assert step != 0
-            if step > 0:
-                def switch_neg_step(a, b):
-                    return b
-                abs_step = step
-                sgn_step = 1
-            else:
-                def switch_neg_step(a, b):
-                    return a
-                abs_step = -step
-                sgn_step = -1
-        else:
-            is_step_neg = lt(step, 0)
-
-            def switch_neg_step(a, b):
-                return switch(is_step_neg, a, b)
-            abs_step = abs(step)
-            sgn_step = sgn(step)
-
-        defstart = switch_neg_step(length - 1, 0)
-        defstop = switch_neg_step(-1, length)
-        if start is None:
-            start = defstart
-        else:
-            start = switch(lt(start, 0), start + length, start)
-            start = switch(lt(start, 0), switch_neg_step(-1, 0), start)
-            start = switch(ge(start, length),
-                           switch_neg_step(length - 1, length),
-                           start)
-        if stop in [None, maxsize]:
-            # The special "maxsize" case is probably not needed here,
-            # as slices containing maxsize are not generated by
-            # __getslice__ anymore.
-            stop = defstop
-        else:
-            stop = switch(lt(stop, 0), stop + length, stop)
-            stop = switch(lt(stop, 0), -1, stop)
-            stop = switch(ge(stop, length), length, stop)
-
-        nw_stop = switch_neg_step(start + 1, stop)
-        slice_len = (start - stop - 1) // abs_step + 1
-        slice_len = switch(lt(slice_len, 0), 0, slice_len)
-        neg_start = nw_stop - (slice_len - 1) * abs_step - 1
-        neg_start = switch(lt(neg_start, 0), (nw_stop - 1), neg_start)
-        nw_start = switch_neg_step(neg_start, start)
-        nw_start = switch(lt(nw_start, 0), 0, nw_start)
-        nw_stop = switch(lt(nw_stop, 0), 0, nw_stop)
-        # Ensure start <= stop.
-        nw_start = switch(lt(nw_start, nw_stop), nw_start, nw_stop)
-
-        nw_step = abs_step
-        if step != 1:
-            reverse = sgn_step
-            return slice(nw_start, nw_stop, nw_step), reverse
-        else:
-            return slice(nw_start, nw_stop, nw_step), 1
-    else:
-        value = theano.tensor.extract_constant(theslice)
-        value = switch(lt(value, 0), (value + length), value)
-
-        return value, 1
-
-
-class Subtensor(Op):
-    """Return a subtensor view
-
-    The inputs array is the tensor x, followed by scalar integer types.
-    TODO: WRITEME: how are the scalar integer variables formatted?
-
-    This class uses a relatively complex internal representation of the inputs
-    to remember how the input tensor x should be sliced.
-
-    idx_list: instance variable TODO: WRITEME: is this a list or a tuple?
-                                        (old docstring gives two conflicting
-                                        descriptions)
-              elements are either integers, theano scalar types, or slices.
-              one element per "explicitly named dimension"
-                TODO: WRITEME: what is an "explicitly named dimension" ?
-
-              if integer:
-                  indexes into the inputs array
-              if slice:
-                  start/stop/step members of each slice are integer indices
-                  into the inputs array or None
-                  integer indices be actual integers or theano scalar types
-
-    Note that the idx_list defines the Op, so two Subtensor instances are
-    considered to be different Ops if they have different idx_list fields.
-    This means that the entries in it are theano Types, not theano Variables.
-
-    @todo: add support for advanced tensor indexing (in Subtensor_dx too).
-
-    """
-    e_invalid = ('The index list is longer (size %d) than the number of '
-                 'dimensions of the tensor(namely %d). You are asking for '
-                 'a dimension of the tensor that does not exist! You might '
-                 'need to use dimshuffle to add extra dimension to your '
-                 'tensor.')
-    e_subslice = 'nested slicing is not supported'
-    e_indextype = "Invalid index type or slice for Subtensor"
-    debug = 0
-    check_input = False
-    view_map = {0: [0]}
-
-    @staticmethod
-    def collapse(idxs, cond):
-        """
-
-        idxs: a list of indices or slices.
-        cond: a callable that returns a bool
-
-        returns: idxs, with the slices flattened out into a list.
-                if cond is true for an entry, does not flatten it.
-
-        """
-        ret = []
-
-        def helper(entry):
-            if cond(entry):
-                ret.append(entry)
-            elif isinstance(entry, slice):
-                helper(entry.start)
-                helper(entry.stop)
-                helper(entry.step)
-
-        for idx in idxs:
-            helper(idx)
-
-        return ret
-
-    @staticmethod
-    def convert(entry, slice_ok=True):
-        """
-        The "idx_list" field is unique to each Subtensor instance.
-        It is not unique to each Apply node, so it should not refer to
-        specific Variables. This method changes references to Variables
-        into references to Types.
-        TODO: WRITEME: This method also accepts "entry" already being a Type;
-            when would that happen?
-        """
-        invalid_scal_types = [scal.float64, scal.float32]
-        scal_types = [scal.int64, scal.int32, scal.int16, scal.int8]
-        tensor_types = [theano.tensor.lscalar, theano.tensor.iscalar,
-                        theano.tensor.wscalar, theano.tensor.bscalar]
-        invalid_tensor_types = [theano.tensor.fscalar, theano.tensor.dscalar,
-                                theano.tensor.cscalar, theano.tensor.zscalar]
-        if (isinstance(entry, gof.Variable)
-                and (entry.type in invalid_scal_types
-                     or entry.type in invalid_tensor_types)):
-            raise TypeError("Expected an integer")
-
-        if isinstance(entry, gof.Variable) and entry.type in scal_types:
-            return entry.type
-        elif isinstance(entry, gof.Type) and entry in scal_types:
-            return entry
-
-        if (isinstance(entry, gof.Variable)
-                and entry.type in tensor_types
-                and numpy.all(entry.type.broadcastable)):
-            return scal.get_scalar_type(entry.type.dtype)
-        elif (isinstance(entry, gof.Type)
-                and entry in tensor_types
-                and numpy.all(entry.broadcastable)):
-            return scal.get_scalar_type(entry.dtype)
-        elif slice_ok and isinstance(entry, slice):
-            a = entry.start
-            b = entry.stop
-            c = entry.step
-
-            if a is not None:
-                slice_a = Subtensor.convert(a, False)
-            else:
-                slice_a = None
-
-            if b is not None and b != maxsize:
-                # The special "maxsize" case is probably not needed here,
-                # as slices containing maxsize are not generated by
-                # __getslice__ anymore.
-                slice_b = Subtensor.convert(b, False)
-            else:
-                slice_b = None
-
-            if c is not None:
-                slice_c = Subtensor.convert(c, False)
-            else:
-                slice_c = None
-
-            return slice(slice_a, slice_b, slice_c)
-        elif isinstance(entry, (int, long, numpy.integer)):
-            # Disallow the use of python scalars in idx_list
-            raise TypeError("Python scalar in idx_list."
-                            "Please report this error to theano-dev.")
-        else:
-            raise AdvancedIndexingError(Subtensor.e_indextype, entry)
-
-    def get_constant_idx(self, inputs, allow_partial=False):
-        """
-        Return the idx_list with constant inputs replaced by their
-        python scalar equivalent.  May raise
-        `theano.tensor.NotScalarConstantError` if the idx contains
-        non-constant entries.
-
-        If allow_partial is True, then entries that are not constant
-        will stay as their input variable rather than raising an
-        exception.
-
-        None entries are always left as-is.
-
-        Example usage (where v, a are appropriately typed theano variables):
-
-            >>> b = a[v, 1:3]
-            >>> b.owner.op.idx_list
-            (Scalar(int64), slice(Scalar(int64), Scalar(int64), None))
-            >>> b.owner.op.get_constant_idx(b.owner.inputs, allow_partial=True)
-            [v, slice(1, 3, None)]
-            >>> b.owner.op.get_constant_idx(b.owner.inputs)
-            NotScalarConstantError: v
-        """
-        real_idx = get_idx_list(inputs, self.idx_list)
-
-        def conv(val):
-            if val is None:
-                return None
-            elif isinstance(val, slice):
-                return slice(conv(val.start),
-                             conv(val.stop),
-                             conv(val.step))
-            else:
-                try:
-                    return get_scalar_constant_value(val)
-                except theano.tensor.NotScalarConstantError:
-                    if allow_partial:
-                        return val
-                    else:
-                        raise
-        return map(conv, real_idx)
-
-    def __init__(self, idx_list):
-        self.idx_list = tuple(map(self.convert, idx_list))
-
-    @staticmethod
-    def my_as_scalar(a):
-        # Since scal.as_scalar does not know about tensor types (it would
-        # create a circular import) , this method converts either a
-        # TensorVariable or a ScalarVariable to a scalar.
-        if isinstance(a, gof.Variable) and isinstance(a.type, TensorType):
-            return theano.tensor.scalar_from_tensor(a)
-        else:
-            return scal.as_scalar(a)
-
-    def make_node(self, x, *inputs):
-        """
-            x: the tensor to take a subtensor of
-            inputs: a list of theano Scalars
-        """
-        x = theano.tensor.as_tensor_variable(x)
-        inputs = tuple(self.my_as_scalar(a) for a in inputs)
-
-        idx_list = list(self.idx_list)
-        if len(idx_list) > x.type.ndim:
-            exception = ValueError(Subtensor.e_invalid % (
-                len(idx_list), x.type.ndim))
-            exception.subtensor_invalid = True
-            raise exception
-
-        input_types = Subtensor.collapse(idx_list,
-                                         lambda entry: isinstance(entry,
-                                                                  gof.Type))
-        if len(inputs) != len(input_types):
-            raise IndexError(
-                "Not enough inputs to fill in the Subtensor template.",
-                inputs, idx_list)
-        for input, expected_type in izip(inputs, input_types):
-            if input.type != expected_type:
-                raise TypeError(
-                    "Wrong type for Subtensor template. Expected %s, got %s."
-                    % (input.type, expected_type))
-
-        # infer the broadcasting pattern
-        padded = (self.get_constant_idx((None,)+inputs, allow_partial=True)
-                  + [slice(None, None, None)] * (x.type.ndim - len(idx_list)))
-        broadcastable = []
-        for i, (p, bc) in enumerate(izip(padded, x.type.broadcastable)):
-            if isinstance(p, slice):
-                if bc:
-                    start = p.start
-                    try:
-                        start = get_scalar_constant_value(start)
-                    except NotScalarConstantError:
-                        pass
-                    if start in [None, 0]:
-                        start = p.start
-                        if start is None:
-                            start = 0
-                        if (p.stop is None or
-                            (isinstance(p.stop, (int, numpy.integer,
-                                                 numpy.ndarray)) and
-                             p.stop > start)):
-                            broadcastable.append(True)
-                            continue
-
-                broadcastable.append(False)
-
-        return gof.Apply(self,
-                         (x, ) + inputs,
-                         [theano.tensor.tensor(dtype=x.type.dtype,
-                                               broadcastable=broadcastable)])
-
-    def perform(self, node, inputs, out_):
-        out, = out_
-        x = inputs[0]
-
-        cdata = get_idx_list(inputs, self.idx_list)
-        if len(cdata) == 1:
-            cdata = cdata[0]
-
-        out[0] = numpy.asarray(x.__getitem__(cdata))
-
-    def infer_shape(self, node, shapes):
-        xshp = shapes[0]
-        assert len(xshp) == node.inputs[0].ndim
-        outshp = []
-        actual_idx_list = list(get_idx_list(node.inputs, self.idx_list))
-        padded = (actual_idx_list +
-                  [slice(None, None, None)] * (len(xshp) - len(self.idx_list)))
-        i = 0
-        for idx, xl in izip(padded, xshp):
-            if isinstance(idx, slice):
-                # If it is the default (None, None, None) slice, or a variant,
-                # the shape will be xl
-                if ((idx.start in [None, 0])
-                    and (idx.stop in [None, maxsize])
-                    and (idx.step is None or idx.step == 1)):
-                    outshp.append(xl)
-                else:
-                    cnf = get_canonical_form_slice(idx, xl)[0]
-                    if cnf.step == 1:
-                        length = cnf.stop - cnf.start
-                    else:
-                        length = (cnf.stop - cnf.start - 1) // cnf.step + 1
-                    outshp.append(length)
-                i += 1
-            else:
-                # That dimension is dropped
-                pass
-        assert i == node.outputs[0].ndim
-        assert len(outshp) == node.outputs[0].ndim
-        return [outshp]
-
-    def grad(self, inputs, grads):
-        gz, = grads
-        x = inputs[0]
-        rest = inputs[1:]
-        output = self(*inputs)
-        if output.dtype.find('int') != -1:
-            first = x.zeros_like().astype(theano.config.floatX)
-        else:
-            first = IncSubtensor(self.idx_list)(x.zeros_like(), gz, *rest)
-        return ([first]
-                + [DisconnectedType()()] * len(rest))
-
-    def connection_pattern(self, node):
-
-        rval = [[True]]
-
-        for ipt in node.inputs[1:]:
-            rval.append([False])
-
-        return rval
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.idx_list == other.idx_list
-
-    def __hash__(self):
-        # TODO: optimize by cache this hash value
-        msg = []
-        for entry in self.idx_list:
-            if isinstance(entry, slice):
-                msg += [(entry.start, entry.stop, entry.step)]
-            else:
-                msg += [entry]
-
-        idx_list = tuple(msg)
-        # backport
-        # idx_list = tuple((entry.start, entry.stop, entry.step)
-        #                 if isinstance(entry, slice)
-        #                 else entry
-        #                 for entry in self.idx_list)
-        return hash(idx_list)
-
-    @staticmethod
-    def str_from_slice(entry):
-        msg = []
-        for x in [entry.start, entry.stop, entry.step]:
-            if x is None:
-                msg.append("")
-            else:
-                msg.append(str(x))
-        return ":".join(msg)
-
-    def __str__(self):
-        indices = []
-        for entry in self.idx_list:
-            if isinstance(entry, slice):
-                indices.append(self.str_from_slice(entry))
-            else:
-                indices.append(str(entry))
-        return "%s{%s}" % (self.__class__.__name__, ", ".join(indices))
-
-    @staticmethod
-    def default_helper_c_code_args():
-        """
-        Returns a dictionary of default arguments to
-        helper_c_code
-        """
-
-        return {
-            "c_prefix": "PyArray",
-            "strides_mul": 1,
-            }
-
-    @staticmethod
-    def helper_c_code(node, name, inputs, outputs, sub, idx_list, view_ndim,
-                      c_prefix=None,
-                      strides_mul=None):
-        """
-        The parameters c_prefix are there to allow reusing this
-        function on PyArray and CudaNdarray object.
-
-        This fct take as input the x,
-        """
-
-        default_args = Subtensor.default_helper_c_code_args()
-
-        if strides_mul is None:
-            strides_mul = default_args['strides_mul']
-
-        if c_prefix is None:
-            c_prefix = default_args['c_prefix']
-
-        #
-        # two arrays are created in C code:
-        # is_slice: len == ndim, 0 means int, 1 means slice
-        # subtensor_spec: len = n_ints + 3 * n_slices
-        #
-        fail = sub['fail']
-        init_cmds = []  # initialization for subtensor_spec
-        is_slice = []
-        # TODO: change that, it might lead to unexpected results,
-        # see assembla-#767
-        NONE_CODE = maxsize - 1
-
-        pos = [0, 1]  # annoying version of global variable for init_entry
-
-        def inc_spec_pos(amt):
-            pos[0] += amt
-
-        def inc_input_pos(amt):
-            pos[1] += amt
-
-        def spec_pos():
-            return pos[0]
-
-        def input_pos():
-            return pos[1]
-
-        def init_entry(entry, depth=0):
-            if isinstance(entry, (numpy.integer, int)):
-                init_cmds.append(
-                    "subtensor_spec[%i] = %i;" % (spec_pos(),
-                                                  entry))
-                inc_spec_pos(1)
-                if depth == 0:
-                    is_slice.append(0)
-            elif isinstance(entry, Type):
-                init_cmds.append(
-                    "subtensor_spec[%i] = %s;" % (spec_pos(),
-                                                  inputs[input_pos()]))
-                inc_spec_pos(1)
-                inc_input_pos(1)
-                if depth == 0:
-                    is_slice.append(0)
-            elif entry is None:
-                init_cmds.append(
-                    "subtensor_spec[%i] = %i;" % (spec_pos(),
-                                                  NONE_CODE))
-                inc_spec_pos(1)
-                if depth == 0:
-                    is_slice.append(0)
-            elif depth == 0 and isinstance(entry, slice):
-                init_entry(entry.start, depth + 1)
-                init_entry(entry.stop, depth + 1)
-                init_entry(entry.step, depth + 1)
-                is_slice.append(1)
-            else:
-                assert 0, entry
-
-        for entry in idx_list:
-            init_entry(entry)
-        # make sure we used all inputs
-        assert input_pos() == len(inputs), input_pos()
-        assert len(is_slice) <= node.inputs[0].ndim, node.inputs[0].ndim
-
-        len_is_slice = len(is_slice)
-
-        len_subtensor_spec = spec_pos()
-        subensor_spec = "npy_intp subtensor_spec[%(len_subtensor_spec)s];" % locals()
-        if len_subtensor_spec == 0:
-            subensor_spec = "npy_intp * subtensor_spec = NULL;"
-
-        if is_slice:
-            is_slice_init = "int is_slice[] = {" + ",".join([str(s) for s in
-                                                             is_slice]) + "};"
-        else:
-            is_slice_init = "int* is_slice = NULL;"
-        subtensor_init = "\n".join(init_cmds)
-
-        x, = inputs[:1]
-        z, = outputs
-
-        if view_ndim:
-            rval = """
-        // Argument of the view
-        npy_intp xview_dims[%(view_ndim)s];
-        npy_intp xview_strides[%(view_ndim)s];
-
-        """ % locals()
-        else:
-            rval = """
-        // Argument of the view
-        npy_intp* xview_dims = NULL;
-        npy_intp* xview_strides = NULL;
-
-        """
-
-        rval += """
-        // One more argument of the view
-        npy_intp xview_offset = 0;
-
-        // The subtensor is created by iterating over the dimensions
-        // and updating stride, shape, and data pointers
-
-        %(is_slice_init)s
-        %(subensor_spec)s
-        %(subtensor_init)s;
-        int spec_pos = 0; //position in subtensor_spec
-        int inner_ii = 0; // the current dimension of zview
-        int outer_ii = 0; // current dimension of z
-
-
-        for (; outer_ii < %(len_is_slice)s; ++outer_ii)
-        {
-            if (is_slice[outer_ii])
-            {
-                npy_intp length = %(c_prefix)s_DIMS(%(x)s)[outer_ii];
-                npy_intp slicelength;
-                npy_intp start = subtensor_spec[spec_pos+0];
-                npy_intp stop  = subtensor_spec[spec_pos+1];
-                npy_intp step  = subtensor_spec[spec_pos+2];
-                if (step == %(NONE_CODE)s) step = 1;
-
-                npy_intp defstart = step < 0 ? length-1 : 0;
-                npy_intp defstop = step < 0 ? -1 : length;
-
-                // logic adapted from
-                // PySlice_GetIndicesEx in python source
-                if (!step)
-                {
-                    PyErr_Format(PyExc_ValueError,
-                                 "slice step cannot be zero");
-                    %(fail)s;
-                }
-
-                if (start == %(NONE_CODE)s)
-                {
-                    start = defstart;
-                }
-                else
-                {
-                    if (start < 0) start += length;
-                    if (start < 0) start = (step < 0) ? -1 : 0;
-                    if (start >= length)
-                        start = (step < 0) ? length - 1 : length;
-                }
-
-                if (stop == %(NONE_CODE)s)
-                {
-                    stop = defstop;
-                }
-                else
-                {
-                    if (stop < 0) stop += length;
-                    if (stop < 0) stop = (step < 0) ? -1 : 0;
-                    if (stop >= length)
-                        stop = (step < 0) ? length - 1 : length;
-                }
-
-                if ((step < 0 && stop >= start)
-                    || (step > 0 && start >= stop)) {
-                    slicelength = 0;
-                }
-                else if (step < 0) {
-                    slicelength = (stop-start+1)/step+1;
-                }
-                else {
-                    slicelength = (stop-start-1)/step+1;
-                }
-
-                if (0){
-                    fprintf(stdout, "start %%zi\\n", start);
-                    fprintf(stdout, "stop %%zi\\n", stop);
-                    fprintf(stdout, "step %%zi\\n", step);
-                    fprintf(stdout, "length %%zi\\n", length);
-                    fprintf(stdout, "slicelength %%zi\\n", slicelength);
-                }
-
-                assert (slicelength <= length);
-
-                xview_offset += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start *
-                       %(strides_mul)s;
-                xview_dims[inner_ii] = slicelength;
-                xview_strides[inner_ii] = %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step;
-
-                inner_ii += 1;
-                spec_pos += 3;
-            }
-            else // tuple coord `outer_ii` is an int
-            {
-                int idx = subtensor_spec[spec_pos];
-                if (idx < 0) idx += %(c_prefix)s_DIMS(%(x)s)[outer_ii];
-                if (idx >= 0)
-                {
-                    if (idx < %(c_prefix)s_DIMS(%(x)s)[outer_ii])
-                    {
-                        xview_offset += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * idx *
-                               %(strides_mul)s;
-                    }
-                    else
-                    {
-                        PyErr_Format(PyExc_IndexError,"index out of bounds");
-                        %(fail)s;
-                    }
-                }
-                else
-                {
-                    PyErr_Format(PyExc_IndexError,"index out of bounds");
-                    %(fail)s;
-                }
-
-                spec_pos += 1;
-            }
-        }
-        assert (inner_ii <= %(view_ndim)s);
-        while (inner_ii < %(view_ndim)s)
-        {
-            assert (outer_ii < %(c_prefix)s_NDIM(%(x)s));
-            xview_dims[inner_ii] = %(c_prefix)s_DIMS(%(x)s)[outer_ii];
-            xview_strides[inner_ii] = %(c_prefix)s_STRIDES(%(x)s)[outer_ii];
-
-            inner_ii += 1;
-            outer_ii += 1;
-        }
-        """ % locals()
-        # print rval
-        return rval
-
-    @staticmethod
-    def helper_c_code_cache_version():
-        return (8,)
-
-    def c_code(self, node, name, inputs, outputs, sub):  # DEBUG
-        if not isinstance(node.inputs[0].type, theano.tensor.TensorType):
-            raise NotImplementedError()
-
-        x = inputs[0]
-        z, = outputs
-        ndim = node.inputs[0].ndim
-        view_ndim = node.outputs[0].ndim
-        fail = sub['fail']
-
-        decl = "PyArrayObject * xview = NULL;"
-
-        checkNDim = """
-        if (PyArray_NDIM(%(x)s) != %(ndim)s){
-            PyErr_SetString(PyExc_ValueError,
-                                     "Expected %(ndim)s dimensions input"
-                                        );
-            %(fail)s
-        }
-        """ % locals()
-
-        get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
-                                       self.idx_list, view_ndim)
-        build_view = """
-        //TODO: give this Op a second output so that this view can be cached
-        //TODO: alternatively, fix the memory leak on failure
-        Py_INCREF(PyArray_DESCR(%(x)s));
-        xview = (PyArrayObject*)PyArray_NewFromDescr(
-                &PyArray_Type,
-                PyArray_DESCR(%(x)s),
-                %(view_ndim)s,
-                xview_dims,
-                xview_strides,
-                PyArray_BYTES(%(x)s) + xview_offset,
-                PyArray_FLAGS(%(x)s),
-                NULL);
-        assert (PyArray_NDIM(xview) == %(view_ndim)s);
-        if (!xview)
-        {
-            %(fail)s;
-        }
-        """ % locals()
-
-        finish_view = """
-        //This is needed for NumPy 1.5, but not 1.7.2
-        PyArray_UpdateFlags(xview, NPY_ARRAY_C_CONTIGUOUS| NPY_ARRAY_F_CONTIGUOUS);
-        Py_XDECREF(%(z)s);
-        Py_INCREF(py_%(x)s);
-#if NPY_API_VERSION < 0x00000007
-        PyArray_BASE(xview) = py_%(x)s;
-#else
-        PyArray_SetBaseObject(xview, py_%(x)s);
-#endif
-        assert(py_%(x)s == (PyObject*)%(x)s);
-        %(z)s = xview;
-        """ % locals()
-
-        return (decl + checkNDim +
-                "{" + get_xview + build_view + finish_view + "}")
-
-    def c_code_cache_version(self):
-        hv = self.helper_c_code_cache_version()
-        # If `helper_c_code_cache_version` is not versioned we do not want to
-        # have a versioned version of this op's C code.
-        if len(hv) == 0:
-            return ()
-        return (4, hv)
-
-    def R_op(self, inputs, eval_points):
-        # Subtensor is not differentiable wrt to its indices, therefore we
-        # do not even need to consider the eval_points provided for those
-        # (they should be defaulted to zeros_like by the global R_op)
-        if eval_points[0] is None:
-            return [None]
-        return self(eval_points[0], *inputs[1:], **dict(return_list=True))
-
-
-class SubtensorPrinter:
-
-    def process(self, r, pstate):
-        if r.owner is None:
-            raise TypeError("Can only print Subtensor.")
-        elif isinstance(r.owner.op, Subtensor):
-            idxs = r.owner.op.idx_list
-            inputs = list(r.owner.inputs)
-            input = inputs.pop()
-            sidxs = []
-            inbrack_pstate = pstate.clone(precedence=-1000)
-            for entry in idxs:
-                if isinstance(entry, int):
-                    sidxs.append(str(entry))
-                elif isinstance(entry, scal.Scalar):
-                    sidxs.append(inbrack_pstate.pprinter.process(inputs.pop()))
-                elif isinstance(entry, slice):
-                    if entry.start is None or entry.start == 0:
-                        msg1 = ""
-                    else:
-                        msg1 = entry.start
-
-                    if entry.stop is None or entry.stop == maxsize:
-                        msg2 = ""
-                    else:
-                        msg2 = entry.stop
-
-                    if entry.step is None:
-                        msg3 = ""
-                    else:
-                        msg3 = ":%s" % entry.step
-
-                    sidxs.append("%s:%s%s" % (msg1, msg2, msg3))
-            return "%s[%s]" % (pstate.pprinter.process(
-                input,
-                pstate.clone(precedence=1000)),
-                ", ".join(sidxs))
-        else:
-            raise TypeError("Can only print Subtensor.")
-
-pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Subtensor),
-              SubtensorPrinter())
-
-
-def set_subtensor(x, y, inplace=False,
-                  tolerate_inplace_aliasing=False):
-    """Return x with the given subtensor overwritten by y.
-
-    Example: To replicate the numpy expression "r[10:] = 5", type
-
-    >>> r = ivector()
-    >>> new_r = set_subtensor(r[10:], 5)
-
-    :param x: symbolic variable for the lvalue of = operation
-    :param y: symbolic variable for the rvalue of = operation
-    :param tolerate_inplace_aliasing: see inc_subtensor for documentation.
-    """
-    return inc_subtensor(x, y, inplace, set_instead_of_inc=True,
-                         tolerate_inplace_aliasing=tolerate_inplace_aliasing)
-
-
-def inc_subtensor(x, y, inplace=False, set_instead_of_inc=False,
-                  tolerate_inplace_aliasing=False):
-    """Return x with the given subtensor incremented by y.
-
-    :param x: the symbolic result of a Subtensor operation.
-    :param y: the amount by which to increment ths subtensor in question
-    :param tolerate_inplace_aliasing: allow x and y to be views of a single
-        underlying array even while working inplace.  For correct results,
-        x and y must not be overlapping views; if they overlap, the result
-        of this Op will generally be incorrect. This value has no effect if
-        inplace=False.
-
-    Example: To replicate the numpy expression "r[10:] += 5", type
-
-    >>> r = ivector()
-    >>> new_r = inc_subtensor(r[10:], 5)
-    """
-    # First of all, y cannot have a higher dimension than x,
-    # nor have non-broadcastable dimensions where x is broadcastable.
-
-    x = theano.tensor.as_tensor_variable(x)
-    y = theano.tensor.as_tensor_variable(y)
-
-    if y.ndim > x.ndim:
-        raise TypeError(("Trying to increment a %d-dimensional "
-                         "subtensor with a %d-dimensional value.") % (x.ndim,
-                                                                      y.ndim))
-
-    for dim in range(y.ndim):
-        dim_offset = x.ndim - y.ndim
-        if (x.broadcastable[dim + dim_offset]
-                and not y.broadcastable[dim]):
-            # It is acceptable to try to increment a subtensor with a
-            # broadcastable dim with a tensor that is not broadcastable
-            # on that dimension. However, its length must then be 1.
-            # We insert a Rebroadcast Op to make sure it is the case.
-            y = addbroadcast(y, dim)
-
-    if not x.owner:
-        raise TypeError('x must be the result of a subtensor operation')
-
-    # retrieve idx_list from x.owner
-    if isinstance(x.owner.op, Subtensor):
-        if tolerate_inplace_aliasing:
-            destroyhandler_tolerate_aliased = [[0, 1]]
-        else:
-            destroyhandler_tolerate_aliased = []
-        the_op = IncSubtensor(
-            x.owner.op.idx_list, inplace, set_instead_of_inc,
-            destroyhandler_tolerate_aliased=destroyhandler_tolerate_aliased)
-        real_x = x.owner.inputs[0]
-        real_idxargs = x.owner.inputs[1:]
-        return the_op(real_x, y, *real_idxargs)
-    elif isinstance(x.owner.op, AdvancedSubtensor1):
-        real_x = x.owner.inputs[0]
-        ilist = x.owner.inputs[1]
-        the_op = AdvancedIncSubtensor1(inplace,
-                                       set_instead_of_inc=set_instead_of_inc)
-        return the_op(real_x, y, ilist)
-    elif isinstance(x.owner.op, AdvancedSubtensor):
-        real_x = x.owner.inputs[0]
-        ilist = x.owner.inputs[1:]
-
-        the_op = AdvancedIncSubtensor(inplace,
-                                      set_instead_of_inc=set_instead_of_inc)
-        return the_op(real_x, y, *ilist)
-    elif isinstance(x.owner.op, DimShuffle):
-        inner_x = x.owner.inputs[0]
-        # In the dimshuffle case, there are in fact two dimshuffles:
-        # one to make the indexed dimension the last one,
-        # and one to put it back where it was. So, in the case where we have
-        # inc_subtensor(x[:,i], y), the graph is actually
-        # inc_subtensor((x.T)[i].T, y).
-        # We could get all the way to x, and then get rid of the dimshuffles
-        # completely, but the problem is that advanced_inc_subtensor1 can only
-        # work on the first (outer-most, left-most) dimension of x,
-        # just like advanced_subtensor1.
-        # So we call advanced_inc_subtensor1(x.T, i, y), but then we need to
-        # return something that has the same shape as x, not as x.T (inner_x).
-        # So re-apply the outer dimshuffle on the new inc_subtensor,
-        # and return advanced_inc_subtensor1(x.T, i, y).T.
-        inner_incsubtensor = inc_subtensor(
-            inner_x, y,
-            inplace=inplace,
-            set_instead_of_inc=set_instead_of_inc,
-            tolerate_inplace_aliasing=tolerate_inplace_aliasing)
-        return x.owner.op(inner_incsubtensor, *x.owner.inputs[1:])
-    elif isinstance(x.owner.op, theano.tensor.Reshape):
-        inner_x = x.owner.inputs[0]
-        # Try to apply inc_subtensor on inner_x.
-        # If it works, there is no need to reshape, as the inc_subtensor
-        # will have the same shape as inner_x, which is what we want.
-        inner_incsubtensor = inc_subtensor(
-            inner_x, y.flatten(),
-            inplace=inplace,
-            set_instead_of_inc=set_instead_of_inc,
-            tolerate_inplace_aliasing=tolerate_inplace_aliasing)
-        return inner_incsubtensor
-    else:
-        raise TypeError('x must be the result of a subtensor operation')
-
-
-class IncSubtensor(Op):
-    """Increment a subtensor.
-
-    This is like numpy's
-
-        x[i,j,k] += y
-
-    It is used internally to implement the gradient on SubTensor.
-
-    :param set_instead_of_inc: if True set the subtensor to the value instead
-    of incrementing it by that value.
-    """
-
-    check_input = False
-
-    def __init__(self, idx_list, inplace=False, set_instead_of_inc=False,
-                 destroyhandler_tolerate_aliased=None):
-        if destroyhandler_tolerate_aliased is None:
-            destroyhandler_tolerate_aliased = []
-        self.idx_list = map(Subtensor.convert, idx_list)
-        self.inplace = inplace
-        if inplace:
-            self.destroy_map = {0: [0]}
-        self.destroyhandler_tolerate_aliased = list(
-            destroyhandler_tolerate_aliased)
-        self.set_instead_of_inc = set_instead_of_inc
-
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.idx_list == other.idx_list and
-                self.inplace == other.inplace and
-                self.set_instead_of_inc == other.set_instead_of_inc)
-
-    def __hash__(self):
-        msg = []
-        for entry in self.idx_list:
-            if isinstance(entry, slice):
-                msg += [(entry.start, entry.stop, entry.step)]
-            else:
-                msg += [entry]
-
-        idx_list = tuple(msg)
-        # backport
-        # idx_list = tuple((entry.start, entry.stop, entry.step)
-        #                 if isinstance(entry, slice)
-        #                 else entry
-        #                 for entry in self.idx_list)
-        return (hashtype(self) ^ hash(idx_list) ^ hash(self.inplace) ^
-                hash(self.set_instead_of_inc))
-
-    def __str__(self):
-        indices = []
-        for entry in self.idx_list:
-            if isinstance(entry, slice):
-                indices.append(Subtensor.str_from_slice(entry))
-            else:
-                indices.append(str(entry))
-        if self.inplace:
-            msg = 'Inplace'
-        else:
-            msg = ''
-        if not self.set_instead_of_inc:
-            msg += 'Inc'
-        else:
-            msg += 'Set'
-        return "%s{%s;%s}" % (
-            self.__class__.__name__,
-            msg,
-            ", ".join(indices))
-
-    def make_node(self, x, y, *inputs):
-        """
-            x: the tensor to increment
-            y: the value to increment by
-            inputs: TODO WRITEME
-        """
-        x, y = map(theano.tensor.as_tensor_variable, [x, y])
-        if y.ndim > x.ndim:
-            raise ValueError(("Trying to increment a %d-dimensional "
-                              "subtensor with a %d-dimensional value.") % (
-                                  x.ndim, y.ndim))
-        inputs = tuple(map(Subtensor.my_as_scalar, inputs))
-
-        idx_list = list(self.idx_list)
-        if len(idx_list) > x.type.ndim:
-            exception = ValueError(
-                Subtensor.e_invalid % (
-                    len(idx_list),
-                    x.type.ndim))
-            exception.subtensor_invalid = True
-            raise exception
-
-        input_types = Subtensor.collapse(
-            idx_list,
-            lambda entry: isinstance(entry, gof.Type))
-        if len(inputs) != len(input_types):
-            raise IndexError(
-                "Not enough inputs to fill in the Subtensor template.",
-                inputs, idx_list)
-        for input, expected_type in izip(inputs, input_types):
-            if input.type != expected_type:
-                raise TypeError(
-                    "Wrong type for Subtensor template. Expected %s, got %s."
-                    % (input.type, expected_type))
-
-        return gof.Apply(self,
-                         (x, y) + inputs,
-                         [x.type()])
-
-    def decl_view(self):
-        return "PyArrayObject * zview = NULL;"
-
-    def perform(self, node, inputs, out_):
-        out, = out_
-        x, y = inputs[:2]
-        indices = list(reversed(inputs[2:]))
-
-        def convert(entry):
-            if isinstance(entry, gof.Type):
-                rval = indices.pop()
-                if sys.version_info < (2, 5):
-                    # Before Python 2.5, PySlice_GetIndicesEx requires
-                    # Python int to be passed.
-                    rval_ = int(rval)
-                    if rval_ != rval:
-                        raise IndexError((
-                            "Invalid value for indexing: %s. "
-                            "That value may be too big.") % rval)
-                    return rval_
-                return rval
-            elif isinstance(entry, slice):
-                return slice(convert(entry.start),
-                             convert(entry.stop),
-                             convert(entry.step))
-            else:
-                return entry
-
-        cdata = tuple(map(convert, self.idx_list))
-        if len(cdata) == 1:
-            cdata = cdata[0]
-        if not self.inplace:
-            x = x.copy()
-        sub_x = x.__getitem__(cdata)
-        if sub_x.shape:
-            # we've sliced out an N-D tensor with N > 0
-            if not self.set_instead_of_inc:
-                sub_x += y
-            else:
-                # sub_x += -sub_x + y
-                x.__setitem__(cdata, y)
-        else:
-            # scalar case
-            if not self.set_instead_of_inc:
-                x.__setitem__(cdata, sub_x + y)
-            else:
-                x.__setitem__(cdata, y)
-        out[0] = x
-
-    def c_code(self, node, name, inputs, outputs, sub):
-
-        # This method delegates much of the work to helper
-        # methods. This method implements the main logic
-        # but subclasses may override the helper methods
-        # to change the particulars, e.g. GpuIncSubtensor
-        # turns the view/copy operations on numpy arrays
-        # into the same operations on cuda arrays.
-
-        self.do_type_checking(node)
-
-        if self.inplace:  # convert bool to int
-            inplace = 1
-        else:
-            inplace = 0
-        x = inputs[0]
-        y = inputs[1]
-        z, = outputs
-        if self.set_instead_of_inc:  # convert bool to int
-            op_is_set = 1
-        else:
-            op_is_set = 0
-        fail = sub['fail']
-        view_ndim = (node.inputs[0].ndim -
-                     numpy.sum([not isinstance(idx, slice)
-                                for idx in self.idx_list]))
-
-        copy_of_x = self.copy_of_x(x)
-
-        copy_input_if_necessary = """
-        if (%(inplace)s)
-        {
-            if (%(x)s != %(z)s)
-            {
-                Py_XDECREF(%(z)s);
-                Py_INCREF(%(x)s);
-                %(z)s = %(x)s;
-            }
-        }
-        else
-        {
-            Py_XDECREF(%(z)s);
-            %(z)s = %(copy_of_x)s;
-        }
-        """ % locals()
-
-        # get info needed to make zview: a view of %(z)s
-        helper_args = self.get_helper_c_code_args()
-
-        get_zview = Subtensor.helper_c_code(
-            node=node,
-            name=name,
-            inputs=outputs[:1] + inputs[2:],
-            outputs=outputs,
-            sub=sub,
-            idx_list=self.idx_list,
-            view_ndim=view_ndim,
-            ** helper_args
-        )
-
-        # Make a view on the output, as we will write into it.
-        alloc_zview = self.make_view_array(z, view_ndim)
-
-        build_view = """
-        //TODO: give this Op a second output so that this view can be cached
-        //TODO: alternatively, fix the memory leak on failure
-        %(alloc_zview)s;
-        if (!zview)
-        {
-            %(fail)s;
-        }
-        """ % locals()
-
-        copy_into = self.copy_into("zview", y)
-
-        add_to_zview = self.add_to_zview(name, y, fail)
-
-        make_modification = """
-        if (%(op_is_set)s)
-        {
-            if (%(copy_into)s) // does broadcasting
-            {
-                Py_DECREF(zview);
-                %(fail)s;
-            }
-        }
-        else
-        {
-            %(add_to_zview)s
-        }
-        """ % locals()
-        return (self.decl_view() +
-                copy_input_if_necessary +
-                get_zview +
-                build_view +
-                make_modification +
-                "Py_DECREF(zview);"
-                )
-
-    def do_type_checking(self, node):
-        """ Should raise NotImplementedError if c_code does not support
-        the types involved in this node.
-        """
-
-        if not isinstance(node.inputs[0].type, theano.tensor.TensorType):
-            raise NotImplementedError()
-
-    def c_code_cache_version(self):
-        hv = Subtensor.helper_c_code_cache_version()
-        if hv:
-            return (1, hv)
-        else:
-            return ()
-
-    def copy_of_x(self, x):
-        """
-            :param x: a string giving the name of a C variable
-                pointing to an array
-
-            :return: C code expression to make a copy of x
-
-            Base class uses PyArrayObject *, subclasses may override for
-            different types of arrays.
-        """
-        # Parameters of PyArrary_FromAny are:
-        # array
-        # dtype: we pass NULL to say any dtype is acceptable, so the existing
-        #        dtype will be copied
-        # min_depth: we pass 0 to have this parameter ignored
-        # max_depth: we pass 0 to have this parameter ignored
-        # requirements: here we pass NPY_ARRAY_ENSURECOPY to force a copy
-        # context: this is almost always NULL, I'm not sure what it's used for
-        return """(PyArrayObject*)PyArray_FromAny(py_%(x)s, NULL, 0, 0,
-                NPY_ARRAY_ENSURECOPY, NULL)""" % locals()
-
-    def make_view_array(self, x, view_ndim):
-        """
-            :param x: a string identifying an array to be viewed
-            :param view_ndim: a string specifying the number of dimensions
-                to have in the view
-
-            This doesn't need to actually set up the view with the
-            right indexing; we'll do that manually later.
-        """
-
-        return """Py_INCREF(PyArray_DESCR(%(x)s));
-        zview = (PyArrayObject*)PyArray_NewFromDescr(
-                &PyArray_Type,
-                PyArray_DESCR(%(x)s),
-                %(view_ndim)s,
-                xview_dims, //PyArray_DIMS(%(x)s),
-                xview_strides, //PyArray_STRIDES(%(x)s),
-                PyArray_BYTES(%(x)s) + xview_offset, //PyArray_DATA(%(x)s),
-                PyArray_FLAGS(%(x)s),
-                NULL);
-        //This is needed for NumPy 1.5, but not 1.7.2
-        PyArray_UpdateFlags(zview, NPY_ARRAY_C_CONTIGUOUS| NPY_ARRAY_F_CONTIGUOUS);
-        """ % locals()
-
-    def get_helper_c_code_args(self):
-        """ Return a dictionary of arguments to pass to helper_c_code."""
-        return Subtensor.default_helper_c_code_args()
-
-    def copy_into(self, view, source):
-        """
-            view: string, C code expression for an array
-            source: string, C code expression for an array
-
-            returns a C code expression to copy source into view, and
-            return 0 on success
-        """
-        return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals()
-
-    def add_to_zview(self, name, x, fail):
-        """ Return C code to add x to zview. Should DECREF zview if the
-        add fails."""
-
-        return """
-            PyArrayObject * add_rval = (PyArrayObject*)PyNumber_InPlaceAdd(
-                    (PyObject*)zview, py_%(x)s);
-            if (add_rval)
-            {
-                assert (PyArray_Check((PyObject*)add_rval));
-                assert (PyArray_DATA(add_rval) == PyArray_DATA(zview));
-                Py_DECREF(add_rval);
-            }
-            else
-            {
-                Py_DECREF(zview);
-                %(fail)s;
-            }""" % locals()
-
-    def infer_shape(self, node, shapes):
-        return [shapes[0]]
-
-    def R_op(self, inputs, eval_points):
-        if eval_points[0] is None or eval_points[1] is None:
-            return [None]
-        # Again we ignore eval points for indices because incsubtensor is
-        # not differentiable wrt to those
-        return self(eval_points[0], eval_points[1], *inputs[2:],
-                    **dict(return_list=True))
-
-    def connection_pattern(self, node):
-
-        rval = [[True], [True]]
-
-        for ipt in node.inputs[2:]:
-            rval.append([False])
-
-        return rval
-
-    def grad(self, inputs, grads):
-        g_output, = grads
-        x, y = inputs[:2]
-        idx_list = inputs[2:]
-
-        if self.set_instead_of_inc:
-            gx = set_subtensor(
-                Subtensor(idx_list=self.idx_list)(g_output, *idx_list),
-                theano.tensor.zeros_like(y))
-        else:
-            gx = g_output
-        gy = Subtensor(idx_list=self.idx_list)(g_output, *idx_list)
-        if gy.broadcastable != y.broadcastable:
-            y_dim_added = gy.ndim - y.ndim
-            y_broad = (True,) * y_dim_added + y.broadcastable
-            assert sum(gy.broadcastable) < sum(y_broad)
-            axis_to_sum = []
-            for i in range(gy.ndim):
-                if gy.broadcastable[i] is False and y_broad[i] is True:
-                    axis_to_sum.append(i)
-                elif (gy.broadcastable[i] is True and
-                      y_broad[i] is False):
-                    # This mean that Theano where able to infer that
-                    # gy.shape[i] is 1, so y.shape[i] is 1, but we
-                    # didn't know it. It is fine.
-                    pass
-                else:
-                    assert gy.broadcastable[i] == y_broad[i]
-            gy = gy.sum(axis=axis_to_sum, keepdims=True)
-            if gy.ndim != y.ndim:
-                assert gy.ndim > y.ndim
-                for i in range(y_dim_added):
-                    assert gy.broadcastable[i]
-                gy = gy.dimshuffle(*range(y_dim_added, gy.ndim))
-            assert gy.broadcastable == y.broadcastable
-
-        return [gx, gy] + [DisconnectedType()()] * len(idx_list)
-
-
-#########################
-# Advanced indexing
-#########################
-#
-# Should reproduce numpy's behaviour, see url:
-# docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing
-
-
-class AdvancedSubtensor1(Op):
-    """Implement x[ilist] where ilist is a vector of integers."""
-
-    def __init__(self, sparse_grad=False):
-        self.sparse_grad = sparse_grad
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __eq__(self, other):
-        # Don't check the sparse_grad attribute as
-        # This don't change the output of this op
-        # So we want the merge optimier to merge two op
-        # that differ from there sparse_grad attribute.
-        return type(self) == type(other)
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def make_node(self, x, ilist):
-        x_ = theano.tensor.as_tensor_variable(x)
-        ilist_ = theano.tensor.as_tensor_variable(ilist)
-        if ilist_.type.dtype[:3] not in ('int', 'uin'):
-            raise TypeError('index must be integers')
-        if ilist_.type.ndim != 1:
-            raise TypeError('index must be vector')
-        if x_.type.ndim == 0:
-            raise TypeError('cannot index into a scalar')
-        bcast = (ilist_.broadcastable[0],) + x_.broadcastable[1:]
-        return Apply(self, [x_, ilist_], [TensorType(dtype=x.dtype,
-                                                     broadcastable=bcast)()])
-
-    def perform(self, node, inp, out_):
-        x, i = inp
-        out, = out_
-        # Copy always implied by numpy advanced indexing semantic.
-        if out[0] is not None and out[0].shape == (len(i),) + x.shape[1:]:
-            o = out[0]
-        else:
-            o = None
-
-        # If i.dtype is more precise than numpy.intp (int32 on 32-bit machines,
-        # int64 on 64-bit machines), numpy may raise the following error:
-        # TypeError: array cannot be safely cast to required type.
-        # We need to check if values in i can fit in numpy.intp, because
-        # if they don't, that should be an error (no array can have that
-        # many elements on a 32-bit arch).
-        if i.dtype != numpy.intp:
-            i_ = theano._asarray(i, dtype=numpy.intp)
-            if not numpy.can_cast(i.dtype, numpy.intp):
-                # Check if there was actually an incorrect conversion
-                if numpy.any(i != i_):
-                    raise IndexError(
-                        'index contains values that are bigger '
-                        'than the maximum array size on this system.', i)
-            i = i_
-
-        out[0] = x.take(i, axis=0, out=o)
-
-    def connection_pattern(self, node):
-        rval = [[True]]
-
-        for ipt in node.inputs[1:]:
-            rval.append([False])
-
-        return rval
-
-    def grad(self, inputs, grads):
-        global sparse_module_ref
-        x, ilist = inputs
-        gz, = grads
-        assert len(inputs) == 2
-        if self.sparse_grad:
-            if x.type.ndim != 2:
-                raise TypeError(
-                    "AdvancedSubtensor1: you can't take the sparse grad"
-                    " from a tensor with ndim != 2. ndim is " +
-                    str(x.type.ndim))
-            if sparse_module_ref is None:
-                import theano.sparse as sparse_module_ref
-
-            rval1 = [sparse_module_ref.construct_sparse_from_list(x, gz,
-                                                                  ilist)]
-        else:
-            rval1 = [advanced_inc_subtensor1(x.zeros_like(), gz, ilist)]
-        return rval1 + [DisconnectedType()()] * (len(inputs) - 1)
-
-    def R_op(self, inputs, eval_points):
-        if eval_points[0] is None:
-            return [None]
-        return self.make_node(eval_points[0], *inputs[1:]).outputs
-
-    def infer_shape(self, node, ishapes):
-        x, ilist = ishapes
-        return [ilist + x[1:]]
-
-    def c_support_code(self):
-        # In some versions of numpy, NPY_MIN_INTP is defined as MIN_LONG,
-        # which is not defined. It should be NPY_MIN_LONG instead in that case.
-        return dedent("""\
-                #ifndef MIN_LONG
-                #define MIN_LONG NPY_MIN_LONG
-                #endif""")
-
-    def c_code(self, node, name, input_names, output_names, sub):
-        if self.__class__ is not AdvancedSubtensor1:
-            raise MethodNotDefined(
-                "c_code defined for AdvancedSubtensor1,"
-                " not for child class", type(self))
-        a_name, i_name = input_names[0], input_names[1]
-        output_name = output_names[0]
-        fail = sub['fail']
-        return """
-            PyArrayObject *indices;
-            int i_type = PyArray_TYPE(%(i_name)s);
-            if (i_type != NPY_INTP) {
-                // Cast %(i_name)s to NPY_INTP (expected by PyArray_TakeFrom),
-                // if all values fit.
-                if (!PyArray_CanCastSafely(i_type, NPY_INTP)) {
-                    npy_int64 min_val, max_val;
-                    PyObject* py_min_val = PyArray_Min(%(i_name)s, NPY_MAXDIMS,
-                                                       NULL);
-                    if (py_min_val == NULL) {
-                        %(fail)s;
-                    }
-                    min_val = PyLong_AsLongLong(py_min_val);
-                    Py_DECREF(py_min_val);
-                    if (min_val == -1 && PyErr_Occurred()) {
-                        %(fail)s;
-                    }
-                    PyObject* py_max_val = PyArray_Max(%(i_name)s, NPY_MAXDIMS,
-                                                       NULL);
-                    if (py_max_val == NULL) {
-                        %(fail)s;
-                    }
-                    max_val = PyLong_AsLongLong(py_max_val);
-                    Py_DECREF(py_max_val);
-                    if (max_val == -1 && PyErr_Occurred()) {
-                        %(fail)s;
-                    }
-                    if (min_val < NPY_MIN_INTP || max_val > NPY_MAX_INTP) {
-                        PyErr_SetString(PyExc_IndexError,
-                                     "Index contains values "
-                                     "that are bigger than the maximum array "
-                                     "size on this system.");
-                        %(fail)s;
-                    }
-                }
-                indices = (PyArrayObject*) PyArray_Cast(%(i_name)s, NPY_INTP);
-                if (indices == NULL) {
-                    %(fail)s;
-                }
-            }
-            else {
-                 indices = %(i_name)s;
-                 Py_INCREF(indices);
-            }
-            if (%(output_name)s != NULL) {
-                npy_intp nd, i, *shape;
-                nd = PyArray_NDIM(%(a_name)s) + PyArray_NDIM(indices) - 1;
-                if (PyArray_NDIM(%(output_name)s) != nd) {
-                    Py_CLEAR(%(output_name)s);
-                }
-                else {
-                    shape = PyArray_DIMS(%(output_name)s);
-                    for (i = 0; i < PyArray_NDIM(indices); i++) {
-                        if (shape[i] != PyArray_DIMS(indices)[i]) {
-                            Py_CLEAR(%(output_name)s);
-                            break;
-                        }
-                    }
-                    if (%(output_name)s != NULL) {
-                        for (; i < nd; i++) {
-                            if (shape[i] != PyArray_DIMS(%(a_name)s)[
-                                                i-PyArray_NDIM(indices)+1]) {
-                                Py_CLEAR(%(output_name)s);
-                                break;
-                            }
-                        }
-                    }
-                }
-            }
-            %(output_name)s = (PyArrayObject*)PyArray_TakeFrom(
-                        %(a_name)s, (PyObject*)indices, 0, %(output_name)s, NPY_RAISE);
-            Py_DECREF(indices);
-            if (%(output_name)s == NULL) %(fail)s;
-        """ % locals()
-
-    def c_code_cache_version(self):
-        return (0, 1, 1)
-
-advanced_subtensor1 = AdvancedSubtensor1()
-
-
-class AdvancedIncSubtensor1(Op):
-    """Increments a subtensor using advanced slicing (list of index)"""
-    def __init__(self, inplace=False, set_instead_of_inc=False):
-        self.inplace = inplace
-        self.set_instead_of_inc = set_instead_of_inc
-        if inplace:
-            self.destroy_map = {0: [0]}
-
-    def __hash__(self):
-        return hash((type(self), self.inplace, self.set_instead_of_inc))
-
-    def __eq__(self, other):
-        return (type(self) == type(other)
-                and self.inplace == other.inplace
-                and self.set_instead_of_inc == other.set_instead_of_inc)
-
-    def __str__(self):
-        if self.inplace:
-            msg = "inplace"
-        else:
-            msg = "no_inplace"
-        if self.set_instead_of_inc:
-            msg += ",set"
-        else:
-            msg += ",inc"
-
-        return self.__class__.__name__ + "{%s}" % msg
-
-    def make_node(self, x, y, ilist):
-        x_ = theano.tensor.as_tensor_variable(x)
-        y_ = theano.tensor.as_tensor_variable(y)
-        ilist_ = theano.tensor.as_tensor_variable(ilist)
-
-        if ilist_.type.dtype[:3] not in ('int', 'uin'):
-            raise TypeError('index must be integers')
-        if ilist_.type.ndim != 1:
-            raise TypeError('index must be vector')
-        if x_.type.ndim == 0:
-            raise TypeError('cannot index into a scalar')
-        if y_.type.ndim > x_.type.ndim:
-            if self.set_instead_of_inc:
-                opname = 'set'
-            else:
-                opname = 'increment'
-            raise TypeError(
-                'cannot %s x subtensor with ndim=%s'
-                ' by y with ndim=%s to x subtensor with ndim=%s ' % (
-                    opname, x_.type.ndim, y_.type.ndim))
-
-        return Apply(self, [x_, y_, ilist_], [x_.type()])
-
-    def perform(self, node, inp, out_):
-        # TODO opt to make this inplace
-        x, y, idx = inp
-        out, = out_
-        if not self.inplace:
-            x = x.copy()
-        # In Numpy, x[idx] += y doesn't work if the same index is present
-        # many times: it does it only once. Is it a bug? In any case, for
-        # this reason we implement our own 'inc' iteration.
-        if self.set_instead_of_inc:
-            x[idx] = y
-        else:
-            increment = inplace_increment
-            if increment is None:
-                increment = self.inplace_increment1d_slow
-
-            increment(x, idx, y)
-
-        out[0] = x
-
-    def inplace_increment1d_slow(self, x, idx, y):
-        # If `y` has as many dimensions as `x`, then we want to iterate
-        # jointly on `x` and `y`. Otherwise, it means `y` should be
-        # broadcasted to fill all relevant rows of `x`.
-        assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
-        if y.ndim == x.ndim:
-            assert len(y) == len(idx)
-            for (j, i) in enumerate(idx):
-                x[i] += y[j]
-        else:
-            for i in idx:
-                x[i] += y
-
-    def infer_shape(self, node, ishapes):
-        x, y, ilist = ishapes
-        return [x]
-
-    def R_op(self, inputs, eval_points):
-        if None in eval_points[:2]:
-            return [None]
-        return self.make_node(eval_points[0], eval_points[1],
-                              *inputs[2:]).outputs
-
-    def connection_pattern(self, node):
-
-        rval = [[True], [True], [False]]
-        return rval
-
-    def grad(self, inputs, grads):
-        g_output, = grads
-        x, y = inputs[:2]
-        idx_list = inputs[2:]
-
-        gx = g_output
-        gy = advanced_subtensor1(g_output, *idx_list)
-
-        return [gx, gy] + [DisconnectedType()()] * len(idx_list)
-
-advanced_inc_subtensor1 = AdvancedIncSubtensor1()
-
-
-def as_index_variable(idx):
-    if idx is None:
-        return NoneConst.clone()
-    if isinstance(idx, slice):
-        return make_slice(idx)
-    if isinstance(idx, gof.Variable) and isinstance(idx.type, SliceType):
-        return idx
-    idx = theano.tensor.as_tensor_variable(idx)
-    if idx.type.dtype[:3] not in ('int', 'uin'):
-        raise TypeError('index must be integers')
-    return idx
-
-
-def adv_index_broadcastable_pattern(a, idx):
-    """
-    This function is only used to determine the broadcast pattern for
-    AdvancedSubtensor output variable.
-
-    For this, we make a fake ndarray and a fake idx and call use ask numpy
-    the output. From this, we find the output broadcast pattern.
-    """
-
-    def replace_slice(v):
-        if isinstance(v, gof.Apply):
-            if len(v.outputs) != 1:
-                raise ValueError(
-                    "It is ambiguous which output of a multi-output Op has"
-                    " to be fetched.", v)
-            else:
-                v = v.outputs[0]
-
-        if NoneConst.equals(v):
-            return None
-        if isinstance(v.type, SliceType):
-            return slice(None, None)
-
-        return numpy.zeros((2,) * v.ndim, int)
-
-    newidx = tuple(map(replace_slice, idx))
-
-    # 2 - True = 1; 2 - False = 2
-    fakeshape = [2 - bc for bc in a.broadcastable]
-    retshape = numpy.empty(fakeshape)[newidx].shape
-    return tuple([dim == 1 for dim in retshape])
-
-
-class AdvancedSubtensor(Op):
-    """Return a subtensor copy, using advanced indexing.
-    """
-    # Should be used by __getitem__ and __getslice__, as follow:
-    # AdvancedSubtensor()(self, *args),
-    # if args contains and advanced indexing pattern
-
-    def __eq__(self, other):
-        return self.__class__ == other.__class__
-
-    def __hash__(self):
-        return hash(self.__class__)
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def make_node(self, x, *index):
-        x = theano.tensor.as_tensor_variable(x)
-
-        index = tuple(map(as_index_variable, index))
-        bcast = adv_index_broadcastable_pattern(x, index)
-        return gof.Apply(self,
-                         (x,) + index,
-                         [theano.tensor.tensor(dtype=x.type.dtype,
-                                               broadcastable=bcast)])
-
-    def R_op(self, inputs, eval_points):
-        if eval_points[0] is None:
-            return [None]
-        return self.make_node(eval_points[0], *inputs[1:]).outputs
-
-    def infer_shape(self, node, ishapes):
-        # Really special case
-        if len(ishapes) == 3:
-            xshp, ind1shp, ind2shp = ishapes
-            if (len(xshp) == 2 and
-                    ind1shp is not None and len(ind1shp) == 1 and
-                    ind2shp is not None and len(ind2shp) == 1):
-                # if the graph is correct, we can assume ind1shp[0] and
-                # ind2shp[0] will have the same value.
-                # Try to return the one closest to the graph input.
-                if node.inputs[2].owner is None:
-                    return [ind2shp]
-                else:
-                    return [ind1shp]
-        # Default case, we don't know
-        raise theano.tensor.basic.ShapeError("case not implemented")
-
-    def perform(self, node, inputs, out_):
-        out, = out_
-        # TODO: in general, we need to re-pack the inputs into a valid
-        # index, just like subtensor
-        out[0] = inputs[0].__getitem__(inputs[1:])
-        if (numpy.__version__ <= '1.6.1' and
-                out[0].size != numpy.uint32(out[0].size)):
-            warnings.warn(
-                'Numpy versions 1.6.1 and below have a bug preventing '
-                'advanced indexing from correctly filling arrays that '
-                'are too big (>= 2^32 elements). It is possible that '
-                'out[0] (%s), with shape %s, is not correctly filled.'
-                % (out[0], out[0].shape))
-
-    def connection_pattern(self, node):
-
-        rval = [[True]]
-
-        for ipt in node.inputs[1:]:
-            rval.append([False])
-
-        return rval
-
-    def grad(self, inputs, grads):
-        gz, = grads
-        x = inputs[0]
-        rest = inputs[1:]
-        return [advanced_inc_subtensor(theano.tensor.zeros_like(x), gz,
-                                       *rest)] + \
-            [DisconnectedType()()] * len(rest)
-advanced_subtensor = AdvancedSubtensor()
-
-
-class AdvancedIncSubtensor(Op):
-    """Increments a subtensor using advanced indexing.
-
-    :note: We need the numpy.inplace_increment() function currently
-        numpy's PR 326 to be able to make an inplace version of this
-        op.
-
-    """
-
-    def __init__(self, inplace=False, set_instead_of_inc=False):
-        self.inplace = inplace
-        self.set_instead_of_inc = set_instead_of_inc
-        # The assert is needed as in the pass the first argument was
-        # something else that was not used.
-        assert isinstance(inplace, bool)
-        if self.inplace:
-            raise NotImplementedError('In place computation is not'
-                                      ' implemented')
-
-        self.allow_legacy_perform = False
-
-    def __hash__(self):
-        return hash((type(self), self.inplace, self.set_instead_of_inc))
-
-    def __eq__(self, other):
-        return (type(self) == type(other)
-                and self.inplace == other.inplace
-                and self.set_instead_of_inc == other.set_instead_of_inc)
-
-    def __str__(self):
-        return "%s{%s, %s}" % (self.__class__.__name__,
-                               "inplace=" + str(self.inplace),
-                               " set_instead_of_inc=" +
-                               str(self. set_instead_of_inc))
-
-    def make_node(self, x, y, *inputs):
-        x = theano.tensor.as_tensor_variable(x)
-        y = theano.tensor.as_tensor_variable(y)
-
-        op = self
-        # If we are incrementing, but the increment compiled function is not
-        # available, we need to support legacy cases.
-        if not self.set_instead_of_inc and inplace_increment is None:
-            legacy_conditions = False
-            if x.ndim == 2 and y.ndim == 1 and len(inputs) == 2:
-                ind1 = theano.tensor.as_tensor_variable(inputs[0])
-                ind2 = theano.tensor.as_tensor_variable(inputs[1])
-                if ind1.ndim == 1 and ind2.ndim == 1:
-                    if ind1.owner and isinstance(ind1.owner.op, ARange):
-                        legacy_conditions = True
-                    elif isinstance(ind1, Constant):
-                        # Make sure no index is duplicated
-                        val = ind1.value
-                        if numpy.unique(val).size == val.size:
-                            legacy_conditions = True
-                    elif ind2.owner and isinstance(ind2.owner.op, ARange):
-                        legacy_conditions = True
-                    elif isinstance(ind2, Constant):
-                        # Make sure no index is duplicated
-                        val = ind2.value
-                        if numpy.unique(val).size == val.size:
-                            legacy_conditions = True
-            if legacy_conditions:
-                op = copy(self)
-                op.allow_legacy_perform = True
-            else:
-                raise NotImplementedError(
-                    'Could not import inplace_increment, so some advanced '
-                    'indexing features are disabled. They will be '
-                    'available if you update NumPy to version 1.8 or '
-                    'later, or to the latest development version. '
-                    'You may need to clear the cache (theano-cache clear) '
-                    'afterwards.')
-
-        return gof.Apply(op,
-                         (x, y) + inputs,
-                         [theano.tensor.tensor(
-                             dtype=x.type.dtype,
-                             broadcastable=x.type.broadcastable)])
-
-    def perform(self, node, inputs, out_):
-        # TODO: 1. opt to make this in place 2. generalize as described in
-        # AdvancedSubtensor's perform TODO
-
-        out, = out_
-        if not self.inplace:
-            out[0] = inputs[0].copy()
-        else:
-            out[0] = inputs[0]
-
-        if self.set_instead_of_inc:
-            out[0][inputs[2:]] = inputs[1]
-        elif inplace_increment is not None:
-            inplace_increment(out[0], tuple(inputs[2:]), inputs[1])
-        elif self.allow_legacy_perform:
-            out[0][inputs[2:]] += inputs[1]
-        else:
-            raise NotImplementedError(
-                'Could not import inplace_increment, so some advanced '
-                'indexing features are disabled. They will be '
-                'available if you update NumPy to version 1.8 or '
-                'later, or to the latest development version. '
-                'You may need to clear the cache (theano-cache clear) '
-                'afterwards.')
-
-        if (numpy.__version__ <= '1.6.1' and
-                out[0].size != numpy.uint32(out[0].size)):
-            warnings.warn(
-                'Numpy versions 1.6.1 and below have a bug preventing '
-                'advanced indexing from correctly filling arrays that '
-                'are too big (>= 2^32 elements). It is possible that '
-                'out[0] (%s), with shape %s, is not correctly filled.'
-                % (out[0], out[0].shape))
-
-    def infer_shape(self, node, ishapes):
-        return [ishapes[0]]
-
-    def connection_pattern(self, node):
-
-        rval = [[True], [True]]
-
-        for ipt in node.inputs[2:]:
-            rval.append([False])
-
-        return rval
-
-    def grad(self, inpt, output_gradients):
-        x, y = inpt[:2]
-        idxs = inpt[2:]
-        outgrad, = output_gradients
-        d_x_wrt_C = outgrad
-        d_y_wrt_C = AdvancedSubtensor()(outgrad, *idxs)
-        return [d_x_wrt_C, d_y_wrt_C] + \
-            [DisconnectedType()() for _ in idxs]
-
-    def R_op(self, inputs, eval_points):
-        if None in eval_points[:2]:
-            return [None]
-        return self.make_node(eval_points[0], eval_points[1],
-                              *inputs[2:]).outputs
-advanced_inc_subtensor = AdvancedIncSubtensor()
-
-
-def take(a, indices, axis=None, mode='raise'):
-    a = theano.tensor.as_tensor_variable(a)
-    indices = theano.tensor.as_tensor_variable(indices)
-    # Reuse advanced_subtensor1 if indices is a vector
-    if indices.ndim == 1:
-        if mode == 'clip':
-            indices = clip(indices, 0, a.shape[axis] - 1)
-        elif mode == 'wrap':
-            indices = indices % a.shape[axis]
-        if axis is None:
-            return advanced_subtensor1(a.flatten(), indices)
-        elif axis == 0:
-            return advanced_subtensor1(a, indices)
-        else:
-            if axis < 0:
-                axis += a.ndim
-            assert axis >= 0
-            shuffle = range(a.ndim)
-            shuffle[0] = axis
-            shuffle[axis] = 0
-            return advanced_subtensor1(
-                a.dimshuffle(shuffle), indices).dimshuffle(shuffle)
-    if axis is None:
-        shape = indices.shape
-        ndim = indices.ndim
-    else:
-        # If axis is 0, don't generate a useless concatenation.
-        if axis == 0:
-            shape = theano.tensor.concatenate(
-                [indices.shape, a.shape[axis + 1:]])
-        else:
-            shape = theano.tensor.concatenate(
-                [a.shape[:axis], indices.shape, a.shape[axis + 1:]])
-        ndim = a.ndim + indices.ndim - 1
-    return take(a, indices.flatten(), axis, mode).reshape(shape, ndim)
diff --git a/theano/tensor/tests/shape_opt_cycle.pkl b/theano/tensor/tests/shape_opt_cycle.pkl
deleted file mode 100644
index aa9c7c43d1c..00000000000
Binary files a/theano/tensor/tests/shape_opt_cycle.pkl and /dev/null differ
diff --git a/theano/tensor/tests/test_basic.py b/theano/tensor/tests/test_basic.py
index ea45738f64d..779ff714260 100644
--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -11,9 +11,8 @@
 # Import builtin min to be able to use it after importing the tensor version.
 import __builtin__
 builtin_min = __builtin__.min
-from nose.tools import assert_raises
+
 from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
 import numpy
 from numpy.testing import dec, assert_array_equal, assert_allclose
 from numpy.testing.noseclasses import KnownFailureTest
@@ -26,11 +25,11 @@
 from theano.compile.mode import get_default_mode
 from theano.gof.python25 import any, all, combinations
 from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
-        argmin, max_and_argmax, cscalar, ctensor3, join,
+        argmin, max_and_argmax, cscalar, Subtensor, ctensor3, join,
         horizontal_stack, vertical_stack, argmax, get_vector_length,
         fscalar, zeros_like, sum, tensor3, vector, add, addbroadcast,
         alloc, as_tensor_variable, tensor_from_scalar, ARange, autocast_float,
-        clip, constant, default, dot,
+        clip, constant, default, dot, inc_subtensor, set_subtensor,
         dmatrix, dscalar, dvector, eq, eye, fill, flatten, inverse_permutation,
         tensor4, permute_row_elements, Flatten, fmatrix, fscalars, grad,
         inplace, iscalar, matrix, minimum, matrices, maximum, mul, neq,
@@ -39,16 +38,13 @@
         var, Join, shape, MaxAndArgmax, lscalar, zvector, exp,
         get_scalar_constant_value, ivector, reshape, scalar_from_tensor, scal,
         iscalars, arange, dscalars, fvector, imatrix, numeric_grad,
-        opt, lvector, lmatrix, true_div, max, min, Split, roll,
+        opt, ComplexError, lvector, lmatrix, true_div, max, min, Split, roll,
         tile, patternbroadcast, Eye, Shape, Dot, PermuteRowElements,
         ScalarFromTensor, TensorFromScalar, dtensor4, Rebroadcast, Alloc,
-        dtensor3, SpecifyShape, Mean,
-        itensor3, Tile, switch, Diagonal, Diag,
-        nonzero, flatnonzero, nonzero_values,
-        stacklists, DimShuffle, hessian, ptp, power,
-        swapaxes, choose, Choose
-        )
-
+        dtensor3, SpecifyShape, Mean, IncSubtensor, AdvancedIncSubtensor1,
+        itensor3, Tile, AdvancedIncSubtensor, switch, Diagonal, Diag,
+        nonzero, flatnonzero, nonzero_values, inplace_increment,
+        stacklists)
 from theano.tests import unittest_tools as utt
 
 
@@ -189,57 +185,9 @@ def safe_make_node(op, *inputs):
         return node.owner
 
 
-def upcast_float16_ufunc(fn):
-    """Decorator that enforces computation is not done in float16 by NumPy.
-
-    Some ufuncs in NumPy will compute float values on int8 and uint8
-    in half-precision (float16), which is not enough, and not compatible
-    with the C code.
-
-    :param fn: numpy ufunc
-    :returns: function similar to fn.__call__, computing the same
-        value with a minimum floating-point precision of float32
-    """
-    def ret(*args, **kwargs):
-        out_dtype = numpy.find_common_type(
-            [a.dtype for a in args], [numpy.float16])
-        if out_dtype == 'float16':
-            # Force everything to float32
-            sig = 'f' * fn.nin + '->' + 'f' * fn.nout
-            kwargs.update(sig=sig)
-        return fn(*args, **kwargs)
-
-    return ret
-
-
-def upcast_int8_nfunc(fn):
-    """Decorator that upcasts input of dtype int8 to float32.
-
-    This is so that floating-point computation is not carried using
-    half-precision (float16), as some NumPy functions do.
-
-    :param fn: function computing a floating-point value from inputs
-    :returns: function similar to fn, but upcasting its uint8 and int8
-        inputs before carrying out the computation.
-    """
-    def ret(*args, **kwargs):
-        args = list(args)
-        for i, a in enumerate(args):
-            if getattr(a, 'dtype', None) in ('int8', 'uint8'):
-                args[i] = a.astype('float32')
-
-        return fn(*args, **kwargs)
-
-    return ret
-
-
 def makeTester(name, op, expected, checks=None, good=None, bad_build=None,
                bad_runtime=None, grad=None, mode=None, grad_rtol=None,
-               eps=1e-10, skip=False, test_memmap=True, check_name=True):
-    """
-    :param check_name:
-        Use only for tester that aren't in Theano.
-    """
+               eps=1e-10, skip=False, test_memmap=True):
     if checks is None:
         checks = {}
     if good is None:
@@ -257,14 +205,12 @@ def makeTester(name, op, expected, checks=None, good=None, bad_build=None,
     _bad_build, _bad_runtime, _grad = bad_build, bad_runtime, grad
     _mode, _grad_rtol, _eps, skip_ = mode, grad_rtol, eps, skip
     _test_memmap = test_memmap
-    _check_name = check_name
 
     class Checker(unittest.TestCase):
 
         op = staticmethod(_op)
         expected = staticmethod(_expected)
         checks = _checks
-        check_name = _check_name
         good = _good
         bad_build = _bad_build
         bad_runtime = _bad_runtime
@@ -276,8 +222,7 @@ class Checker(unittest.TestCase):
         def setUp(self):
             # Verify that the test's name is correctly set.
             # Some tests reuse it outside this module.
-            if self.check_name:
-                eval(self.__class__.__module__ + '.' + self.__class__.__name__)
+            eval(self.__class__.__module__ + '.' + self.__class__.__name__)
 
             # We keep a list of temporary files created in add_memmap_values,
             # to remove them at the end of the test.
@@ -365,8 +310,7 @@ def test_good(self):
                     expecteds = self.expected(*inputs)
                     eps = 1e-10
 
-                if any([i.dtype in ('float32', 'int8', 'uint8')
-                        for i in inputs]):
+                if any([i.dtype == 'float32' for i in inputs]):
                     eps = 1e-6
                 eps = numpy.max([eps, _eps])
 
@@ -474,54 +418,6 @@ def test_grad(self):
             finally:
                 config.warn.sum_div_dimshuffle_bug = backup
 
-        def test_grad_none(self):
-            # Check that None is never returned as input gradient
-            # when calling self.op.grad
-            # We use all values in self.good because this has to be true
-            # whether or not the values work for utt.verify_grad.
-            if skip:
-                raise SkipTest(skip)
-
-            if not hasattr(self.op, 'grad'):
-                # This is not actually an Op
-                return
-
-            for testname, inputs in self.good.items():
-                inputs = [copy(input) for input in inputs]
-                inputrs = [TensorType(
-                            dtype=input.dtype,
-                            broadcastable=[shape_elem == 1
-                                           for shape_elem in input.shape]
-                            )() for input in inputs]
-
-                if (isinstance(self.expected, dict)
-                        and testname in self.expected):
-                    expecteds = self.expected[testname]
-                    # with numpy version, when we print a number and read it
-                    # back, we don't get exactly the same result, so we accept
-                    # rounding error in that case.
-                else:
-                    expecteds = self.expected(*inputs)
-                if not isinstance(expecteds, (list, tuple)):
-                    expecteds = (expecteds, )
-
-                out_grad_vars = []
-                for out in expecteds:
-                    if str(out.dtype) in tensor.discrete_dtypes:
-                        dtype = floatX
-                    else:
-                        dtype = str(out.dtype)
-                    bcast = [shape_elem == 1 for shape_elem in out.shape]
-                    var = TensorType(dtype=dtype, broadcastable=bcast)()
-                    out_grad_vars.append(var)
-
-                try:
-                    in_grad_vars = self.op.grad(inputrs, out_grad_vars)
-                except (gof.utils.MethodNotDefined, NotImplementedError):
-                    pass
-                else:
-                    assert None not in in_grad_vars
-
     Checker.__name__ = name
     return Checker
 
@@ -833,9 +729,6 @@ def copymod(dct, without=None, **kwargs):
     integer=(randint(2, 3), randint_nonzero(2, 3)),
     uinteger=(randint(2, 3).astype("uint8"),
               randint_nonzero(2, 3).astype("uint8")),
-    int8=[numpy.tile(numpy.arange(-127, 128, dtype='int8'), [254, 1]).T,
-          numpy.tile(numpy.array(range(-127, 0) + range(1, 128), dtype='int8'),
-                     [255, 1])],
     # This empty2 doesn't work for some tests. I don't remember why
     #empty2=(numpy.asarray([0]), numpy.asarray([])),
     )
@@ -901,7 +794,7 @@ def _numpy_true_div(x, y):
 TrueDivTester = makeBroadcastTester(
         op=tensor.true_div,
         expected=_numpy_true_div,
-        good=_good_broadcast_div_mod_normal_float_no_complex,
+        good=_good_broadcast_div_mod_normal_float,
         grad=_grad_broadcast_div_mod_normal,
         grad_rtol=div_grad_rtol,
         )
@@ -912,48 +805,12 @@ def _numpy_true_div(x, y):
         good=copymod(
             _good_broadcast_div_mod_normal_float_inplace,
             # The output is now in float, we cannot work inplace on an int.
-            without=['integer', 'uinteger', 'int8']),
+            without=['integer', 'uinteger']),
         grad=_grad_broadcast_div_mod_normal,
         grad_rtol=div_grad_rtol,
         inplace=True)
 
 
-_good_inv = dict(
-    normal=[5 * rand_nonzero((2, 3))],
-    integers=[randint_nonzero(2, 3)],
-    int8=[numpy.array(range(-127, 0) + range(1, 127), dtype='int8')],
-    complex=[randcomplex_nonzero((2, 3))],
-    empty=[numpy.asarray([], dtype=config.floatX)])
-
-_good_inv_inplace = copymod(_good_inv, without=['integers', 'int8', 'complex'])
-_grad_inv = copymod(_good_inv,
-                    without=['integers', 'int8', 'complex', 'empty'])
-
-_bad_runtime_inv = dict(
-    float=[numpy.zeros((2, 3))],
-    integers=[numpy.zeros((2, 3), dtype='int64')],
-    int8=[numpy.zeros((2, 3), dtype='int8')],
-    complex=[numpy.zeros((2, 3), dtype='complex128')])
-
-
-InvTester = makeBroadcastTester(
-    op=tensor.inv,
-    expected=lambda x: upcast_int8_nfunc(numpy.true_divide)(numpy.int8(1), x),
-    good=_good_inv,
-    bad_runtime=_bad_runtime_inv,
-    grad=_grad_inv,
-    grad_rtol=div_grad_rtol)
-
-InvInplaceTester = makeBroadcastTester(
-    op=inplace.inv_inplace,
-    expected=lambda x: _numpy_true_div(numpy.int8(1), x),
-    good=_good_inv_inplace,
-    bad_runtime=_bad_runtime_inv,
-    grad=_grad_inv,
-    grad_rtol=div_grad_rtol,
-    inplace=True)
-
-
 CeilIntDivTester = makeBroadcastTester(
     op=tensor.ceil_intdiv,
     expected=lambda x, y: check_floatX((x, y), (x // y) + ((x % y) != 0)),
@@ -970,7 +827,6 @@ def _numpy_true_div(x, y):
         x % y, dtype=theano.scalar.basic.upcast(x.dtype, y.dtype)),
     good=copymod(_good_broadcast_div_mod_normal_float,
                  ['complex1', 'complex2']),
-    grad=_grad_broadcast_div_mod_normal,
     )
 
 
@@ -980,7 +836,6 @@ def _numpy_true_div(x, y):
         x % y, dtype=theano.scalar.basic.upcast(x.dtype, y.dtype)),
     good=copymod(_good_broadcast_div_mod_normal_float_inplace,
                  ["complex1", "complex2"]),
-    grad=_grad_broadcast_div_mod_normal,
     inplace=True)
 
 _good_broadcast_pow_normal_float = dict(same_shapes = (rand_ranged(1, 5, (2, 3)), rand_ranged(-3, 3, (2, 3))),
@@ -1008,36 +863,23 @@ def _numpy_true_div(x, y):
                                   #complex3 = (rand(2,3),randcomplex(2,3)),
                                   #empty1 = (numpy.asarray([]), numpy.asarray([1])),
                                   #empty2 = (numpy.asarray([0]), numpy.asarray([])),
-                                  x_eq_zero = (
-                                      numpy.asarray([0.], dtype=config.floatX),
-                                      numpy.asarray([2.], dtype=config.floatX)
-                                  ),  # Test for issue 1780
                                   )
 #empty2 case is not supported by numpy.
 _good_broadcast_pow_normal_float_pow = copy(_good_broadcast_pow_normal_float)
 del _good_broadcast_pow_normal_float_pow["empty2"]
 
-# Disable NAN checking for pow operator per issue #1780
-m = copy(theano.compile.get_default_mode())
-m.check_isfinite = False
-
 PowTester = makeBroadcastTester(
-    op=pow,
-    expected=lambda x, y: check_floatX((x, y), x ** y),
-    good=_good_broadcast_pow_normal_float,
-    grad=_grad_broadcast_pow_normal,
-    name='Pow',
-    mode=m
-)
-
-PowInplaceTester = makeBroadcastTester(
-    op=inplace.pow_inplace,
-    expected=lambda x, y: x ** y,
-    good=_good_broadcast_pow_normal_float_pow,
-    grad=_grad_broadcast_pow_normal,
-    inplace=True,
-    mode=m
-)
+        op=pow,
+        expected=lambda x, y: check_floatX((x, y), x ** y),
+        good=_good_broadcast_pow_normal_float,
+        grad=_grad_broadcast_pow_normal,
+        name='Pow')
+
+PowInplaceTester = makeBroadcastTester(op=inplace.pow_inplace,
+                                       expected=lambda x, y: x ** y,
+                                       good = _good_broadcast_pow_normal_float_pow,
+                                       grad = _grad_broadcast_pow_normal,
+                                       inplace = True)
 
 #Those are corner case when rounding. Their is many rounding algo.
 #c round() fct and numpy round are not the same!
@@ -1074,8 +916,6 @@ def _numpy_true_div(x, y):
         normal=[numpy.asarray(rand_ranged(-5, 5, (2, 3)),
                               dtype=config.floatX)],
         integers=[randint_ranged(-5, 5, (2, 3))],
-        # not using -128 because numpy.allclose would return False
-        int8=[numpy.arange(-127, 128, dtype='int8')],
         corner_case=[corner_case],
         complex=[randcomplex(2, 3)],
         empty=[numpy.asarray([], dtype=config.floatX)],
@@ -1084,7 +924,6 @@ def _numpy_true_div(x, y):
 _good_broadcast_unary_normal_no_complex = dict(
         normal=[numpy.asarray(rand_ranged(-5, 5, (2, 3)), dtype=floatX)],
         integers=[randint_ranged(-5, 5, (2, 3))],
-        int8=[numpy.arange(-127, 128, dtype='int8')],
         corner_case=[corner_case],
         empty=[numpy.asarray([], dtype=config.floatX)],
         )
@@ -1107,8 +946,6 @@ def _numpy_true_div(x, y):
         normal=[numpy.asarray(rand_ranged(0, 2, (2, 3)), dtype=floatX)],
         )
 
-#inplace ops when the input is integer and the output is float*
-# don't have a well defined behavior. We don't test that case.
 
 AbsTester = makeBroadcastTester(op=tensor.abs_,
                                   expected=lambda x: abs(x),
@@ -1249,123 +1086,112 @@ def _numpy_true_div(x, y):
                                        grad=_grad_broadcast_unary_normal,
                                        inplace=True)
 
-ExpTester = makeBroadcastTester(
-    op=tensor.exp,
-    expected=upcast_float16_ufunc(numpy.exp),
-    good=dict(_good_broadcast_unary_normal,
-              int8=[numpy.arange(-127, 89, dtype='int8')]),
-    grad=_grad_broadcast_unary_normal)
-ExpInplaceTester = makeBroadcastTester(
-    op=inplace.exp_inplace,
-    expected=numpy.exp,
-    good=_good_broadcast_unary_normal_float,
-    grad=_grad_broadcast_unary_normal,
-    inplace=True)
+ExpTester = makeBroadcastTester(op=tensor.exp,
+                                expected=numpy.exp,
+                                good=_good_broadcast_unary_normal,
+                                grad=_grad_broadcast_unary_normal)
+ExpInplaceTester = makeBroadcastTester(op=inplace.exp_inplace,
+                                       expected=numpy.exp,
+                                       good=_good_broadcast_unary_normal,
+                                       grad=_grad_broadcast_unary_normal,
+                                       inplace=True)
+
+
+def _numpy_exp2_round_int(x):
+    # Make sure exp2 on an int returns a value that can be correctly casted
+    # to an int. For instance, numpy.exp2(4) sometimes returns
+    # 15.999999999999998, we make sure we return 16. instead.
+    # This is used in Exp2InplaceTester.
+    out = numpy.exp2(x)
+    if x.dtype in tensor.discrete_dtypes:
+        out = numpy.round(out)
+    return out
 
 Exp2Tester = makeBroadcastTester(op=tensor.exp2,
-                                 expected=upcast_float16_ufunc(numpy.exp2),
+                                 expected=numpy.exp2,
                                  good=_good_broadcast_unary_normal,
                                  grad=_grad_broadcast_unary_normal)
-Exp2InplaceTester = makeBroadcastTester(
-    op=inplace.exp2_inplace,
-    expected=numpy.exp2,
-    good=_good_broadcast_unary_normal_float,
-    grad=_grad_broadcast_unary_normal,
-    inplace=True)
+Exp2InplaceTester = makeBroadcastTester(op=inplace.exp2_inplace,
+                                        expected=_numpy_exp2_round_int,
+                                         good=_good_broadcast_unary_normal,
+                                         grad=_grad_broadcast_unary_normal,
+                                         inplace=True)
 
 
-Expm1Tester = makeBroadcastTester(
-    op=tensor.expm1,
-    expected=upcast_float16_ufunc(numpy.expm1),
-    good=dict(_good_broadcast_unary_normal,
-              int8=[numpy.arange(-127, 89, dtype='int8')]),
-              grad=_grad_broadcast_unary_normal)
-Expm1InplaceTester = makeBroadcastTester(
-    op=inplace.expm1_inplace,
-    expected=numpy.expm1,
-    good=_good_broadcast_unary_normal_float,
-    grad=_grad_broadcast_unary_normal,
-    inplace=True)
-
+Expm1Tester = makeBroadcastTester(op=tensor.expm1,
+                                  expected=numpy.expm1,
+                                  good=_good_broadcast_unary_normal,
+                                  grad=_grad_broadcast_unary_normal)
+Expm1InplaceTester = makeBroadcastTester(op=inplace.expm1_inplace,
+                                         expected=numpy.expm1,
+                                         good=_good_broadcast_unary_normal,
+                                         grad=_grad_broadcast_unary_normal,
+                                         inplace=True)
 
-_good_broadcast_unary_positive = dict(
-    normal=(rand_ranged(0.001, 5, (2, 3)),),
-    integers=(randint_ranged(1, 5, (2, 3)),),
-    uint8=[numpy.arange(1, 256, dtype='uint8')],
-    complex=(randc128_ranged(1, 5, (2, 3)),),
-    empty=(numpy.asarray([], dtype=config.floatX),),
-)
 
-_good_broadcast_unary_positive_float = copymod(
-    _good_broadcast_unary_positive,
-    without=['integers', 'uint8'])
+_good_broadcast_unary_positive = dict(normal=(rand_ranged(0.001, 5, (2, 3)),),
+                                      integers=(randint_ranged(1, 5, (2, 3)),),
+                                      complex=(randc128_ranged(1, 5, (2, 3)),),
+                                      empty=(numpy.asarray([], dtype=config.floatX),),
+                                      )
 
 _grad_broadcast_unary_positive = dict(normal=(rand_ranged(0.001, 5, (2, 3)),),)
 
 LogTester = makeBroadcastTester(op=tensor.log,
-                                expected=upcast_float16_ufunc(numpy.log),
+                                expected=numpy.log,
                                 good=_good_broadcast_unary_positive,
                                 grad=_grad_broadcast_unary_positive)
-LogInplaceTester = makeBroadcastTester(
-    op=inplace.log_inplace,
-    expected=numpy.log,
-    good=_good_broadcast_unary_positive_float,
-    grad=_grad_broadcast_unary_positive,
-    inplace=True)
+LogInplaceTester = makeBroadcastTester(op=inplace.log_inplace,
+                                       expected=numpy.log,
+                                       good=_good_broadcast_unary_positive,
+                                       grad=_grad_broadcast_unary_positive,
+                                       inplace=True)
 
 Log2Tester = makeBroadcastTester(op=tensor.log2,
-                                 expected=upcast_float16_ufunc(numpy.log2),
+                                 expected=numpy.log2,
                                  good=_good_broadcast_unary_positive,
                                  grad=_grad_broadcast_unary_positive)
-Log2InplaceTester = makeBroadcastTester(
-    op=inplace.log2_inplace,
-    expected=numpy.log2,
-    good=_good_broadcast_unary_positive_float,
-    grad=_grad_broadcast_unary_positive,
-    inplace=True)
+Log2InplaceTester = makeBroadcastTester(op=inplace.log2_inplace,
+                                        expected=numpy.log2,
+                                        good=_good_broadcast_unary_positive,
+                                        grad=_grad_broadcast_unary_positive,
+                                        inplace=True)
 
 Log10Tester = makeBroadcastTester(op=tensor.log10,
-                                  expected=upcast_float16_ufunc(numpy.log10),
+                                  expected=numpy.log10,
                                   good=_good_broadcast_unary_positive,
                                   grad=_grad_broadcast_unary_positive)
-Log10InplaceTester = makeBroadcastTester(
-    op=inplace.log10_inplace,
-    expected=numpy.log10,
-    good=_good_broadcast_unary_positive_float,
-    grad=_grad_broadcast_unary_positive,
-    inplace=True)
+Log10InplaceTester = makeBroadcastTester(op=inplace.log10_inplace,
+                                         expected=numpy.log10,
+                                         good=_good_broadcast_unary_positive,
+                                         grad=_grad_broadcast_unary_positive,
+                                         inplace=True)
 
 Log1pTester = makeBroadcastTester(op=tensor.log1p,
-                                  expected=upcast_float16_ufunc(numpy.log1p),
+                                  expected=numpy.log1p,
                                   good=_good_broadcast_unary_positive,
                                   grad=_grad_broadcast_unary_positive)
-Log1pInplaceTester = makeBroadcastTester(
-    op=inplace.log1p_inplace,
-    expected=numpy.log1p,
-    good=_good_broadcast_unary_positive_float,
-    grad=_grad_broadcast_unary_positive,
-    inplace=True)
+Log1pInplaceTester = makeBroadcastTester(op=inplace.log1p_inplace,
+                                         expected=numpy.log1p,
+                                         good=_good_broadcast_unary_positive,
+                                         grad=_grad_broadcast_unary_positive,
+                                         inplace=True)
 
 SqrtTester = makeBroadcastTester(op=tensor.sqrt,
-                                   expected=upcast_float16_ufunc(numpy.sqrt),
+                                   expected=numpy.sqrt,
                                    good=_good_broadcast_unary_positive,
                                    grad=_grad_broadcast_unary_positive)
-SqrtInplaceTester = makeBroadcastTester(
-    op=inplace.sqrt_inplace,
-    expected=numpy.sqrt,
-    good=_good_broadcast_unary_positive_float,
-    grad=_grad_broadcast_unary_positive,
-    inplace=True)
+SqrtInplaceTester = makeBroadcastTester(op=inplace.sqrt_inplace,
+                                        expected=numpy.sqrt,
+                                        good=_good_broadcast_unary_positive,
+                                        grad=_grad_broadcast_unary_positive,
+                                        inplace=True)
 
 _good_broadcast_unary_wide = dict(
     normal=(rand_ranged(-1000, 1000, (2, 3)),),
     integers=(randint_ranged(-1000, 1000, (2, 3)),),
-    int8=[numpy.arange(-127, 128, dtype='int8')],
     complex=(randc128_ranged(-1000, 1000, (2, 3)),),
     empty=(numpy.asarray([], dtype=config.floatX),),)
-_good_broadcast_unary_wide_float = copymod(
-    _good_broadcast_unary_wide,
-    without=['integers', 'int8'])
 _grad_broadcast_unary_wide = dict(normal=(rand_ranged(-1000, 1000, (2, 3)),),)
 
 if theano.config.floatX == 'float32':
@@ -1375,92 +1201,82 @@ def _numpy_true_div(x, y):
 
 Deg2radTester = makeBroadcastTester(
     op=tensor.deg2rad,
-    expected=upcast_float16_ufunc(numpy.deg2rad),
+    expected=numpy.deg2rad,
     good=_good_broadcast_unary_normal_no_complex,
     grad=_grad_broadcast_unary_normal_no_complex,
     eps=angle_eps)
 Deg2radInplaceTester = makeBroadcastTester(
     op=inplace.deg2rad_inplace,
     expected=numpy.deg2rad,
-    good=_good_broadcast_unary_normal_float_no_complex,
+    good=_good_broadcast_unary_normal_no_complex,
     grad=_grad_broadcast_unary_normal_no_complex,
     inplace=True,
     eps=angle_eps)
 
 Rad2degTester = makeBroadcastTester(
     op=tensor.rad2deg,
-    expected=upcast_float16_ufunc(numpy.rad2deg),
+    expected=numpy.rad2deg,
     good=_good_broadcast_unary_normal_no_complex,
     grad=_grad_broadcast_unary_normal_no_complex,
     eps=angle_eps)
 Rad2degInplaceTester = makeBroadcastTester(
     op=inplace.rad2deg_inplace,
     expected=numpy.rad2deg,
-    good=_good_broadcast_unary_normal_float_no_complex,
+    good=_good_broadcast_unary_normal_no_complex,
     grad=_grad_broadcast_unary_normal_no_complex,
     inplace=True,
     eps=angle_eps)
 
 SinTester = makeBroadcastTester(op=tensor.sin,
-                                expected=upcast_float16_ufunc(numpy.sin),
+                                expected=numpy.sin,
                                 good=_good_broadcast_unary_wide,
                                 grad=_grad_broadcast_unary_wide)
-SinInplaceTester = makeBroadcastTester(
-    op=inplace.sin_inplace,
-    expected=numpy.sin,
-    good=_good_broadcast_unary_wide_float,
-    grad=_grad_broadcast_unary_wide,
-    inplace=True)
+SinInplaceTester = makeBroadcastTester(op=inplace.sin_inplace,
+                                       expected=numpy.sin,
+                                       good=_good_broadcast_unary_wide,
+                                       grad=_grad_broadcast_unary_wide,
+                                       inplace=True)
 
-_good_broadcast_unary_arcsin = dict(
-    normal=(rand_ranged(-1, 1, (2, 3)),),
-    integers=(randint_ranged(-1, 1, (2, 3)),),
-    int8=[numpy.arange(-1, 2, dtype='int8')],
-    complex=(randc128_ranged(-1, 1, (2, 3)),),
-    empty=(numpy.asarray([], dtype=config.floatX),),)
-_good_broadcast_unary_arcsin_float = copymod(
-    _good_broadcast_unary_arcsin,
-    without=['integers', 'int8'])
+_good_broadcast_unary_arcsin = dict(normal=(rand_ranged(-1, 1, (2, 3)),),
+                                    integers=(randint_ranged(-1, 1, (2, 3)),),
+                                    complex=(randc128_ranged(-1, 1, (2, 3)),),
+                                    empty=(numpy.asarray([], dtype=config.floatX),),)
 _grad_broadcast_unary_arcsin = dict(normal=(rand_ranged(-1, 1, (2, 3)),),)
 
 ArcsinTester = makeBroadcastTester(op=tensor.arcsin,
-                                   expected=upcast_float16_ufunc(numpy.arcsin),
+                                   expected=numpy.arcsin,
                                    good=_good_broadcast_unary_arcsin,
                                    grad=_grad_broadcast_unary_arcsin)
-ArcsinInplaceTester = makeBroadcastTester(
-    op=inplace.arcsin_inplace,
-    expected=numpy.arcsin,
-    good=_good_broadcast_unary_arcsin_float,
-    grad=_grad_broadcast_unary_arcsin,
-    inplace=True)
+ArcsinInplaceTester = makeBroadcastTester(op=inplace.arcsin_inplace,
+                                          expected=numpy.arcsin,
+                                          good=_good_broadcast_unary_arcsin,
+                                          grad=_grad_broadcast_unary_arcsin,
+                                          inplace=True)
 
 CosTester = makeBroadcastTester(op=tensor.cos,
-                                expected=upcast_float16_ufunc(numpy.cos),
+                                expected=numpy.cos,
                                 good=_good_broadcast_unary_wide,
                                 grad=_grad_broadcast_unary_wide)
-CosInplaceTester = makeBroadcastTester(
-    op=inplace.cos_inplace,
-    expected=numpy.cos,
-    good=_good_broadcast_unary_wide_float,
-    grad=_grad_broadcast_unary_wide,
-    inplace=True)
+CosInplaceTester = makeBroadcastTester(op=inplace.cos_inplace,
+                                       expected=numpy.cos,
+                                       good=_good_broadcast_unary_wide,
+                                       grad=_grad_broadcast_unary_wide,
+                                       inplace=True)
 
 ArccosTester = makeBroadcastTester(op=tensor.arccos,
-                                   expected=upcast_float16_ufunc(numpy.arccos),
+                                   expected=numpy.arccos,
                                    good=_good_broadcast_unary_arcsin,
                                    grad=_grad_broadcast_unary_arcsin)
-ArccosInplaceTester = makeBroadcastTester(
-    op=inplace.arccos_inplace,
-    expected=numpy.arccos,
-    good=_good_broadcast_unary_arcsin_float,
-    grad=_grad_broadcast_unary_arcsin,
-    inplace=True)
+ArccosInplaceTester = makeBroadcastTester(op=inplace.arccos_inplace,
+                                          expected=numpy.arccos,
+                                          good=_good_broadcast_unary_arcsin,
+                                          grad=_grad_broadcast_unary_arcsin,
+                                          inplace=True)
 
 _good_broadcast_unary_tan = dict(
     normal=(rand_ranged(-3.14, 3.14, (2, 3)),),
     shifted=(rand_ranged(3.15, 6.28, (2, 3)),),
     integers=(randint_ranged(-3, 3, (2, 3)),),
-    int8=[numpy.arange(-3, 4, dtype='int8')],
     complex=(randc128_ranged(-3.14, 3.14, (2, 3)),),
     empty=(numpy.asarray([], dtype=config.floatX),),)
 #We do not want to test around the discontinuity.
@@ -1468,27 +1284,25 @@ def _numpy_true_div(x, y):
                                  shifted=(rand_ranged(1.6, 4.6, (2, 3)),))
 
 TanTester = makeBroadcastTester(op=tensor.tan,
-                                expected=upcast_float16_ufunc(numpy.tan),
+                                expected=numpy.tan,
                                 good=_good_broadcast_unary_tan,
                                 grad=_grad_broadcast_unary_tan)
 
-TanInplaceTester = makeBroadcastTester(
-    op=inplace.tan_inplace,
-    expected=numpy.tan,
-    good=copymod(_good_broadcast_unary_tan, without=['integers', 'int8']),
-    grad=_grad_broadcast_unary_tan,
-    inplace=True)
+TanInplaceTester = makeBroadcastTester(op=inplace.tan_inplace,
+                                       expected=numpy.tan,
+                                       good=_good_broadcast_unary_tan,
+                                       grad=_grad_broadcast_unary_tan,
+                                       inplace=True)
 
 ArctanTester = makeBroadcastTester(op=tensor.arctan,
-                                   expected=upcast_float16_ufunc(numpy.arctan),
+                                   expected=numpy.arctan,
                                    good=_good_broadcast_unary_wide,
                                    grad=_grad_broadcast_unary_wide)
-ArctanInplaceTester = makeBroadcastTester(
-    op=inplace.arctan_inplace,
-    expected=numpy.arctan,
-    good=_good_broadcast_unary_wide_float,
-    grad=_grad_broadcast_unary_wide,
-    inplace=True)
+ArctanInplaceTester = makeBroadcastTester(op=inplace.arctan_inplace,
+                                          expected=numpy.arctan,
+                                          good=_good_broadcast_unary_wide,
+                                          grad=_grad_broadcast_unary_wide,
+                                          inplace=True)
 
 _good_broadcast_binary_arctan2 = dict(
     same_shapes=(rand(2, 3), rand(2, 3)),
@@ -1497,8 +1311,6 @@ def _numpy_true_div(x, y):
     row=(rand(2, 3), rand(1, 3)),
     column=(rand(2, 3), rand(2, 1)),
     integers=(randint(2, 3), randint(2, 3)),
-    int8=[numpy.arange(-127, 128, dtype='int8'),
-          numpy.arange(-127, 128, dtype='int8')[:, numpy.newaxis]],
     dtype_mixup_1=(rand(2, 3), randint(2, 3)),
     dtype_mixup_2=(randint(2, 3), rand(2, 3)),
     empty=(numpy.asarray([], dtype=config.floatX),
@@ -1512,110 +1324,100 @@ def _numpy_true_div(x, y):
     column=(rand(2, 3), rand(2, 1)),
     )
 
-Arctan2Tester = makeBroadcastTester(
-    op=tensor.arctan2,
-    expected=upcast_float16_ufunc(numpy.arctan2),
-    good=_good_broadcast_binary_arctan2,
-    grad=_grad_broadcast_binary_arctan2)
-Arctan2InplaceTester = makeBroadcastTester(
-    op=inplace.arctan2_inplace,
-    expected=numpy.arctan2,
-    good=copymod(_good_broadcast_binary_arctan2, without=['integers', 'int8']),
-    grad=_grad_broadcast_binary_arctan2,
-    inplace=True)
-
-CoshTester = makeBroadcastTester(
-    op=tensor.cosh,
-    expected=upcast_float16_ufunc(numpy.cosh),
-    good=dict(_good_broadcast_unary_normal,
-              int8=[numpy.arange(-89, 90, dtype='int8')]),
-    grad=_grad_broadcast_unary_normal)
-CoshInplaceTester = makeBroadcastTester(
-    op=inplace.cosh_inplace,
-    expected=numpy.cosh,
-    good=_good_broadcast_unary_normal_float,
-    grad=_grad_broadcast_unary_normal,
-    inplace=True)
+Arctan2Tester = makeBroadcastTester(op=tensor.arctan2,
+                                    expected=numpy.arctan2,
+                                    good=_good_broadcast_binary_arctan2,
+                                    grad=_grad_broadcast_binary_arctan2)
+Arctan2InplaceTester = makeBroadcastTester(op=inplace.arctan2_inplace,
+                                           expected=numpy.arctan2,
+                                           good=_good_broadcast_binary_arctan2,
+                                           grad=_grad_broadcast_binary_arctan2,
+                                           inplace=True)
+
+CoshTester = makeBroadcastTester(op=tensor.cosh,
+                                 expected=numpy.cosh,
+                                 good=_good_broadcast_unary_normal,
+                                 grad=_grad_broadcast_unary_normal)
+CoshInplaceTester = makeBroadcastTester(op=inplace.cosh_inplace,
+                                        expected=numpy.cosh,
+                                        good=_good_broadcast_unary_normal,
+                                        grad=_grad_broadcast_unary_normal,
+                                        inplace=True)
 
 _good_broadcast_unary_arccosh = dict(
     normal=(rand_ranged(1, 1000, (2, 3)),),
     integers=(randint_ranged(1, 1000, (2, 3)),),
-    uint8=[numpy.arange(1, 256, dtype='uint8')],
     complex=(randc128_ranged(1, 1000, (2, 3)),),
     empty=(numpy.asarray([], dtype=config.floatX),),)
 _grad_broadcast_unary_arccosh = dict(normal=(rand_ranged(1, 1000, (2, 3)),),)
 
-ArccoshTester = makeBroadcastTester(
-    op=tensor.arccosh,
-    expected=upcast_float16_ufunc(numpy.arccosh),
-    good=_good_broadcast_unary_arccosh,
-    grad=_grad_broadcast_unary_arccosh)
-ArccoshInplaceTester = makeBroadcastTester(
-    op=inplace.arccosh_inplace,
-    expected=numpy.arccosh,
-    good=copymod(_good_broadcast_unary_arccosh, without=['integers', 'uint8']),
-    grad=_grad_broadcast_unary_arccosh,
-    inplace=True)
-
-SinhTester = makeBroadcastTester(
-    op=tensor.sinh,
-    expected=upcast_float16_ufunc(numpy.sinh),
-    good=dict(_good_broadcast_unary_normal,
-              int8=[numpy.arange(-89, 90, dtype='int8')]),
-              grad=_grad_broadcast_unary_normal)
-SinhInplaceTester = makeBroadcastTester(
-    op=inplace.sinh_inplace,
-    expected=numpy.sinh,
-    good=_good_broadcast_unary_normal_float,
-    grad=_grad_broadcast_unary_normal,
-    inplace=True)
-
-ArcsinhTester = makeBroadcastTester(
-    op=tensor.arcsinh,
-    expected=upcast_float16_ufunc(numpy.arcsinh),
-    good=_good_broadcast_unary_normal,
-    grad=_grad_broadcast_unary_normal)
-ArcsinhInplaceTester = makeBroadcastTester(
-    op=inplace.arcsinh_inplace,
-    expected=numpy.arcsinh,
-    good=_good_broadcast_unary_normal_float,
-    grad=_grad_broadcast_unary_normal,
-    inplace=True)
+ArccoshTester = makeBroadcastTester(op=tensor.arccosh,
+                                    expected=numpy.arccosh,
+                                    good=_good_broadcast_unary_arccosh,
+                                    grad=_grad_broadcast_unary_arccosh)
+ArccoshInplaceTester = makeBroadcastTester(op=inplace.arccosh_inplace,
+                                           expected=numpy.arccosh,
+                                           good=_good_broadcast_unary_arccosh,
+                                           grad=_grad_broadcast_unary_arccosh,
+                                           inplace=True)
+
+SinhTester = makeBroadcastTester(op=tensor.sinh,
+                                 expected=numpy.sinh,
+                                 good=_good_broadcast_unary_normal,
+                                 grad=_grad_broadcast_unary_normal)
+SinhInplaceTester = makeBroadcastTester(op=inplace.sinh_inplace,
+                                        expected=numpy.sinh,
+                                        good=_good_broadcast_unary_normal,
+                                        grad=_grad_broadcast_unary_normal,
+                                        inplace=True)
+
+ArcsinhTester = makeBroadcastTester(op=tensor.arcsinh,
+                                    expected=numpy.arcsinh,
+                                    good=_good_broadcast_unary_normal,
+                                    grad=_grad_broadcast_unary_normal)
+ArcsinhInplaceTester = makeBroadcastTester(op=inplace.arcsinh_inplace,
+                                           expected=numpy.arcsinh,
+                                           good=_good_broadcast_unary_normal,
+                                           grad=_grad_broadcast_unary_normal,
+                                           inplace=True)
 
 TanhTester = makeBroadcastTester(op=tensor.tanh,
-                                 expected=upcast_float16_ufunc(numpy.tanh),
+                                 expected=numpy.tanh,
                                  good=_good_broadcast_unary_normal,
                                  grad=_grad_broadcast_unary_normal)
-TanhInplaceTester = makeBroadcastTester(
-    op=inplace.tanh_inplace,
-    expected=numpy.tanh,
-    good=_good_broadcast_unary_normal_float,
-    grad=_grad_broadcast_unary_normal,
-    inplace=True)
+TanhInplaceTester = makeBroadcastTester(op=inplace.tanh_inplace,
+                                        expected=numpy.tanh,
+                                        good=_good_broadcast_unary_normal,
+                                        grad=_grad_broadcast_unary_normal,
+                                        inplace=True)
 
 _eps = 1e-10
 _good_broadcast_unary_arctanh = dict(
     normal=(rand_ranged(-1 + _eps, 1 - _eps, (2, 3)),),
     integers=(randint_ranged(-1 + _eps, 1 - _eps, (2, 3)),),
-    int8=[numpy.arange(0, 1, dtype='int8')],
     complex=(randc128_ranged(-1 + _eps, 1 - _eps, (2, 3)),),
     empty=(numpy.asarray([], dtype=config.floatX),),)
 _grad_broadcast_unary_arctanh = dict(
     normal=(rand_ranged(-1 + _eps, 1 - _eps, (2, 3)),),)
 
-ArctanhTester = makeBroadcastTester(
-    op=tensor.arctanh,
-    expected=upcast_float16_ufunc(numpy.arctanh),
-    good=_good_broadcast_unary_arctanh,
-    grad=_grad_broadcast_unary_arctanh)
-ArctanhInplaceTester = makeBroadcastTester(
-    op=inplace.arctanh_inplace,
-    expected=numpy.arctanh,
-    good=copymod(_good_broadcast_unary_arctanh, without=['integers', 'int8']),
-    grad=_grad_broadcast_unary_arctanh,
-    inplace=True)
+ArctanhTester = makeBroadcastTester(op=tensor.arctanh,
+                                    expected=numpy.arctanh,
+                                    good=_good_broadcast_unary_arctanh,
+                                    grad=_grad_broadcast_unary_arctanh)
+ArctanhInplaceTester = makeBroadcastTester(op=inplace.arctanh_inplace,
+                                           expected=numpy.arctanh,
+                                           good=_good_broadcast_unary_arctanh,
+                                           grad=_grad_broadcast_unary_arctanh,
+                                           inplace=True)
 
 
+#inplace ops when the input is integer and the output is float*
+# don't have a well defined behavior. We don't test that case.
+_good_broadcast_unary_normal_no_int_no_complex = _good_broadcast_unary_normal_no_complex.copy()
+del _good_broadcast_unary_normal_no_int_no_complex['integers']
+_good_broadcast_unary_normal_no_int = _good_broadcast_unary_normal.copy()
+del _good_broadcast_unary_normal_no_int['integers']
+
 # We can't test it if scipy is not installed!
 # Precomputing the result is brittle(it have been broken!)
 # As if we do any modification to random number here,
@@ -1652,7 +1454,7 @@ def _numpy_true_div(x, y):
 ErfInplaceTester = makeBroadcastTester(
     op=inplace.erf_inplace,
     expected=expected_erf,
-    good=_good_broadcast_unary_normal_float,
+    good=_good_broadcast_unary_normal_no_int,
     grad=_grad_broadcast_unary_normal,
     mode=mode_no_scipy,
     eps=2e-10,
@@ -1662,7 +1464,7 @@ def _numpy_true_div(x, y):
 ErfcTester = makeBroadcastTester(
     op=tensor.erfc,
     expected=expected_erfc,
-    good=_good_broadcast_unary_normal_float_no_complex,
+    good=_good_broadcast_unary_normal_no_int_no_complex,
     grad=_grad_broadcast_unary_normal,
     eps=2e-10,
     mode=mode_no_scipy,
@@ -1670,7 +1472,7 @@ def _numpy_true_div(x, y):
 ErfcInplaceTester = makeBroadcastTester(
     op=inplace.erfc_inplace,
     expected=expected_erfc,
-    good=_good_broadcast_unary_normal_float_no_complex,
+    good=_good_broadcast_unary_normal_no_int_no_complex,
     grad=_grad_broadcast_unary_normal,
     eps=2e-10,
     mode=mode_no_scipy,
@@ -1680,7 +1482,7 @@ def _numpy_true_div(x, y):
 ErfinvTester = makeBroadcastTester(
     op=tensor.erfinv,
     expected=expected_erfinv,
-    good=_good_broadcast_unary_normal_float_no_complex,
+    good=_good_broadcast_unary_normal_no_int_no_complex,
     grad=_grad_broadcast_unary_abs1_no_complex,
     eps=2e-10,
     mode=mode_no_scipy,
@@ -1689,7 +1491,7 @@ def _numpy_true_div(x, y):
 ErfcinvTester = makeBroadcastTester(
     op=tensor.erfcinv,
     expected=expected_erfcinv,
-    good=_good_broadcast_unary_normal_float_no_complex,
+    good=_good_broadcast_unary_normal_no_int_no_complex,
     grad=_grad_broadcast_unary_0_2_no_complex,
     eps=2e-10,
     mode=mode_no_scipy,
@@ -1936,16 +1738,10 @@ def multi_dtype_cast_checks(shape, dtypes=ALL_DTYPES, nameprefix=''):
             correct01_bcast=(rand(1), numpy.int32(7)),
             correct02=(rand(), numpy.int32(4), numpy.int32(7)),
             correct12=(rand(7), numpy.int32(4), numpy.int32(7)),
-            correct13=(rand(7), numpy.int32(2), numpy.int32(4),
-                       numpy.int32(7)),
-            correct23=(rand(4, 7), numpy.int32(2), numpy.int32(4),
-                       numpy.int32(7)),
-            correctb1=(rand(1, 7), numpy.int32(4), numpy.int32(7)),
-            correctb2=(rand(1, 7), numpy.int32(2),
-                       numpy.int32(4), numpy.int32(7)),
-            correctb3=(rand(7, 1), numpy.int32(7), numpy.int32(4)),
-            correctb4=(rand(7, 1), numpy.int32(2),
-                       numpy.int32(7), numpy.int32(4)),
+            correct13=(rand(7), numpy.int32(2), numpy.int32(
+                4), numpy.int32(7)),
+            correct23=(rand(4, 7), numpy.int32(2), numpy.
+                int32(4), numpy.int32(7)),
             ),
         bad_runtime=dict(
                     bad_shape12=(rand(7), numpy.int32(7), numpy.int32(5)),
@@ -1994,96 +1790,6 @@ def multi_dtype_cast_checks(shape, dtypes=ALL_DTYPES, nameprefix=''):
             ),
         )
 
-# unbroadcast a row to a matrix
-Allocb1GradTester = makeBroadcastTester(
-    name='Allocb1GradTester',
-    op=lambda x: alloc(x, s1, s2),
-    expected=(lambda x: numpy.zeros((s1, s2), dtype=x.dtype) + x),
-    grad=dict(
-        x1=(rand(1, s2),),
-        x2=(rand(1, s2),),
-        x3=(rand(1, s2),),
-    ),
-)
-
-# unbroadcast a row to a tensor3
-Allocb2GradTester = makeBroadcastTester(
-    name='Allocb2GradTester',
-    op=lambda x: alloc(x, s1, s2, s3),
-    expected=(lambda x: numpy.zeros((s1, s2, s3), dtype=x.dtype) + x),
-    grad=dict(
-        x1=(rand(1, s3),),
-        x2=(rand(1, s3),),
-        x3=(rand(1, s3),),
-    ),
-)
-
-# unbroadcast a col to a matrix
-Allocb3GradTester = makeBroadcastTester(
-    name='Allocb3GradTester',
-    op=lambda x: alloc(x, s1, s2),
-    expected=(lambda x: numpy.zeros((s1, s2), dtype=x.dtype) + x),
-    grad=dict(
-        x1=(rand(s1, 1),),
-        x2=(rand(s1, 1),),
-        x3=(rand(s1, 1),),
-    ),
-)
-
-# unbroadcast a col to a tensor3
-Allocb4GradTester = makeBroadcastTester(
-    name='Allocb4GradTester',
-    op=lambda x: alloc(x, s1, s2, s3),
-    expected=(lambda x: numpy.zeros((s1, s2, s3), dtype=x.dtype) + x),
-    grad=dict(
-        x1=(rand(s2, 1),),
-        x2=(rand(s2, 1),),
-        x3=(rand(s2, 1),),
-    ),
-)
-
-
-class ApplyDefaultTestOp(theano.Op):
-    def __init__(self, id):
-        self.default_output = id
-
-    def make_node(self, x):
-        x = theano.tensor.as_tensor_variable(x)
-        return theano.Apply(self, [x], [x.type()])
-
-
-class TestAsTensorVariable(unittest.TestCase):
-    """
-    Unit test for ensuring that as_tensor_variable handles Apply objects
-    correctly and removes leading broadcastable dimensions when possible.
-    """
-    def setUp(self):
-        self.x = tensor.scalar('x')
-
-    def test_one_output(self):
-        good_apply_var = ApplyDefaultTestOp(0).make_node(self.x)
-        x = as_tensor_variable(good_apply_var)
-
-    def test_below_zero_output(self):
-        bad_apply_var = ApplyDefaultTestOp(-1).make_node(self.x)
-        self.assertRaises(AttributeError, as_tensor_variable, bad_apply_var)
-
-    def test_above_output_len(self):
-        bad_apply_var = ApplyDefaultTestOp(2).make_node(self.x)
-        self.assertRaises(AttributeError, as_tensor_variable, bad_apply_var)
-
-    def test_list(self):
-        bad_apply_var = ApplyDefaultTestOp([0, 1]).make_node(self.x)
-        self.assertRaises(AttributeError, as_tensor_variable, bad_apply_var)
-
-    def test_strip_leading_broadcastable(self):
-        x = tensor.TensorType(config.floatX, (True, False))('x')
-        x = as_tensor_variable(x, ndim=1)
-        assert(x.ndim == 1)
-
-        x = tensor.matrix('x', dtype=config.floatX)
-        self.assertRaises(ValueError, as_tensor_variable, x, ndim=1)
-
 
 class TestAlloc(unittest.TestCase):
     dtype = config.floatX
@@ -2109,8 +1815,7 @@ def test_alloc_constant_folding(self):
                 #AdvancedIncSubtensor1
                 (some_matrix[arange(60)], 2),
                 #AdvancedIncSubtensor
-                (some_matrix[idx, idx], 1)
-        ]):
+                (some_matrix[idx, idx], 1)]):
             derp = sum(dot(subtensor, variables))
 
             fobj = theano.function([some_vector], derp, mode=self.mode)
@@ -2118,18 +1823,14 @@ def test_alloc_constant_folding(self):
             fgrad = theano.function([some_vector], grad_derp,
                                     mode=self.mode)
             topo_obj = fobj.maker.fgraph.toposort()
-            #<= is needed as the GPU currently don't implement
-            #AdvancedIncSubtensor. When this is the case it can be
-            #replaced with ==.
             assert numpy.sum([isinstance(node.op, alloc)
-                              for node in topo_obj]) <= 1
+                              for node in topo_obj]) == 0
             topo_grad = fgrad.maker.fgraph.toposort()
 
             #print subtensor
             #theano.printing.debugprint(fgrad)
             assert numpy.sum([isinstance(node.op, alloc)
-                              for node in topo_grad]) == n_alloc, (
-                                  alloc, subtensor, n_alloc, topo_grad)
+                              for node in topo_grad]) == n_alloc
             fobj(test_params)
             fgrad(test_params)
 
@@ -2146,44 +1847,7 @@ def test_alloc_output(self):
                               for node in topo]) == 1
             assert not isinstance(topo[0].op, DeepCopyOp)
 
-    def test_ones(self):
-        for shp in [[], 1, [1], [1, 2], [1, 2, 3]]:
-            ones = theano.function([], [tensor.ones(shp)])
-            assert numpy.allclose(ones(), numpy.ones(shp))
-
-        # scalar doesn't have to be provided as input
-        x = scalar()
-        shp = []
-        ones_scalar = theano.function([], [tensor.ones(x.shape)])
-        assert numpy.allclose(ones_scalar(), numpy.ones(shp))
-
-        for (typ, shp) in [(vector, [3]), (matrix, [3,4])]:
-            x = typ()
-            ones_tensor = theano.function([x], [tensor.ones(x.shape)])
-            inp = numpy.zeros(shp, dtype=config.floatX)
-            assert numpy.allclose(ones_tensor(inp),
-                                  numpy.ones(shp))
-
-    def test_zeros(self):
-        for shp in [[], 1, [1], [1, 2], [1, 2, 3]]:
-            zeros = theano.function([], [tensor.zeros(shp)])
-            assert numpy.allclose(zeros(), numpy.zeros(shp))
-
-        # scalar doesn't have to be provided as input
-        x = scalar()
-        shp = []
-        zeros_scalar = theano.function([], [tensor.zeros(x.shape)])
-        assert numpy.allclose(zeros_scalar(), numpy.zeros(shp))
-
-        for (typ, shp) in [(vector, [3]), (matrix, [3,4])]:
-            x = typ()
-            zeros_tensor = theano.function([x], [tensor.zeros(x.shape)])
-            inp = numpy.zeros(shp, dtype=config.floatX)
-            assert numpy.allclose(zeros_tensor(inp),
-                                  numpy.zeros(shp))
-
-
-# This is slow for the ('int8', 3) version.
+
 def test_eye():
     def check(dtype, N, M_=None, k=0):
         # Theano does not accept None as a tensor.
@@ -2509,32 +2173,7 @@ def test_batched_dot():
     result_fn = theano.function([first_mat, second_mat], output)
     result = result_fn(first_mat_val, second_mat_val)
 
-    assert result.shape[0] == first_mat_val.shape[0]
-
-
-def test_batched_tensordot():
-    first = theano.tensor.tensor4("first")
-    second = theano.tensor.tensor4("second")
-    axes = [[1,2], [3,1]]
-    output = theano.tensor.basic.batched_tensordot(first, second, axes)
-    first_val = numpy.random.rand(8, 10, 20, 3).astype(config.floatX)
-    second_val = numpy.random.rand(8, 20, 5, 10).astype(config.floatX)
-    result_fn = theano.function([first, second], output)
-    result = result_fn(first_val, second_val)
     assert result.shape[0] == first_val.shape[0]
-    assert result.shape[1] == first_val.shape[3]
-    assert result.shape[2] == second_val.shape[2]
-
-    first_mat = theano.tensor.dmatrix("first")
-    second_mat = theano.tensor.dmatrix("second")
-    axes = 1
-    output = theano.tensor.basic.batched_tensordot(first_mat, second_mat, axes)
-    first_mat_val = numpy.random.rand(10, 4).astype(config.floatX)
-    second_mat_val = numpy.random.rand(10, 4).astype(config.floatX)
-    result_fn = theano.function([first_mat, second_mat], output)
-    result = result_fn(first_mat_val, second_mat_val)
-    assert result.shape[0] == first_mat_val.shape[0]
-    assert len(result.shape) == 1
 
 
 def test_tensor_values_eq_approx():
@@ -3167,6 +2806,1114 @@ def test_grad(self):
             utt.verify_grad(tensor.outer, [data0, data1])
 
 
+class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
+    """
+    This is build in a way that allow to reuse it to test the
+    equivalent gpu op.
+    """
+    def __init__(self, name, shared=_shared,
+                 sub=tensor.Subtensor,
+                 inc_sub=tensor.IncSubtensor,
+                 adv_sub1=tensor.AdvancedSubtensor1,
+                 adv_incsub1=tensor.AdvancedIncSubtensor1,
+                 mode=None,
+                 dtype=theano.config.floatX,
+                 ignore_topo=DeepCopyOp):
+        self.shared = shared
+        self.sub = sub
+        self.inc_sub = inc_sub
+        self.adv_sub1 = adv_sub1
+        self.adv_incsub1 = adv_incsub1
+        if mode is None:
+            mode = theano.compile.mode.get_default_mode()
+        self.mode = mode
+        self.dtype = dtype
+        self.ignore_topo = ignore_topo
+        self.fast_compile = theano.config.mode == 'FAST_COMPILE'
+        self.ops = (sub, inc_sub, adv_sub1, adv_incsub1)
+        return super(T_subtensor, self).__init__(name)
+
+    def function(self, inputs, outputs, accept_inplace=False,
+                 op=None, mode=None, N=1, N_fast=None):
+        """ wrapper around theano.function that also check the output
+
+        :param N: the number of op expected in the toposort
+                  if tuple of length 2, (expected if fast_compile,
+                                         if not fast_compile)
+        """
+        if self.fast_compile and N_fast is not None:
+            N = N_fast
+        if mode is None:
+            mode = self.mode
+        if op is None:
+            op = self.sub
+
+        f = theano.function(inputs, outputs, mode=mode,
+                            accept_inplace=accept_inplace)
+        self.assertFunctionContainsClassN(f, op, N)
+        return f
+
+    def setUp(self):
+        Subtensor.debug = False
+        utt.seed_rng()
+
+    def eval_output_and_check(self, t, list=False):
+        f = inplace_func([], t, mode=self.mode)
+        topo = f.maker.fgraph.toposort()
+        topo_ = [node for node in topo if not isinstance(node.op,
+             self.ignore_topo)]
+        assert len(topo_) == 1
+        if not list:
+            assert isinstance(topo_[0].op, self.sub)
+        else:
+            assert isinstance(topo_[0].op, self.adv_sub1)
+        tval = f()
+        return tval
+
+    def test0_err_invalid(self):
+        #it is impossible to retrieve a view of a 0-d tensor
+        n = self.shared(numpy.ones((), dtype=self.dtype))
+        try:
+            t = n[0]
+        except ValueError, e:
+            self.assertTrue(hasattr(e, 'subtensor_invalid'))
+            return
+        self.fail()
+
+    def test1_err_bounds(self):
+        n = self.shared(numpy.ones(3, dtype=self.dtype))
+        ctv_backup = config.compute_test_value
+        config.compute_test_value = 'off'
+        try:
+            t = n[7]
+        finally:
+            config.compute_test_value = ctv_backup
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        # Silence expected error messages
+        _logger = logging.getLogger('theano.gof.opt')
+        oldlevel = _logger.level
+        _logger.setLevel(logging.CRITICAL)
+        try:
+            try:
+                self.eval_output_and_check(t)
+                assert 0
+            except Exception, e:
+                if exc_message(e) != 'index out of bounds':
+                    raise
+        finally:
+            _logger.setLevel(oldlevel)
+
+    def test1_err_subslice(self):
+        n = self.shared(numpy.ones(3, dtype=self.dtype))
+        try:
+            t = n[slice(0, slice(1, 2, None), None)]
+        except Exception, e:
+            ### Relax constraint on the type of Exception,
+            ### since this might be handled by AvancedSubtensor
+            #if e[0] != Subtensor.e_indextype:
+            #    raise
+            return
+        self.fail()
+
+    def test1_ok_range_finite(self):
+        n = self.shared(numpy.arange(3, dtype=self.dtype))
+        t = n[0:2]
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == (2,))
+        self.assertTrue((tval == [0, 1]).all())
+
+    def test2_ok_range_finite(self):
+        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((3, 4)))
+        # Also check negative index
+        for idx in [(slice(0, 2), 3), ((slice(0, 2), -1)), (slice(0, 2), -4)]:
+            t = n[idx]  # l]#0:2,3]
+            self.assertTrue(isinstance(t.owner.op, Subtensor))
+            tval = self.eval_output_and_check(t)
+            self.assertTrue(tval.shape == (2,))
+            self.assertTrue(numpy.allclose(tval, n.get_value()[idx]))
+
+    def test1_0_dims(self):
+        n = self.shared(numpy.ones((), dtype=self.dtype))
+        t = theano.tensor.Subtensor([])(n)
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        mode = self.mode
+        self.mode = mode.excluding("local_useless_subtensor")
+        try:
+            self.eval_output_and_check(t)
+        finally:
+            self.mode = mode
+
+    def test1_err_invalid(self):
+        n = self.shared(numpy.ones(1, dtype=self.dtype))
+        try:
+            t = n[0, 0]
+        except ValueError, e:
+            self.assertTrue(hasattr(e, 'subtensor_invalid'))
+            return
+        self.fail()
+
+    def test1_ok_elem(self):
+        n = self.shared(numpy.ones(1, dtype=self.dtype) * 5)
+        t = n[0]
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == ())
+        self.assertTrue(tval == 5.0)
+
+    def test1_ok_range_infinite(self):
+        #Subtensor.debug = True
+        n = self.shared(numpy.arange(3, dtype=self.dtype))
+        t = n[1:]
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == (2,))
+        self.assertTrue((tval == [1.0, 2.0]).all())
+
+    def test1_ok_strided(self):
+        n = self.shared(numpy.arange(5, dtype=self.dtype))
+        t = n[1::2]
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == (2,))
+        self.assertTrue((tval == [1.0, 3.0]).all())
+
+        t = n[0:-1:2]  # 0 to 1 from the end stepping by 2
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == (2,))
+        self.assertTrue((tval == [0.0, 2.0]).all())
+
+    def test2_err_bounds0(self):
+        n = self.shared(numpy.ones((2, 3), dtype=self.dtype) * 5)
+        ctv_backup = config.compute_test_value
+        config.compute_test_value = 'off'
+        try:
+            for idx in [(0, 4), (0, -4)]:
+                t = n[idx]
+                self.assertTrue(isinstance(t.owner.op, Subtensor))
+                # Silence expected warnings
+                _logger = logging.getLogger('theano.gof.opt')
+                oldlevel = _logger.level
+                _logger.setLevel(logging.CRITICAL)
+                try:
+                    self.assertRaises(IndexError,
+                                      self.eval_output_and_check, [t])
+                finally:
+                    _logger.setLevel(oldlevel)
+        finally:
+            config.compute_test_value = ctv_backup
+
+    def test2_err_bounds1(self):
+        n = self.shared((numpy.ones((2, 3), dtype=self.dtype) * 5))
+        t = n[4:5, 3]
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        old_stderr = sys.stderr
+        sys.stderr = StringIO()
+        try:
+            self.assertRaises(IndexError,
+                              self.eval_output_and_check, [t])
+        finally:
+            sys.stderr = old_stderr
+
+    def test2_ok_elem(self):
+        n = self.shared(numpy.arange(6, dtype=self.dtype).reshape((2, 3)))
+        t = n[0, 2]
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == ())
+        self.assertTrue(numpy.all(tval == 2))
+
+    def test2_ok_row(self):
+        n = self.shared(numpy.arange(6, dtype=self.dtype).reshape((2, 3)))
+        t = n[1]
+        self.assertFalse(any(n.type.broadcastable))
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == (3,))
+        self.assertTrue(numpy.all(tval == [3, 4, 5]))
+
+    def test2_ok_col(self):
+        n = self.shared(numpy.arange(6, dtype=self.dtype).reshape((2, 3)))
+        t = n[:, 0]
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        self.assertFalse(any(n.type.broadcastable))
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == (2,))
+        self.assertTrue(numpy.all(tval == [0, 3]))
+
+    def test2_ok_rows_finite(self):
+        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((4, 3)))
+        t = n[1:3, 0]
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == (2,))
+        self.assertTrue(numpy.all(tval == [3, 6]))
+
+    def test2_ok_cols_infinite(self):
+        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((4, 3)))
+        t = n[1, 2:]
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == (1,))
+        self.assertTrue(numpy.all(tval == 5))
+
+    def test2_ok_strided(self):
+        n = self.shared(numpy.arange(20, dtype=self.dtype).reshape((4, 5)))
+        t = n[1:4:2, 1:5:2]
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == (2, 2))
+        self.assertTrue(numpy.all(tval == [[6, 8], [16, 18]]))
+
+    def test3_ok_mat(self):
+        n = self.shared(numpy.arange(24, dtype=self.dtype).reshape((2, 3, 4)))
+        t = n[0, 0, 0]
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == ())
+        self.assertTrue(numpy.all(tval == 0))
+
+    def test_long(self):
+        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((4, 3)))
+        t = n[1L:4L:2L, 1L]
+        self.assertTrue(isinstance(t.owner.op, Subtensor))
+        tval = self.eval_output_and_check(t)
+        self.assertTrue(tval.shape == (2,))
+        self.assertTrue(numpy.all(tval == [4, 10]))
+
+    def test_long_too_big(self):
+        # Currently, we cast Python longs to int64 when used for indexing.
+        # This test checks that using a long that does not fit raises an error.
+        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((4, 3)))
+        self.assertRaises(Exception, lambda: n[:(2L ** 63)])
+
+    def test_newaxis(self):
+        """
+        newaxis support comes from logic in the __getitem__ of TensorType
+        Variables, which currently inserts dimshuffle to get the right number
+        of dimensions, and adjusts the slice tuple accordingly.
+
+        So testing is done via square-bracket notation rather than direct
+        interaction with the Subtensor Op (which has no support of its own for
+        newaxis).
+        """
+        newaxis = numpy.newaxis
+
+        n = self.shared(numpy.arange(24, dtype=self.dtype).reshape((2, 3, 4)))
+        assert n.ndim == 3
+
+        n4 = n[newaxis, :, :, :]
+        assert n4.broadcastable == (True, False, False, False), n4
+
+        n4 = n[:, newaxis, :, :]
+        assert n4.broadcastable == (False, True, False, False), n4
+
+        n4 = n[:, :, newaxis, :]
+        assert n4.broadcastable == (False, False, True, False), n4
+
+        n4 = n[:, :, :, newaxis]
+        assert n4.broadcastable == (False, False, False, True), n4
+
+        n3 = n.flatten()[newaxis, :, newaxis]
+        assert n3.broadcastable == (True, False, True), n3
+
+        s = cscalar()
+        s1 = s[newaxis]
+        assert s1.broadcastable == (True,), s1
+
+        vs1, vn3, vn4 = theano.function([s], [s1, n3, n4])(-2.0)
+
+        assert numpy.all(vs1 == [-2.0])
+        assert numpy.all(vn3
+                == numpy.arange(24)[newaxis, :, newaxis])
+        assert numpy.all(vn4
+                == numpy.arange(24).reshape((2, 3, 4))[:, :, :, newaxis])
+
+    def test_grad_1d(self):
+        subi = 0
+        data = numpy.asarray(rand(2, 3), dtype=self.dtype)
+        n = self.shared(data)
+        z = scal.constant(subi)
+        t = n[z:, z]
+        gn = grad(sum(exp(t)), n)
+
+        f = inplace_func([], gn, mode=self.mode)
+        topo = f.maker.fgraph.toposort()
+        topo_ = [node for node in topo if not isinstance(node.op,
+             self.ignore_topo)]
+        if not self.fast_compile:
+            assert len(topo_) == 6
+        assert numpy.sum([isinstance(node.op, self.inc_sub)
+             for node in topo_]) == 1
+        assert numpy.sum([isinstance(node.op, self.sub)
+             for node in topo_]) == 1
+        gval = f()
+
+        good = numpy.zeros_like(data)
+        good[subi:, subi] = numpy.exp(data[subi:, subi])
+        self.assertTrue(numpy.allclose(gval, good), (gval, good))
+
+    def test_grad_0d(self):
+        data = numpy.asarray(rand(2, 3), dtype=self.dtype)
+        n = self.shared(data)
+        t = n[1, 0]
+        gn = grad(sum(exp(t)), n)
+        f = self.function([], gn)
+        topo = f.maker.fgraph.toposort()
+        topo_ = [node for node in topo if not isinstance(node.op,
+             self.ignore_topo)]
+        if not self.fast_compile:
+            assert len(topo_) == 6
+        assert numpy.sum([isinstance(node.op, self.inc_sub)
+             for node in topo_]) == 1
+        assert numpy.sum([isinstance(node.op, self.sub)
+             for node in topo_]) == 1
+
+        gval = f()
+        good = numpy.zeros_like(data)
+        good[1, 0] = numpy.exp(data[1, 0])
+        self.assertTrue(numpy.allclose(gval, good), (gval, good))
+
+    def test_ok_list(self):
+        for data, idx in [(rand(4), [1, 0]),
+                          (rand(4, 5), [2, 3]),
+                          (rand(4, 2, 3), [0, 3]),
+                          (rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0]),
+                          (rand(4, 2, 3), [3, 3,
+                               1, 1, 2, 2, 0, 0, -1, -2, -3, -4]),
+                          # Test 4 dims as gpu code use another algo in that case
+                          # This new algo is not as much optimized for that case.
+                          (rand(4, 4, 2, 3), [3,
+                               3, 1, 1, 2, 2, 0, 0, -1, -2, -3, -4]),
+                          # Test with TensorConstant index.
+                          (rand(4, 2, 3), constant([3, 3, 1, 1, 2, 2, 0, 0])),
+                          ]:
+            data = numpy.asarray(data, dtype=self.dtype)
+            n = self.shared(data)
+            t = n[idx]
+
+            # We test again AdvancedSubtensor1 as we transfer data to the cpu.
+            self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
+
+            val = self.eval_output_and_check(t, list=True)
+            if isinstance(idx, list):
+                good = data[idx]
+            else:
+                good = data[idx.data]
+            self.assertTrue(val.ndim == data.ndim)
+            self.assertTrue(numpy.allclose(val, good), (val, good))
+
+            # Test reuse of output memory
+            if isinstance(self.adv_sub1, tensor.AdvancedSubtensor1):
+                op = self.adv_sub1()
+                # When idx is a TensorConstant.
+                if hasattr(idx, "data"):
+                    idx = idx.data
+                test_out = [[None]]
+                op.perform(None, [data, idx], test_out)
+                out1 = test_out[0][0]
+                op.perform(None, [data, idx], test_out)
+                out2 = test_out[0][0]
+                assert out1 is out2
+
+    def test_err_invalid_list(self):
+        n = self.shared(numpy.asarray(5, dtype=self.dtype))
+        self.assertRaises(TypeError, n.__getitem__, [0, 0])
+
+
+    def test_err_invalid_2list_dtype(self):
+        n = self.shared(numpy.ones((3, 3), dtype=self.dtype) * 5)
+        self.assertRaises(TypeError, n.__getitem__, ([0., 0], [1, 1]))
+
+    def test_err_bound_list(self):
+        n = self.shared(numpy.ones((2, 3), dtype=self.dtype) * 5)
+        l = lvector()
+        t = n[l]
+        # We test again AdvancedSubtensor1 as we transfer data to the cpu.
+        self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
+
+        f = self.function([l], t, op=self.adv_sub1)
+        topo = f.maker.fgraph.toposort()
+        topo_ = [node for node in topo if not isinstance(node.op,
+             self.ignore_topo)]
+        assert len(topo_) == 1
+        self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
+        for shp in [[0, 4], [0, -3], [-10]]:
+            self.assertRaises(IndexError, f, shp)
+
+    def test_adv_sub1_broadcast(self):
+        ones = numpy.ones((1, 3), dtype=self.dtype)
+        n = self.shared(ones * 5, broadcastable=(True, False))
+        idx = tensor.lvector()
+        t = n[idx]
+        self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
+
+        f = self.function([idx], t, op=self.adv_sub1)
+        topo = f.maker.fgraph.toposort()
+        topo_ = [node for node in topo if not isinstance(node.op,
+             self.ignore_topo)]
+        assert len(topo_) == 1
+        self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
+        self.assertTrue(numpy.allclose(f([0]), ones[0] * 5))
+        self.assertRaises(IndexError, f, [0, 1])
+
+    def test_adv_sub1_idx_broadcast(self):
+        # The idx can be a broadcastable vector.
+        ones = numpy.ones((4, 3), dtype=self.dtype)
+        n = self.shared(ones * 5)
+        idx = tensor.TensorType(dtype='int64', broadcastable=(True,))()
+        assert idx.type.broadcastable == (True,)
+        t = n[idx]
+        self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
+
+        f = self.function([idx], t, op=self.adv_sub1)
+        topo = f.maker.fgraph.toposort()
+        topo_ = [node for node in topo if not isinstance(node.op,
+             self.ignore_topo)]
+        assert len(topo_) == 1
+        self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
+        self.assertTrue(numpy.allclose(f([0]), ones[0] * 5))
+
+    def test_shape_i_const(self):
+        # Each axis is treated independently by shape_i/shape operators
+
+        mode_opt = self.mode.including("fast_run")
+        data = self.shared(numpy.array(numpy.arange(5), dtype=self.dtype))
+        for start in [None] + [-8, -5, -1, 0, 1, 5, 8]:
+            outs = []
+            shapes = []
+            for stop in [None] + [-8, -5, -1, 0, 1, 5, 8]:
+                for step in [None] + [-3, -1, 2]:
+                    outs += [data[start:stop:step].shape]
+                    shapes += [data.get_value(
+                        borrow=True)[start:stop:step].shape]
+            f = self.function([], outs, mode=mode_opt,
+                              op=self.ops, N=0)
+            t_shapes = f()
+            for t_shape, shape in zip(t_shapes, shapes):
+                assert numpy.all(t_shape == shape)
+            assert tensor.Subtensor not in [x.op for x in
+                                           f.maker.fgraph.toposort()]
+
+    def test_shape_i_scalar(self):
+        # Each axis is treated independently by shape_i/shape operators
+
+        mode_opt = self.mode.including("fast_run")
+
+        v_data = numpy.array(numpy.arange(5), dtype=self.dtype)
+        t_data = self.shared(v_data)
+        start = tensor.iscalar('b')
+        stop = tensor.iscalar('e')
+        step = tensor.iscalar('s')
+        f = self.function([start, stop, step],
+                          t_data[start:stop:step].shape,
+                          mode=mode_opt,
+                          op=self.ops,
+                          N=0)
+        assert tensor.Subtensor not in [x.op for x in f.maker.
+            fgraph.toposort()]
+        for start in [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
+            for stop in [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
+                for step in [-3, -1, 2, 5]:
+                    assert numpy.all(f(start, stop, step) ==
+                                     v_data[start:stop:step].shape)
+
+    def test_slice_canonical_form_0(self):
+        start = tensor.iscalar('b')
+        stop = tensor.iscalar('e')
+        step = tensor.iscalar('s')
+        length = tensor.iscalar('l')
+        cnf = tensor.get_canonical_form_slice(slice(start, stop, step), length)
+        f = self.function([start, stop, step, length], [
+            tensor.as_tensor_variable(cnf[0].start),
+            tensor.as_tensor_variable(cnf[0].stop),
+            tensor.as_tensor_variable(cnf[0].step),
+            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
+
+        length = 5
+        a = numpy.arange(length)
+        for start in [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
+            for stop in  [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
+                for step in [-6, -3, -1, 2, 5]:
+                    out = f(start, stop, step, length)
+                    t_out = a[out[0]:out[1]:out[2]][::out[3]]
+                    v_out = a[start:stop:step]
+                    assert numpy.all(t_out == v_out)
+                    assert numpy.all(t_out.shape == v_out.shape)
+
+    def test_slice_canonical_form_1(self):
+        stop = tensor.iscalar('e')
+        step = tensor.iscalar('s')
+        length = tensor.iscalar('l')
+        cnf = tensor.get_canonical_form_slice(slice(None, stop, step), length)
+        f = self.function([stop, step, length], [
+            tensor.as_tensor_variable(cnf[0].start),
+            tensor.as_tensor_variable(cnf[0].stop),
+            tensor.as_tensor_variable(cnf[0].step),
+            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
+
+        length = 5
+        a = numpy.arange(length)
+        for stop in  [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
+            for step in [-6, -3, -1, 2, 5]:
+                out = f(stop, step, length)
+                t_out = a[out[0]:out[1]:out[2]][::out[3]]
+                v_out = a[:stop:step]
+                assert numpy.all(t_out == v_out)
+                assert numpy.all(t_out.shape == v_out.shape)
+
+    def test_slice_canonical_form_2(self):
+        start = tensor.iscalar('b')
+        step = tensor.iscalar('s')
+        length = tensor.iscalar('l')
+        cnf = tensor.get_canonical_form_slice(slice(start, None, step), length)
+        f = self.function([start, step, length], [
+            tensor.as_tensor_variable(cnf[0].start),
+            tensor.as_tensor_variable(cnf[0].stop),
+            tensor.as_tensor_variable(cnf[0].step),
+            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
+
+        length = 5
+        a = numpy.arange(length)
+        for start in [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
+            for step in [-6, -3, -1, 2, 5]:
+                out = f(start, step, length)
+                t_out = a[out[0]:out[1]:out[2]][::out[3]]
+                v_out = a[start:None:step]
+                assert numpy.all(t_out == v_out)
+                assert numpy.all(t_out.shape == v_out.shape)
+
+    def test_slice_canonical_form_3(self):
+        start = tensor.iscalar('b')
+        stop = tensor.iscalar('e')
+        length = tensor.iscalar('l')
+        cnf = tensor.get_canonical_form_slice(slice(start, stop, None), length)
+        f = self.function([start, stop, length], [
+            tensor.as_tensor_variable(cnf[0].start),
+            tensor.as_tensor_variable(cnf[0].stop),
+            tensor.as_tensor_variable(cnf[0].step),
+            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
+
+        length = 5
+        a = numpy.arange(length)
+        for start in [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
+            for stop in  [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
+                out = f(start, stop, length)
+                t_out = a[out[0]:out[1]:out[2]][::out[3]]
+                v_out = a[start:stop:None]
+                assert numpy.all(t_out == v_out)
+                assert numpy.all(t_out.shape == v_out.shape)
+
+    def test_slice_canonical_form_4(self):
+        step = tensor.iscalar('s')
+        length = tensor.iscalar('l')
+        cnf = tensor.get_canonical_form_slice(slice(None, None, step), length)
+        f = self.function([step, length], [
+            tensor.as_tensor_variable(cnf[0].start),
+            tensor.as_tensor_variable(cnf[0].stop),
+            tensor.as_tensor_variable(cnf[0].step),
+            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
+
+        length = 5
+        a = numpy.arange(length)
+        for step in [-6, -3, -1, 2, 5]:
+            out = f(step, length)
+            t_out = a[out[0]:out[1]:out[2]][::out[3]]
+            v_out = a[None:None:step]
+            assert numpy.all(t_out == v_out)
+            assert numpy.all(t_out.shape == v_out.shape)
+
+    def test_slice_canonical_form_5(self):
+        start = tensor.iscalar('b')
+        length = tensor.iscalar('l')
+        cnf = tensor.get_canonical_form_slice(slice(start, None, None), length)
+        f = self.function([start, length], [
+            tensor.as_tensor_variable(cnf[0].start),
+            tensor.as_tensor_variable(cnf[0].stop),
+            tensor.as_tensor_variable(cnf[0].step),
+            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
+
+        length = 5
+        a = numpy.arange(length)
+        for start in [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
+            out = f(start, length)
+            t_out = a[out[0]:out[1]:out[2]][::out[3]]
+            v_out = a[start:None:None]
+            assert numpy.all(t_out == v_out)
+            assert numpy.all(t_out.shape == v_out.shape)
+
+    def test_slice_canonical_form_6(self):
+        stop = tensor.iscalar('e')
+        length = tensor.iscalar('l')
+        cnf = tensor.get_canonical_form_slice(slice(None, stop, None), length)
+        f = self.function([stop, length], [
+            tensor.as_tensor_variable(cnf[0].start),
+            tensor.as_tensor_variable(cnf[0].stop),
+            tensor.as_tensor_variable(cnf[0].step),
+            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
+
+        length = 5
+        a = numpy.arange(length)
+        for stop in  [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
+            out = f(stop, length)
+            t_out = a[out[0]:out[1]:out[2]][::out[3]]
+            v_out = a[None:stop:None]
+            assert numpy.all(t_out == v_out)
+            assert numpy.all(t_out.shape == v_out.shape)
+
+    def grad_list_(self, idxs, data):
+        n = self.shared(data)
+
+        for idx in idxs:
+            # Should stay on the cpu.
+            idx_ = _shared(numpy.asarray(idx))
+            t = n[idx_]
+            gn = grad(sum(exp(t)), n)
+            f = self.function([], [gn, gn.shape], op=self.adv_incsub1)
+            topo = f.maker.fgraph.toposort()
+            if not self.fast_compile:
+                assert any([isinstance(node.op, self.
+                    adv_incsub1) and node.op.inplace for node in topo])
+            else:
+                assert any([isinstance(node.op, self.
+                    adv_incsub1) for node in topo])
+            assert any([isinstance(node.op, self.adv_sub1) for node in topo])
+            gval, gshape = f()
+            good = numpy.zeros_like(data)
+            # good[idx] += numpy.exp(data[idx]) don't work when the same index is used many time
+            for i in idx:
+                good[i] += numpy.exp(data[i])
+            self.assertTrue(gval.ndim == data.ndim)
+            self.assertTrue(numpy.allclose(gval, good), (gval, good))
+            self.assertTrue(numpy.allclose(gshape, data.shape))
+
+            def fct(t):
+                return sum(t[idx_])
+            utt.verify_grad(fct, [data])
+
+            # Test the grad of the grad (e.i. AdvancedIncSubtensor1.grad)
+            def fct2(t):
+                return grad(sum(t[idx_]), t)
+            utt.verify_grad(fct2, [data])
+
+            # Test shape of AdvancedIncSubtensor1 and AdvancedSubtensor1
+            if not self.fast_compile:
+                ops = (self.adv_incsub1, self.adv_sub1)
+            else:
+                ops = self.ops
+            if idx is idxs[0]:
+                f = self.function([], [gn.shape, n[idx_].shape],
+                                  op=ops,
+                                  N=0, N_fast=2)
+                f()
+
+    def test_wrong_exception_regression(self):
+        a = fscalar()
+        b = fscalar()
+        c = vector()
+        try:
+            c[a:b]
+        except NotImplementedError:
+            self.fail()
+        except TypeError:
+            pass
+        try:
+            c[a:]
+        except NotImplementedError:
+            self.fail()
+        except TypeError:
+            pass
+        try:
+            c[:b]
+        except NotImplementedError:
+            self.fail()
+        except TypeError:
+            pass
+
+    def test_grad_list(self):
+        data = rand(4)
+        data = numpy.asarray(data, dtype=self.dtype)
+        idxs = [[i] for i in range(data.shape[0])]
+        for i in range(data.shape[0]):
+            for j in range(0, data.shape[0], 2):
+                idxs.append([i, j, (i + 1) % data.shape[0]])
+        self.grad_list_(idxs, data)
+
+        data = rand(4, 3)
+        data = numpy.asarray(data, dtype=self.dtype)
+        self.grad_list_(idxs, data)
+
+        data = rand(4, 3, 2)
+        data = numpy.asarray(data, dtype=self.dtype)
+        self.grad_list_(idxs, data)
+
+    def test_shape_list(self):
+        #TODO for all type of subtensor shape
+        for data, idx in [(rand(4), [1, 0]),
+                          (rand(4, 2), [2, 3]),
+                          (rand(4, 2, 3), [0, 3]),
+                          (rand(4, 2, 3), [3, 3, 1, 2, 2, ]),
+                          ]:
+            data = numpy.asarray(data, dtype=self.dtype)
+            n = self.shared(data)
+            t = n[idx]
+            f = self.function([], t.shape, op=self.ops, N=0, N_fast=1)
+            val = f()
+            self.assertTrue(numpy.allclose(val, data[idx].shape))
+
+    def test_grad_advanced_inc_subtensor(self):
+        def inc_slice(*s):
+            def just_numeric_args(a, b):
+                cost = (a[s] + b).sum()
+                cost_wrt_a = grad(cost, a)
+                cost_wrt_b = grad(cost, b)
+                grads = cost_wrt_a.sum() + cost_wrt_b.sum()
+                return grads
+            return just_numeric_args
+
+        # vector
+        utt.verify_grad(
+            inc_slice(slice(2, 4, None)),
+            (numpy.asarray([0, 1, 2, 3, 4, 5.]), numpy.asarray([9, 9.]),))
+
+        # matrix
+        utt.verify_grad(
+            inc_slice(slice(1, 2, None), slice(None, None, None)),
+            (numpy.asarray([[0, 1], [2, 3], [4, 5.]]),
+             numpy.asarray([[9, 9.]]),))
+
+        #single element
+        utt.verify_grad(
+            inc_slice(2, 1),
+            (numpy.asarray([[0, 1], [2, 3], [4, 5.]]), numpy.asarray(9.),))
+
+    def test_advanced_inc_and_set(self):
+        """
+        Test advanced increment and set.
+        """
+        rng = numpy.random.RandomState(seed=utt.fetch_seed())
+        all_inputs_var = []
+        all_inputs_num = []
+        all_outputs_var = []
+        all_outputs_num = []
+        for set_instead_of_inc in (False, True):
+            for inplace in (False, True):
+                for data_shape in ((10,), (4, 5), (1, 2, 3), (4, 5, 6, 7)):
+                    data_n_dims = len(data_shape)
+                    data_size = numpy.product(data_shape)
+                    # Corresponding numeric variable.
+                    data_num_init = numpy.arange(data_size, dtype=self.dtype)
+                    data_num_init = data_num_init.reshape(data_shape)
+                    inc_shapes = [data_shape[i:]
+                                  for i in xrange(0, len(data_shape) + 1)]
+                    for inc_shape in inc_shapes:
+                        inc_n_dims = len(inc_shape)
+                        # We copy the numeric value to be 100% sure there is no
+                        # risk of accidentally sharing it.
+                        data_num = data_num_init.copy()
+                        # Symbolic variable to be incremented.
+                        # We create a new one every time in order not to
+                        # have duplicated variables in the function's inputs
+                        data_var = tensor.tensor(
+                                broadcastable=[False] * data_n_dims,
+                                dtype=self.dtype)
+                        # Symbolic variable with rows to be incremented.
+                        idx_var = theano.tensor.vector(dtype='int64')
+                        n_to_inc = rng.randint(data_shape[0])
+                        # Corresponding numeric variable.
+                        idx_num = rng.randint(0, data_shape[0], n_to_inc)
+                        idx_num = idx_num.astype('int64')
+                        # Symbolic variable with increment value.
+                        inc_var = tensor.tensor(
+                                broadcastable=[False] * inc_n_dims,
+                                dtype=self.dtype)
+                        # Trick for the case where `inc_shape` is the same as
+                        # `data_shape`: what we actually want is the first
+                        # shape element to be equal to the number of rows to
+                        # increment.
+                        if len(inc_shape) == len(data_shape):
+                            inc_shape = (n_to_inc,) + inc_shape[1:]
+                        inc_size = numpy.product(inc_shape)
+                        # Corresponding numeric variable.
+                        inc_num = rng.uniform(size=inc_size).astype(self.dtype)
+                        inc_num = inc_num.reshape(inc_shape)
+                        # Result of the incrementation.
+                        # (i) Theano
+                        if set_instead_of_inc:
+                            op = set_subtensor
+                        else:
+                            op = inc_subtensor
+                        output = op(data_var[idx_var], inc_var,
+                                    inplace=inplace)
+                        # (ii) Numpy (note that Numpy increments only once
+                        # duplicated indices, so we cannot directly use +=).
+                        data_copy = data_num.copy()
+                        for j, idx in enumerate(idx_num):
+                            if len(inc_shape) == len(data_shape):
+                                # Special case where there is no broadcasting.
+                                if set_instead_of_inc:
+                                    data_copy[idx] = inc_num[j]
+                                else:
+                                    data_copy[idx] += inc_num[j]
+                            else:
+                                if set_instead_of_inc:
+                                    data_copy[idx] = inc_num
+                                else:
+                                    data_copy[idx] += inc_num
+                        data_var = theano.In(data_var, mutable=True)
+
+                        # Remember data for the Theano function (see below).
+                        all_inputs_var += [data_var, idx_var, inc_var]
+                        all_inputs_num += [data_num, idx_num, inc_num]
+                        all_outputs_var.append(output)
+                        all_outputs_num.append(data_copy)
+                        if False:  # Enable for debugging purpose.
+                            f = self.function([data_var, idx_var, inc_var],
+                                              output, accept_inplace=inplace,
+                                              op=self.adv_incsub1)
+                            if inplace:
+                                # Ensure calling `f` will not alter `data_num`.
+                                data_num = data_num.copy()
+                            f_out = f(data_num.copy(), idx_num, inc_num)
+                            assert numpy.allclose(f_out, data_copy)
+                            if not inplace:
+                                # Sanity check: `data_num` should be intact.
+                                assert (data_num == data_num_init).all()
+
+        # Actual test (we compile a single Theano function to make it faster).
+        orig_warn = theano.config.warn.gpu_set_subtensor1
+        try:
+            theano.config.warn.gpu_set_subtensor1 = False
+            f = self.function(all_inputs_var, all_outputs_var,
+                              accept_inplace=True,
+                              op=self.adv_incsub1,
+                              N=len(all_outputs_var))
+        finally:
+            theano.config.warn.gpu_set_subtensor1 = orig_warn
+
+        f_outs = f(*all_inputs_num)
+        assert len(f_outs) == len(all_outputs_num)
+        for f_out, output_num in izip(f_outs, all_outputs_num):
+            # NB: if this assert fails, it will probably be easier to debug if
+            # you enable the debug code above.
+            assert numpy.allclose(f_out, output_num)
+
+    def test_adv_constant_arg(self):
+        # Test case provided (and bug detected, gh-607) by John Salvatier
+        m = matrix('m')
+        gv = numpy.array([0, 1, 3])
+        g = constant(gv)
+        i = lvector('i')
+
+        # s1 used to fail
+        s1 = m[gv, i]
+        s2 = m[g, i]
+
+        assert gof.graph.is_same_graph(s1, s2)
+
+    def test_adv1_inc_sub_notlastdim(self):
+        # Test that taking 1-dimensional advanced indexing
+        # over a dimension that's not the first (outer-most) works.
+        m = matrix('m')
+        i = lvector('i')
+
+        m1 = set_subtensor(m[:, i], 0)
+        m2 = inc_subtensor(m[:, i], 1)
+        f = theano.function([m, i], [m1, m2])
+
+        m_val = rand(3, 5)
+        i_val = randint_ranged(min=0, max=4, shape=(4,))
+        m1_ref = m_val.copy()
+        m2_ref = m_val.copy()
+
+        m1_val, m2_val = f(m_val, i_val)
+        for idx in i_val:
+            m1_ref[:, idx] = 0
+            m2_ref[:, idx] += 1
+
+        assert numpy.allclose(m1_val, m1_ref), (m1_val, m1_ref)
+        assert numpy.allclose(m2_val, m2_ref), (m2_val, m2_ref)
+
+    def test_adv1_inc_sub_notlastdim_2didx(self):
+        # Test that taking 1-dimensional advanced indexing
+        # over a dimension that's not the first (outer-most) works,
+        # if the index is a matrix.
+        m = matrix('m')
+        i = lmatrix('i')
+
+        m1 = set_subtensor(m[:, i], 0)
+        m2 = inc_subtensor(m[:, i], 1)
+        f = theano.function([m, i], [m1, m2])
+
+        m_val = rand(5, 7)
+        i_val = randint_ranged(min=0, max=6, shape=(4, 2))
+        m1_ref = m_val.copy()
+        m2_ref = m_val.copy()
+
+        m1_val, m2_val = f(m_val, i_val)
+        for idx in i_val.ravel():
+            m1_ref[:, idx] = 0
+            m2_ref[:, idx] += 1
+
+        assert numpy.allclose(m1_val, m1_ref), (m1_val, m1_ref)
+        assert numpy.allclose(m2_val, m2_ref), (m2_val, m2_ref)
+
+
+class TestIncSubtensor1(unittest.TestCase):
+    # test inc_subtensor
+    # also tests set_subtensor
+
+    def setUp(self):
+        self.s = iscalar()
+        self.v = fvector()
+        self.m = dmatrix()
+        self.t = ctensor3()
+
+        self.adv1q = lvector()  # advanced 1d query
+
+    def test_cant_adv_idx_into_scalar(self):
+        self.assertRaises(TypeError, lambda: self.s[self.adv1q])
+
+    def test_index_into_vec_w_vec(self):
+        a = self.v[self.adv1q]
+        assert a.type == self.v.type
+
+    def test_1d_set_adv_selection(self):
+        a = set_subtensor(self.v[self.adv1q], self.v[self.adv1q])
+
+        assert a.type == self.v.type
+
+        #TODO: compile a function and verify that the subtensor is removed
+        #      completely, because the whole expression is redundant.
+
+        f = theano.function([self.v, self.adv1q], a, allow_input_downcast=True)
+        aval = f([.4, .9, .1], [1, 2])
+        assert numpy.allclose(aval, [.4, 0.9, 0.1])
+
+    def test_1d_inc_adv_selection(self):
+        a = inc_subtensor(self.v[self.adv1q], self.v[self.adv1q])
+
+        assert a.type == self.v.type
+        f = theano.function([self.v, self.adv1q], a, allow_input_downcast=True)
+        aval = f([.4, .9, .1], [1, 2])
+        assert numpy.allclose(aval, [.4, 1.8, 0.2])
+
+    def test_1d_inc_adv_selection_w_broadcasting(self):
+        a = inc_subtensor(self.v[self.adv1q], 3.0)
+
+        assert a.type == self.v.type
+        f = theano.function([self.v, self.adv1q], a, allow_input_downcast=True)
+        aval = f([.4, .9, .1], [1, 2])
+        assert numpy.allclose(aval, [.4, 3.9, 3.1])
+
+    def test_assigning_matrix_to_vector_selection(self):
+        self.assertRaises(TypeError,
+                lambda: inc_subtensor(self.v[self.adv1q], fmatrix()))
+
+inplace_increment_missing = SkipTest("inc_subtensor with advanced indexing not enabled. "
+                       "Installing NumPy 1.8 or the latest development version "
+                       "should make that feature available.")
+
+class TestAdvancedSubtensor(unittest.TestCase):
+    # test inc_subtensor
+    # also tests set_subtensor
+
+    def setUp(self):
+        self.s = iscalar()
+        self.v = fvector()
+        self.m = dmatrix()
+        self.t = ctensor3()
+
+        self.ix1 = lvector()  # advanced 1d query
+        self.ix12 = lvector()
+        self.ix2 = lmatrix()
+
+    def test_cant_adv_idx_into_scalar(self):
+        self.assertRaises(TypeError, lambda: self.s[self.ix1])
+
+    def test_index_into_vec_w_vec(self):
+        a = self.v[self.ix1]
+        assert a.type == self.v.type, (a.type, self.v.type)
+
+    def test_index_into_vec_w_matrix(self):
+        a = self.v[self.ix2]
+        assert a.dtype == self.v.dtype, (a.dtype, self.v.dtype)
+        assert a.broadcastable == self.ix2.broadcastable, (
+                a.broadcastable, self.ix2.broadcastable)
+
+    def test_inc_adv_subtensor_w_matrix(self):
+        if inplace_increment is None:
+            raise inplace_increment_missing
+
+        subt = self.v[self.ix2]
+        a = inc_subtensor(subt,subt)
+
+        assert a.type == self.v.type, (a.type, self.v.type)
+        f = theano.function([self.v, self.ix2], a, allow_input_downcast=True)
+        aval = f([.4, .9, .1], [[1, 2],
+                                [1, 2]])
+        assert numpy.allclose(aval, [.4, .9 * 3, .1 * 3])
+
+    def test_inc_adv_subtensor_w_2vec(self):
+        if inplace_increment is None:
+            raise inplace_increment_missing
+
+        subt = self.m[self.ix1, self.ix12]
+        a = inc_subtensor(subt, subt)
+
+        typ = TensorType(self.m.type.dtype, self.ix2.type.broadcastable)
+        assert a.type == typ, (a.type, typ)
+        f = theano.function([self.m, self.ix1, self.ix12], a,
+                            allow_input_downcast=True)
+        aval = f([[.4, .9, .1],
+                  [5,   6,  7],
+                  [.5, .3, .15]],
+                 [1, 2, 1],
+                 [0, 1, 0])
+        assert numpy.allclose(aval,
+                [[.4, .9, .1],
+                  [5 * 3,   6,  7],
+                  [.5, .3 * 2, .15]]), aval
+
+    def test_inc_adv_subtensor_with_broadcasting(self):
+        if inplace_increment is None:
+            raise inplace_increment_missing
+
+        a = inc_subtensor(self.m[self.ix1, self.ix12], 2.1)
+
+        assert a.type == self.m.type, (a.type, self.m.type)
+        f = theano.function([self.m, self.ix1, self.ix12], a,
+                            allow_input_downcast=True)
+        aval = f([[.4, .9, .1],
+                  [5,   6,  7],
+                  [.5, .3, .15]],
+                 [1, 2, 1],
+                 [0, 1, 0])
+        assert numpy.allclose(aval,
+                [[.4, .9, .1],
+                  [5 + 2.1 * 2,   6,  7],
+                  [.5, .3 + 2.1, .15]]), aval
+
+    def test_inc_adv_subtensor_with_index_broadcasting(self):
+        if inplace_increment is None:
+            raise inplace_increment_missing
+
+        a = inc_subtensor(self.m[self.ix1, self.ix2], 2.1)
+
+        assert a.type == self.m.type, (a.type, self.m.type)
+        f = theano.function([self.m, self.ix1, self.ix2], a,
+                            allow_input_downcast=True)
+        aval = f([[.4, .9, .1],
+                  [5,   6,  7],
+                  [.5, .3, .15]],
+                 [0, 2, 0],
+                 [[0, 1, 0],
+                  [2, 2, 2]])
+        assert numpy.allclose(aval,
+                [[.4 + 2*2.1, .9, .1 + 2*2.1],
+                  [5 ,   6,  7 ],
+                  [.5, .3 + 2.1, .15 + 2.1]]), aval
+
 class T_Join_and_Split(unittest.TestCase):
     """
     Split is tested by each verify_grad method.
@@ -3282,44 +4029,6 @@ def test_stack_scalar_make_vector_constant(self):
         assert len([n for n in topo if isinstance(n, self.join_op)]) == 0
         assert f.maker.fgraph.outputs[0].dtype == 'int64'
 
-    def test_stack_hessian(self):
-        # Test the gradient of stack when used in hessian, see gh-1589
-        a = tensor.dvector('a')
-        b = tensor.dvector('b')
-        A = stack(a, b)
-        B = A.T.dot(A)
-        Ha, Hb = hessian(B.sum(), [a, b])
-
-        # Try some values
-        a_v = numpy.random.rand(4)
-        b_v = numpy.random.rand(4)
-        f = theano.function([a, b], [Ha, Hb])
-        Ha_v, Hb_v = f(a_v, b_v)
-        # The Hessian is always a matrix full of 2
-        assert Ha_v.shape == (4, 4)
-        assert Hb_v.shape == (4, 4)
-        assert numpy.allclose(Ha_v, 2.)
-        assert numpy.allclose(Hb_v, 2.)
-
-    def test_stack_hessian2(self):
-        # Test the hessian macro when the gradient itself does not depend
-        # on the input (but the cost does)
-        a = tensor.dvector('a')
-        b = tensor.dvector('b')
-        A = stack([a, b])
-        Ha, Hb = hessian(A.sum(), [a, b])
-
-        # Try some values
-        a_v = numpy.random.rand(4)
-        b_v = numpy.random.rand(4)
-        f = theano.function([a, b], [Ha, Hb])
-        Ha_v, Hb_v = f(a_v, b_v)
-        # The Hessian is always a matrix full of 0
-        assert Ha_v.shape == (4, 4)
-        assert Hb_v.shape == (4, 4)
-        assert numpy.allclose(Ha_v, 0.)
-        assert numpy.allclose(Hb_v, 0.)
-
     def test_join_concatenate_one_element(self):
         ''' Fast test of concatenate as this is an alias for join.
         also test that we remove the Join op if there is only 1 input'''
@@ -3401,26 +4110,7 @@ def test_join_matrix0(self):
         self.assertTrue((out == want).all())
 
     def test_join_matrix1(self):
-        av = numpy.array([[.1, .2, .3], [.4, .5, .6]], dtype='float32')
-        bv = numpy.array([[.7], [.8]], dtype='float32')
-        a = self.shared(av)
-        b = as_tensor_variable(bv)
-        s = join(1, a, b)
-        want = numpy.array([[.1, .2, .3, .7], [.4, .5, .6, .8]],
-                           dtype='float32')
-        out = self.eval_outputs_and_check_join([s])
-        self.assertTrue((out == want).all())
-
-        utt.verify_grad(lambda a, b: join(1, a, b), [av, bv],
-                        mode=self.mode)
-
-    def test_join_matrix_dtypes(self):
-        if "float32" in self.shared.__name__:
-            raise SkipTest(
-                "The shared variable constructor"
-                " need to support other dtype then float32")
-        # Test mixed dtype. There was a bug that caused crash in the past.
-        av = numpy.array([[1, 2, 3], [4, 5, 6]], dtype='int8')
+        av = numpy.array([[1, 2, 3], [4, 5, 6]], dtype='float32')
         bv = numpy.array([[7], [8]], dtype='float32')
         a = self.shared(av)
         b = as_tensor_variable(bv)
@@ -3429,28 +4119,9 @@ def test_join_matrix_dtypes(self):
         out = self.eval_outputs_and_check_join([s])
         self.assertTrue((out == want).all())
 
-        grad(s.sum(), b)
-        grad(s.sum(), a)
-        utt.verify_grad(lambda b: join(1, a, b), [bv],
-                        eps=1.0e-2, mode=self.mode)
-
-    def test_join_matrix_ints(self):
-        if "float32" in self.shared.__name__:
-            raise SkipTest(
-                "The shared variable constructor"
-                " need to support other dtype then float32")
-        # Test mixed dtype. There was a bug that caused crash in the past.
-        av = numpy.array([[1, 2, 3], [4, 5, 6]], dtype='int8')
-        bv = numpy.array([[7], [8]], dtype='int32')
-        a = self.shared(av)
-        b = as_tensor_variable(bv)
-        s = join(1, a, b)
-        want = numpy.array([[1, 2, 3, 7], [4, 5, 6, 8]], dtype='float32')
-        out = self.eval_outputs_and_check_join([s])
-        self.assertTrue((out == want).all())
-
-        assert (grad(s.sum(), b).eval() == 0).all()
-        assert (grad(s.sum(), a).eval() == 0).all()
+#        assert tensor.grad(join(1,a,b), a
+        utt.verify_grad(lambda a, b: join(1, a, b), [av, bv],
+                        eps=1.0e-4, rel_tol=1.0e-3)
 
     def test_join_matrix1_using_vertical_stack(self):
         a = self.shared(numpy.array([[1, 2, 3], [4, 5, 6]], dtype=self.floatX))
@@ -3463,25 +4134,24 @@ def test_join_matrix1_using_vertical_stack(self):
         self.assertTrue((out == want).all())
 
     def test_join_matrix1_using_horizontal_stack(self):
-        av = numpy.array([[.1, .2, .3], [.4, .5, .6]], dtype='float32')
-        bv = numpy.array([[.7], [.8]], dtype='float32')
-        cv = numpy.array([[.3, .2, .1], [.6, .5, .4]], dtype='float32')
+        av = numpy.array([[1, 2, 3], [4, 5, 6]], dtype='float32')
+        bv = numpy.array([[7], [8]], dtype='float32')
+        cv = numpy.array([[3, 2, 1], [6, 5, 4]], dtype='float32')
         a = self.shared(av)
         b = as_tensor_variable(bv)
         c = as_tensor_variable(cv)
         s = horizontal_stack(a, b, c)
-        want = numpy.array([[.1, .2, .3, .7, .3, .2, .1],
-                            [.4, .5, .6, .8, .6, .5, .4]],
+        want = numpy.array([[1, 2, 3, 7, 3, 2, 1], [4, 5, 6, 8, 6, 5, 4]],
                            dtype='float32')
         out = self.eval_outputs_and_check_join([s])
         self.assertTrue((out == want).all())
 
         utt.verify_grad(lambda a, b: join(1, a, b), [av, bv],
-                        mode=self.mode)
+                        eps=1.0e-4, rel_tol=1.0e-3)
 
     def test_join_matrixV(self):
         """variable join axis"""
-        v = numpy.array([[.1, .2, .3], [.4, .5, .6]], dtype=self.floatX)
+        v = numpy.array([[1., 2., 3.], [4., 5., 6.]], dtype=self.floatX)
         a = self.shared(v.copy())
         b = as_tensor_variable(v.copy())
         ax = lscalar()
@@ -3491,18 +4161,16 @@ def test_join_matrixV(self):
         topo = f.maker.fgraph.toposort()
         assert [True for node in topo if isinstance(node.op, self.join_op)]
 
-        want = numpy.array([[.1, .2, .3], [.4, .5, .6],
-                            [.1, .2, .3], [.4, .5, .6]])
+        want = numpy.array([[1, 2, 3], [4, 5, 6], [1, 2, 3], [4, 5, 6]])
         got = f(0)
-        assert numpy.allclose(got, want)
+        self.assertTrue((got == want).all(), (got, want))
 
-        want = numpy.array([[.1, .2, .3, .1, .2, .3],
-                            [.4, .5, .6, .4, .5, .6]])
+        want = numpy.array([[1, 2, 3, 1, 2, 3], [4, 5, 6, 4, 5, 6]])
         got = f(1)
-        assert numpy.allclose(got, want)
+        self.assertTrue((got == want).all(), (got, want))
 
-        utt.verify_grad(lambda a, b: join(0, a, b), [v, 2 * v], mode=self.mode)
-        utt.verify_grad(lambda a, b: join(1, a, b), [v, 2 * v], mode=self.mode)
+        utt.verify_grad(lambda a, b: join(0, a, b), [v, 2 * v])
+        utt.verify_grad(lambda a, b: join(1, a, b), [v, 2 * v])
 
     def test_vector_len(self):
         x = lscalar('x')
@@ -3551,8 +4219,7 @@ def test_broadcastable_flag_assignment_mixed_otheraxes(self):
         assert [True for node in topo if isinstance(node.op, self.join_op)]
 
         f()
-        utt.verify_grad((lambda a, b: join(1, a, b)), [a_val, b_val], rng=rng,
-                        mode=self.mode)
+        utt.verify_grad((lambda a, b: join(1, a, b)), [a_val, b_val], rng=rng)
 
         # Should raise an error if dimension 0 does not match
         a.set_value(rng.rand(2, 4, 1).astype(self.floatX))
@@ -3578,8 +4245,7 @@ def test_broadcastable_flag_assignment_mixed_thisaxes(self):
         assert [True for node in topo if isinstance(node.op, self.join_op)]
 
         f()
-        utt.verify_grad((lambda a, b: join(0, a, b)), [a_val, b_val], rng=rng,
-                        mode=self.mode)
+        utt.verify_grad((lambda a, b: join(0, a, b)), [a_val, b_val], rng=rng)
         # Should raise an error if b_val.shape[0] is not 1
         # We can't set the value|
         self.assertRaises(TypeError, b.set_value,
@@ -3611,12 +4277,13 @@ def test_broadcastable_flags_all_broadcastable_on_joinaxis(self):
         assert [True for node in topo if isinstance(node.op, self.join_op)]
 
         f()
-        utt.verify_grad((lambda a, b: join(0, a, b)), [a_val, b_val], rng=rng,
-                        mode=self.mode)
+        utt.verify_grad((lambda a, b: join(0, a, b)), [a_val, b_val], rng=rng)
 
     def test_broadcastable_single_input_broadcastable_dimension(self):
-        # Test that all broadcastable flags are preserved by a
-        # single-input join.
+        """
+        Test that all broadcastable flags are preserved by a
+        single-input join.
+        """
         rng = numpy.random.RandomState(seed=utt.fetch_seed())
         a_val = rng.rand(1, 4, 1).astype(self.floatX)
         a = self.shared(a_val, broadcastable=(True, False, True))
@@ -3632,16 +4299,17 @@ def test_broadcastable_single_input_broadcastable_dimension(self):
                 node.op, self.join_op)]
 
         f()
-        utt.verify_grad((lambda a: join(0, a)), [a_val], rng=rng,
-                        mode=self.mode)
+        utt.verify_grad((lambda a: join(0, a)), [a_val], rng=rng)
         # Should raise an error if length of dimension 0 is not 1
         self.assertRaises(TypeError, a.set_value,
                           rng.rand(2, 4, 1).astype(self.floatX))
         #self.assertRaises(TypeError, f, bad_a_val)
 
     def test_broadcastable_flags_many_dims_and_inputs(self):
-        # Test that the right broadcastable flags get set for a join
-        # with many inputs and many input dimensions.
+        """
+        Test that the right broadcastable flags get set for a  join
+        with many inputs and many input dimensions.
+        """
         a = TensorType(dtype=self.floatX, broadcastable=[1, 0, 1, 0, 0, 0])()
         b = TensorType(dtype=self.floatX, broadcastable=[1, 1, 1, 0, 0, 0])()
         c = TensorType(dtype=self.floatX, broadcastable=[1, 0, 0, 0, 0, 0])()
@@ -3669,15 +4337,14 @@ def test_broadcastable_flags_many_dims_and_inputs(self):
         e_val = rng.rand(1, 1, 1, 1, 2, 1).astype(self.floatX)
         f(a_val, b_val, c_val, d_val, e_val)
         utt.verify_grad((lambda a, b, c, d, e: join(0, a, b, c, d, e)),
-                        [a_val, b_val, c_val, d_val, e_val], rng=rng,
-                        mode=self.mode)
+                        [a_val, b_val, c_val, d_val, e_val], rng=rng)
         # Should raise an error if length of dimension 0 is not 1
         bad_val = rng.rand(2, 1, 1, 1, 2, 1).astype(self.floatX)
-        self.assertRaises(TypeError, f, bad_val, b_val, c_val, d_val, e_val)
-        self.assertRaises(TypeError, f, a_val, bad_val, c_val, d_val, e_val)
-        self.assertRaises(TypeError, f, a_val, b_val, bad_val, d_val, e_val)
-        self.assertRaises(TypeError, f, a_val, b_val, c_val, bad_val, e_val)
-        self.assertRaises(TypeError, f, a_val, b_val, c_val, d_val, bad_val)
+        self.assertRaises(TypeError, g, bad_val, b_val, c_val, d_val, e_val)
+        self.assertRaises(TypeError, g, a_val, bad_val, c_val, d_val, e_val)
+        self.assertRaises(TypeError, g, a_val, b_val, bad_val, d_val, e_val)
+        self.assertRaises(TypeError, g, a_val, b_val, c_val, bad_val, e_val)
+        self.assertRaises(TypeError, g, a_val, b_val, c_val, d_val, bad_val)
         # Should raise an error if any dimension other than 4 has length != 1
         bad_a_val = rng.rand(1, 2, 1, 1, 2, 1).astype(self.floatX)
         bad_b_val = rng.rand(1, 1, 1, 1, 2, 2).astype(self.floatX)
@@ -3731,54 +4398,14 @@ def get_mat(s1, s2):
             f(get_mat(3, 4), get_mat(3, 4), get_mat(2, 5))
 
     def test_rebroadcast(self):
-        # Regression test for a crash that used to happen when rebroadcasting.
-        x = tensor.TensorType(self.floatX, [False, False, True])()
-        u = tensor.TensorType(self.floatX, [False, False, True])()
+        """
+        Regression test for a crash that used to happen when rebroadcasting.
+        """
+        x = tensor.TensorType(floatX, [False, False, True])()
+        u = tensor.TensorType(floatX, [False, False, True])()
         # This line used to crash.
         z = tensor.concatenate([x, -u], axis=2)
 
-    def test_concatenate_same(self):
-        # Test that we can concatenate the same tensor multiple time.
-
-        # In the past it was broken on the GPU.
-        rng = numpy.random.RandomState(seed=utt.fetch_seed())
-        T_shared = self.shared(rng.rand(3, 4).astype(self.floatX))
-        Tout = tensor.concatenate([T_shared, T_shared])
-        f = function([], Tout, mode=self.mode)
-        out = f()
-        if theano.config.mode != 'FAST_COMPILE':
-            assert [True for node in f.maker.fgraph.toposort() if isinstance(
-                node.op, self.join_op)]
-        assert numpy.allclose(out,
-                              numpy.concatenate([T_shared.get_value(),
-                                                 T_shared.get_value()]))
-
-    def test_mixed_ndim_error(self):
-        rng = numpy.random.RandomState(seed=utt.fetch_seed())
-        v = self.shared(rng.rand(4).astype(self.floatX))
-        m = self.shared(rng.rand(4, 4).astype(self.floatX))
-        self.assertRaises(TypeError, self.join_op(), 0, v, m)
-
-    def test_split_0elem(self):
-        rng = numpy.random.RandomState(seed=utt.fetch_seed())
-        m = self.shared(rng.rand(4, 6).astype(self.floatX))
-        o = self.split_op(2)(m, 0, [4, 0])
-        f = function([], o, mode=self.mode)
-        assert any([isinstance(node.op, self.split_op)
-                    for node in f.maker.fgraph.toposort()])
-        o1, o2 = f()
-        assert numpy.allclose(o1, m.get_value(borrow=True))
-        assert numpy.allclose(o2, m.get_value(borrow=True)[4:])
-
-    def test_split_neg(self):
-        rng = numpy.random.RandomState(seed=utt.fetch_seed())
-        m = self.shared(rng.rand(4, 6).astype(self.floatX))
-        o = self.split_op(2)(m, 0, [5, -1])
-        f = function([], o, mode=self.mode)
-        assert any([isinstance(node.op, self.split_op)
-                    for node in f.maker.fgraph.toposort()])
-        self.assertRaises(ValueError, f)
-
 
 class test_comparison(unittest.TestCase):
     """Test <, >, <=, >=, == and !=
@@ -4280,6 +4907,9 @@ def not_aligned(self, x, y):
                     # Reported by Theano perform
                     e0.split()[0:4]
                          == ['Incompatible', 'shapes', 'for', 'gemv'] or
+                    # Reported by Theano when 'exception_verbosity' is set
+                    # to 'high'.
+                    e0.split()[0:3] == ['dot', 'product', 'failed.'],
                     e)
         finally:
             _logger.setLevel(oldlevel)
@@ -4326,19 +4956,18 @@ def test_grad(self):
         utt.verify_grad(dot, [rand(2, 3, 4), rand(4, 5)])
         utt.verify_grad(dot, [rand(2, 3, 4), rand(3, 4, 5)])
 
-    @attr('slow')
     def test_broadcastable_patterns(self):
 
         #
-        # These examples should all work because we broadcastable or
-        # no, all dimensions of all results have size 1.
+        # These examples should all work because we broadcastable or no, all dimensions of all
+        # results have size 1.
         #
         def val_for(r):
             if r.dtype.startswith('complex'):
                 # We want to test complex at the same time, so we give a value
                 # To the imaginary component.
-                # This strange way of doing things is the only way that worked
-                # on numpy 1.4.1
+                # This strange way of doing things is the only way that worked on
+                # numpy 1.4.1
                 if r.ndim == 0:
                     return numpy.asarray(numpy.complex(1.1, 2.1),
                                          dtype=r.dtype)
@@ -4361,16 +4990,14 @@ def val_for(r):
                 return numpy.asarray([[1.3]], dtype=r.dtype)
             raise ValueError()
 
-        for dtype0 in ('float32', 'float64', 'complex64'):
-            for dtype1 in ('float32', 'complex64', 'complex128'):
-                for bc0 in ((True,), (False,), (True, True),
-                            (True, False), (False, True),
+        for dtype0 in ('float32', 'float64', 'complex64', 'complex128'):
+            for dtype1 in ('float32', 'float64', 'complex64', 'complex128'):
+                for bc0 in ((True,), (False,), (True, True), (True, False), (False, True),
+                        (False, False)):
+                    for bc1 in ((True,), (False,), (True, True), (True, False), (False, True),
                             (False, False)):
-                    x = TensorType(dtype=dtype0, broadcastable=bc0)()
-                    for bc1 in ((True,), (False,), (True, True),
-                                (True, False), (False, True),
-                                (False, False)):
 
+                        x = TensorType(dtype=dtype0, broadcastable=bc0)()
                         y = TensorType(dtype=dtype1, broadcastable=bc1)()
                         z = dot(x, y)
                         t = TensorType(dtype=dtype0,
@@ -4383,12 +5010,6 @@ def val_for(r):
                         tval = val_for(t)
 
                         f(xval, yval, tval)  # debugmode checks result
-                        if (dtype0.startswith('float') and
-                            dtype1.startswith('float')):
-                            g = grad(z.sum(), x)
-                            assert g.broadcastable == x.broadcastable
-                            g = grad(z.sum(), y)
-                            assert g.broadcastable == y.broadcastable
 
 
 class T_tensorfromscalar(unittest.TestCase):
@@ -4577,30 +5198,9 @@ def test0(self):
         self.assertTrue(numpy.all(fn_py(a) == fn_c_or_py(a)))
 
 
-class T_reshape(utt.InferShapeTester, utt.TestOptimizationMixin):
-    def __init__(self, name, shared=tensor._shared, op=Reshape, mode=None,
-                 ignore_topo=(DeepCopyOp, opt.MakeVector,
-                              opt.Shape_i, DimShuffle, theano.tensor.Elemwise)):
-        self.shared = shared
-        self.op = op
-        #The tag canonicalize is needed for the shape test in FAST_COMPILE
-        self.mode = mode
-        self.ignore_topo = ignore_topo
-        return super(T_reshape, self).__init__(name)
-
-    def function(self, inputs, outputs):
-        f = function(inputs, outputs, mode=self.mode)
-        if self.mode is not None or theano.config.mode != "FAST_COMPILE":
-            topo = f.maker.fgraph.toposort()
-            topo_ = [node for node in topo if not isinstance(node.op,
-                                                             self.ignore_topo)]
-            assert len(topo_) == 1, topo_
-        return f
-
-    def eval_output_and_check(self, t):
-        f = self.function([], t)
-        tval = f()
-        return tval
+class T_reshape(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
 
     def test_reshape(self):
         a = dvector()
@@ -4609,7 +5209,7 @@ def test_reshape(self):
 
         #basic to 1 dim(without list)
         c = reshape(b, as_tensor_variable(6), ndim=1)
-        f = self.function([b], c)
+        f = inplace_func([b], c)
 
         b_val1 = numpy.asarray([[0, 1, 2], [3, 4, 5]])
         c_val1 = numpy.asarray([0, 1, 2, 3, 4, 5])
@@ -4625,7 +5225,7 @@ def test_reshape(self):
 
         #basic to 1 dim(with list)
         c = reshape(b, (as_tensor_variable(6),), ndim=1)
-        f = self.function([b], c)
+        f = inplace_func([b], c)
         assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) ==
                          numpy.asarray([0, 1, 2, 3, 4, 5]))
         #print f.maker.fgraph.toposort()
@@ -4633,14 +5233,14 @@ def test_reshape(self):
 
         #basic to shape object of same ndim
         c = reshape(b, d.shape)
-        f = self.function([b, d], c)
+        f = inplace_func([b, d], c)
         assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]]),
                            [[0, 1], [2, 3], [4, 5]]) ==
                          numpy.asarray([[0, 1], [2, 3], [4, 5]]))
 
         #basic to 2 dims
         c = reshape(a, [2, 3])
-        f = self.function([a], c)
+        f = inplace_func([a], c)
         assert numpy.all(f(numpy.asarray([0, 1, 2, 3, 4, 5])) ==
                          numpy.asarray([[0, 1, 2], [3, 4, 5]]))
 
@@ -4649,7 +5249,7 @@ def test_reshape(self):
         a_val_copy = numpy.asarray([0, 1, 2, 3, 4, 5])
         b_val = numpy.asarray([[0, 1, 2], [3, 4, 5]])
 
-        f_sub = self.function([a, b], c - b)
+        f_sub = inplace_func([a, b], c - b)
         assert numpy.all(f_sub(a_val, b_val) == 0.0)
         assert numpy.all(a_val == a_val_copy)
 
@@ -4658,33 +5258,35 @@ def test_reshape(self):
         a_val_copy = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float64')
         b_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float64')
 
-        f_sub = self.function([a, b], c - b)
+        f_sub = inplace_func([a, b], c - b)
         assert numpy.all(f_sub(a_val, b_val) == 0.0)
         assert numpy.all(a_val == a_val_copy)
 
         # verify gradient
         def just_vals(v):
             return Reshape(2)(v, theano._asarray([2, 3], dtype='int32'))
-        utt.verify_grad(just_vals, [a_val], mode=self.mode)
+        utt.verify_grad(just_vals, [a_val])
 
         #test infer_shape
-        self._compile_and_check([a], [c], (a_val,), self.op)
+        f_sub = function([a, b], (c - b).shape)
+        if config.mode == "FAST_COMPILE":
+            assert len(f_sub.maker.fgraph.toposort()) == 3
+        else:
+            topo = f_sub.maker.fgraph.toposort()
+            assert len(topo) == 1
+            topo[0].op == theano.compile.function_module.deep_copy_op
+            #assert numpy.all(f_sub(a_val,numpy.asarray([[0,1],[2,3],[4,5]]))==[2,3])#work in FAST_RUN, but fail on other!
+            #assert numpy.all(f_sub(a_val,numpy.asarray([[0,1],[2,3],[4,5],[6,7]]))==[2,3])#work in FAST_RUN, but fail on other!
 
         # test broadcast flag for constant value of 1
         c = reshape(b, (b.shape[0], b.shape[1], 1))
-        f = self.function([b], c)
+        f = inplace_func([b], c)
         assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) ==
                          numpy.asarray([[[0], [1], [2]], [[3], [4], [5]]]))
         assert (f.maker.fgraph.toposort()[-2].outputs[0].type.broadcastable ==
                 (False, False, True))
 
-    def test_m1(self):
-        t = tensor3()
-        rng = numpy.random.RandomState(seed=utt.fetch_seed())
-        val = rng.uniform(size=(3, 4, 5)).astype(config.floatX)
-        for out in [t.reshape([-1]), t.reshape([-1, 5]),
-                    t.reshape([5, -1]), t.reshape([5, -1, 3])]:
-            self._compile_and_check([t], [out], [val], self.op)
+        assert numpy.all(f_sub(a_val, b_val) == [2, 3])
 
     def test_reshape_long_in_shape(self):
         v = dvector('v')
@@ -4703,14 +5305,14 @@ def test_bad_shape(self):
         r = a.reshape(shapes, ndim=1)
         z = zeros_like(r)
 
-        f = self.function([a, shapes], z.shape)
+        f = function([a, shapes], z.shape)
         self.assertRaises(ValueError, f, a_val, [13])
 
         #Test reshape to 2 dim
         r = a.reshape(shapes, ndim=2)
         z = zeros_like(r)
 
-        f = self.function([a, shapes], z.shape)
+        f = function([a, shapes], z.shape)
 
         self.assertRaises(ValueError, f, a_val, [-1, 5])
         self.assertRaises(ValueError, f, a_val, [7, -1])
@@ -5355,7 +5957,7 @@ def test0(self):
         bval = rand(5)
         out0 = numpy.tensordot(aval, bval, axes)
         out1 = f1(aval, bval)
-        utt.assert_allclose(out0, out1)
+        self.assertTrue(numpy.allclose(out0, out1), (out0, out1))
         utt.verify_grad(self.TensorDot(axes), [aval, bval])
 
         # Test matrix-vector
@@ -5365,8 +5967,8 @@ def test0(self):
         f2 = inplace_func([avec, bmat], c)
         aval = rand(5)
         bval = rand(8, 5)
-        utt.assert_allclose(numpy.tensordot(aval, bval, axes),
-                            f2(aval, bval))
+        self.assertTrue(numpy.allclose(numpy.tensordot(aval, bval, axes),
+                                       f2(aval, bval)))
         utt.verify_grad(self.TensorDot(axes), [aval, bval])
 
         # Test matrix-matrix
@@ -5384,8 +5986,8 @@ def test0(self):
             f3 = inplace_func([amat, bmat], c)
             aval = rand(*shps[0])
             bval = rand(*shps[1])
-            utt.assert_allclose(numpy.tensordot(aval, bval, axes),
-                                f3(aval, bval))
+            self.assertTrue(numpy.allclose(numpy.tensordot(aval, bval, axes),
+                                           f3(aval, bval)))
             utt.verify_grad(self.TensorDot(axes), [aval, bval])
 
         # Test ndarray-matrix, sum over one dim of matrix
@@ -5402,8 +6004,8 @@ def test0(self):
             f4 = inplace_func([atens, bmat], c)
             aval = rand(*shps[0])
             bval = rand(*shps[1])
-            utt.assert_allclose(numpy.tensordot(aval, bval, axes),
-                                f4(aval, bval))
+            self.assertTrue(numpy.allclose(numpy.tensordot(aval, bval, axes),
+                                           f4(aval, bval)))
             utt.verify_grad(self.TensorDot(axes), [aval, bval])
 
         # Test ndarray-ndarray
@@ -5414,15 +6016,15 @@ def test0(self):
         f5 = inplace_func([atens, btens], c)
         aval = rand(4, 3, 5, 2)
         bval = rand(3, 4, 2)
-        utt.assert_allclose(numpy.tensordot(aval, bval, axes),
-                            f5(aval, bval))
+        self.assertTrue(numpy.allclose(numpy.tensordot(aval, bval, axes),
+                                       f5(aval, bval)))
         utt.verify_grad(self.TensorDot(axes), [aval, bval])
 
         axes = (axes[1], axes[0])
         c = tensordot(btens, atens, axes)
         f6 = inplace_func([btens, atens], c)
-        utt.assert_allclose(numpy.tensordot(bval, aval, axes),
-                            f6(bval, aval))
+        self.assertTrue(numpy.allclose(numpy.tensordot(bval, aval, axes),
+                                       f6(bval, aval)))
         utt.verify_grad(self.TensorDot(axes), [bval, aval])
 
     def test_raise_error(self):
@@ -6176,7 +6778,7 @@ def test_get_scalar_constant_value(self):
         # For now get_scalar_constant_value goes through only MakeVector and Join of
         # scalars.
         v = tensor.ivector()
-        a = tensor.stack(v, [2], [3])
+        a = tensor.stack(v, 2, 3)
         self.assertRaises(tensor.NotScalarConstantError, get_scalar_constant_value, a[0])
         self.assertRaises(tensor.NotScalarConstantError, get_scalar_constant_value, a[1])
         self.assertRaises(tensor.NotScalarConstantError, get_scalar_constant_value, a[2])
@@ -6225,35 +6827,6 @@ def test_make_vector(self):
             get_scalar_constant_value,
             mv[t()])
 
-    def test_shape_i(self):
-        c = theano.tensor.constant(numpy.random.rand(3, 4))
-        s = opt.Shape_i(0)(c)
-        assert get_scalar_constant_value(s) == 3
-        s = opt.Shape_i(1)(c)
-        assert get_scalar_constant_value(s) == 4
-
-    def test_elemwise(self):
-        # We test only for a few elemwise, the list of all supported
-        # elemwise are in the fct.
-        c = theano.tensor.constant(numpy.random.rand())
-        s = c + 1
-        assert numpy.allclose(get_scalar_constant_value(s), c.data + 1)
-        s = c - 1
-        assert numpy.allclose(get_scalar_constant_value(s), c.data - 1)
-        s = c * 1.2
-        assert numpy.allclose(get_scalar_constant_value(s), c.data * 1.2)
-        s = c < 0.5
-        assert numpy.allclose(get_scalar_constant_value(s), int(c.data < 0.5))
-        s = tensor.second(c, .4)
-        assert numpy.allclose(get_scalar_constant_value(s), .4)
-
-    def test_second(self):
-        #Second should apply when the value is constant but not the shape
-        c = theano.tensor.constant(numpy.random.rand())
-        shp = theano.tensor.vector()
-        s = theano.tensor.second(shp, c)
-        assert get_scalar_constant_value(s) == c.data
-
 
 class T_as_tensor_variable(unittest.TestCase):
     """
@@ -6270,14 +6843,6 @@ def test_ndarray_bool(self):
         ten = as_tensor_variable(numpy.array([True, False, False, True, True]))
         assert ten.type.dtype == 'uint8'
 
-    def test_memmap(self):
-        inp = numpy.random.rand(4, 3)
-        f, fname = mkstemp()
-        new_inp = numpy.memmap(fname, dtype=inp.dtype,
-                               mode='w+', shape=inp.shape)
-        new_inp[...] = inp
-        x = as_tensor_variable(new_inp)
-
 
 class test_complex_mod(unittest.TestCase):
     """Make sure % fails on complex numbers."""
@@ -6287,7 +6852,7 @@ def test_fail(self):
         try:
             x % 5
             assert False
-        except theano.scalar.ComplexError:
+        except ComplexError:
             pass
 
 
@@ -6423,11 +6988,7 @@ def test_stacklists():
     x = numpy.ones((4, 4), 'float32')
     assert f(x,x,x,x).shape == (2, 2, 4, 4)
 
-
 class TestSpecifyShape(unittest.TestCase):
-    mode = None
-    input_type = TensorType
-
     def shortDescription(self):
         return None
 
@@ -6438,21 +6999,14 @@ def test_bad_shape(self):
 
         x = vector()
         xval = numpy.random.rand(2).astype(floatX)
-        f = theano.function([x], specify_shape(x, [2]), mode=self.mode)
+        f = theano.function([x], specify_shape(x, [2]))
         f(xval)
         xval = numpy.random.rand(3).astype(floatX)
         self.assertRaises(AssertionError, f, xval)
-        theano.printing.debugprint(f)
-        assert isinstance([n for n in f.maker.fgraph.toposort()
-                           if isinstance(n.op, SpecifyShape)][0].inputs[0].type,
-                          self.input_type)
 
         x = matrix()
         xval = numpy.random.rand(2, 3).astype(floatX)
-        f = theano.function([x], specify_shape(x, [2, 3]), mode=self.mode)
-        assert isinstance([n for n in f.maker.fgraph.toposort()
-                           if isinstance(n.op, SpecifyShape)][0].inputs[0].type,
-                          self.input_type)
+        f = theano.function([x], specify_shape(x, [2, 3]))
         f(xval)
         for shape in [(1, 3), (2, 2), (5, 5)]:
             xval = numpy.random.rand(*shape).astype(floatX)
@@ -6468,11 +7022,7 @@ def test_bad_number_of_shape(self):
         self.assertRaises(AssertionError, specify_shape, x, [])
         self.assertRaises(AssertionError, specify_shape, x, [2, 2])
 
-        f = theano.function([x, shape_vec], specify_shape(x, shape_vec),
-                            mode=self.mode)
-        assert isinstance([n for n in f.maker.fgraph.toposort()
-                           if isinstance(n.op, SpecifyShape)][0].inputs[0].type,
-                          self.input_type)
+        f = theano.function([x, shape_vec], specify_shape(x, shape_vec))
         self.assertRaises(AssertionError, f, xval, [])
         self.assertRaises(AssertionError, f, xval, [2, 2])
 
@@ -6482,11 +7032,7 @@ def test_bad_number_of_shape(self):
                       (1,),
                       (2, 3, 4)]:
             self.assertRaises(AssertionError, specify_shape, x, shape)
-            f = theano.function([x, shape_vec], specify_shape(x, shape_vec),
-                                mode=self.mode)
-            assert isinstance([n for n in f.maker.fgraph.toposort()
-                               if isinstance(n.op, SpecifyShape)][0].inputs[0].type,
-                              self.input_type)
+            f = theano.function([x, shape_vec], specify_shape(x, shape_vec))
             self.assertRaises(AssertionError, f, xval, shape)
 
 
@@ -6595,8 +7141,7 @@ def test_infer_shape(self):
         self._compile_and_check([advec, bdvec],
                                 [Dot()(advec, bdvec)],
                                 [advec_val, bdvec_val],
-                                (Dot, tensor.blas.Dot22,
-                                 tensor.blas.Gemv, tensor.blas_c.CGemv))
+                                (Dot, tensor.blas.Gemv, tensor.blas_c.CGemv))
 
         #mat/mat
         admat = dmatrix()
@@ -6613,16 +7158,14 @@ def test_infer_shape(self):
         self._compile_and_check([advec, bdmat],
                                 [Dot()(advec, bdmat)],
                                 [advec_val, bdmat_val],
-                                (Dot, tensor.blas.Dot22,
-                                 tensor.blas.Gemv, tensor.blas_c.CGemv))
+                                (Dot, tensor.blas.Gemv, tensor.blas_c.CGemv))
 
         #mat/vec
         admat_val = rand(5, 4)
         self._compile_and_check([admat, bdvec],
                                 [Dot()(admat, bdvec)],
                                 [admat_val, bdvec_val],
-                                (Dot, tensor.blas.Dot22,
-                                 tensor.blas.Gemv, tensor.blas_c.CGemv))
+                                (Dot, tensor.blas.Gemv, tensor.blas_c.CGemv))
 
         # Split
         aivec = ivector()
@@ -6798,6 +7341,166 @@ def test_infer_shape(self):
                                 [Mean(aiscal_val)(adtens3)],
                                 [adtens3_val], Mean)
 
+        # IncSubtensor
+        admat = dmatrix()
+        bdmat = dmatrix()
+        advec = dvector()
+        adscal = dscalar()
+        admat_val = rand(5, 4)
+        self._compile_and_check([admat, bdmat],
+                            [inc_subtensor(admat[2:4], bdmat)],
+                            [admat_val, [[1, 2, 3, 4]]], IncSubtensor)
+
+        self._compile_and_check([admat, advec],
+                            [inc_subtensor(admat[2], advec)],
+                            [admat_val, [1, 2, 3, 4]], IncSubtensor)
+
+        self._compile_and_check([admat, adscal],
+                            [inc_subtensor(admat[2, 3], adscal)],
+                            [admat_val, 1], IncSubtensor)
+
+        self._compile_and_check([admat, adscal],
+                            [inc_subtensor(admat[1:3, 2], adscal)],
+                            [admat_val, 1], IncSubtensor)
+
+        self._compile_and_check([admat, bdmat],
+                            [set_subtensor(admat[2:4], bdmat)],
+                            [admat_val, [[1, 2, 3, 4]]], IncSubtensor)
+
+        self._compile_and_check([admat, advec],
+                            [set_subtensor(admat[2], advec)],
+                            [admat_val, [1, 2, 3, 4]], IncSubtensor)
+
+        self._compile_and_check([admat, adscal],
+                            [set_subtensor(admat[2, 3], adscal)],
+                            [admat_val, 1], IncSubtensor)
+
+        self._compile_and_check([admat, adscal],
+                            [set_subtensor(admat[1:3, 2], adscal)],
+                            [admat_val, 1], IncSubtensor)
+
+        bdtens4 = dtensor4()
+        adtens4_val = rand(3, 4, 2, 5)
+        self._compile_and_check([adtens4, bdtens4],
+                            [inc_subtensor(adtens4[::, 2:4, ::, ::], bdtens4)],
+                            [adtens4_val, [[[[1, 2, 3, 4, 5]]]]], IncSubtensor,
+                            warn=False)
+        self._compile_and_check([adtens4, bdmat],
+                            [inc_subtensor(adtens4[2, 2:4, 1, ::], bdmat)],
+                            [adtens4_val, [[1, 2, 3, 4, 5]]], IncSubtensor)
+
+        self._compile_and_check([adtens4, advec],
+                            [inc_subtensor(adtens4[0, 1, ::, 4], advec)],
+                            [adtens4_val, [1, 2]], IncSubtensor)
+
+        self._compile_and_check([adtens4, adscal],
+                            [inc_subtensor(adtens4[1:3, 1, ::, 2:4], adscal)],
+                            [adtens4_val, 1], IncSubtensor)
+
+        self._compile_and_check([adtens4, bdtens4],
+                            [set_subtensor(adtens4[::, 2:4, ::, ::], bdtens4)],
+                            [adtens4_val, [[[[1, 2, 3, 4, 5]]]]], IncSubtensor,
+                            warn=False)
+
+        self._compile_and_check([adtens4, bdmat],
+                            [set_subtensor(adtens4[2, 2:4, 1, ::], bdmat)],
+                            [adtens4_val, [[1, 2, 3, 4, 5]]], IncSubtensor)
+
+        self._compile_and_check([adtens4, advec],
+                            [set_subtensor(adtens4[0, 1, ::, 4], advec)],
+                            [adtens4_val, [1, 2]], IncSubtensor)
+
+        self._compile_and_check([adtens4, adscal],
+                            [set_subtensor(adtens4[1:3, 1, ::, 2:4], adscal)],
+                            [adtens4_val, 1], IncSubtensor)
+
+        # AdvancedIncSubtensor1
+        admat = dmatrix()
+        bdmat = dmatrix()
+        advec = dvector()
+        adscal = dscalar()
+        admat_val = rand(5, 4)
+        aivec_val = [2, 3]
+        self._compile_and_check([admat, bdmat],
+                            [set_subtensor(admat[aivec_val], bdmat)],
+                            [admat_val, [[1, 2, 3, 4]]], AdvancedIncSubtensor1)
+
+        aivec_val = [1, 3, 2]
+        self._compile_and_check([admat, advec],
+                            [set_subtensor(admat[aivec_val], advec)],
+                            [admat_val, [1, 2, 3, 4]], AdvancedIncSubtensor1)
+
+        aivec_val = [0, 3, 0]
+        self._compile_and_check([admat, adscal],
+                            [set_subtensor(admat[aivec_val], adscal)],
+                            [admat_val, 1], AdvancedIncSubtensor1)
+
+        bdtens4 = dtensor4()
+        adtens4_val = rand(4, 3, 2, 5)
+        aivec_val = [2, 3]
+        self._compile_and_check([adtens4, bdtens4],
+                            [set_subtensor(adtens4[aivec_val], bdtens4)],
+                            [adtens4_val, [[[[1, 2, 3, 4, 5]]]]],
+                            AdvancedIncSubtensor1,
+                            warn=False)
+
+        aivec_val = [1, 3, 2]
+        self._compile_and_check([adtens4, advec],
+                            [set_subtensor(adtens4[aivec_val], advec)],
+                            [adtens4_val, [1, 2, 3, 4, 5]],
+                            AdvancedIncSubtensor1)
+
+        aivec_val = [0, 3, 0]
+        self._compile_and_check([adtens4, adscal],
+                            [set_subtensor(adtens4[aivec_val], adscal)],
+                            [adtens4_val, 1],
+                            AdvancedIncSubtensor1)
+
+        aivec_val = [2, 3]
+        self._compile_and_check([admat, bdmat],
+                                [inc_subtensor(admat[aivec_val], bdmat)],
+                                [admat_val, [[1, 2, 3, 4], [5, 6, 7, 8]]],
+                                AdvancedIncSubtensor1)
+
+        aivec_val = [1, 3, 2]
+        self._compile_and_check([admat, advec],
+                            [inc_subtensor(admat[aivec_val], advec)],
+                            [admat_val, [1, 2, 3, 4]], AdvancedIncSubtensor1)
+
+        aivec_val = [0, 3, 0]
+        self._compile_and_check([admat, adscal],
+                            [inc_subtensor(admat[aivec_val], adscal)],
+                            [admat_val, 1], AdvancedIncSubtensor1)
+
+        bdtens4 = dtensor4()
+        adtens4_val = rand(4, 3, 2, 5)
+        aivec_val = [2, 3]
+        self._compile_and_check([adtens4, bdtens4],
+                            [inc_subtensor(adtens4[aivec_val], bdtens4)],
+                            [adtens4_val, [[[[1, 2, 3, 4, 5]]],
+                                           [[[6, 7, 8, 9, 10]]]]],
+                            AdvancedIncSubtensor1,
+                            warn=False)
+
+        aivec_val = [1, 2, 1]
+        self._compile_and_check([adtens4, advec],
+                            [inc_subtensor(adtens4[aivec_val], advec)],
+                            [adtens4_val, [1, 2, 3, 4, 5]],
+                            AdvancedIncSubtensor1)
+
+        aivec_val = [0, 3, 0]
+        self._compile_and_check([adtens4, adscal],
+                            [inc_subtensor(adtens4[aivec_val], adscal)],
+                            [adtens4_val, 2],
+                            AdvancedIncSubtensor1)
+
+        # AdvancedIncSubtensor
+        aivec_val = [1, 3, 2]
+        bivec_val = [0, 3, 3]
+        advec_val = [23, 24, 25]
+        self._compile_and_check([admat, advec],
+                    [set_subtensor(admat[aivec_val, bivec_val], advec)],
+                    [admat_val, advec_val], AdvancedIncSubtensor)
 
         # Reshape
         # TODO: generalize infer_shape to account for tensor variable
@@ -7008,107 +7711,6 @@ def test_take(self):
         # Test equivalent advanced indexing
         assert_array_equal(X[:,indices].eval({X: x}), x[:,indices])
 
-    def test_cumsum(self):
-        X, _ = self.vars
-        x, _ = self.vals
-        assert_array_equal(X.cumsum().eval({X: x}), x.cumsum())
-
-    def test_cumprod(self):
-        X, _ = self.vars
-        x, _ = self.vals
-        assert_array_equal(X.cumprod().eval({X: x}), x.cumprod())
-
-
-def test_norm():
-    x = theano.tensor.vector('x')
-    n = x.norm(2)
-    f = theano.function([x], n)
-    assert numpy.allclose(f([1, 1]), numpy.sqrt(2))
-
-
-class test_ptp(unittest.TestCase):
-    def test_scalar(self):
-        """
-        Should return 0 for all scalar
-        """
-        x = scalar('x')
-        p = ptp(x)
-        f = theano.function([x], p)
-
-        y = numpy.asarray(rand() * 2000 - 1000, dtype=config.floatX)
-        result = f(y)
-        numpyResult = numpy.ptp(y)
-
-        self.assertTrue(numpy.array_equal(result, numpyResult))
-
-    def test_vector(self):
-
-        x = vector('x')
-        p = ptp(x, 0)
-        f = theano.function([x], p)
-
-        y = rand_ranged(-1000, 1000, [100])
-        result = f(y)
-        numpyResult = numpy.ptp(y, 0)
-
-        self.assertTrue(numpy.array_equal(result, numpyResult))
-
-    def test_matrix_first_axis(self):
-
-        x = matrix('x')
-        p = ptp(x, 1)
-        f = theano.function([x], p)
-
-        y = rand_ranged(-1000, 1000, [100, 100])
-        result = f(y)
-        numpyResult = numpy.ptp(y, 1)
-
-        self.assertTrue(numpy.array_equal(result, numpyResult))
-
-    def test_matrix_second_axis(self):
-        x = matrix('x')
-        p = ptp(x, 0)
-        f = theano.function([x], p)
-
-        y = rand_ranged(-1000, 1000, [100, 100])
-        result = f(y)
-        numpyResult = numpy.ptp(y, 0)
-
-        self.assertTrue(numpy.array_equal(result, numpyResult))
-
-    def test_matrix_neg_axis(self):
-        x = matrix('x')
-        p = ptp(x, -1)
-        f = theano.function([x], p)
-
-        y = rand_ranged(-1000, 1000, [100, 100])
-        result = f(y)
-        numpyResult = numpy.ptp(y, -1)
-
-        self.assertTrue(numpy.array_equal(result, numpyResult))
-
-    def test_matrix_no_axis(self):
-        x = matrix('x')
-        p = ptp(x)
-        f = theano.function([x], p)
-
-        y = rand_ranged(-1000, 1000, [100, 100])
-        result = f(y)
-        numpyResult = numpy.ptp(y)
-
-        self.assertTrue(numpy.array_equal(result, numpyResult))
-
-    def test_interface(self):
-        x = matrix('x')
-        p = x.ptp(1)
-        f = theano.function([x], p)
-
-        y = rand_ranged(-1000, 1000, [100, 100])
-        result = f(y)
-        numpyResult = numpy.ptp(y, 1)
-
-        self.assertTrue(numpy.array_equal(result, numpyResult))
-
 if __name__ == '__main__':
 
     t = TestInferShape('setUp')
@@ -7116,196 +7718,6 @@ def test_interface(self):
     t.test_infer_shape()
 
 
-class T_swapaxes(unittest.TestCase):
-
-    def test_no_dimensional_input(self):
-        self.assertRaises(IndexError, swapaxes, 2, 0, 1)
-
-    def test_unidimensional_input(self):
-        self.assertRaises(IndexError, swapaxes, [2, 1], 0, 1)
-
-    def test_not_enough_dimension(self):
-        self.assertRaises(IndexError, swapaxes, [[2, 1], [3, 4]], 3, 4)
-
-    def test_doubleswap(self):
-        y = matrix()
-        n = swapaxes(y, 0, 1)
-        f = function([y], n)
-        testMatrix = [[2, 1], [3, 4]]
-        self.assertTrue(numpy.array_equal(testMatrix, f(f(testMatrix))))
-
-    def test_interface(self):
-        x = theano.tensor.matrix()
-        x.swapaxes(0,1)
-
-    def test_numpy_compare(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        A = tensor.matrix("A", dtype=theano.config.floatX)
-        Q = swapaxes(A, 0, 1)
-        fn = function([A], [Q])
-        a = rng.rand(4, 4).astype(theano.config.floatX)
-
-        n_s = numpy.swapaxes(a, 0, 1)
-        t_s = fn(a)
-        assert numpy.allclose(n_s, t_s)
-
-
-class T_Power(unittest.TestCase):
-    def test_numpy_compare(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        A = tensor.matrix("A", dtype=theano.config.floatX)
-        Q = power(A, 3)
-        fn = function([A], [Q])
-        a = rng.rand(4, 4).astype(theano.config.floatX)
-
-        n_p = numpy.power(a, 3)
-        t_p = fn(a)
-        assert numpy.allclose(n_p, t_p)
-
-    def test_multiple_power(self):
-        x = tensor.vector()
-        y = [1, 2, 3]
-        z = power(x, y)
-        f = function([x], z)
-        assert numpy.allclose(f([1, 2, 3]), [1, 4, 27])
-
-    def test_wrong_shape(self):
-        x = tensor.vector()
-        y = [1, 2, 3]
-        z = power(x, y)
-        f = function([x], z)
-        self.assertRaises(ValueError, f, [1, 2, 3, 4])
-
-
-class T_Choose(utt.InferShapeTester):
-    op = staticmethod(choose)
-    op_class = Choose
-    modes = ['raise', 'wrap', 'clip']
-
-    def test_numpy_compare(self):
-
-        a = tensor.vector(dtype='int32')
-        b = tensor.matrix(dtype='float32')
-
-        A = numpy.asarray(numpy.random.random_integers(0, 3, 4),
-                          dtype='int32')
-        B = numpy.asarray(numpy.random.rand(4, 4), dtype='float32')
-
-        for m in self.modes:
-            f = function([a, b], choose(a, b, mode=m))
-            t_c = f(A, B)
-            n_c = numpy.choose(A, B, mode=m)
-            assert numpy.allclose(t_c, n_c)
-
-    def test_broadcasted(self):
-        a = tensor.scalar(dtype='int32')
-        b = tensor.matrix(dtype='float32')
-
-        # Test when a is broadcastable
-        A = 3
-        B = numpy.asarray(numpy.random.rand(4, 4), dtype='float32')
-
-        for m in self.modes:
-            f = function([a, b], choose(a, b, mode=m))
-            t_c = f(A, B)
-            n_c = numpy.choose(A, B, mode=m)
-            assert numpy.allclose(t_c, n_c)
-
-        # Test when the result should be broadcastable
-        b = theano.tensor.col(dtype='float32')
-        B = numpy.asarray(numpy.random.rand(4, 1), dtype='float32')
-        for m in self.modes:
-            f = function([a, b], choose(a, b, mode=m))
-            assert choose(a, b, mode=m).broadcastable[0]
-            t_c = f(A, B)
-            n_c = numpy.choose(A, B, mode=m)
-            assert numpy.allclose(t_c, n_c)
-
-    def test_dtype_error(self):
-        a = tensor.scalar(dtype='float32')
-        b = tensor.matrix(dtype='float32')
-
-        A = 3
-        B = numpy.asarray(numpy.random.rand(4, 4), dtype='float32')
-        self.assertRaises(TypeError, choose, a, b)
-
-    def test_numpy_compare_tuple(self):
-
-        a = tensor.tensor3(dtype='int32')
-        b = tensor.tensor3(dtype='float32')
-        c = tensor.tensor3(dtype='float32')
-
-        A = numpy.asarray(numpy.random.random_integers(0, 1, (2, 1, 1)),
-                          dtype='int32')
-        B = numpy.asarray(numpy.random.rand(1, 6, 1), dtype='float32')
-        C = numpy.asarray(numpy.random.rand(1, 1, 5), dtype='float32')
-
-        for m in self.modes:
-            f = function([a, b, c], choose(a, (b, c), mode=m))
-            t_c = f(A, B, C)
-            n_c = numpy.choose(A, (B, C), mode=m)
-            assert numpy.allclose(t_c, n_c)
-
-    def test_infer_shape(self):
-        for shp1, shp2 in [
-            ((5, 4), (7, 4)),
-            ((1, 4), (7, 4)),
-            ((5, 1), (7, 4)),
-            ((5, 4), (1, 4)),
-            ((5, 4), (7, 1)),
-
-            ((5, 4), (4,)),
-            ((1, 4), (4,)),
-            ((5, 1), (4,)),
-            ((5, 4), (1,)),
-
-            ((4,), (5, 4)),
-            ((1,), (5, 4)),
-            ((4,), (1, 4)),
-            ((4,), (3, 1)),
-
-            ((4,), (4,)),
-            ((1,), (4,)),
-            ((4,), (1,)),
-            ((1,), (1,)),
-        ]:
-            a = tensor.tensor(dtype='int32',
-                              broadcastable=[n == 1 for n in shp1])
-            c = tensor.tensor(dtype='float32',
-                              broadcastable=[n == 1 for n in shp2])
-            A = numpy.asarray(numpy.random.rand(*shp1) * shp2[0], dtype='int32')
-            C = numpy.asarray(numpy.random.rand(*shp2) * shp2[0], dtype='float32')
-            self._compile_and_check([a, c],  # theano.function inputs
-                                    [self.op(a, c)],  # theano.function outputs
-                                    # Always use not square matrix!
-                                    # inputs data
-                                    [A, C],
-                                    # Op that should be removed from the graph.
-                                    self.op_class)
-
-# Disabled as it isn't implemented.
-    def ___test_infer_shape_tuple(self):
-
-        a = tensor.tensor3(dtype='int32')
-        b = tensor.tensor3(dtype='int32')
-        c = tensor.tensor3(dtype='int32')
-
-        A = numpy.asarray([1, 0], dtype='int32').reshape((2, 1, 1))
-        B = numpy.asarray(numpy.random.rand(1, 4, 1), dtype='int32')
-        C = numpy.asarray(numpy.random.rand(1, 1, 7), dtype='int32')
-
-        f = function([a, b, c], choose(a, (b, c)))
-        shape = (2, 4, 7)
-        assert numpy.allclose(f(A, B, C).shape, shape)
-
-        self._compile_and_check([a, b, c],  # theano.function inputs
-                                [self.op(a, (b, c))],  # theano.function outputs
-                                # Always use not square matrix!
-                                # inputs data
-                                [A, B, C],
-                                # Op that should be removed from the graph.
-                                self.op_class)
-
 """
 
 if __name__ == '__main__':
diff --git a/theano/tensor/tests/test_blas.py b/theano/tensor/tests/test_blas.py
index 1f886de1818..c869c3bc4ac 100644
--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -1,28 +1,35 @@
-from copy import copy
-from unittest import TestCase
-
-import numpy
-from numpy import (arange, array, common_type, complex64, complex128, float32,
-                  float64, newaxis, shape, transpose, zeros)
-from numpy.testing import assert_array_almost_equal
+#from nose.plugins.skip import SkipTest
+#import traceback
+import itertools
+import sys
 
-from nose.plugins.attrib import attr
-
-import theano
 import theano.tensor as T
-from theano import tensor, Param, shared, config
-from theano.compat import exc_message
+from theano import tensor
+from theano.compat import PY3, exc_message
 from theano.gof.python25 import product as itertools_product
 from theano.gof.python25 import any
 from theano.printing import pp
+
+import numpy
+import theano
+from numpy import (arange, array, common_type, complex64, complex128, float32,
+                  float64, newaxis, shape, transpose, zeros)
+from numpy.testing import assert_array_almost_equal
+#from numpy.testing import dec
+#from numpy.testing.noseclasses import KnownFailureTest
 from theano.tensor.blas import (_dot22, _dot22scalar, res_is_a, _as_scalar,
                                 _is_real_matrix, _gemm_canonicalize,
                                 _factor_canonicalized, Gemm, Gemv,
                                 gemm_inplace, gemm_no_inplace,
                                 InconsistencyError, Ger, ger, ger_destructive)
+from unittest import TestCase
 from theano.tests import unittest_tools
-from test_basic import (as_tensor_variable, inplace_func,
-                        compile, inplace)
+from copy import copy, deepcopy
+
+from theano import Param, shared, config
+from test_basic import (_approx_eq, as_tensor_variable, inplace_func,
+        compile, inplace)
+        #, constant, eval_outputs)
 import theano.tensor.blas_scipy
 
 
@@ -51,6 +58,7 @@ class t_gemm(TestCase):
     """
     def setUp(self):
         unittest_tools.seed_rng()
+        _approx_eq.debug = 0
         Gemm.debug = False
 
     @staticmethod
@@ -84,7 +92,8 @@ def cmp_linker(z, a, x, y, b, l):
                 z_after = self._gemm(z_orig, a, x, y, b)
 
                 #print z_orig, z_after, z, type(z_orig), type(z_after), type(z)
-                unittest_tools.assert_allclose(z_after, z)
+                #_approx_eq.debug = 1
+                self.assertTrue(_approx_eq(z_after, z))
                 if a == 0.0 and b == 1.0:
                     return
                 elif z_orig.size == 0:
@@ -149,6 +158,7 @@ def test9(self):
                  self.rand(3, 5), self.rand(5, 4), -1.0)
 
     def test10(self):
+        _approx_eq.debug = 1
         self.cmp(self.rand(3, 4), -1.0, self.rand(3, 5), self.rand(5, 4), 0.0)
 
     def test11(self):
@@ -279,11 +289,14 @@ def t(z, x, y, a=1.0, b=0.0, l='c|py', dt='float64'):
             f = inplace_func([], gemm_inplace(tz, ta, tx, ty, tb),
                              mode=compile.Mode(optimizer=None, linker=l))
             f()
-            unittest_tools.assert_allclose(z_after, tz.get_value(borrow=True))
+            self.assertTrue(_approx_eq(z_after, tz.get_value(borrow=True)),
+                            (z_orig, z_after, z, z_after - z))
             f()
-            unittest_tools.assert_allclose(z_after, tz.get_value(borrow=True))
+            self.assertTrue(_approx_eq(z_after, tz.get_value(borrow=True)),
+                            (z_orig, z_after, z, z_after - z))
             f()
-            unittest_tools.assert_allclose(z_after, tz.get_value(borrow=True))
+            self.assertTrue(_approx_eq(z_after, tz.get_value(borrow=True)),
+                            (z_orig, z_after, z, z_after - z))
 
             #tz.value *= 0 # clear z's value
             y_T = ty.get_value(borrow=True).T
@@ -293,7 +306,7 @@ def t(z, x, y, a=1.0, b=0.0, l='c|py', dt='float64'):
             f()
             # test that the transposed version of multiplication gives
             # same answer
-            unittest_tools.assert_allclose(z_after, tz.get_value(borrow=True).T)
+            self.assertTrue(_approx_eq(z_after, tz.get_value(borrow=True).T))
 
         t(C, A, B)
         t(C.T, A, B)
@@ -348,8 +361,11 @@ def t(z, x, y, a=1.0, b=0.0, l='c|py', dt='float64'):
                     z = tz.get_value(borrow=True, return_internal_type=True)
                     z[:, :, i] = z_i
 
-                    unittest_tools.assert_allclose(z_after[:, :, i],
-                                                   tz.get_value(borrow=True)[:, :, i])
+                    self.assertTrue(
+                            _approx_eq(z_after[:, :, i],
+                                       tz.get_value(borrow=True)[:, :, i]),
+                            (z_orig[:, :, i], z_after[:, :, i],
+                                z[:, :, i], z_after[:, :, i] - z[:, :, i]))
 
                 tz_i = gemm_no_inplace(tz[:, :, i], ta, tx[
                     :, :, i], ty[:, :, i], tb)
@@ -358,8 +374,11 @@ def t(z, x, y, a=1.0, b=0.0, l='c|py', dt='float64'):
                         mode=compile.Mode(optimizer=None, linker=l))
                 for j in xrange(3):
                     g_i()
-                    unittest_tools.assert_allclose(z_after[:, :, i],
-                                                   tz.get_value(borrow=True)[:, :, i])
+                    self.assertTrue(
+                            _approx_eq(z_after[:, :, i],
+                                       tz.get_value(borrow=True)[:, :, i]),
+                            (z_orig[:, :, i], z_after[:, :, i],
+                                z[:, :, i], z_after[:, :, i] - z[:, :, i]))
 
         t(C, A, B)
         t(C.transpose((1, 0, 2)), A, B)
@@ -857,7 +876,6 @@ def cmp(a_shp, b_shp):
             cmp((0, 0), (0, 0))
 
 
-@attr('slow')
 def test_dot22scalar():
     ## including does not seem to work for 'local_dot_to_dot22' and
     ## 'local_dot22_to_dot22scalar'
@@ -1006,60 +1024,6 @@ def test_dot22scalar_cast():
             assert _dot22scalar in [x.op for x in f.maker.fgraph.toposort()]
 
 
-def test_local_dot22_to_dot22scalar():
-    """
-    This test that the bug in gh-1507 is really fixed
-    """
-    A = T.dmatrix()
-    mode = theano.compile.mode.get_default_mode()
-    opt = theano.tensor.opt.in2out(
-        theano.tensor.blas.local_dot22_to_dot22scalar)
-    mode = mode.__class__(optimizer=opt)
-
-    x = T.dscalar()
-    y = T.dscalar()
-    z = T.dscalar()
-    # make sure to don't have dimshuffle as we don't opt those cases
-    m = T.dmatrix()
-    r = T.drow()
-    for idx, node in enumerate([
-        #Old working cases
-        T.mul(_dot22(A, A), x),
-        T.mul(_dot22(A, A), x, y),
-        T.mul(_dot22(A, A), x, r),
-        T.mul(_dot22(A, A), m, x),
-        T.mul(_dot22(A, A), x, m),
-        T.mul(_dot22(A, A), x, (m * y)),
-        T.mul(_dot22(A, A), (m * y), x),
-        T.mul(_dot22(A, A), x, (r * y)),
-        T.mul(_dot22(A, A), (r * y), x),
-        T.mul(_dot22(A, A), (x * y), (m * x)),
-        T.mul(_dot22(A, A), (r * y), (y * x)),
-
-        # Case that was raising an assert that is fixed in gh-1507
-        T.mul(_dot22(A, A), (m * y), m),
-        T.mul(_dot22(A, A), m, (m * y)),
-        T.mul(_dot22(A, A), (r * y), (m * x)),
-
-        # assert fixed in gh-1507 and opt case added in gh-1515
-        T.mul(_dot22(A, A), (m * y * z), m),
-        T.mul(_dot22(A, A), m, (m * y * z)),
-
-        # Opt case added in gh-1515
-        T.mul(_dot22(A, A), T.mul(m, y, z), m),
-        T.mul(_dot22(A, A), m, T.mul(m, y, z)),
-
-        #Case that opt later in gh-1515
-        T.mul(_dot22(A, A), (r * m), (m * x)),
-    ]):
-        node2 = theano.tensor.blas.local_dot22_to_dot22scalar.transform(
-            node.owner)
-        assert node2
-        f = theano.function([x, y, z, m, r, A], node,
-                            mode=mode, on_unused_input='ignore')
-        f(.1, .2, .3, [[1, 2], [3, 4]], [[5, 6]], [[7, 8], [9, 10]])
-
-
 def test_dot_w_self():
     # This can trigger problems in the optimization because what would
     # normally be a gemm must not be because the output is aliased to
@@ -1091,7 +1055,7 @@ def test_dot_vv(self):
 
         # Assert that the dot was optimized somehow
         self.assertFunctionContains0(f, T.dot)
-        self.assertFunctionContains1(f, Gemv(True))
+        self.assertFunctionContains1(f, Gemv(False))
 
         # Assert they produce the same output
         assert numpy.allclose(f(), numpy.dot(v.get_value(), w.get_value()))
@@ -1182,7 +1146,6 @@ def t_gemv1(m_shp):
         assert numpy.allclose(v2.get_value(),
                 numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
 
-    @attr('slow')
     def test_gemv1(self):
         self.t_gemv1((3, 2))
         self.t_gemv1((0, 2))
diff --git a/theano/tensor/tests/test_blas_c.py b/theano/tensor/tests/test_blas_c.py
index 1904c5997ab..5ddb4fb122a 100644
--- a/theano/tensor/tests/test_blas_c.py
+++ b/theano/tensor/tests/test_blas_c.py
@@ -15,8 +15,6 @@
 from theano.tensor.blas_scipy import ScipyGer
 from theano.tensor.blas import Gemv
 
-from theano.tensor.blas_c import check_force_gemv_init
-
 from theano.tests import unittest_tools
 from theano.tests.unittest_tools import TestOptimizationMixin
 
@@ -139,10 +137,7 @@ def test_optimizations_vm(self):
 
         # Assert that the dot was optimized somehow
         self.assertFunctionContains0(f, tensor.dot)
-        self.assertFunctionContains1(
-            f,
-            CGemv(inplace=True, force_init_beta=True)
-        )
+        self.assertFunctionContains1(f, CGemv(True))
 
         # Assert they produce the same output
         assert numpy.allclose(f(self.xval, self.Aval),
@@ -160,10 +155,7 @@ def test_optimizations_mv(self):
 
         # Assert that the dot was optimized somehow
         self.assertFunctionContains0(f, tensor.dot)
-        self.assertFunctionContains1(
-            f,
-            CGemv(inplace=True, force_init_beta=True)
-        )
+        self.assertFunctionContains1(f, CGemv(True))
 
         # Assert they produce the same output
         assert numpy.allclose(f(self.Aval, self.yval),
@@ -172,14 +164,6 @@ def test_optimizations_mv(self):
         assert numpy.allclose(f(self.Aval[::-1, ::-1], self.yval),
                 numpy.dot(self.Aval[::-1, ::-1], self.yval))
 
-    def test_force_gemv_init(self):
-        if check_force_gemv_init():
-            sys.stderr.write(
-                "WARNING: The current BLAS requires Theano to initialize"
-                + " memory for some GEMV calls which will result in a minor"
-                + " degradation in performance for such calls."
-            )
-
     def t_gemv1(self, m_shp):
         ''' test vector2 + dot(matrix, vector1) '''
         rng = numpy.random.RandomState(unittest_tools.fetch_seed())
diff --git a/theano/tensor/tests/test_complex.py b/theano/tensor/tests/test_complex.py
index 484e3bf71e0..6e4c8c4acc7 100644
--- a/theano/tensor/tests/test_complex.py
+++ b/theano/tensor/tests/test_complex.py
@@ -5,31 +5,30 @@
 
 from numpy.testing import dec
 
-
 class TestRealImag(unittest.TestCase):
 
     def test0(self):
-        x = zvector()
+        x= zvector()
         rng = numpy.random.RandomState(23)
-        xval = numpy.asarray(list(numpy.complex(rng.randn(), rng.randn())
-                                  for i in xrange(10)))
-        assert numpy.all(xval.real == theano.function([x], real(x))(xval))
-        assert numpy.all(xval.imag == theano.function([x], imag(x))(xval))
+        xval = numpy.asarray(list(numpy.complex(rng.randn(), rng.randn()) for i in xrange(10)))
+        assert numpy.all( xval.real == theano.function([x], real(x))(xval))
+        assert numpy.all( xval.imag == theano.function([x], imag(x))(xval))
 
     def test_on_real_input(self):
-        x = dvector()
+        x= dvector()
         rng = numpy.random.RandomState(23)
         xval = rng.randn(10)
-        numpy.all(0 == theano.function([x], imag(x))(xval))
-        numpy.all(xval == theano.function([x], real(x))(xval))
+        numpy.all( 0 == theano.function([x], imag(x))(xval))
+        numpy.all( xval == theano.function([x], real(x))(xval))
+
 
-        x = imatrix()
-        xval = numpy.asarray(rng.randn(3, 3) * 100, dtype='int32')
-        numpy.all(0 == theano.function([x], imag(x))(xval))
-        numpy.all(xval == theano.function([x], real(x))(xval))
+        x= imatrix()
+        xval = numpy.asarray(rng.randn(3,3)*100, dtype='int32')
+        numpy.all( 0 == theano.function([x], imag(x))(xval))
+        numpy.all( xval == theano.function([x], real(x))(xval))
 
     def test_cast(self):
-        x = zvector()
+        x= zvector()
         self.assertRaises(TypeError, cast, x, 'int32')
 
     def test_complex(self):
@@ -37,27 +36,27 @@ def test_complex(self):
         m = fmatrix()
         c = complex(m[0], m[1])
         assert c.type == cvector
-        r, i = [real(c), imag(c)]
+        r,i = [real(c), imag(c)]
         assert r.type == fvector
         assert i.type == fvector
-        f = theano.function([m], [r, i])
+        f = theano.function([m], [r,i] )
 
-        mval = numpy.asarray(rng.randn(2, 5), dtype='float32')
+        mval = numpy.asarray(rng.randn(2,5), dtype='float32')
         rval, ival = f(mval)
-        assert numpy.all(rval == mval[0]), (rval, mval[0])
+        assert numpy.all(rval == mval[0]), (rval,mval[0])
         assert numpy.all(ival == mval[1]), (ival, mval[1])
 
-    @dec.knownfailureif(True, "Complex grads not enabled, see #178")
+    @dec.knownfailureif(True,"Complex grads not enabled, see #178")
     def test_complex_grads(self):
         def f(m):
             c = complex(m[0], m[1])
             return .5 * real(c) + .9 * imag(c)
 
         rng = numpy.random.RandomState(9333)
-        mval = numpy.asarray(rng.randn(2, 5))
+        mval = numpy.asarray(rng.randn(2,5))
         utt.verify_grad(f, [mval])
 
-    @dec.knownfailureif(True, "Complex grads not enabled, see #178")
+    @dec.knownfailureif(True,"Complex grads not enabled, see #178")
     def test_mul_mixed0(self):
 
         def f(a):
@@ -65,7 +64,7 @@ def f(a):
             return abs((ac)**2).sum()
 
         rng = numpy.random.RandomState(9333)
-        aval = numpy.asarray(rng.randn(2, 5))
+        aval = numpy.asarray(rng.randn(2,5))
         try:
             utt.verify_grad(f, [aval])
         except utt.verify_grad.E_grad, e:
@@ -73,7 +72,7 @@ def f(a):
             print e.analytic_grad
             raise
 
-    @dec.knownfailureif(True, "Complex grads not enabled, see #178")
+    @dec.knownfailureif(True,"Complex grads not enabled, see #178")
     def test_mul_mixed1(self):
 
         def f(a):
@@ -81,23 +80,22 @@ def f(a):
             return abs(ac).sum()
 
         rng = numpy.random.RandomState(9333)
-        aval = numpy.asarray(rng.randn(2, 5))
+        aval = numpy.asarray(rng.randn(2,5))
         try:
             utt.verify_grad(f, [aval])
         except utt.verify_grad.E_grad, e:
             print e.num_grad.gf
             print e.analytic_grad
             raise
-
-    @dec.knownfailureif(True, "Complex grads not enabled, see #178")
+    @dec.knownfailureif(True,"Complex grads not enabled, see #178")
     def test_mul_mixed(self):
 
-        def f(a, b):
+        def f(a,b):
             ac = complex(a[0], a[1])
             return abs((ac*b)**2).sum()
 
         rng = numpy.random.RandomState(9333)
-        aval = numpy.asarray(rng.randn(2, 5))
+        aval = numpy.asarray(rng.randn(2,5))
         bval = rng.randn(5)
         try:
             utt.verify_grad(f, [aval, bval])
@@ -106,22 +104,22 @@ def f(a, b):
             print e.analytic_grad
             raise
 
-    @dec.knownfailureif(True, "Complex grads not enabled, see #178")
+    @dec.knownfailureif(True,"Complex grads not enabled, see #178")
     def test_polar_grads(self):
         def f(m):
             c = complex_from_polar(abs(m[0]), m[1])
             return .5 * real(c) + .9 * imag(c)
 
         rng = numpy.random.RandomState(9333)
-        mval = numpy.asarray(rng.randn(2, 5))
+        mval = numpy.asarray(rng.randn(2,5))
         utt.verify_grad(f, [mval])
 
-    @dec.knownfailureif(True, "Complex grads not enabled, see #178")
+    @dec.knownfailureif(True,"Complex grads not enabled, see #178")
     def test_abs_grad(self):
         def f(m):
             c = complex(m[0], m[1])
             return .5 * abs(c)
 
         rng = numpy.random.RandomState(9333)
-        mval = numpy.asarray(rng.randn(2, 5))
+        mval = numpy.asarray(rng.randn(2,5))
         utt.verify_grad(f, [mval])
diff --git a/theano/tensor/tests/test_elemwise.py b/theano/tensor/tests/test_elemwise.py
index d97f15816a0..3b5ca85361e 100644
--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -5,19 +5,17 @@
 
 import numpy
 from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
 
 import theano
 from theano.gof.python25 import all, any
 from theano import gof, scalar, config
 
 from theano import tensor
-from theano.tensor import TensorType, as_tensor_variable
+from theano.tensor import TensorType
 from theano.compile.mode import get_default_mode
 from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle,
                                     Prod, ProdWithoutZeros)
 from theano.tests import unittest_tools
-import math
 
 
 def FunctionGraph(i, o):
@@ -26,7 +24,6 @@ def FunctionGraph(i, o):
 
 
 class test_DimShuffle(unittest_tools.InferShapeTester):
-    op = DimShuffle
 
     def with_linker(self, linker):
         for xsh, shuffle, zsh in [((2, 3), (1, 'x', 0), (3, 1, 2)),
@@ -41,25 +38,25 @@ def with_linker(self, linker):
                                   ((1,), ('x', 'x'), (1, 1))]:
             ib = [(entry == 1) for entry in xsh]
             x = TensorType('float64', ib)('x')
-            e = self.op(ib, shuffle)(x)
+            e = DimShuffle(ib, shuffle)(x)
             f = copy(linker).accept(FunctionGraph([x], [e])).make_function()
             assert f(numpy.ones(xsh)).shape == zsh
             #test that DimShuffle.infer_shape work correctly
             x = TensorType('float64', ib)('x')
-            e = self.op(ib, shuffle)(x)
-            f = copy(linker).accept(FunctionGraph([x],
-                                                  [e.shape])).make_function()
+            e = DimShuffle(ib, shuffle)(x)
+            f = copy(linker).accept(FunctionGraph([x], [e.
+                shape])).make_function()
             assert all(f(numpy.ones(xsh))) == all(zsh)
 
         # Test when we drop a axis that is not broadcastable
         ib = [False, True, False]
         x = TensorType('float64', ib)('x')
-        self.assertRaises(ValueError, self.op, ib, shuffle)
+        self.assertRaises(ValueError, DimShuffle, ib, shuffle)
 
         # Test when we drop a axis that don't have shape 1
         ib = [True, True, False]
         x = TensorType('float64', ib)('x')
-        e = self.op(ib, (1, 2))(x)
+        e = DimShuffle(ib, (1, 2))(x)
         f = copy(linker).accept(FunctionGraph([x], [e.shape])).make_function()
         self.assertRaises(TypeError, f, numpy.ones((2, 1, 4)))
 
@@ -92,8 +89,8 @@ def test_infer_shape(self):
             adtens = TensorType('float64', ib)('x')
             adtens_val = numpy.ones(xsh)
             self._compile_and_check([adtens],
-                                    [self.op(ib, shuffle)(adtens)],
-                                    [adtens_val], self.op,
+                                    [DimShuffle(ib, shuffle)(adtens)],
+                                    [adtens_val], DimShuffle,
                                     warn=False)
 
     def test_too_big_rank(self):
@@ -102,110 +99,41 @@ def test_too_big_rank(self):
         self.assertRaises(ValueError, y.eval, {x: 0})
 
 
-class test_reduce_axes(unittest.TestCase):
-
-    def test_sum_axes(self):
-        axes = [None, 0, 1, [0, 1], numpy.array(1),
-                [numpy.array(0), numpy.array(1)]]
-        for a in axes:
-            x = tensor.matrix()
-            m = x.sum(a)
-
-    def test_mean_axes(self):
-        axes = [None, 0, 1, [0, 1], numpy.array(1),
-                [numpy.array(0), numpy.array(1)]]
-        for a in axes:
-            x = tensor.matrix()
-            m = x.mean(a)
-
-    def test_max_axes(self):
-        axes = [None, 0, 1, [0, 1], numpy.array(1),
-                [numpy.array(0), numpy.array(1)]]
-        for a in axes:
-            x = tensor.matrix()
-            m = x.max(a)
-
-    def test_min_axes(self):
-        axes = [None, 0, 1, [0, 1], numpy.array(1),
-                [numpy.array(0), numpy.array(1)]]
-        for a in axes:
-            x = tensor.matrix()
-            m = x.min(a)
-
-    def test_argmax_axes(self):
-        axes = [None, 0, 1, [0, 1], numpy.array(1),
-                [numpy.array(0), numpy.array(1)]]
-        for a in axes:
-            x = tensor.matrix()
-            m = x.argmax(a)
-
-    def test_var_axes(self):
-        axes = [None, 0, 1, [0, 1], numpy.array(1),
-                [numpy.array(0), numpy.array(1)]]
-        for a in axes:
-            x = tensor.matrix()
-            m = x.var(a)
-
-
 class test_Broadcast(unittest.TestCase):
-    # this is to allow other types to reuse this class to test their ops
-    type = TensorType
-    op = Elemwise
-
-    ctype = TensorType
-    cop = Elemwise
-
-    openmp_minsize = 2*config.openmp_elemwise_minsize
-    openmp_minsize_sqrt = math.ceil(math.sqrt(openmp_minsize))
-
-    # The order is important if you change them.
-    linkers = [gof.PerformLinker, gof.CLinker]
-
-    def rand_val(self, shp):
-        return numpy.asarray(numpy.random.rand(*shp))
-
-    def rand_cval(self, shp):
-        return numpy.asarray(numpy.random.rand(*shp))
-
     def setUp(self):
         unittest_tools.seed_rng()
 
-    def with_linker(self, linker, op, type, rand_val):
+    def with_linker(self, linker):
         for xsh, ysh in [((3, 5), (3, 5)),
                          ((3, 5), (1, 5)),
                          ((3, 5), (3, 1)),
                          ((1, 5), (5, 1)),
                          ((1, 1), (1, 1)),
-                         ((self.openmp_minsize,), (self.openmp_minsize,)),
-                         ((self.openmp_minsize_sqrt,
-                           self.openmp_minsize_sqrt),
-                          (self.openmp_minsize_sqrt,
-                           self.openmp_minsize_sqrt)),
                          ((2, 3, 4, 5), (2, 3, 4, 5)),
                          ((2, 3, 4, 5), (1, 3, 1, 5)),
                          ((2, 3, 4, 5), (1, 1, 1, 1)),
                          ((), ())]:
-            x = type('float64', [(entry == 1) for entry in xsh])('x')
-            y = type('float64', [(entry == 1) for entry in ysh])('y')
-            e = op(scalar.add)(x, y)
+            x = TensorType('float64', [(entry == 1) for entry in xsh])('x')
+            y = TensorType('float64', [(entry == 1) for entry in ysh])('y')
+            e = Elemwise(scalar.add)(x, y)
             f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
-            xv = rand_val(xsh)
-            yv = rand_val(ysh)
+            xv = numpy.asarray(numpy.random.rand(*xsh))
+            yv = numpy.asarray(numpy.random.rand(*ysh))
             zv = xv + yv
 
-            unittest_tools.assert_allclose(f(xv, yv), zv)
+            self.assertTrue((f(xv, yv) == zv).all())
 
             #test Elemwise.infer_shape
             #the Shape op don't implement c_code!
             if isinstance(linker, gof.PerformLinker):
-                x = type('float64', [(entry == 1) for entry in xsh])('x')
-                y = type('float64', [(entry == 1) for entry in ysh])('y')
-                e = op(scalar.add)(x, y)
-                f = copy(linker).accept(FunctionGraph(
-                    [x, y], [e.shape])).make_function()
+                x = TensorType('float64', [(entry == 1) for entry in xsh])('x')
+                y = TensorType('float64', [(entry == 1) for entry in ysh])('y')
+                e = Elemwise(scalar.add)(x, y)
+                f = copy(linker).accept(FunctionGraph([x,
+                     y], [e.shape])).make_function()
                 assert tuple(f(xv, yv)) == tuple(zv.shape)
 
-    def with_linker_inplace(self, linker, op, type, rand_val):
+    def with_linker_inplace(self, linker):
         for xsh, ysh in [((5, 5), (5, 5)),
                          ((5, 5), (1, 5)),
                          ((5, 5), (5, 1)),
@@ -214,12 +142,12 @@ def with_linker_inplace(self, linker, op, type, rand_val):
                          ((2, 3, 4, 5), (1, 3, 1, 5)),
                          ((2, 3, 4, 5), (1, 1, 1, 1)),
                          ((), ())]:
-            x = type('float64', [(entry == 1) for entry in xsh])('x')
-            y = type('float64', [(entry == 1) for entry in ysh])('y')
-            e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
+            x = TensorType('float64', [(entry == 1) for entry in xsh])('x')
+            y = TensorType('float64', [(entry == 1) for entry in ysh])('y')
+            e = Elemwise(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
             f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
-            xv = rand_val(xsh)
-            yv = rand_val(ysh)
+            xv = numpy.asarray(numpy.random.rand(*xsh))
+            yv = numpy.asarray(numpy.random.rand(*ysh))
             zv = xv + yv
 
             f(xv, yv)
@@ -228,13 +156,13 @@ def with_linker_inplace(self, linker, op, type, rand_val):
             #test Elemwise.infer_shape
             #the Shape op don't implement c_code!
             if isinstance(linker, gof.PerformLinker):
-                x = type('float64', [(entry == 1) for entry in xsh])('x')
-                y = type('float64', [(entry == 1) for entry in ysh])('y')
-                e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
-                f = copy(linker).accept(FunctionGraph(
-                    [x, y], [e.shape])).make_function()
-                xv = rand_val(xsh)
-                yv = rand_val(ysh)
+                x = TensorType('float64', [(entry == 1) for entry in xsh])('x')
+                y = TensorType('float64', [(entry == 1) for entry in ysh])('y')
+                e = Elemwise(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
+                f = copy(linker).accept(FunctionGraph([x,
+                     y], [e.shape])).make_function()
+                xv = numpy.asarray(numpy.random.rand(*xsh))
+                yv = numpy.asarray(numpy.random.rand(*ysh))
                 zv = xv + yv
 
                 f(xv, yv)
@@ -242,107 +170,82 @@ def with_linker_inplace(self, linker, op, type, rand_val):
                 assert xv.shape == zv.shape
 
     def test_perform(self):
-        self.with_linker(gof.PerformLinker(), self.op, self.type,
-                         self.rand_val)
+        self.with_linker(gof.PerformLinker())
 
     def test_c(self):
         if not theano.config.cxx:
             raise SkipTest("G++ not available, so we need to skip this test.")
-        self.with_linker(gof.CLinker(), self.cop, self.ctype, self.rand_cval)
+        self.with_linker(gof.CLinker())
 
     def test_perform_inplace(self):
-        self.with_linker_inplace(gof.PerformLinker(), self.op, self.type,
-                                 self.rand_val)
+        self.with_linker_inplace(gof.PerformLinker())
 
     def test_c_inplace(self):
         if not theano.config.cxx:
             raise SkipTest("G++ not available, so we need to skip this test.")
-        self.with_linker_inplace(gof.CLinker(), self.cop, self.ctype,
-                                 self.rand_cval)
+        self.with_linker_inplace(gof.CLinker())
 
     def test_fill(self):
         if not theano.config.cxx:
             raise SkipTest("G++ not available, so we need to skip this test.")
-        x = self.ctype('float64', [0, 0])('x')
-        y = self.ctype('float64', [1, 1])('y')
-        for linker, op in zip(self.linkers, [self.op, self.cop]):
-            e = op(scalar.Second(scalar.transfer_type(0)), {0: 0})(x, y)
-            f = linker().accept(FunctionGraph([x, y], [e])).make_function()
-            xv = self.rand_cval((5, 5))
-            yv = self.rand_cval((1, 1))
-            f(xv, yv)
-            assert (xv == yv).all()
-
-    def test_fill_var(self):
-        x = tensor.matrix()
-        x.fill(3)
-
-    def test_fill_grad(self):
-        # Fix bug reported at
-        # https://groups.google.com/d/topic/theano-users/nQshB8gUA6k/discussion
-        x = TensorType(config.floatX, [0, 1, 0])('x')
-        y = TensorType(config.floatX, [0, 1, 0])('y')
-        e = tensor.second(x, y)
-        theano.grad(e.sum(), y)
+        x = TensorType('float64', [0, 0])('x')
+        y = TensorType('float64', [1, 1])('y')
+        e = Elemwise(scalar.Second(scalar.transfer_type(0)), {0: 0})(x, y)
+        f = gof.CLinker().accept(FunctionGraph([x, y], [e])).make_function()
+        xv = numpy.ones((5, 5))
+        yv = numpy.random.rand(1, 1)
+        f(xv, yv)
+        assert (xv == yv).all()
 
     def test_weird_strides(self):
         if not theano.config.cxx:
             raise SkipTest("G++ not available, so we need to skip this test.")
-        x = self.ctype('float64', [0, 0, 0, 0, 0])('x')
-        y = self.ctype('float64', [0, 0, 0, 0, 0])('y')
-        for linker, op in zip(self.linkers, [self.op, self.cop]):
-            e = op(scalar.add)(x, y)
-            f = linker().accept(FunctionGraph([x, y], [e])).make_function()
-            xv = self.rand_cval((2, 2, 2, 2, 2))
-            yv = self.rand_cval((2, 2, 2, 2, 2)).transpose(4, 0, 3, 1, 2)
-            zv = xv + yv
-            assert (f(xv, yv) == zv).all()
+        x = TensorType('float64', [0, 0, 0, 0, 0])('x')
+        y = TensorType('float64', [0, 0, 0, 0, 0])('y')
+        e = Elemwise(scalar.add)(x, y)
+        f = gof.CLinker().accept(FunctionGraph([x, y], [e])).make_function()
+        xv = numpy.random.rand(2, 2, 2, 2, 2)
+        yv = numpy.random.rand(2, 2, 2, 2, 2).transpose(4, 0, 3, 1, 2)
+        zv = xv + yv
+        assert (f(xv, yv) == zv).all()
 
     def test_same_inputs(self):
         if not theano.config.cxx:
             raise SkipTest("G++ not available, so we need to skip this test.")
-        x = self.ctype('float64', [0, 0])('x')
-        for linker, op in zip(self.linkers, [self.op, self.cop]):
-            e = op(scalar.add)(x, x)
-            f = linker().accept(FunctionGraph([x], [e])).make_function()
-            xv = self.rand_cval((2, 2))
-            zv = xv + xv
-            assert (f(xv) == zv).all()
+        x = TensorType('float64', [0, 0])('x')
+        e = Elemwise(scalar.add)(x, x)
+        f = gof.CLinker().accept(FunctionGraph([x], [e])).make_function()
+        xv = numpy.random.rand(2, 2)
+        zv = xv + xv
+        assert (f(xv) == zv).all()
 
 
 class test_CAReduce(unittest_tools.InferShapeTester):
-    op = CAReduce
-    cases = [((5, 6), None),
-             ((5, 6), (0, 1)),
-             ((5, 6), (0, )),
-             ((5, 6), (1, )),
-             ((5, 6), (-1, )),
-             ((5, 6), (-2, )),
-             ((5, 6), ()),
-             ((2, 3, 4, 5), (0, 1, 3)),
-             ((2, 3, 4, 5), (-2, -3)),
-             ((5, 0), None),
-             ((5, 0), (0, )),
-             ((5, 0), (1, )),
-             ((5, 0), ()),
-             ((), None),
-             ((), ())
-    ]
 
     def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
-                    pre_scalar_op=None,
                     test_nan=False, tensor_op=None):
-        for xsh, tosum in self.cases:
+        for xsh, tosum in [((5, 6), None),
+                           ((5, 6), (0, 1)),
+                           ((5, 6), (0, )),
+                           ((5, 6), (1, )),
+                           ((5, 6), (-1, )),
+                           ((5, 6), (-2, )),
+                           ((5, 6), ()),
+                           ((2, 3, 4, 5), (0, 1, 3)),
+                           ((2, 3, 4, 5), (-2, -3)),
+                           ((5, 0), None),
+                           ((5, 0), (0, )),
+                           ((5, 0), (1, )),
+                           ((5, 0), ()),
+                           ((), None),
+                           ((), ())]:
             if dtype == "floatX":
                 dtype = theano.config.floatX
             x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
-            d = {}
-            if pre_scalar_op is not None:
-                d = {"pre_scalar_op": pre_scalar_op}
             if tensor_op is None:
-                e = as_tensor_variable(self.op(scalar_op, axis=tosum, **d)(x))
+                e = CAReduce(scalar_op, axis=tosum)(x)
             else:
-                e = as_tensor_variable(tensor_op(x, axis=tosum, **d))
+                e = tensor_op(x, axis=tosum)
 
             if tosum is None:
                 tosum = range(len(xsh))
@@ -363,8 +266,6 @@ def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
                 else:
                     xv = numpy.asarray(numpy.nan, dtype=dtype)
             zv = xv
-            if pre_scalar_op is not None:
-                zv = Elemwise(scalar_op=pre_scalar_op)(x).eval({x: xv})
             numpy_raised = False
             if len(tosum) > 1 and any([a < 0 for a in tosum]):
                 #In that case, we need to use the good order of axis
@@ -436,39 +337,29 @@ def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
                 if scalar_op in [scalar.and_, scalar.or_]:
                     zv = numpy.asarray(zv, dtype='int8')
                 if test_nan:
-                    try:
-                        self.assertTrue(
-                            theano.tensor.TensorType.values_eq(f(xv), zv),
-                            (f(xv), zv))
-                    except NotImplementedError:
-                        # GpuCAReduce don't implement all cases when size is 0
-                        assert xv.size == 0
+                    self.assertTrue(theano.tensor.TensorType.values_eq(f(xv),
+                                                                       zv),
+                                    (f(xv), zv))
                 else:
-                    try:
-                        f_xv = f(xv)
-                        self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv))
-                        self.assertTrue(numpy.allclose(f_xv, zv),
-                                        (f_xv, zv, xsh, tosum))
-                    except NotImplementedError:
-                        # GpuCAReduce don't implement all cases when size is 0
-                        assert xv.size == 0
+                    f_xv = f(xv)
+                    self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv))
+                    self.assertTrue(numpy.allclose(f_xv, zv), (f_xv, zv))
 
-            x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
-            if tensor_op is None:
-                e = self.op(scalar_op, axis=tosum)(x)
-            else:
-                e = tensor_op(x, axis=tosum)
-            if tosum is None:
-                tosum = range(len(xsh))
-            f = copy(linker).accept(FunctionGraph([x],
-                                                  [e.shape])).make_function()
-            if not(scalar_op in [scalar.maximum, scalar.minimum] and
-                   ((xsh == () or numpy.prod(xsh) == 0))):
-                try:
+            #test CAReduce.infer_shape
+            #the Shape op don't implement c_code!
+            if isinstance(linker, gof.PerformLinker):
+                x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
+                if tensor_op is None:
+                    e = CAReduce(scalar_op, axis=tosum)(x)
+                else:
+                    e = tensor_op(x, axis=tosum)
+                if tosum is None:
+                    tosum = range(len(xsh))
+                f = copy(linker).accept(FunctionGraph([x],
+                     [e.shape])).make_function()
+                if not(scalar_op in [scalar.maximum, scalar.minimum] and
+                       ((xsh == () or numpy.prod(xsh) == 0))):
                     assert all(f(xv) == zv.shape)
-                except NotImplementedError:
-                    # GpuCAReduce don't implement all cases when size is 0
-                    assert xv.size == 0
 
     def test_perform(self):
         for dtype in ["floatX", "complex64", "complex128", "int8", "uint8"]:
@@ -500,7 +391,6 @@ def test_perform_nan(self):
             self.with_linker(gof.PerformLinker(), scalar.and_, dtype=dtype,
                              test_nan=True, tensor_op=tensor.all)
 
-    @attr('slow')
     def test_c(self):
         if not theano.config.cxx:
             raise SkipTest("G++ not available, so we need to skip this test.")
@@ -534,25 +424,30 @@ def test_c_nan(self):
             self.with_linker(gof.CLinker(), scalar.maximum, dtype=dtype,
                              test_nan=True)
 
-    def test_infer_shape(self, dtype=None, pre_scalar_op=None):
-        if dtype is None:
+    def test_infer_shape(self):
+        for xsh, tosum in [((5, 6), None),
+                           ((5, 6), (0, 1)),
+                           ((5, 6), (0, )),
+                           ((5, 6), (1, )),
+                           ((5, 6), (-1, )),
+                           ((5, 6), (-2, )),
+                           ((2, 3, 4, 5), (0, 1, 3)),
+                           ((2, 3, 4, 5), (-2, -3)),
+                           ((5, 0), None),
+                           ((5, 0), (0, )),
+                           ((5, 0), (1, )),
+                           ((5, 6), ()),
+                           ((5, 0), ()),
+                           ((), None),
+                           ((), ())]:
             dtype = theano.config.floatX
-        for xsh, tosum in self.cases:
             x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
-            if pre_scalar_op is not None:
-                x = pre_scalar_op(x)
             if tosum is None:
                 tosum = range(len(xsh))
             xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype)
-            d = {}
-            if pre_scalar_op is not None:
-                xv = x.eval({x.owner.inputs[0]: xv})
-                d = {pre_scalar_op: pre_scalar_op}
             self._compile_and_check([x],
-                                    [self.op(scalar.add, axis=tosum, *d)(x)],
-                                    [xv], self.op,
-                                    ["local_cut_useless_reduce"],
-                                    warn=0 not in xsh)
+                            [CAReduce(scalar.add, axis=tosum)(x)],
+                            [xv], CAReduce, ["local_cut_useless_reduce"])
 
 
 class test_Prod(unittest.TestCase):
@@ -571,8 +466,8 @@ def test_verify_grad(self):
 
         # including zeros, as the case with zeros is important
         # (and special cases: 1 zero in the row, more than 1 zero in the row)
-        x_val = numpy.asarray([[.1, .2, .3], [.4, .5, .6], [.7, .8, .9]],
-                              dtype='float32')
+        x_val = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+             dtype='float32')
         # now with verify_grad
         unittest_tools.verify_grad(Prod(axis=1), [x_val], mode=self.mode)
 
@@ -587,7 +482,7 @@ def test_verify_grad_with_zeros(self):
         # including zeros, as the case with zeros is important
         # (and special cases: 1 zero in the row, more than 1 zero in the row)
         x_val = numpy.asarray([[1., 2., 3.], [0., 5., 6.], [0., 0., 9.]],
-                              dtype='float32')
+             dtype='float32')
         x = theano.tensor.dmatrix()
 
         # sanity check
@@ -623,38 +518,6 @@ def test_verify_grad_with_zeros(self):
 
         #unittest_tools.verify_grad(fn5, [x_val])
 
-    def test_prod_no_zeros_in_input(self):
-        x = theano.tensor.dmatrix()
-        x_val = numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype='float32')
-        pwz = Prod(axis=1, no_zeros_in_input=True)(x)
-        fn = theano.function([x], pwz, mode=self.mode)
-
-        assert numpy.allclose(fn(x_val), [6, 120, 504])
-
-        pwz = Prod(no_zeros_in_input=True)(x)
-        g = theano.grad(pwz, x)
-        gg = theano.grad(g.sum(), x)
-        fn = theano.function([x], g, mode=self.mode)
-        assert numpy.allclose(fn(x_val),
-                              [[362880., 181440., 120960.],
-                               [90720., 72576., 60480.],
-                               [51840., 45360., 40320.]])
-        fn = theano.function([x], gg, mode=self.mode)
-        assert numpy.allclose(fn(x_val),
-                              [[663696., 422568., 301872.],
-                               [233964., 190800., 161016.],
-                               [139248., 122652., 109584.]])
-        unittest_tools.verify_grad(Prod(axis=1, no_zeros_in_input=True),
-                                   [x_val],
-                                   mode=self.mode)
-        unittest_tools.verify_grad(Prod(no_zeros_in_input=True), [x_val],
-                                   mode=self.mode)
-
-        def second_deriv(x):
-            return theano.grad(Prod(no_zeros_in_input=True)(x), x)
-        unittest_tools.verify_grad(second_deriv, [x_val],
-                                   mode=self.mode)
-
     def test_prod_without_zeros(self):
         x = theano.tensor.dmatrix()
         x_val = numpy.array([[1, 2, 3], [0, 5, 6], [0, 0, 9]], dtype='float32')
@@ -666,7 +529,6 @@ def test_prod_without_zeros(self):
         fn_a0 = theano.function([x], pwz_a0, mode=self.mode)
         assert numpy.allclose(fn_a0(x_val), [1, 10, 162])
 
-    @attr('slow')
     def test_other_grad_tests(self):
         x = theano.tensor.dmatrix()
         x_val1 = numpy.array([[1, 2, 3], [0, 5, 6], [0, 0, 9]],
@@ -740,9 +602,7 @@ def run_isfunc(self, isfunc):
                     (x.ndim == 1 and input is not self.vector)):
                     # We only test with the appropriate input type.
                     continue
-                t_out = theano_isfunc(x)
-                n_out = numpy_isfunc(x)
-                assert (t_out == n_out).all(), (t_out, n_out)
+                assert (theano_isfunc(x) == numpy_isfunc(x)).all()
 
     def test_isinf(self):
         return self.run_isfunc('isinf')
@@ -751,49 +611,39 @@ def test_isnan(self):
         return self.run_isfunc('isnan')
 
 
-class T_reduce_dtype(unittest.TestCase):
-    mode = theano.compile.get_default_mode().excluding(
-        'local_cut_useless_reduce')
-    op = CAReduce
-    axes = [None, 0, 1, [], [0], [1], [0, 1]]
-    methods = ['sum', 'prod']
-    dtypes = imap(str, theano.scalar.all_types)
-
-    def test_reduce_default_dtype(self):
+class T_sum_dtype(unittest.TestCase):
+    def test_sum_default_dtype(self):
         """
-        Test the default dtype of a method().
+        Test the default dtype of a sum().
         """
         # We try multiple axis combinations even though axis should not matter.
-        for method in self.methods:
-            for idx, dtype in enumerate(self.dtypes):
-                axis = self.axes[idx % len(self.axes)]
-                x = tensor.matrix(dtype=dtype)
-                s = getattr(x, method)(axis=axis)
-                assert s.dtype == dict(
+        axes = [None, 0, 1, [], [0], [1], [0, 1]]
+        for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
+            axis = axes[idx % len(axes)]
+            x = tensor.matrix(dtype=dtype)
+            s = x.sum(axis=axis)
+            assert s.dtype == dict(
                     int8='int64',
                     int16='int64',
                     int32='int64',
                     uint8='uint64',
                     uint16='uint64',
                     uint32='uint64',
-                ).get(dtype, dtype)
-                f = theano.function([x], s, mode=self.mode)
-                topo = f.maker.fgraph.toposort()
-                assert [n for n in topo if isinstance(n.op, self.op)], (topo,
-                                                                        dtype)
-                data = numpy.random.rand(3, 4) * 10
-                data = data.astype(dtype)
-                f(data)
+                    ).get(dtype, dtype)
+            f = theano.function([x], s)
+            data = numpy.random.rand(3, 4) * 10
+            data = data.astype(dtype)
+            f(data)
 
-    def test_reduce_default_acc_dtype(self):
-        ##Test the default acc_dtype of a reduce().
+    def test_sum_default_acc_dtype(self):
+        ##Test the default acc_dtype of a sum().
         # We try multiple axis combinations even though axis should not matter.
-        for method in self.methods:
-            for idx, dtype in enumerate(self.dtypes):
-                axis = self.axes[idx % len(self.axes)]
-                x = tensor.matrix(dtype=dtype)
-                s = getattr(x, method)(axis=axis)
-                assert s.owner.op.acc_dtype == dict(
+        axes = [None, 0, 1, [], [0], [1], [0, 1]]
+        for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
+            axis = axes[idx % len(axes)]
+            x = tensor.matrix(dtype=dtype)
+            s = x.sum(axis=axis)
+            assert s.owner.op.acc_dtype == dict(
                     int8='int64',
                     int16='int64',
                     int32='int64',
@@ -802,107 +652,90 @@ def test_reduce_default_acc_dtype(self):
                     uint32='uint64',
                     float32='float64',
                     complex64='complex128',
-                ).get(dtype, dtype)
-                f = theano.function([x], s, mode=self.mode)
-                topo = f.maker.fgraph.toposort()
-                assert [n for n in topo if isinstance(n.op, self.op)], (topo,
-                                                                        dtype)
-                data = numpy.random.rand(3, 4) * 10
-                data = data.astype(dtype)
-                f(data)
+                    ).get(dtype, dtype)
+            f = theano.function([x], s)
+            data = numpy.random.rand(3, 4) * 10
+            data = data.astype(dtype)
+            f(data)
 
-    @attr('slow')
-    def test_reduce_custom_dtype(self):
+    def test_sum_custom_dtype(self):
         """
-        Test the ability to provide your own output dtype for a reduce.
+        Test the ability to provide your own output dtype for a sum.
         """
         # We try multiple axis combinations even though axis should not matter.
+        axes = [None, 0, 1, [], [0], [1], [0, 1]]
         idx = 0
-        for method in self.methods:
-            for input_dtype in self.dtypes:
-                x = tensor.matrix(dtype=input_dtype)
-                for output_dtype in self.dtypes:
-                # If the output is a complex, the gradient of the reduce will
+        for input_dtype in imap(str, theano.scalar.all_types):
+            x = tensor.matrix(dtype=input_dtype)
+            for output_dtype in imap(str, theano.scalar.all_types):
+                # If the output is a complex, the gradient of the sum will
                 # cast the complex to the input dtype. We can't call the normal
                 # cast on a complex to a not complex as this is ambiguous.
-                    if (not input_dtype.startswith('complex') and
-                        output_dtype.startswith('complex')):
-                        continue
+                if (not input_dtype.startswith('complex') and
+                    output_dtype.startswith('complex')):
+                    continue
 
-                    axis = self.axes[idx % len(self.axes)]
-                    var = getattr(x, method)(dtype=output_dtype, axis=axis)
-                    assert var.dtype == output_dtype
+                axis = axes[idx % len(axes)]
+                sum_var = x.sum(dtype=output_dtype, axis=axis)
+                assert sum_var.dtype == output_dtype
 
-                    f = theano.function([x], var, mode=self.mode)
-                    topo = f.maker.fgraph.toposort()
-                    assert [n for n in topo if isinstance(n.op, self.op)], (topo,
-                                                                            dtype)
-                    data = numpy.random.rand(3, 4) * 10
-                    data = data.astype(input_dtype)
-                    f(data)
-                    if "complex" in input_dtype:
-                        continue
-                    # Check that we can take the gradient
-                    tensor.grad(var.sum(), x,
-                                disconnected_inputs='ignore')
-                    idx += 1
+                f = theano.function([x], sum_var)
+                data = numpy.random.rand(3, 4) * 10
+                data = data.astype(input_dtype)
+                f(data)
+                if "complex" in input_dtype:
+                    continue
+                # Check that we can take the gradient
+                tensor.grad(sum_var.sum(), x,
+                            disconnected_inputs='ignore')
+                idx += 1
 
-    def test_reduce_custom_acc_dtype(self):
+    def test_sum_custom_acc_dtype(self):
         """
-        Test the ability to provide your own accumulator dtype for a reduce.
+        Test the ability to provide your own accumulator dtype for a sum.
         """
         # We try multiple axis combinations even though axis should not matter.
+        axes = [None, 0, 1, [], [0], [1], [0, 1]]
         idx = 0
-        for method in self.methods:
-            for input_dtype in self.dtypes:
-                x = tensor.matrix(dtype=input_dtype)
-                for acc_dtype in self.dtypes:
-                # If the accumulator is a complex, the gradient of the reduce will
+        for input_dtype in imap(str, theano.scalar.all_types):
+            x = tensor.matrix(dtype=input_dtype)
+            for acc_dtype in imap(str, theano.scalar.all_types):
+                # If the accumulator is a complex, the gradient of the sum will
                 # cast the complex to the input dtype. We can't call the normal
                 # cast on a complex to a not complex as this is ambiguous.
-                    if (not input_dtype.startswith('complex') and
-                        acc_dtype.startswith('complex')):
-                        continue
+                if (not input_dtype.startswith('complex') and
+                    acc_dtype.startswith('complex')):
+                    continue
 
-                    axis = self.axes[idx % len(self.axes)]
+                axis = axes[idx % len(axes)]
                 # If output_dtype would force a downcast, we expect a TypeError
                 # We always allow int/uint inputs with float/complex outputs.
-                    upcasted_dtype = scalar.upcast(input_dtype, acc_dtype)
-                    if (acc_dtype == upcasted_dtype or
+                upcasted_dtype = scalar.upcast(input_dtype, acc_dtype)
+                if (acc_dtype == upcasted_dtype or
                         (input_dtype in tensor.discrete_dtypes and
                             acc_dtype in tensor.continuous_dtypes)
                         ):
-                        var = getattr(x, method)(acc_dtype=acc_dtype,
-                                                 axis=axis)
-                        assert var.owner.op.acc_dtype == acc_dtype
+                    sum_var = x.sum(acc_dtype=acc_dtype, axis=axis)
+                    assert sum_var.owner.op.acc_dtype == acc_dtype
 
-                        if "complex" in input_dtype:
-                            continue
+                    if "complex" in input_dtype:
+                        continue
                     # Check that we can take the gradient
-                        tensor.grad(var.sum(), x,
-                                    disconnected_inputs='ignore')
-                    else:
-                        self.assertRaises(TypeError,
-                                          getattr(x, method),
-                                          acc_dtype=acc_dtype, axis=axis)
+                    tensor.grad(sum_var.sum(), x,
+                                disconnected_inputs='ignore')
+                else:
+                    self.assertRaises(TypeError,
+                            x.sum, acc_dtype=acc_dtype, axis=axis)
 
-                    idx += 1
+                idx += 1
 
-    def test_reduce_precision(self):
+    def test_sum_precision(self):
         # Check that the default accumulator precision is sufficient
-        for method in self.methods:
-            x = theano.shared(numpy.asarray([1e8, 1, -1e8],
-                                            dtype='float32'))
-            s = getattr(x, method)()
-            f = theano.function([], s, mode=self.mode)
-            topo = f.maker.fgraph.toposort()
-            assert [n for n in topo if isinstance(n.op, self.op)], (topo,
-                                                                    dtype)
-            s_val = f()
-            # Use extra precision in NumPy to compute the good answer.
-            ret = getattr(numpy.asarray([1e8, 1, -1e8], dtype='float64'),
-                          method)()
-            assert numpy.allclose(s_val, ret), (s_val, ret)
+        x = theano.shared(numpy.asarray([1e8, 1, -1e8], dtype='float32'))
+        s = x.sum()
+        f = theano.function([], s)
+        s_val = f()
+        assert numpy.allclose(s_val, 1)
 
 
 class T_mean_dtype(unittest.TestCase):
@@ -925,7 +758,6 @@ def test_mean_default_dtype(self):
             data = data.astype(dtype)
             f(data)
 
-    @attr('slow')
     def test_mean_custom_dtype(self):
         """
         Test the ability to provide your own output dtype for a mean.
@@ -947,10 +779,10 @@ def test_mean_custom_dtype(self):
                     # Executed if no TypeError was raised
                     if sum_dtype in tensor.discrete_dtypes and axis != []:
                         assert mean_var.dtype == 'float64', (
-                            (mean_var.dtype, sum_dtype))
+                                (mean_var.dtype, sum_dtype))
                     else:
                         assert mean_var.dtype == sum_dtype, (
-                            (mean_var.dtype, sum_dtype))
+                                (mean_var.dtype, sum_dtype))
                     if (('complex' in input_dtype or
                          'complex' in sum_dtype) and
                         input_dtype != sum_dtype):
@@ -984,6 +816,127 @@ def test_mean_precision(self):
         assert numpy.allclose(m_val, 1. / 3)
 
 
+class T_prod_dtype(unittest.TestCase):
+    def test_prod_default_dtype(self):
+        """
+        Test the default dtype of a prod().
+        """
+        # We try multiple axis combinations even though axis should not matter.
+        axes = [None, 0, 1, [], [0], [1], [0, 1]]
+        for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
+            axis = axes[idx % len(axes)]
+            x = tensor.matrix(dtype=dtype)
+            p = x.prod(axis=axis)
+            assert p.dtype == dict(
+                    int8='int64',
+                    int16='int64',
+                    int32='int64',
+                    uint8='uint64',
+                    uint16='uint64',
+                    uint32='uint64',
+                    ).get(dtype, dtype)
+            f = theano.function([x], p)
+            data = numpy.random.rand(3, 4) * 10
+            data = data.astype(dtype)
+            f(data)
+
+    def test_prod_default_acc_dtype(self):
+        """
+        Test the default acc_dtype of a prod().
+        """
+        # We try multiple axis combinations even though axis should not matter.
+        axes = [None, 0, 1, [], [0], [1], [0, 1]]
+        for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
+            axis = axes[idx % len(axes)]
+            x = tensor.matrix(dtype=dtype)
+            p = x.prod(axis=axis)
+            assert p.owner.op.acc_dtype == dict(
+                    int8='int64',
+                    int16='int64',
+                    int32='int64',
+                    uint8='uint64',
+                    uint16='uint64',
+                    uint32='uint64',
+                    float32='float64',
+                    complex64='complex128',
+                    ).get(dtype, dtype)
+            f = theano.function([x], p)
+            data = numpy.random.rand(3, 4) * 10
+            data = data.astype(dtype)
+            f(data)
+
+    def test_prod_custom_dtype(self):
+        """
+        Test the ability to provide your own output dtype for a prod.
+        """
+        # We try multiple axis combinations even though axis should not matter.
+        axes = [None, 0, 1, [], [0], [1], [0, 1]]
+        idx = 0
+        for input_dtype in imap(str, theano.scalar.all_types):
+            x = tensor.matrix(dtype=input_dtype)
+            for output_dtype in imap(str, theano.scalar.all_types):
+                axis = axes[idx % len(axes)]
+                idx += 1
+                prod_var = x.prod(dtype=output_dtype, axis=axis)
+                assert prod_var.dtype == output_dtype
+
+                if (('complex' in output_dtype or
+                    'complex' in input_dtype) and
+                    input_dtype != output_dtype):
+                    continue
+
+                f = theano.function([x], prod_var)
+                data = numpy.random.rand(3, 4) * 10
+                data = data.astype(input_dtype)
+                f(data)
+
+                if "complex" in output_dtype or "complex" in input_dtype:
+                    continue
+                # Check that we can take the gradient
+                tensor.grad(prod_var.sum(), x,
+                            disconnected_inputs='ignore')
+
+    def test_prod_custom_acc_dtype(self):
+        """
+        Test the ability to provide your own acc_dtype for a prod.
+        """
+        # We try multiple axis combinations even though axis should not matter.
+        axes = [None, 0, 1, [], [0], [1], [0, 1]]
+        idx = 0
+        for input_dtype in imap(str, theano.scalar.all_types):
+            x = tensor.matrix(dtype=input_dtype)
+            for acc_dtype in imap(str, theano.scalar.all_types):
+                axis = axes[idx % len(axes)]
+                # If acc_dtype would force a downcast, we expect a TypeError
+                # We always allow int/uint inputs with float/complex outputs.
+                upcasted_dtype = scalar.upcast(input_dtype, acc_dtype)
+                if (acc_dtype == upcasted_dtype or
+                        (input_dtype in tensor.discrete_dtypes and
+                            acc_dtype in tensor.continuous_dtypes)
+                        ):
+                    prod_var = x.prod(acc_dtype=acc_dtype, axis=axis)
+                    assert prod_var.owner.op.acc_dtype == acc_dtype
+
+                    if (acc_dtype.startswith('complex') and
+                        input_dtype != acc_dtype):
+                        continue
+                    f = theano.function([x], prod_var)
+                    data = numpy.random.rand(3, 4) * 10
+                    data = data.astype(input_dtype)
+                    f(data)
+
+                    if "complex" in acc_dtype:
+                        continue
+                    # Check that we can take the gradient
+                    tensor.grad(prod_var.sum(), x,
+                                disconnected_inputs='ignore')
+                else:
+                    self.assertRaises(TypeError,
+                            x.prod, acc_dtype=acc_dtype, axis=axis)
+
+                idx += 1
+
+
 class T_prod_without_zeros_dtype(unittest.TestCase):
     def test_prod_without_zeros_default_dtype(self):
         """
@@ -995,13 +948,13 @@ def test_prod_without_zeros_default_dtype(self):
             axis = axes[idx % len(axes)]
             x = ProdWithoutZeros(axis=axis)(tensor.matrix(dtype=dtype))
             assert x.dtype == dict(
-                int8='int64',
-                int16='int64',
-                int32='int64',
-                uint8='uint64',
-                uint16='uint64',
-                uint32='uint64',
-            ).get(dtype, dtype)
+                    int8='int64',
+                    int16='int64',
+                    int32='int64',
+                    uint8='uint64',
+                    uint16='uint64',
+                    uint32='uint64',
+                    ).get(dtype, dtype)
 
     def test_prod_without_zeros_default_acc_dtype(self):
         """
@@ -1031,7 +984,6 @@ def test_prod_without_zeros_default_acc_dtype(self):
             data = data.astype(dtype)
             f(data)
 
-    @attr('slow')
     def test_prod_without_zeros_custom_dtype(self):
         """
         Test ability to provide your own output dtype for a ProdWithoutZeros().
@@ -1055,7 +1007,6 @@ def test_prod_without_zeros_custom_dtype(self):
                 data = data.astype(input_dtype)
                 f(data)
 
-    @attr('slow')
     def test_prod_without_zeros_custom_acc_dtype(self):
         """
         Test ability to provide your own acc_dtype for a ProdWithoutZeros().
diff --git a/theano/tensor/tests/test_extra_ops.py b/theano/tensor/tests/test_extra_ops.py
index 38e0537d359..4d9496bc27c 100644
--- a/theano/tensor/tests/test_extra_ops.py
+++ b/theano/tensor/tests/test_extra_ops.py
@@ -1,17 +1,11 @@
 import numpy as np
 import numpy
-import unittest
 
 import theano
 from theano.tests import unittest_tools as utt
-
-from theano.tensor.extra_ops import (CumsumOp, cumsum, CumprodOp, cumprod,
-                                     BinCountOp, bincount, DiffOp, diff,
-                                     squeeze, RepeatOp, repeat,
-                                     Bartlett, bartlett,
-                                     FillDiagonal, fill_diagonal,
-                                     FillDiagonalOffset, fill_diagonal_offset,
-                                     to_one_hot)
+from theano.tensor.extra_ops import (BinCountOp, bincount, DiffOp, diff,
+        squeeze, RepeatOp, repeat, Bartlett, bartlett,
+        FillDiagonal, fill_diagonal)
 from theano import tensor as T
 from theano import config, tensor, function
 
@@ -19,94 +13,6 @@
 numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
 numpy_16 = bool(numpy_ver >= [1, 6])
 
-class TestCumsumOp(utt.InferShapeTester):
-
-    def setUp(self):
-        super(TestCumsumOp, self).setUp()
-        self.op_class = CumsumOp
-        self.op = CumsumOp()
-
-    def test_cumsumOp(self):
-        x = T.tensor3('x')
-        a = np.random.random((3, 5, 2)).astype(config.floatX)
-
-        # Test axis out of bounds
-        self.assertRaises(ValueError, cumsum, x, axis=4)
-
-        f = theano.function([x], cumsum(x))
-        assert np.allclose(np.cumsum(a), f(a))  # Test axis=None
-
-        for axis in range(len(a.shape)):
-            f = theano.function([x], cumsum(x, axis=axis))
-            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-
-    def test_infer_shape(self):
-        x = T.tensor3('x')
-        a = np.random.random((3, 5, 2)).astype(config.floatX)
-
-        # Test axis=None
-        self._compile_and_check([x],
-                                [self.op(x)],
-                                [a],
-                                self.op_class)
-
-        for axis in range(len(a.shape)):
-            self._compile_and_check([x],
-                                    [cumsum(x, axis=axis)],
-                                    [a],
-                                    self.op_class)
-
-    def test_grad(self):
-        a = np.random.random((3, 5, 2)).astype(config.floatX)
-
-        utt.verify_grad(self.op, [a])  # Test axis=None
-
-        for axis in range(len(a.shape)):
-            utt.verify_grad(self.op_class(axis=axis), [a], eps=4e-4)
-
-
-class TestCumprodOp(utt.InferShapeTester):
-
-    def setUp(self):
-        super(TestCumprodOp, self).setUp()
-        self.op_class = CumprodOp
-        self.op = CumprodOp()
-
-    def test_CumprodOp(self):
-        x = T.tensor3('x')
-        a = np.random.random((3, 5, 2)).astype(config.floatX)
-
-        f = theano.function([x], cumprod(x))
-        assert np.allclose(np.cumprod(a), f(a))  # Test axis=None
-
-        for axis in range(len(a.shape)):
-            f = theano.function([x], cumprod(x, axis=axis))
-            assert np.allclose(np.cumprod(a, axis=axis), f(a))
-
-    def test_infer_shape(self):
-        x = T.tensor3('x')
-        a = np.random.random((3, 5, 2)).astype(config.floatX)
-
-        # Test axis=None
-        self._compile_and_check([x],
-                                [self.op(x)],
-                                [a],
-                                self.op_class)
-
-        for axis in range(len(a.shape)):
-            self._compile_and_check([x],
-                                    [cumprod(x, axis=axis)],
-                                    [a],
-                                    self.op_class)
-
-    def test_grad(self):
-        a = np.random.random((3, 5, 2)).astype(config.floatX)
-
-        utt.verify_grad(self.op, [a])  # Test axis=None
-
-        for axis in range(len(a.shape)):
-            utt.verify_grad(self.op_class(axis=axis), [a])
-
 
 class TestBinCountOp(utt.InferShapeTester):
     def setUp(self):
@@ -297,10 +203,10 @@ def setUp(self):
         self.op = RepeatOp()
         # uint64 always fails
         # int64 and uint32 also fail if python int are 32-bit
-        ptr_bitwidth = theano.gof.local_bitwidth()
-        if ptr_bitwidth == 64:
+        int_bitwidth = theano.gof.python_int_bitwidth()
+        if int_bitwidth == 64:
             self.numpy_unsupported_dtypes = ('uint64',)
-        if ptr_bitwidth == 32:
+        if int_bitwidth == 32:
             self.numpy_unsupported_dtypes = ('uint32', 'int64', 'uint64')
 
     def test_repeatOp(self):
@@ -413,6 +319,13 @@ def test_infer_shape(self):
         self._compile_and_check([x], [self.op(x)], [1], self.op_class)
 
 
+if __name__ == "__main__":
+    t = TestBartlett('setUp')
+    t.setUp()
+    t.test_perform()
+    t.test_infer_shape()
+
+
 class TestFillDiagonal(utt.InferShapeTester):
 
     rng = numpy.random.RandomState(43)
@@ -470,91 +383,10 @@ def test_infer_shape(self):
                                 self.op_class,
                                 warn=False)
 
-class TestFillDiagonalOffset(utt.InferShapeTester):
-
-    rng = numpy.random.RandomState(43)
-
-    def setUp(self):
-        super(TestFillDiagonalOffset, self).setUp()
-        self.op_class = FillDiagonalOffset
-        self.op = fill_diagonal_offset
-
-    def test_perform(self):
-        x = tensor.matrix()
-        y = tensor.scalar()
-        z = tensor.iscalar()
-
-        f = function([x, y, z], fill_diagonal_offset(x, y, z))
-        for test_offset in (-5, -4, -1, 0, 1, 4, 5):
-            for shp in [(8, 8), (5, 8), (8, 5), (5, 5)]:
-                a = numpy.random.rand(*shp).astype(config.floatX)
-                val = numpy.cast[config.floatX](numpy.random.rand())
-                out = f(a, val, test_offset)
-                # We can't use numpy.fill_diagonal as it is bugged.
-                assert numpy.allclose(numpy.diag(out, test_offset), val)
-                if test_offset >= 0:
-                   assert (out == val).sum() == min( min(a.shape),
-                                            a.shape[1]-test_offset )
-                else:
-                    assert (out == val).sum() == min( min(a.shape),
-                                            a.shape[0]+test_offset )
-
-    def test_gradient(self):
-        for test_offset in (-5, -4, -1, 0, 1, 4, 5):
-            # input 'offset' will not be tested
-            def fill_diagonal_with_fix_offset( a, val):
-                return fill_diagonal_offset( a, val, test_offset)
-
-            utt.verify_grad(fill_diagonal_with_fix_offset,
-                        [numpy.random.rand(5, 8), numpy.random.rand()],
-                            n_tests=1, rng=TestFillDiagonalOffset.rng)
-            utt.verify_grad(fill_diagonal_with_fix_offset,
-                        [numpy.random.rand(8, 5), numpy.random.rand()],
-                            n_tests=1, rng=TestFillDiagonalOffset.rng)
-            utt.verify_grad(fill_diagonal_with_fix_offset,
-                        [numpy.random.rand(5, 5), numpy.random.rand()],
-                            n_tests=1, rng=TestFillDiagonalOffset.rng)
-
-    def test_infer_shape(self):
-        x = tensor.dmatrix()
-        y = tensor.dscalar()
-        z = tensor.iscalar()
-        for test_offset in (-5, -4, -1, 0, 1, 4, 5):
-            self._compile_and_check([x, y, z], [self.op(x, y, z)],
-                                    [numpy.random.rand(8, 5),
-                                     numpy.random.rand(),
-                                     test_offset],
-                                     self.op_class )
-            self._compile_and_check([x, y, z], [self.op(x, y, z)],
-                                    [numpy.random.rand(5, 8),
-                                     numpy.random.rand(),
-                                     test_offset],
-                                     self.op_class )
-
-
-def test_to_one_hot():
-    v = theano.tensor.ivector()
-    o = to_one_hot(v, 10)
-    f = theano.function([v], o)
-    out = f([1, 2, 3, 5, 6])
-    assert out.dtype == theano.config.floatX
-    assert numpy.allclose(
-        out,
-        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
-         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
-         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
-         [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
-         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]])
-
-    v = theano.tensor.ivector()
-    o = to_one_hot(v, 10, dtype="int32")
-    f = theano.function([v], o)
-    out = f([1, 2, 3, 5, 6])
-    assert out.dtype == "int32"
-    assert numpy.allclose(
-        out,
-        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
-         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
-         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
-         [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
-         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]])
+if __name__ == "__main__":
+    utt.unittest.main()
+    t = TestFillDiagonal('setUp')
+    t.setUp()
+    t.test_perform()
+    t.test_gradient()
+    t.test_infer_shape()
diff --git a/theano/tensor/tests/test_gc.py b/theano/tensor/tests/test_gc.py
index d1304de7e26..3e93dd4eeb6 100644
--- a/theano/tensor/tests/test_gc.py
+++ b/theano/tensor/tests/test_gc.py
@@ -20,60 +20,56 @@ def test_no_reuse():
         return
     assert not 'should not get here'
 
-
 def test_gc_never_pickles_temporaries():
     x = T.dvector()
 
     #print >> sys.stderr, 'BUILDING GRAPH'
-    for i in xrange(2):  # TODO: 30 causes like LONG compilation due to MERGE
-        if i:
+    for i in xrange(2): #TODO: 30 causes like LONG compilation due to MERGE
+        if i :
             r = r + r/10
         else:
             r = x
 
-    optimizer = None
-    optimizer = 'fast_run'
+    optimizer=None
+    optimizer='fast_run'
 
     for f_linker, g_linker in [
-            (theano.PerformLinker(allow_gc=True),
-             theano.PerformLinker(allow_gc=False)),
-            (theano.OpWiseCLinker(allow_gc=True),
-             theano.OpWiseCLinker(allow_gc=False))]:
+            (theano.PerformLinker(allow_gc = True), theano.PerformLinker(allow_gc=False)), 
+            (theano.OpWiseCLinker(allow_gc = True), theano.OpWiseCLinker(allow_gc=False))]:
 
         #f_linker has garbage collection
 
         #g_linker has no garbage collection
 
         #print >> sys.stderr, 'COMPILING'
-        f = theano.function([x], r, mode=theano.Mode(optimizer=optimizer,
-                                                     linker=f_linker))
-        g = theano.function([x], r, mode=theano.Mode(optimizer=optimizer,
-                                                     linker=g_linker))
+        f = theano.function([x], r,mode=theano.Mode(optimizer=optimizer, linker=f_linker))
+        g = theano.function([x], r,mode=theano.Mode(optimizer=optimizer, linker=g_linker))
 
         len_pre_f = len(cPickle.dumps(f))
         len_pre_g = len(cPickle.dumps(g))
 
-        # We can't compare the content or the length of the string
-        # between f and g. 2 reason, we store some timming information
-        # in float. They won't be the same each time. Different float
-        # can have different lenght when printed.
+        # should be no difference at first
+        # In future, FunctionMaker might pickle linker-dependent stuff and make
+        # this assertion fail.
+        assert len_pre_f == len_pre_g
 
         def a(fn):
             return len(cPickle.dumps(fn.maker))
-        assert a(f) == a(f)  # some sanity checks on the pickling mechanism
-        assert a(g) == a(g)  # some sanity checks on the pickling mechanism
+        assert a(f) == a(f) # some sanity checks on the pickling mechanism
+        assert a(g) == a(g) # some sanity checks on the pickling mechanism
 
         def b(fn):
             return len(
-                cPickle.dumps(
-                    theano.compile.function_module._pickle_Function(
-                        fn)))
-        assert b(f) == b(f)  # some sanity checks on the pickling mechanism
+                    cPickle.dumps(
+                        theano.compile.function_module._pickle_Function(
+                            fn)))
+        assert b(f) == b(f) # some sanity checks on the pickling mechanism
 
         def c(fn):
             return len(cPickle.dumps(fn))
-        assert c(f) == c(f)  # some sanity checks on the pickling mechanism
-        assert c(g) == c(g)  # some sanity checks on the pickling mechanism
+        assert c(f) == c(f) # some sanity checks on the pickling mechanism
+        assert c(g) == c(g) # some sanity checks on the pickling mechanism
+
 
         # now run the function once to create temporaries within the no-gc
         # linker
@@ -86,39 +82,32 @@ def c(fn):
         len_post_f = len(post_f)
         len_post_g = len(post_g)
 
-        # assert that f() didn't cause the function to grow
+        #assert that f() didn't cause the function to grow
         # allow_gc should leave the function un-changed by calling
         assert len_pre_f == len_post_f
 
-        # assert that g() didn't cause g to grow because temporaries
-        # that weren't collected shouldn't be pickled anyway
-        # Allow for a couple of bytes of difference, since timing info,
-        # for instance, can be represented as text of varying size.
-        assert abs(len_post_f - len_post_g) < 16, (
-            f_linker, len_post_f, len_post_g)
+        #assert that g() didn't cause g to grow
+        # because temporaries that weren't collected shouldn't be pickled anyway
+        assert len_post_f == len_post_g, (f_linker, len_post_f, len_post_g)
 
 
 def test_merge_opt_runtime():
-    """In the original merge optimization, the following graph took
-    like caused the MERGE optimizer to exhibit really bad performance
-    (quadratic? exponential?)
+    """In the original merge optimization, the following graph took like caused the MERGE
+    optimizer to exhibit really bad performance (quadratic? exponential?)
 
     Ironically, there is actually no merging to do in this graph.
-
     """
     x = T.dvector()
     for i in xrange(50):
-        if i:
+        if i :
             r = r + r/10
         else:
             r = x
     t = time.time()
     f = theano.function([x], r, mode='FAST_COMPILE')
-    # FAST_RUN does in-place optimizer which requires a lot of
-    # toposorting, which is actually pretty slow at the moment.  This
-    # test was designed to test MergeOptimizer... so I'm leaving
-    # toposort optimizations for a later date.
+    # FAST_RUN does in-place optimizer which requires a lot of toposorting, which is actually
+    # pretty slow at the moment.  This test was designed to test MergeOptimizer... so I'm
+    # leaving toposort optimizations for a later date.
     dt = time.time() - t
 
-    # it should never take longer than 5 seconds to compile this graph
-    assert dt < 5.0
+    assert dt < 5.0 #it should never take longer than 5 seconds to compile this graph
diff --git a/theano/tensor/tests/test_inc_subtensor.py b/theano/tensor/tests/test_inc_subtensor.py
index 6f7f4b7798d..2010614aa99 100644
--- a/theano/tensor/tests/test_inc_subtensor.py
+++ b/theano/tensor/tests/test_inc_subtensor.py
@@ -54,7 +54,7 @@ def test_simple_2d(self):
             else:
                 expected_result[:, :val_sl2_end] += val_inc
 
-            utt.assert_allclose(result, expected_result)
+            self.assertTrue(numpy.array_equal(result, expected_result))
 
     def test_wrong_dims(self):
         a = tt.matrix()
@@ -83,11 +83,11 @@ def rng_randX(*shape):
                 f(rng_randX(3, 1), rng_randX(1))
                 # These ones should not
                 self.assertRaises(ValueError,
-                                  f, rng_randX(3, 1), rng_randX(2))
+                        f, rng_randX(3, 1), rng_randX(2))
                 self.assertRaises(ValueError,
-                                  f, rng_randX(3, 1), rng_randX(3))
+                        f, rng_randX(3, 1), rng_randX(3))
                 self.assertRaises(ValueError,
-                                  f, rng_randX(3, 1), rng_randX(0))
+                        f, rng_randX(3, 1), rng_randX(0))
 
     def test_simple_3d(self):
         """Increments or sets part of a tensor by a scalar using full slice and
@@ -100,41 +100,29 @@ def test_simple_3d(self):
         sl2 = slice(sl2_end)
         sl3 = 2
 
-        val_a = numpy.ones((5, 3, 4))
-        val_inc = 2.3
-        val_sl2_end = 2
+        for do_set in [True, False]:
+            print "Set", do_set
 
-        for method in [tt.set_subtensor, tt.inc_subtensor]:
-            print "MethodSet", method
-
-            resut = method(a[sl1, sl3, sl2], increment)
+            if do_set:
+                resut = tt.set_subtensor(a[sl1, sl3, sl2], increment)
+            else:
+                resut = tt.inc_subtensor(a[sl1, sl3, sl2], increment)
 
             f = theano.function([a, increment, sl2_end], resut)
 
+            val_a = numpy.ones((5, 3, 4))
+            val_inc = 2.3
+            val_sl2_end = 2
+
             expected_result = numpy.copy(val_a)
             result = f(val_a, val_inc, val_sl2_end)
 
-            if method is tt.set_subtensor:
+            if do_set:
                 expected_result[:, sl3, :val_sl2_end] = val_inc
             else:
                 expected_result[:, sl3, :val_sl2_end] += val_inc
 
-            utt.assert_allclose(result, expected_result)
-
-            # Test when we broadcast the result
-            resut = method(a[sl1, sl2], increment)
-
-            f = theano.function([a, increment, sl2_end], resut)
-
-            expected_result = numpy.copy(val_a)
-            result = f(val_a, val_inc, val_sl2_end)
-
-            if method is tt.set_subtensor:
-                expected_result[:, :val_sl2_end] = val_inc
-            else:
-                expected_result[:, :val_sl2_end] += val_inc
-
-            utt.assert_allclose(result, expected_result)
+            self.assertTrue(numpy.array_equal(result, expected_result))
 
     def test_grad_inc_set(self):
         def inc_slice(*s):
@@ -150,24 +138,19 @@ def just_numeric_args(a, b):
         for f_slice in [inc_slice, set_slice]:
             # vector
             utt.verify_grad(
-                f_slice(slice(2, 4, None)),
-                (numpy.asarray([0, 1, 2, 3, 4, 5.]),
-                 numpy.asarray([9, 9.]), ))
+                    f_slice(slice(2, 4, None)),
+                    (numpy.asarray([0, 1, 2, 3, 4, 5.]),
+                        numpy.asarray([9, 9.]), ))
 
             # matrix
             utt.verify_grad(
-                f_slice(slice(1, 2, None), slice(None, None, None)),
-                (numpy.asarray([[0, 1], [2, 3], [4, 5.]]),
-                 numpy.asarray([[9, 9.]]), ))
+                    f_slice(slice(1, 2, None), slice(None, None, None)),
+                    (numpy.asarray([[0, 1], [2, 3], [4, 5.]]),
+                        numpy.asarray([[9, 9.]]), ))
 
-            # single element
+            #single element
             utt.verify_grad(
-                f_slice(2, 1),
-                (numpy.asarray([[0, 1], [2, 3], [4, 5.]]),
-                 numpy.asarray(9.),))
+                    f_slice(2, 1),
+                    (numpy.asarray([[0, 1], [2, 3], [4, 5.]]),
+                        numpy.asarray(9.),))
 
-            # broadcast
-            utt.verify_grad(
-                f_slice(2),
-                (numpy.asarray([[0, 1], [2, 3], [4, 5.]]),
-                 numpy.asarray(9.),))
diff --git a/theano/tensor/tests/test_keepdims.py b/theano/tensor/tests/test_keepdims.py
index 463c8d85bec..b8841edc431 100644
--- a/theano/tensor/tests/test_keepdims.py
+++ b/theano/tensor/tests/test_keepdims.py
@@ -1,34 +1,21 @@
-import unittest
-
-from nose.plugins.attrib import attr
 import numpy
-
-import theano
 from theano import tensor, function
 
 
-# this tests other ops to ensure they keep the dimensions of their
-# inputs correctly
-class TestKeepDims(unittest.TestCase):
+class TestKeepDims:
 
     def makeKeepDims_local(self, x, y, axis):
+        x = tensor.as_tensor_variable(x)
+        y = tensor.as_tensor_variable(y)
+
         if axis is None:
-            newaxis = range(x.ndim)
+            axis = numpy.arange(x.ndim)
         elif isinstance(axis, int):
-            if axis < 0:
-                newaxis = [axis + x.type.ndim]
-            else:
-                newaxis = [axis]
-        else:
-            newaxis = []
-            for a in axis:
-                if a < 0:
-                    a += x.type.ndim
-                newaxis.append(a)
+            axis = [axis]
         i = 0
         new_dims = []
         for j, _ in enumerate(x.shape):
-            if j in newaxis:
+            if j in axis:
                 new_dims.append('x')
             else:
                 new_dims.append(i)
@@ -36,70 +23,71 @@ def makeKeepDims_local(self, x, y, axis):
 
         return tensor.DimShuffle(y.type.broadcastable, new_dims)(y)
 
-    @attr('slow')
     def test_keepdims(self):
 
         x = tensor.dtensor3()
         a = numpy.random.rand(3, 2, 4)
-        # We don't need to test all opt and C code, as this is tested
-        # by the ops tests.
-        mode = theano.compile.Mode(optimizer="fast_compile", linker="py")
 
         # 'max_and_argmax' has two outputs and can be specified with either
         # a single or every axis:
-        for axis in [0, 1, 2, [0], [1], [2], None, [0, 1, 2],
-                     [-1], [-2], [-3], [-1, -2, -3], [0, -1, -2],
-                     [-2, -3, 2]]:
+        for axis in [0, 1, 2, [0], [1], [2], None, [0, 1, 2]]:
 
             op = tensor.max_and_argmax
-            f = function([x], [op(x, axis=axis, keepdims=True)[0],
-                               self.makeKeepDims_local(
-                                   x, op(x, axis=axis, keepdims=False)[0],
-                                   axis)],
-                         mode=mode)
-            ans1, ans2 = f(a)
-            assert numpy.allclose(ans1, ans2)
-            assert ans1.shape == ans2.shape
-
-            f = function([x], [op(x, axis=axis, keepdims=True)[1],
-                               self.makeKeepDims_local(
-                                   x, op(x, axis=axis, keepdims=False)[1],
-                                   axis)],
-                         mode=mode)
-            ans1, ans2 = f(a)
-            assert numpy.allclose(ans1, ans2)
-            assert ans1.shape == ans2.shape
+            keep_param = function([x], op(x, axis=axis, keepdims=True)[0])
+            keep_synth = function([x], self.makeKeepDims_local(x,
+                                op(x, axis=axis, keepdims=False)[0], axis))
+
+            assert numpy.allclose(keep_param(a), keep_synth(a))
+            assert keep_param(a).shape == keep_synth(a).shape
+
+            keep_param = function([x], op(x, axis=axis, keepdims=True)[1])
+            keep_synth = function([x], self.makeKeepDims_local(x,
+                                op(x, axis=axis, keepdims=False)[1], axis))
+
+            assert numpy.allclose(keep_param(a), keep_synth(a))
+            assert keep_param(a).shape == keep_synth(a).shape
 
         # the following ops can be specified with either a single axis or every
         # axis:
         for op in ([tensor.argmax, tensor.argmin]):
-            for axis in [0, 1, 2, [0], [1], [2], None, [0, 1, 2],
-                         [-1], [-2], [-3], [-1, -2, -3], [0, -2, 2]]:
-
-                f = function([x], [op(x, axis=axis, keepdims=True),
-                                   self.makeKeepDims_local(
-                                       x, op(x, axis=axis, keepdims=False),
-                                       axis)],
-                             mode=mode)
-                ans1, ans2 = f(a)
-                assert numpy.allclose(ans1, ans2)
-                assert ans1.shape == ans2.shape
+
+            for axis in [0, 1, 2, [0], [1], [2], None, [0, 1, 2]]:
+
+                keep_param = function([x], op(x, axis=axis, keepdims=True))
+                keep_synth = function([x], self.makeKeepDims_local(x,
+                                op(x, axis=axis, keepdims=False), axis))
+
+                assert numpy.allclose(keep_param(a), keep_synth(a))
+                assert keep_param(a).shape == keep_synth(a).shape
+
+            keep_param = function([x], op(x, axis=None, keepdims=True))
+            keep_synth = function([x], self.makeKeepDims_local(x,
+                                op(x, axis=None, keepdims=False), None))
+
+            assert numpy.allclose(keep_param(a), keep_synth(a))
+            assert keep_param(a).shape == keep_synth(a).shape
 
         # the following ops can be specified with a freely specified axis
         # parameter
         for op in ([tensor.sum, tensor.prod, tensor.mean, tensor.var,
                     tensor.std, tensor.all, tensor.any,
                     tensor.max, tensor.min]):
-            for axis in [0, 1, 2, [0], [1], [2], None,
-                         [0, 1], [1, 2], [0, 1, 2],
-                         [-1], [-2], [-3], [-1, -2], [-1, -2, -3], [0, -2, 2]]:
-
-                f = function([x], [op(x, axis=axis, keepdims=True),
-                                   self.makeKeepDims_local(
-                                       x, op(x, axis=axis, keepdims=False),
-                                       axis)],
-                             mode=mode)
-
-                ans1, ans2 = f(a)
-                assert numpy.allclose(ans1, ans2)
-                assert ans1.shape == ans2.shape
+            for axis in [0, 1, 2, [0], [1], [2], [0, 1], [1, 2], [0, 1, 2]]:
+
+                keep_param = function([x], op(x, axis=axis, keepdims=True))
+                keep_synth = function([x], self.makeKeepDims_local(x,
+                                op(x, axis=axis, keepdims=False), axis))
+
+                assert numpy.allclose(keep_param(a), keep_synth(a))
+                assert keep_param(a).shape == keep_synth(a).shape
+
+            keep_param = function([x], op(x, axis=None, keepdims=True))
+            keep_synth = function([x], self.makeKeepDims_local(x,
+                                op(x, axis=None, keepdims=False), None))
+
+            assert numpy.allclose(keep_param(a), keep_synth(a))
+            assert keep_param(a).shape == keep_synth(a).shape
+
+
+if __name__ == '__main__':
+    TestKeepDims().test_keepdims()
diff --git a/theano/tensor/tests/test_mpi.py b/theano/tensor/tests/test_mpi.py
index 201d710e2fa..fb64af2c973 100644
--- a/theano/tensor/tests/test_mpi.py
+++ b/theano/tensor/tests/test_mpi.py
@@ -33,18 +33,19 @@ def test_can_make_function():
     assert theano.function([], [y])
 
 def test_mpi_roundtrip():
+#    p = subprocess.Popen(executable="mpiexec",
+#                         args = ("-np", "2",
+#                                 "python",
+#                                 "theano/tensor/tests/_test_mpi_roundtrip.py"),
+#                         stdout=subprocess.PIPE)
+#    assert p.stdout.read() == "True"
     if not mpi_enabled:
         return
     theano_root = theano.__file__.split('__init__')[0]
-    p = subprocess.Popen("mpiexec -np 2 python " + theano_root +
-                         "tensor/tests/_test_mpi_roundtrip.py",
-                         stdin=subprocess.PIPE,
-                         stdout=subprocess.PIPE,
-                         stderr=subprocess.PIPE,
-                         shell=True,
-                         close_fds=True)
-    result = theano.compat.decode(p.stdout.read())
-    assert "True" in result, theano.compat.decode(p.stderr.read())
+    sin, sout, serr = os.popen3("mpiexec -np 2 python " + theano_root +
+                                "tensor/tests/_test_mpi_roundtrip.py")
+    result = sout.read()
+    assert "True" in result
 
 def test_mpi_send_wait_cmp():
     x = theano.tensor.matrix('x')
diff --git a/theano/tensor/tests/test_naacl09.py b/theano/tensor/tests/test_naacl09.py
index 7ead5b59231..ed2074435c2 100644
--- a/theano/tensor/tests/test_naacl09.py
+++ b/theano/tensor/tests/test_naacl09.py
@@ -12,8 +12,6 @@
 from theano.tests import unittest_tools
 from numpy.testing.noseclasses import KnownFailureTest
 
-from nose.plugins.attrib import attr
-
 
 def cross_entropy(target, output, axis=1):
     """
@@ -559,7 +557,6 @@ def create_realistic(window_size=3,  # 7,
     return model
 
 
-@attr('slow')
 def test_naacl_model(iters_per_unsup=3, iters_per_sup=3,
         optimizer=None, realistic=False):
     #print "BUILDING MODEL"
@@ -646,7 +643,13 @@ def jtest_main():
     test_naacl_model(optimizer, 10, 10, realistic=False)
 
 
+def real_main():
+    test_naacl_model()
+
+
 def profile_main():
+    # This is the main function for profiling
+    # We've renamed our original main() above to real_main()
     import cProfile
     import pstats
     from theano.compat.six import StringIO
@@ -661,4 +664,5 @@ def profile_main():
     # stats.print_callers()
 
 if __name__ == '__main__':
+    #real_main()
     profile_main()
diff --git a/theano/tensor/tests/test_nlinalg.py b/theano/tensor/tests/test_nlinalg.py
deleted file mode 100644
index 3e46df181e1..00000000000
--- a/theano/tensor/tests/test_nlinalg.py
+++ /dev/null
@@ -1,509 +0,0 @@
-import unittest
-
-import numpy
-import numpy.linalg
-from numpy.testing import assert_array_almost_equal
-from numpy.testing import dec, assert_array_equal, assert_allclose
-from numpy import inf
-
-import theano
-from theano import tensor, function
-from theano.tensor.basic import _allclose
-from theano.tests.test_rop import break_op
-from theano.tests import unittest_tools as utt
-from theano import config
-
-from theano.tensor.nlinalg import ( MatrixInverse,
-                                    matrix_inverse,
-                                    MatrixPinv,
-                                    pinv,
-                                    AllocDiag,
-                                    alloc_diag,
-                                    ExtractDiag,
-                                    extract_diag,
-                                    diag,
-                                    trace,
-                                    Det,
-                                    det,
-                                    Eig,
-                                    eig,
-                                    Eigh,
-                                    EighGrad,
-                                    eigh,
-                                    matrix_dot,
-                                    _zero_disconnected,
-                                    qr,
-                                    matrix_power,
-                                    norm,
-                                    svd
-                                    )
-
-from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
-from nose.tools import assert_raises
-
-
-def test_pseudoinverse_correctness():
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    d1 = rng.randint(4) + 2
-    d2 = rng.randint(4) + 2
-    r = rng.randn(d1, d2).astype(theano.config.floatX)
-
-    x = tensor.matrix()
-    xi = pinv(x)
-
-    ri = function([x], xi)(r)
-    assert ri.shape[0] == r.shape[1]
-    assert ri.shape[1] == r.shape[0]
-    assert ri.dtype == r.dtype
-    # Note that pseudoinverse can be quite unprecise so I prefer to compare
-    # the result with what numpy.linalg returns
-    assert _allclose(ri, numpy.linalg.pinv(r))
-
-
-class test_MatrixInverse(utt.InferShapeTester):
-    def setUp(self):
-        super(test_MatrixInverse, self).setUp()
-        self.op_class = MatrixInverse
-        self.op = matrix_inverse
-        self.rng = numpy.random.RandomState(utt.fetch_seed())
-
-    def test_inverse_correctness(self):
-
-        r = self.rng.randn(4, 4).astype(theano.config.floatX)
-
-        x = tensor.matrix()
-        xi = self.op(x)
-
-        ri = function([x], xi)(r)
-        assert ri.shape == r.shape
-        assert ri.dtype == r.dtype
-
-        rir = numpy.dot(ri, r)
-        rri = numpy.dot(r, ri)
-
-        assert _allclose(numpy.identity(4), rir), rir
-        assert _allclose(numpy.identity(4), rri), rri
-
-    def test_infer_shape(self):
-
-        r = self.rng.randn(4, 4).astype(theano.config.floatX)
-
-        x = tensor.matrix()
-        xi = self.op(x)
-
-        self._compile_and_check([x], [xi], [r],
-                                self.op_class, warn=False)
-
-
-def test_matrix_dot():
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    n = rng.randint(4) + 2
-    rs = []
-    xs = []
-    for k in xrange(n):
-        rs += [rng.randn(4, 4).astype(theano.config.floatX)]
-        xs += [tensor.matrix()]
-    sol = matrix_dot(*xs)
-
-    theano_sol = function(xs, sol)(*rs)
-    numpy_sol = rs[0]
-    for r in rs[1:]:
-        numpy_sol = numpy.dot(numpy_sol, r)
-
-    assert _allclose(numpy_sol, theano_sol)
-
-
-def test_qr_modes():
-    rng = numpy.random.RandomState(utt.fetch_seed())
-
-    A = tensor.matrix("A", dtype=theano.config.floatX)
-    a = rng.rand(4, 4).astype(theano.config.floatX)
-
-    f = function([A], qr(A))
-    t_qr = f(a)
-    n_qr = numpy.linalg.qr(a)
-    assert _allclose(n_qr, t_qr)
-
-    for mode in ["reduced", "r", "raw", "full", "economic"]:
-        f = function([A], qr(A, mode))
-        t_qr = f(a)
-        n_qr = numpy.linalg.qr(a, mode)
-        if isinstance(n_qr, (list, tuple)):
-            assert _allclose(n_qr[0], t_qr[0])
-            assert _allclose(n_qr[1], t_qr[1])
-        else:
-            assert _allclose(n_qr, t_qr)
-
-    try:
-        n_qr = numpy.linalg.qr(a, "complete")
-        f = function([A], qr(A, "complete"))
-        t_qr = f(a)
-        assert _allclose(n_qr, t_qr)
-    except TypeError, e:
-        assert "name 'complete' is not defined" in str(e)
-
-
-def test_svd():
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    A = tensor.matrix("A", dtype=theano.config.floatX)
-    U, V, T = svd(A)
-    fn = function([A], [U, V, T])
-    a = rng.rand(4, 4).astype(theano.config.floatX)
-    n_u, n_v, n_t = numpy.linalg.svd(a)
-    t_u, t_v, t_t = fn(a)
-
-    assert _allclose(n_u, t_u)
-    assert _allclose(n_v, t_v)
-    assert _allclose(n_t, t_t)
-
-
-def test_inverse_singular():
-    singular = numpy.array([[1, 0, 0]] + [[0, 1, 0]] * 2,
-                           dtype=theano.config.floatX)
-    a = tensor.matrix()
-    f = function([a], matrix_inverse(a))
-    try:
-        f(singular)
-    except numpy.linalg.LinAlgError:
-        return
-    assert False
-
-
-def test_inverse_grad():
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    r = rng.randn(4, 4)
-    tensor.verify_grad(matrix_inverse, [r], rng=numpy.random)
-
-    rng = numpy.random.RandomState(utt.fetch_seed())
-
-    r = rng.randn(4, 4)
-    tensor.verify_grad(matrix_inverse, [r], rng=numpy.random)
-
-
-def test_det():
-    rng = numpy.random.RandomState(utt.fetch_seed())
-
-    r = rng.randn(5, 5).astype(config.floatX)
-    x = tensor.matrix()
-    f = theano.function([x], det(x))
-    assert numpy.allclose(numpy.linalg.det(r), f(r))
-
-
-def test_det_grad():
-    rng = numpy.random.RandomState(utt.fetch_seed())
-
-    r = rng.randn(5, 5).astype(config.floatX)
-    tensor.verify_grad(det, [r], rng=numpy.random)
-
-
-def test_det_shape():
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    r = rng.randn(5, 5).astype(config.floatX)
-
-    x = tensor.matrix()
-    f = theano.function([x], det(x))
-    f_shape = theano.function([x], det(x).shape)
-    assert numpy.all(f(r).shape == f_shape(r))
-
-
-class test_diag(unittest.TestCase):
-    """
-    Test that linalg.diag has the same behavior as numpy.diag.
-    numpy.diag has two behaviors:
-    (1) when given a vector, it returns a matrix with that vector as the
-    diagonal.
-    (2) when given a matrix, returns a vector which is the diagonal of the
-    matrix.
-
-    (1) and (2) are tested by test_alloc_diag and test_extract_diag
-    respectively.
-
-    test_diag test makes sure that linalg.diag instantiates
-    the right op based on the dimension of the input.
-    """
-    def __init__(self, name, mode=None, shared=tensor._shared,
-                 floatX=None, type=tensor.TensorType):
-        self.mode = mode
-        self.shared = shared
-        if floatX is None:
-            floatX = config.floatX
-        self.floatX = floatX
-        self.type = type
-        super(test_diag, self).__init__(name)
-
-    def test_alloc_diag(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        x = theano.tensor.vector()
-        g = alloc_diag(x)
-        f = theano.function([x], g)
-
-        # test "normal" scenario (5x5 matrix) and special cases of 0x0 and 1x1
-        for shp in [5, 0, 1]:
-            m = rng.rand(shp).astype(self.floatX)
-            v = numpy.diag(m)
-            r = f(m)
-            # The right matrix is created
-            assert (r == v).all()
-
-        # Test we accept only vectors
-        xx = theano.tensor.matrix()
-        ok = False
-        try:
-            alloc_diag(xx)
-        except TypeError:
-            ok = True
-        assert ok
-
-        # Test infer_shape
-        f = theano.function([x], g.shape)
-        topo = f.maker.fgraph.toposort()
-        if config.mode != 'FAST_COMPILE':
-            assert sum([node.op.__class__ == AllocDiag for node in topo]) == 0
-        for shp in [5, 0, 1]:
-            m = rng.rand(shp).astype(self.floatX)
-            assert (f(m) == m.shape).all()
-
-    def test_alloc_diag_grad(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        x = rng.rand(5)
-        tensor.verify_grad(alloc_diag, [x], rng=rng)
-
-    def test_diag(self):
-        # test that it builds a matrix with given diagonal when using
-        # vector inputs
-        x = theano.tensor.vector()
-        y = diag(x)
-        assert y.owner.op.__class__ == AllocDiag
-
-        # test that it extracts the diagonal when using matrix input
-        x = theano.tensor.matrix()
-        y = extract_diag(x)
-        assert y.owner.op.__class__ == ExtractDiag
-
-        # other types should raise error
-        x = theano.tensor.tensor3()
-        ok = False
-        try:
-            y = extract_diag(x)
-        except TypeError:
-            ok = True
-        assert ok
-
-    # not testing the view=True case since it is not used anywhere.
-    def test_extract_diag(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        m = rng.rand(2, 3).astype(self.floatX)
-        x = self.shared(m)
-        g = extract_diag(x)
-        f = theano.function([], g)
-        assert [isinstance(node.inputs[0].type, self.type)
-                for node in f.maker.fgraph.toposort()
-                if isinstance(node.op, ExtractDiag)] == [True]
-
-        for shp in [(2, 3), (3, 2), (3, 3), (1, 1), (0, 0)]:
-            m = rng.rand(*shp).astype(self.floatX)
-            x.set_value(m)
-            v = numpy.diag(m)
-            r = f()
-            # The right diagonal is extracted
-            assert (r == v).all()
-
-        # Test we accept only matrix
-        xx = theano.tensor.vector()
-        ok = False
-        try:
-            extract_diag(xx)
-        except TypeError:
-            ok = True
-        assert ok
-
-        # Test infer_shape
-        f = theano.function([], g.shape)
-        topo = f.maker.fgraph.toposort()
-        if config.mode != 'FAST_COMPILE':
-            assert sum([node.op.__class__ == ExtractDiag
-                        for node in topo]) == 0
-        for shp in [(2, 3), (3, 2), (3, 3)]:
-            m = rng.rand(*shp).astype(self.floatX)
-            x.set_value(m)
-            assert f() == min(shp)
-
-    def test_extract_diag_grad(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        x = rng.rand(5, 4).astype(self.floatX)
-        tensor.verify_grad(extract_diag, [x], rng=rng)
-
-    @attr('slow')
-    def test_extract_diag_empty(self):
-        c = self.shared(numpy.array([[], []], self.floatX))
-        f = theano.function([], extract_diag(c), mode=self.mode)
-
-        assert [isinstance(node.inputs[0].type, self.type)
-                for node in f.maker.fgraph.toposort()
-                if isinstance(node.op, ExtractDiag)] == [True]
-
-
-def test_trace():
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    x = theano.tensor.matrix()
-    g = trace(x)
-    f = theano.function([x], g)
-
-    for shp in [(2, 3), (3, 2), (3, 3)]:
-        m = rng.rand(*shp).astype(config.floatX)
-        v = numpy.trace(m)
-        assert v == f(m)
-
-    xx = theano.tensor.vector()
-    ok = False
-    try:
-        trace(xx)
-    except TypeError:
-        ok = True
-    assert ok
-
-
-class test_Eig(utt.InferShapeTester):
-    op_class = Eig
-    op = eig
-    dtype = 'float64'
-
-    def setUp(self):
-        super(test_Eig, self).setUp()
-        self.rng = numpy.random.RandomState(utt.fetch_seed())
-        self.A = theano.tensor.matrix(dtype=self.dtype)
-        X = numpy.asarray(self.rng.rand(5, 5),
-                          dtype=self.dtype)
-        self.S = X.dot(X.T)
-
-    def test_infer_shape(self):
-        A = self.A
-        S = self.S
-        self._compile_and_check([A],  # theano.function inputs
-                                self.op(A),  # theano.function outputs
-                                # S must be square
-                                [S],
-                                self.op_class,
-                                warn=False)
-
-    def test_eval(self):
-        A = theano.tensor.matrix(dtype=self.dtype)
-        self.assertEquals([e.eval({A: [[1]]}) for e in self.op(A)],
-                          [[1.0], [[1.0]]])
-        x = [[0, 1], [1, 0]]
-        w, v = [e.eval({A: x}) for e in self.op(A)]
-        assert_array_almost_equal(numpy.dot(x, v), w * v)
-
-
-class test_Eigh(test_Eig):
-    op = staticmethod(eigh)
-
-    def test_uplo(self):
-        S = self.S
-        a = theano.tensor.matrix(dtype=self.dtype)
-        wu, vu = [out.eval({a: S}) for out in self.op(a, 'U')]
-        wl, vl = [out.eval({a: S}) for out in self.op(a, 'L')]
-        assert_array_almost_equal(wu, wl)
-        assert_array_almost_equal(vu * numpy.sign(vu[0, :]),
-                                  vl * numpy.sign(vl[0, :]))
-
-    def test_grad(self):
-        S = self.S
-        utt.verify_grad(lambda x: self.op(x)[0], [S], rng=self.rng)
-        utt.verify_grad(lambda x: self.op(x)[1], [S], rng=self.rng)
-        utt.verify_grad(lambda x: self.op(x, 'U')[0], [S], rng=self.rng)
-        utt.verify_grad(lambda x: self.op(x, 'U')[1], [S], rng=self.rng)
-
-
-class test_Eigh_float32(test_Eigh):
-    dtype = 'float32'
-
-
-class T_lstsq(unittest.TestCase):
-
-    def test_correct_solution(self):
-        x = tensor.lmatrix()
-        y = tensor.lmatrix()
-        z = tensor.lscalar()
-        b = theano.tensor.nlinalg.lstsq()(x, y, z)
-        f = function([x, y, z], b)
-        TestMatrix1 = numpy.asarray([[2, 1], [3, 4]])
-        TestMatrix2 = numpy.asarray([[17, 20], [43, 50]])
-        TestScalar = numpy.asarray(1)
-        f = function([x, y, z], b)
-        m = f(TestMatrix1, TestMatrix2, TestScalar)
-        self.assertTrue(numpy.allclose(TestMatrix2, numpy.dot(TestMatrix1, m[0])))
-
-    def test_wrong_coefficient_matrix(self):
-        x = tensor.vector()
-        y = tensor.vector()
-        z = tensor.scalar()
-        b = theano.tensor.nlinalg.lstsq()(x, y, z)
-        f = function([x, y, z], b)
-        self.assertRaises(numpy.linalg.linalg.LinAlgError, f, [2, 1], [2, 1], 1)
-
-    def test_wrong_rcond_dimension(self):
-        x = tensor.vector()
-        y = tensor.vector()
-        z = tensor.vector()
-        b = theano.tensor.nlinalg.lstsq()(x, y, z)
-        f = function([x, y, z], b)
-        self.assertRaises(numpy.linalg.LinAlgError, f, [2, 1], [2, 1], [2, 1])
-
-
-class Matrix_power():
-
-    def test_numpy_compare(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        A = tensor.matrix("A", dtype=theano.config.floatX)
-        Q = matrix_power(A, 3)
-        fn = function([A], [Q])
-        a = rng.rand(4, 4).astype(theano.config.floatX)
-
-        n_p = numpy.linalg.matrix_power(a, 3)
-        t_p = fn(a)
-        assert numpy.allclose(n_p, t_p)
-
-    def test_non_square_matrix(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        A = tensor.matrix("A", dtype=theano.config.floatX)
-        Q = matrix_power(A, 3)
-        f = function([A], [Q])
-        a = rng.rand(4, 3).astype(theano.config.floatX)
-        self.assertRaises(ValueError, f, a)
-
-
-class T_NormTests(unittest.TestCase):
-
-    def test_wrong_type_of_ord_for_vector(self):
-        self.assertRaises(ValueError, norm, [2, 1], 'fro')
-
-    def test_wrong_type_of_ord_for_matrix(self):
-        self.assertRaises(ValueError, norm, [[2, 1], [3, 4]], 0)
-
-    def test_non_tensorial_input(self):
-        self.assertRaises(ValueError, norm, 3, None)
-
-    def test_tensor_input(self):
-        self.assertRaises(NotImplementedError, norm, numpy.random.rand(3, 4, 5), None)
-
-    def test_numpy_compare(self):
-        rng = numpy.random.RandomState(utt.fetch_seed())
-
-        M = tensor.matrix("A", dtype=theano.config.floatX)
-        V = tensor.vector("V", dtype=theano.config.floatX)
-
-        a = rng.rand(4, 4).astype(theano.config.floatX)
-        b = rng.rand(4).astype(theano.config.floatX)
-
-        A = (   [None, 'fro', 'inf', '-inf', 1, -1, None, 'inf', '-inf', 0, 1, -1, 2, -2],
-                [M, M, M, M, M, M, V, V, V, V, V, V, V, V],
-                [a, a, a, a, a, a, b, b, b, b, b, b, b, b],
-                [None, 'fro', inf, -inf, 1, -1, None, inf, -inf, 0, 1, -1, 2, -2])
-
-        for i in range(0, 14):
-            f = function([A[1][i]], norm(A[1][i], A[0][i]))
-            t_n = f(A[2][i])
-            n_n = numpy.linalg.norm(A[2][i], A[3][i])
-            assert _allclose(n_n, t_n)
diff --git a/theano/tensor/tests/test_opt.py b/theano/tensor/tests/test_opt.py
index b25a998661c..0c3d21d9fd8 100644
--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -2,21 +2,17 @@
 
 import copy
 import logging
-import pickle
-import os
-import sys
 import time
 import unittest
 
 import numpy
 from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
 from numpy.testing import dec
 from numpy.testing.noseclasses import KnownFailureTest
 
 import theano
 import theano.scalar as scal
-from theano.compat.six import PY3, StringIO
+from theano.compat.six import StringIO
 from theano import compile
 from theano.compile import deep_copy_op, DeepCopyOp
 from theano import config
@@ -35,8 +31,7 @@
         out2in,
         Shape_i,
         Assert,
-        MakeVector,
-        make_vector
+        MakeVector
         )
 from theano import tensor
 from theano import tensor as T
@@ -129,27 +124,13 @@ def test_lift(self):
         x, y, z = inputs([False] * 1, [False] * 2, [False] * 3)
         e = x + y + z
         g = FunctionGraph([x, y, z], [e])
-
-        # It does not really matter if the DimShuffles are inplace
-        # or not.
-        init_str_g_inplace = (
-            "[Elemwise{add,no_inplace}(InplaceDimShuffle{x,0,1}"
-            "(Elemwise{add,no_inplace}(InplaceDimShuffle{x,0}(x), y)), z)]")
-        init_str_g_noinplace = (
-            "[Elemwise{add,no_inplace}(DimShuffle{x,0,1}"
-            "(Elemwise{add,no_inplace}(DimShuffle{x,0}(x), y)), z)]")
-        self.assertTrue(str(g) in (init_str_g_inplace, init_str_g_noinplace),
-                        str(g))
-
-        opt_str_g_inplace = (
-            "[Elemwise{add,no_inplace}(Elemwise{add,no_inplace}"
-            "(InplaceDimShuffle{x,x,0}(x), InplaceDimShuffle{x,0,1}(y)), z)]")
-        opt_str_g_noinplace = (
-            "[Elemwise{add,no_inplace}(Elemwise{add,no_inplace}"
-            "(DimShuffle{x,x,0}(x), DimShuffle{x,0,1}(y)), z)]")
+        self.assertTrue(str(g) == ("[Elemwise{add,no_inplace}("
+            "InplaceDimShuffle{x,0,1}(Elemwise{add,no_inplace}"
+            "(InplaceDimShuffle{x,0}(x), y)), z)]"), str(g))
         dimshuffle_lift.optimize(g)
-        self.assertTrue(str(g) in (opt_str_g_inplace, opt_str_g_noinplace),
-                        str(g))
+        self.assertTrue(str(g) == ("[Elemwise{add,no_inplace}(Elemwise"
+            "{add,no_inplace}(InplaceDimShuffle{x,x,0}(x), InplaceDimShuffle"
+            "{x,0,1}(y)), z)]"), str(g))
 
 
 def test_add_canonizer_problem0():
@@ -164,26 +145,13 @@ def test_add_canonizer_problem0():
 class test_greedy_distribute(unittest.TestCase):
     def test_main(self):
         a, b, c, d, x, y, z = matrices('abcdxyz')
-
-        #1. ((a/x + b/y) * x * y) --> a*y + b*x
         e = (a / z + b / x) * x * z
         g = FunctionGraph([a, b, c, d, x, y, z], [e])
-        #print pprint(g.outputs[0])
-        mul_canonizer.optimize(g)
-        gof.TopoOptimizer(gof.LocalOptGroup(local_greedy_distributor),
-                          order='out_to_in').optimize(g)
-        #print pprint(g.outputs[0])
-        assert str(pprint(g.outputs[0])) == "((a * x) + (b * z))"
-
-        #2. ((a/x + b) * x) --> a + b*x
-        e = (a / x + b) * x
-        g = FunctionGraph([a, b, x], [e])
-        #print pprint(g.outputs[0])
+        ##print pprint(g.outputs[0])
         mul_canonizer.optimize(g)
         gof.TopoOptimizer(gof.LocalOptGroup(local_greedy_distributor),
                           order='out_to_in').optimize(g)
-        #print pprint(g.outputs[0])
-        assert str(pprint(g.outputs[0])) == "(a + (b * x))"
+        ##print pprint(g.outputs[0])
 
     def test_kording_bug(self):
         x, y = vectors('xy')
@@ -251,12 +219,12 @@ def test_elemwise_multiple_inputs_optimisation(self):
         fyv = theano._asarray(numpy.random.rand(*shp), dtype='float32')
         fzv = theano._asarray(numpy.random.rand(*shp), dtype='float32')
         fvv = theano._asarray(numpy.random.rand(shp[0]), dtype=
-                              'float32').reshape(1, shp[0])
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        'float32').reshape(1, shp[0])
         dxv = theano._asarray(numpy.random.rand(*shp), dtype='float64')
         dyv = theano._asarray(numpy.random.rand(*shp), dtype='float64')
         dzv = theano._asarray(numpy.random.rand(*shp), dtype='float64')
         dvv = theano._asarray(numpy.random.rand(shp[0]), dtype=
-                              'float64').reshape(1, shp[0])
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        'float64').reshape(1, shp[0])
         cases = [
             (fx + fy, (fx, fy), (fxv, fyv), 1, 'float32'),
             (fx * fy, (fx, fy), (fxv, fyv), 1, 'float32'),
@@ -340,11 +308,13 @@ def test_elemwise_multiple_inputs_optimisation2(self):
         fxv = theano._asarray(numpy.random.rand(*shp), dtype='float32')
         fyv = theano._asarray(numpy.random.rand(*shp), dtype='float32')
         fzv = theano._asarray(numpy.random.rand(*shp), dtype='float32')
-        fvv = theano._asarray(numpy.random.rand(shp[0]), dtype='float32').reshape(1, shp[0])
+        fvv = theano._asarray(numpy.random.rand(shp[0]), dtype=
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        'float32').reshape(1, shp[0])
         dxv = theano._asarray(numpy.random.rand(*shp), dtype='float64')
         dyv = theano._asarray(numpy.random.rand(*shp), dtype='float64')
         dzv = theano._asarray(numpy.random.rand(*shp), dtype='float64')
-        dvv = theano._asarray(numpy.random.rand(shp[0]), dtype='float64').reshape(1, shp[0])
+        dvv = theano._asarray(numpy.random.rand(shp[0]), dtype=
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        'float64').reshape(1, shp[0])
         cases = [
             (fx + fy, (fx, fy), (fxv, fyv), 1, 'float32'),
             (fx * fy, (fx, fy), (fxv, fyv), 1, 'float32'),
@@ -422,7 +392,6 @@ def test_elemwise_multiple_inputs_optimisation2(self):
             assert(len(f.maker.fgraph.toposort()) == nb_elemwise)
             assert(out_dtype == out.dtype)
 
-    @attr('slow')
     def test_multiple_case(self):
         """ test those case take from the comment in Canonizer
         x / x -> 1
@@ -446,12 +415,14 @@ def test_multiple_case(self):
         fyv = theano._asarray(numpy.random.rand(*shp), dtype='float32')
         fzv = theano._asarray(numpy.random.rand(*shp), dtype='float32')
         fwv = theano._asarray(numpy.random.rand(*shp), dtype='float32')
-        fvv = theano._asarray(numpy.random.rand(shp[0]), dtype='float32').reshape(1, shp[0])
+        fvv = theano._asarray(numpy.random.rand(shp[0]), dtype=
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        'float32').reshape(1, shp[0])
         dxv = theano._asarray(numpy.random.rand(*shp), dtype='float64')
         dyv = theano._asarray(numpy.random.rand(*shp), dtype='float64')
         dzv = theano._asarray(numpy.random.rand(*shp), dtype='float64')
         dwv = theano._asarray(numpy.random.rand(*shp), dtype='float64')
-        dvv = theano._asarray(numpy.random.rand(shp[0]), dtype='float64').reshape(1, shp[0])
+        dvv = theano._asarray(numpy.random.rand(shp[0]), dtype=
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        'float64').reshape(1, shp[0])
 
         #We must be sure that the Canonizer is working, but that we don't have other
         # optimisation that could hide bug in the Canonizer as local_elemwise_fusion
@@ -503,6 +474,9 @@ def test_multiple_case(self):
                 assert(out_dtype == out.dtype)
                 assert numpy.allclose(out, val_inputs[1])
                 topo = f.maker.fgraph.toposort()
+                print "ID TOPO", id, topo, sym_inputs
+                for r, t in f.maker.fgraph.shape_feature.shape_of.items():
+                    print '  ', r, t
                 if topo and not(len(topo)==1 and topo[0].op==deep_copy_op):
                     for node in topo[:-1]:
                         assert isinstance(node.op, Shape_i)
@@ -526,6 +500,7 @@ def test_multiple_case(self):
                 out = f(*val_inputs)
                 assert numpy.allclose(out, (1 / val_inputs[1]))
                 topo = f.maker.fgraph.toposort()
+                print topo
                 elem = [t for t in topo if isinstance(t.op, T.Elemwise)]
                 assert len(elem) == nb_elemwise
                 assert isinstance(elem[0].op, (T.Elemwise, ))
@@ -702,7 +677,8 @@ def test_multiple_case_that_fail(self):
         dxv = theano._asarray(numpy.random.rand(*shp), dtype='float32')
         dyv = theano._asarray(numpy.random.rand(*shp), dtype='float32')
         dzv = theano._asarray(numpy.random.rand(*shp), dtype='float32')
-        fvv = theano._asarray(numpy.random.rand(shp[0]), dtype='float32').reshape(1, shp[0])
+        fvv = theano._asarray(numpy.random.rand(shp[0]), dtype=
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        'float32').reshape(1, shp[0])
         #We must be sure that the Canonizer is working, but that we don't have other
         # optimisation that could hide bug in the Canonizer as local_elemwise_fusion
         mode = compile.mode.get_default_mode()
@@ -724,6 +700,7 @@ def test_multiple_case_that_fail(self):
                 assert numpy.allclose(out, val_inputs[0] /
                     val_inputs[1] / val_inputs[2])
                 topo = f.maker.fgraph.toposort()
+                print topo
                 assert len(topo) == 2
                 assert isinstance(topo[0].op, (T.Elemwise, ))
                 assert isinstance(topo[0].op.scalar_op,
@@ -742,6 +719,7 @@ def test_multiple_case_that_fail(self):
                 assert numpy.allclose(out, val_inputs[0] / (
                     val_inputs[1] / val_inputs[2]))
                 topo = f.maker.fgraph.toposort()
+                print topo
                 assert len(topo) == 2
                 assert isinstance(topo[0].op, (T.Elemwise, ))
                 assert isinstance(topo[0].op.scalar_op,
@@ -793,11 +771,13 @@ def test_local_merge_abs():
 
     f = theano.function([y, z], (abs(y * z * -2)), mode=mode)
     f(y_val, z_val)
+    theano.printing.debugprint(f)
     assert isinstance(f.maker.fgraph.toposort()[1].op.scalar_op, scal.Abs)
     assert len(f.maker.fgraph.toposort()) == 2
 
     f = theano.function([x, y], abs(x / y), mode=mode)
     f(x_val, y_val)
+    theano.printing.debugprint(f)
     assert isinstance(f.maker.fgraph.toposort()[1].op.scalar_op, scal.Abs)
     assert len(f.maker.fgraph.toposort()) == 2
 
@@ -874,15 +854,12 @@ def my_init(shp, dtype='float64', num=0):
         ix, iy, iz = [theano.tensor.tensor(dtype='int32',
                                            broadcastable=[False] * len(shp),
                                            name=n) for n in 'xyz']
-        fv = fvector('v')
-        fs = fscalar('s')
-
+        fv = fvector('r')
         fwv = my_init(shp, 'float32', 1)
         fxv = my_init(shp, 'float32', 2)
         fyv = my_init(shp, 'float32', 3)
         fzv = my_init(shp, 'float32', 4)
         fvv = theano._asarray(numpy.random.rand(shp[0]), dtype='float32')
-        fsv = numpy.asarray(numpy.random.rand(), dtype='float32')
         dwv = my_init(shp, 'float64', 5)
         ixv = theano._asarray(my_init(shp, num=60), dtype='int32')
         iyv = theano._asarray(my_init(shp, num=70), dtype='int32')
@@ -1041,13 +1018,7 @@ def my_init(shp, dtype='float64', num=0):
             (theano.tensor.mul(fx,ftanx,ftanx,fx),(fx,),(fxv,),
                 1,fxv*numpy.tan(fxv)*numpy.tan(fxv)*fxv,'float32'),
             (theano.tensor.mul(ftanx,ftanx,fx+fy),(fx,fy),(fxv,
-                fyv),1,numpy.tan(fxv)*numpy.tan(fxv)*(fxv+fyv),'float32'), # 70
-
-            #Cases with different broadcast pattern. They should not
-            #be merged as this would duplicate computation
-            #The graph should have 2 elemwise and 1 dimshuffle
-            (fx*theano.tensor.sin(fs),(fx,fs),(fxv,
-                fsv),3,fxv*numpy.sin(fsv),'float32'),
+                fyv),1,numpy.tan(fxv)*numpy.tan(fxv)*(fxv+fyv),'float32'),
             ]
         if slice:
             cases = cases[slice]
@@ -1139,19 +1110,16 @@ def test_elemwise_fusion(self):
         #we need the optimisation enabled and the canonicalize.
         #the canonicalize is needed to merge multiplication/addition by constant.
         mode._optimizer = mode._optimizer.including(
-            'local_elemwise_fusion', 'composite_elemwise_fusion',
-            'canonicalize')
+            'local_elemwise_fusion', 'canonicalize')
         self.do(mode, shared, shp)
 
-    @attr('slow')
     def test_elemwise_fusion_4d(self):
         shp = (3, 3, 3, 3)
         mode = copy.copy(compile.mode.get_default_mode())
         #we need the optimisation enabled and the canonicalize.
         #the canonicalize is needed to merge multiplication/addition by constant.
         mode._optimizer = mode._optimizer.including(
-            'local_elemwise_fusion', 'composite_elemwise_fusion',
-            'canonicalize')
+            'local_elemwise_fusion', 'canonicalize')
         self.do(mode, shared, shp)
 
     def test_gpu_fusion(self):
@@ -1159,29 +1127,24 @@ def test_gpu_fusion(self):
         #we need the optimisation enabled, debug do this.
         if theano.config.mode == "FAST_COMPILE":
             mode = theano.compile.mode.get_mode("FAST_RUN").including(
-                'local_elemwise_fusion',  'composite_elemwise_fusion',
-                'canonicalize', 'gpu')
+                    'local_elemwise_fusion', 'canonicalize', 'gpu')
         else:
             mode = theano.compile.mode.get_default_mode().including(
-                'local_elemwise_fusion',  'composite_elemwise_fusion',
-                'canonicalize', 'gpu')
+                    'local_elemwise_fusion', 'canonicalize', 'gpu')
         import theano.sandbox.cuda as cuda
         if not cuda.cuda_available:
             raise SkipTest("cuda not available")
 
         self.do(mode, cuda.float32_shared_constructor, shp, gpu=True)
 
-    @attr('slow')
     def test_gpu_fusion_Xd(self):
         #we need the optimisation enabled, debug do this.
         if theano.config.mode == "FAST_COMPILE":
             mode = theano.compile.mode.get_mode("FAST_RUN").including(
-                'local_elemwise_fusion',  'composite_elemwise_fusion',
-                'canonicalize', 'gpu')
+                    'local_elemwise_fusion', 'canonicalize', 'gpu')
         else:
             mode = theano.compile.mode.get_default_mode().including(
-                'local_elemwise_fusion',  'composite_elemwise_fusion',
-                'canonicalize', 'gpu')
+                    'local_elemwise_fusion', 'canonicalize', 'gpu')
         import theano.sandbox.cuda as cuda
         if not cuda.cuda_available:
             raise SkipTest("cuda not available")
@@ -1192,52 +1155,6 @@ def test_gpu_fusion_Xd(self):
             shp = (5, 5, 5)
         self.do(mode, cuda.float32_shared_constructor, shp, gpu=True)
 
-    def test_fusion_35inputs(self):
-        # Make sure a fused graph with more than 35 inputs does not segfault
-        # or error.
-        inpts = vectors(['i%i' % i for i in range(35)])
-        # Make an elemwise graph looking like:
-        # sin(i34 + sin(i33 + sin(... i1 + sin(i0) ...)))
-        out = tensor.sin(inpts[0])
-        for idx in range(1, 35):
-            out = tensor.sin(inpts[idx] + out)
-
-        f = function(inpts, out)
-        # Test it on some dummy values
-        f(*[range(i, 4 + i) for i in range(35)])
-
-    def test_pickle_big_fusion(self):
-        """In the past, pickle of Composite generated in tha case
-        crashed with max recusion limit. So we where not able to
-        generate C code in that case.
-
-        """
-        if not theano.config.cxx:
-            raise SkipTest("no c compiler, so can't use big elemwise!")
-        factors = []
-        sd = tensor.dscalar()
-        means = tensor.dvector()
-
-        cst_05 = theano.tensor.constant(.5)
-        cst_m05 = theano.tensor.constant(-.5)
-        cst_2 = theano.tensor.constant(2)
-        cst_m2 = theano.tensor.constant(-2)
-        ones = theano.tensor.constant(numpy.ones(10))
-        n = 85
-        if theano.config.mode in ["DebugMode", "DEBUG_MODE"]:
-            n = 10
-
-        for i in range(n):
-            f = (cst_m05 * sd ** cst_m2 * (ones - means[i]) ** cst_2 +
-                 cst_05 * tensor.log(cst_05 * (sd ** cst_m2) / numpy.pi))
-            factors.append(tensor.sum(f))
-
-        logp = tensor.add(*factors)
-
-        vars = [sd, means]
-        dlogp = function(vars, [theano.grad(logp, v) for v in vars])
-        dlogp(2, numpy.random.rand(n))
-
     def speed_fusion(self, shared_fn=shared, gpu=False, s=None):
         """
         param type s: a slice object
@@ -1259,10 +1176,8 @@ def speed_fusion(self, shared_fn=shared, gpu=False, s=None):
         mode2 = copy.copy(compile.get_default_mode())
         mode2._optimizer = mode2._optimizer.excluding('local_elemwise_fusion')
         print "test with linker", str(mode1.linker)
-        times1 = self.do(mode1, shared_fn, shp, gpu=gpu, nb_repeat=nb_repeat,
-                         assert_len_topo=False, slice=s)
-        times2 = self.do(mode2, shared_fn, shp, gpu=gpu, nb_repeat=nb_repeat,
-                         assert_len_topo=False, slice=s)
+        times1 = self.do(mode1, shared_fn, shp, gpu=gpu, nb_repeat=                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            nb_repeat, assert_len_topo=False, slice=s)
+        times2 = self.do(mode2, shared_fn, shp, gpu=gpu, nb_repeat=                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            nb_repeat, assert_len_topo=False, slice=s)
         print "times1 with local_elemwise_fusion"
         print times1, times1.min(), times1.max(), times1.sum()
         print "times2 without local_elemwise_fusion"
@@ -1279,8 +1194,7 @@ def test_fusion_inplace(self):
         #we need the optimisation enabled and the canonicalize.
         #the canonicalize is needed to merge multiplication/addition by constant.
         mode._optimizer = mode._optimizer.including(
-            'local_elemwise_fusion',  'composite_elemwise_fusion',
-            'canonicalize', 'inplace')
+            'local_elemwise_fusion', 'canonicalize', 'inplace')
 
         x, y, z = dmatrices('xyz')
         f = theano.function([x, y, z], tensor.dot(x, y) + x + y + z, mode=mode)
@@ -1301,12 +1215,11 @@ def speed_log_exp(self):
         linker = gof.OpWiseCLinker
         mode = compile.Mode(linker(), copy.copy(compile.mode.OPT_FAST_RUN))
         mode = compile.ProfileMode()
-        print "time", self.do(mode, shared, shp=(1000, 1000), gpu=False,
-                              assert_len_topo=False, slice=s, nb_repeat=100)
+        print "time", self.do(mode, shared, shp=(1000, 1000), gpu=
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        False, assert_len_topo=False, slice=s, nb_repeat=100)
+
 
-    def tes_memory_leak(self, mode=compile.mode.Mode('c', 'merge'),
-                        shared_fn=shared, shp=(3000,3000), gpu=False,
-                        nb_repeat=30, assert_len_topo=True, slice=None):
+    def tes_memory_leak(self, mode=compile.mode.Mode('c', 'merge'), shared_fn=shared, shp=(3000,3000), gpu=False, nb_repeat=30, assert_len_topo=True, slice=None):
         """
         param shared_fn: if None, will use compile.function
         verify that the elemwise fusion work
@@ -1497,22 +1410,26 @@ def test_log1p():
     f = function([x], T.log(1 + (x)), mode=m)
     assert [node.op for node in f.maker.fgraph.toposort()] == [T.log1p]
     f = function([x], T.log(1 + (-x)), mode=m)
-    assert [node.op for node in f.maker.fgraph.toposort()] == [
-        T.neg, inplace.log1p_inplace]
+    assert [node.op for node in f.maker.fgraph.toposort()] == [T.neg,
+         inplace.log1p_inplace]
     f = function([x], -T.log(1 + (-x)), mode=m)
-    assert [node.op for node in f.maker.fgraph.toposort()] == [
-        T.neg, inplace.log1p_inplace, inplace.neg_inplace]
+    assert [node.op for node in f.maker.fgraph.toposort()] == [T.neg,
+         inplace.log1p_inplace, inplace.neg_inplace]
 
     # check trickier cases (and use different dtype)
     y = fmatrix()
     f = function([x, y], T.log(tensor.fill(y, 1) + (x)), mode=m)
+    print f.maker.fgraph.toposort()
     # the first three ops are Shape_i, Shape_i, and Dimshuffle
-    assert [node.op for node in f.maker.fgraph.toposort()][3:] == [
-        T.log1p, tensor.alloc]
+    theano.printing.debugprint(f)
+    assert [node.op for node in f.maker.fgraph.toposort()][3:] \
+            == [T.log1p, tensor.alloc]
     f = function([x, y], T.log(0 + (x) + tensor.fill(y, 1.0)), mode=m)
-    assert [node.op for node in f.maker.fgraph.toposort()][3:] == [
-        T.log1p, tensor.alloc]
+    theano.printing.debugprint(f)
+    assert [node.op for node in f.maker.fgraph.toposort()][3:] \
+            == [T.log1p, tensor.alloc]
     f = function([x, y], T.log(2 + (x) - tensor.fill(y, 1.0)), mode=m)
+    theano.printing.debugprint(f)
     assert [node.op for node in f.maker.fgraph.toposort()][3:] \
             == [T.log1p, tensor.alloc]
 
@@ -1524,12 +1441,14 @@ def test_log1p():
         # I was never sure if this optimization should work on complex numbers or not.
         z = tensor.zmatrix()
         f = function([z], T.log(1 + (z)), mode=m)
+        theano.printing.debugprint(f)
         assert [node.op for node in f.maker.fgraph.toposort()] == [T.log1p]
 
     if 1:
         # should work for int
         z = tensor.imatrix()
         f = function([z], T.log(1 + (z)), mode=m)
+        theano.printing.debugprint(f)
         assert [node.op for node in f.maker.fgraph.toposort()] == [T.log1p]
 
 
@@ -1548,12 +1467,14 @@ def test_log_add():
     y = dvector()
     f = function([x, y], T.log(T.exp(x) + T.exp(y)), mode=m)
 
-    f([10000], [10000])  # causes overflow if handled incorrectly
+    theano.printing.debugprint(f)
+
+    print f([10000], [10000])  # causes overflow if handled incorrectly
     assert numpy.isfinite(f([10000], [10000]))
     assert numpy.allclose(f([10000], [10000]), 10000 + numpy.log1p(1))
 
     #test that it give the same result when it don't overflow
-    f([10], [10])  # don't causes overflow
+    print f([10], [10])  # don't causes overflow
     assert numpy.allclose(f([10], [10]), 10 + numpy.log1p(1))
 
     # test that it also works with more than two args, (this currently fails)
@@ -1561,9 +1482,10 @@ def test_log_add():
     y = dvector()
     f = function([x, y], T.log(T.exp(x) + T.exp(y) + T.exp(x - y) + T.exp(
         x + y)), mode=m)
+    theano.printing.debugprint(f)
 
     try:
-        f([10000], [10000])  # causes overflow if handled incorrectly
+        print f([10000], [10000])  # causes overflow if handled incorrectly
         assert numpy.allclose(f([10000], [10000]), 20000)
     except AssertionError:
         raise KnownFailureTest(('log(add(exp)) is not stabilized when adding '
@@ -1574,54 +1496,6 @@ def test_log_add():
     #TODO: (write and) test that the optimization works with Sum in addition to working with Add.
 
 
-def test_local_useless_inc_subtensor():
-    x = tensor.matrix('x')
-    y = tensor.matrix('y')
-    mode = compile.get_default_mode().including("local_useless_inc_subtensor")
-    for sub in [slice(None), slice(None, None, -1)]:
-        o = tensor.set_subtensor(x[::, sub], y)
-        f = theano.function([x, y], o, mode=mode)
-        o_shape = tensor.set_subtensor(x[::, sub],
-                                       tensor.specify_shape(y, x.shape))
-        f_shape = theano.function([x, y], o_shape, mode=mode)
-
-        # Test with shape info
-        topo = f_shape.maker.fgraph.toposort()
-        assert not any(isinstance(n.op, tensor.IncSubtensor) for n in topo)
-        out = f_shape([[2, 3]], [[3, 4]])
-        assert (out == numpy.asarray([[3, 4]])[::, sub]).all()
-
-        # Test that without shape info, we don't apply the opt.
-        topo = f.maker.fgraph.toposort()
-        assert len(topo) == 1
-        assert isinstance(topo[0].op, tensor.IncSubtensor)
-        out = f([[2, 3]], [[3, 4]])
-        assert (out == numpy.asarray([[3, 4]])[::, sub]).all()
-
-        # Test that we don't remove shape error
-        try:
-            f([[2, 3]], [[3, 4], [4, 5]])
-            assert False
-        except (ValueError, AssertionError):
-            pass
-
-        # Test that we don't remove broadcastability
-        out = f([[2, 3], [3, 4]], [[5, 6]])
-        assert (out == numpy.asarray([[5, 6], [5, 6]])[::, sub]).all()
-
-    # Test that we do not optimize others strides even when sub and y
-    # have same shapes
-    sub = x[::, ::2]
-    o_shape = tensor.set_subtensor(sub,
-                                   tensor.specify_shape(y, sub.shape))
-    f_shape = theano.function([x, y], o_shape)
-    topo = f_shape.maker.fgraph.toposort()
-    # theano.printing.debugprint(f_shape)
-    assert any(isinstance(n.op, tensor.IncSubtensor) for n in topo)
-    out = f_shape([[2, 3, 6, 7]], [[8, 9]])
-    assert (out == numpy.asarray([[8, 3, 9, 7]])).all()
-
-
 def test_local_useless_subtensor():
     x = tensor.matrix('x')
 
@@ -1651,7 +1525,8 @@ def test_local_useless_subtensor():
         #theano.printing.debugprint(f)
         prog = f.maker.fgraph.toposort()
         if res:
-            assert isinstance(prog[0].op, theano.tensor.SpecifyShape), dims
+            assert isinstance(prog[0].op, theano.tensor.basic.
+                SpecifyShape), dims
             assert prog[1].op == tensor.exp, dims
             assert len(prog) == 2, dims
         else:
@@ -1668,7 +1543,7 @@ def test_local_useless_subtensor():
             ((slice(0, x.shape[1]), slice(0, x.shape[1]), ), False),
             ((slice(0, x.shape[1]), 2), False),
             ((slice(0, x.shape[1]), slice(x.shape[0] - x.shape[0],
-                                          x.shape[1]),), False),
+                x.shape[1]),), False),
             ((slice(0, T.scalar_from_tensor(x.shape[0])), ), True),
             ]):
         f = function([x], tensor.exp(x).__getitem__(dims), mode=mode_opt)
@@ -1713,54 +1588,6 @@ def test_local_useless_subtensor():
         f([[1, 2, 3], [4, 5, 6]], 3)
 
 
-class test_local_subtensor_make_vector(unittest.TestCase):
-    def test_scalar_idx(self):
-        x, y, z = tensor.lscalars('xyz')
-        v = make_vector(x, y, z)
-        f = function([x, y, z], v[0], mode=mode_opt)
-
-        prog = f.maker.fgraph.toposort()
-        assert len(prog) == 1
-        assert isinstance(prog[0].op, theano.compile.ops.DeepCopyOp)
-        assert f(0, 1, 2) == 0
-    
-    def test_slice_idx_stop(self):
-        x, y, z = tensor.lscalars('xyz')
-        v = make_vector(x, y, z)
-        f = function([x, y, z], v[:2], mode=mode_opt)
-
-        prog = f.maker.fgraph.toposort()
-        assert len(prog) == 1
-        assert isinstance(prog[0].op, MakeVector)
-        assert len(prog[0].inputs) == 2
-        r = f(0, 1, 2)
-        assert r[0] == 0 and r[1] == 1
-    
-    def test_slice_idx_step(self):
-        x, y, z = tensor.lscalars('xyz')
-        v = make_vector(x, y, z)
-        f = function([x, y, z], v[::2], mode=mode_opt)
-
-        prog = f.maker.fgraph.toposort()
-        assert len(prog) == 1
-        assert isinstance(prog[0].op, MakeVector)
-        assert len(prog[0].inputs) == 2
-        r = f(0, 1, 2)
-        assert r[0] == 0 and r[1] == 2
-    
-    def test_AdvancedSubtensor1_idx(self):
-        x, y, z = tensor.lscalars('xyz')
-        v = make_vector(x, y, z)
-        f = function([x, y, z], v[[0, 2]], mode=mode_opt)
-
-        prog = f.maker.fgraph.toposort()
-        assert len(prog) == 1
-        assert isinstance(prog[0].op, MakeVector)
-        assert len(prog[0].inputs) == 2
-        r = f(0, 1, 2)
-        assert r[0] == 0 and r[1] == 2
-
-
 class test_local_subtensor_lift(unittest.TestCase):
     def test0(self):
         # basic test that the Op works
@@ -1794,11 +1621,11 @@ def test1(self):
         f = function([x, y, z], tensor.exp(x + y + z)[0], mode=mode_opt)
 
         prog = f.maker.fgraph.toposort()
-        assert isinstance(prog[0].op, tensor.DimShuffle)
-        assert isinstance(prog[1].op, tensor.Subtensor)  # first subtensor
+        assert isinstance(prog[1].op, tensor.DimShuffle)
+        assert isinstance(prog[0].op, tensor.Subtensor)  # first subtensor
         assert isinstance(prog[2].op, tensor.Subtensor)  # first subtensor
         assert isinstance(prog[3].op.scalar_op, theano.scalar.
-                          Composite)  # Composite{add,add}
+            Composite)  # Composite{add,add}
         assert len(prog) == 4
         f([[0, 1], [2, 3]], 4, [[4, 5], [6, 7]])
               # let debugmode test something
@@ -1811,11 +1638,11 @@ def test2(self):
         f = function([x, y, z], tensor.exp(x + y + z)[0:2], mode=mode_opt)
 
         prog = f.maker.fgraph.toposort()
-        assert isinstance(prog[0].op, tensor.DimShuffle)
-        assert isinstance(prog[1].op, tensor.Subtensor)  # first subtensor
+        assert isinstance(prog[1].op, tensor.DimShuffle)
+        assert isinstance(prog[0].op, tensor.Subtensor)  # first subtensor
         assert isinstance(prog[2].op, tensor.Subtensor)  # first subtensor
         assert isinstance(prog[3].op.scalar_op, theano.scalar.
-                          Composite)  # Composite{add,add}
+            Composite)  # Composite{add,add}
         assert len(prog) == 4
         f([[0, 1], [2, 3]], 4, [[4, 5], [6, 7]])
               # let debugmode test something
@@ -1855,12 +1682,12 @@ def test5(self):
         x = tensor.matrix('x')
         y = tensor.vector('y')
         f = function([x, y], [tensor.exp(x + y)[0], tensor.exp(x + y) + x],
-                     mode=mode_opt)
+             mode=mode_opt)
 
         prog = f.maker.fgraph.toposort()
         assert isinstance(prog[0].op, tensor.DimShuffle)
         assert isinstance(prog[1].op.scalar_op, theano.scalar.
-                          Composite)  # Composite{add,exp}
+            Composite)  # Composite{add,exp}
         assert prog[2].op == tensor.add
         assert isinstance(prog[3].op, tensor.Subtensor)  # first subtensor
         assert len(prog) == 4
@@ -1993,7 +1820,6 @@ def test_scalar(self):
                     self.assertRaises(IndexError, f, x_val, idx)
                     self.assertRaises(IndexError, g, x_val, idx)
 
-    @attr('slow')
     def test_const2(self):
         # var[::-1][const] -> var[-1]
         x = tensor.matrix('x')
@@ -2127,10 +1953,10 @@ def test_const_general(self):
         # Some cases of merge: shape, (start, stop, step) of first,
         # (start, stop, step) of second subtensor
         cases = [
-            ((2, 3), (None, None, None), (None, None, -1)),
-            ((12, 1), (None, None, -4), (None, None, 1)),
-            ((5, 3), (1, 4, 2), (None, None, -1)),
-        ]
+                ((2, 3), (None, None, None), (None, None, -1)),
+                ((12, 1), (None, None, -4), (None, None, 1)),
+                ((5, 3), (1, 4, 2), (None, None, -1)),
+                ]
         x = tensor.matrix('x')
 
         for shape, sl1, sl2 in cases:
@@ -2151,13 +1977,13 @@ def test_scalar5(self):
         e2 = tensor.iscalar('e2')
         s2 = tensor.iscalar('s2')
         f = function([x, b1, e1, s1, b2, e2, s2], x[b1:e1:s1][b2:e2:s2],
-                     mode=mode_opt)
+             mode=mode_opt)
         #theano.printing.debugprint(f, print_type=True)
 
         topo = f.maker.fgraph.toposort()
         #print [t for t in topo if isinstance(t.op, tensor.Subtensor)]
         assert len([t for t in topo if isinstance(t.op, tensor.
-                                                  Subtensor)]) == 1
+            Subtensor)]) == 1
         #print topo[-1].op
         assert isinstance(topo[-1].op, DeepCopyOp)
 
@@ -2167,9 +1993,9 @@ def test_scalar5(self):
         e2r = self.rng.permutation(range(-8, 8))[:2]
 
         s1r = self.rng.permutation([-7, -6, -5, -4, -3, -2, -1, 1,
-                                    2, 3, 4, 5, 6, 7])[:2]
+             2, 3, 4, 5, 6, 7])[:2]
         s2r = self.rng.permutation([-7, -6, -5, -4, -3, -2, -1, 1,
-                                    2, 3, 4, 5, 6, 7])[:2]
+             2, 3, 4, 5, 6, 7])[:2]
 
         for x_s in self.x_shapes:
             x_val = self.rng.uniform(size=x_s).astype(config.floatX)
@@ -2208,7 +2034,7 @@ def test_const5(self):
         val = fun(data)
         assert numpy.all(val == data[3:6, 2:6, 1:7][1])
         assert len([n for n in fun.maker.fgraph.toposort()
-                    if isinstance(n.op, Subtensor)]) == nops
+                    if isinstance(n.op, tensor.basic.Subtensor)]) == nops
 
         # test 2)
         y = x[2, 3][1]
@@ -2216,7 +2042,7 @@ def test_const5(self):
         val = fun(data)
         assert numpy.all(val == data[2, 3][1])
         assert len([n for n in fun.maker.fgraph.toposort()
-                    if isinstance(n.op, Subtensor)]) == nops
+                    if isinstance(n.op, tensor.basic.Subtensor)]) == nops
 
         # test 3)
         y = x[3:6, 2, 1:7][1]
@@ -2224,7 +2050,7 @@ def test_const5(self):
         val = fun(data)
         assert numpy.all(val == data[3:6, 2, 1:7][1])
         assert len([n for n in fun.maker.fgraph.toposort()
-                    if isinstance(n.op, Subtensor)]) == nops
+                    if isinstance(n.op, tensor.basic.Subtensor)]) == nops
 
     def test_scalar6(self):
         # General case with one slice and one index
@@ -2240,7 +2066,7 @@ def test_scalar6(self):
         topo = f.maker.fgraph.toposort()
         #print [t for t in topo if isinstance(t.op, tensor.Subtensor)]
         assert len([t for t in topo if isinstance(t.op, tensor.
-                                                  Subtensor)]) == 1
+            Subtensor)]) == 1
         #print topo[-1].op
         assert isinstance(topo[-1].op, DeepCopyOp)
 
@@ -2266,17 +2092,16 @@ def test_scalar6(self):
                             except IndexError:
                                 n_index_err += 1
                                 self.assertRaises(IndexError,
-                                                  f, x_val, b_v, e_v, s_v, i_v)
+                                        f, x_val, b_v, e_v, s_v, i_v)
                             else:
                                 # Executed if the "try" clause did not raise
                                 # any exception
                                 n_ok += 1
                                 f(x_val, b_v, e_v, s_v, i_v)
 
-            #print 'shape: %s' % (x_s,)
-            #print '%% OK: %f' % (float(n_ok) * 100 / (n_ok + n_index_err))
+            print 'shape: %s' % (x_s,)
+            print '%% OK: %f' % (float(n_ok) * 100 / (n_ok + n_index_err))
 
-    @attr('slow')
     def test_none_slice(self):
         # Test case of two slices, var[b1:e1:s1][b2:e2:s2]
         # where any of the b, e, and s can be None
@@ -2329,7 +2154,7 @@ def test_none_slice(self):
             topo = f.maker.fgraph.toposort()
             #print [t for t in topo if isinstance(t.op, tensor.Subtensor)]
             assert len([t for t in topo if isinstance(t.op,
-                                                      tensor.Subtensor)]) <= 1
+                 tensor.Subtensor)]) <= 1
             assert isinstance(topo[-1].op, DeepCopyOp)
 
             for x_s in self.x_shapes:
@@ -2386,7 +2211,7 @@ def test_none_index(self):
             topo = f.maker.fgraph.toposort()
             #print [t for t in topo if isinstance(t.op, tensor.Subtensor)]
             assert len([t for t in topo if isinstance(t.op,
-                                                      tensor.Subtensor)]) <= 1
+                 tensor.Subtensor)]) <= 1
             assert isinstance(topo[-1].op, DeepCopyOp)
 
             for x_s in self.x_shapes:
@@ -2417,90 +2242,11 @@ def test_none_index(self):
                         f(x_val, *i_val)
 
 
-class test_local_adv_sub1_adv_inc_sub1(unittest.TestCase):
-    def setUp(self):
-        utt.seed_rng()
-        mode = theano.compile.mode.get_default_mode()
-        self.mode = mode.including("local_adv_sub1_adv_inc_sub1").excluding("fusion")
-        self.mode_no_assert = self.mode.including("local_remove_all_assert")
-
-    def test0(self):
-        for dtype1, dtype2 in [("float32", "float32"),
-                               ("float32", "float64"),
-                               ("float64", "float32"),
-                               ("float64", "float64")]:
-            x = tensor.matrix(dtype=dtype1)
-            y = tensor.matrix(dtype=dtype2)
-            idx = tensor.ivector()
-
-            dx = numpy.random.rand(4, 5).astype(dtype1)
-            dy = numpy.random.rand(2, 5).astype(dtype2)
-            didx = numpy.asarray([1, 3], "int32")
-
-            # set_subtensor
-            inc = tensor.set_subtensor(x[idx], y)
-            o = inc[idx]
-            f = theano.function([x, y, idx], o, self.mode_no_assert)
-
-            res = f(dx, dy, didx)
-            assert numpy.allclose(dy, res)
-            topo = f.maker.fgraph.toposort()
-            if opt:
-                assert len(topo) == 1
-                assert isinstance(topo[0].op, (compile.DeepCopyOp, T.Elemwise))
-            else:
-                assert len(topo) == 2
-
-            # inc_subtensor(data[idx], y)
-            inc = tensor.inc_subtensor(x[idx], y)
-            o = inc[idx]
-            f = theano.function([x, y, idx], o, self.mode_no_assert)
-
-            res = f(dx, dy, didx)
-            assert numpy.allclose((dx[didx] + dy), res)
-            topo = f.maker.fgraph.toposort()
-            len(topo) == 2
-
-            # inc_subtensor(0[idx], y)
-            inc = tensor.inc_subtensor(x.zeros_like()[idx], y)
-            o = inc[idx]
-            f = theano.function([x, y, idx], o, self.mode_no_assert)
-
-            res = f(dx, dy, didx)
-            assert numpy.allclose(dy, res)
-            topo = f.maker.fgraph.toposort()
-            if opt:
-                assert len(topo) == 1
-                assert isinstance(topo[0].op, (compile.DeepCopyOp, T.Elemwise))
-            else:
-                assert len(topo) > 2
-
-    def test_assert(self):
-            x = tensor.matrix("x")
-            y = tensor.matrix("y")
-            idx = tensor.ivector()
-
-            dx = numpy.random.rand(4, 5).astype(config.floatX)
-            dy = numpy.random.rand(2, 5).astype(config.floatX)
-            didx = numpy.asarray([1, 3], "int32")
-
-            # set_subtensor
-            inc = tensor.set_subtensor(x[idx], y)
-            o = inc[idx]
-            f = theano.function([x, y, idx], o, self.mode)
-            # test wrong index
-            for i in [dx.shape[0], -dx.shape[0] - 1]:
-                self.assertRaises(AssertionError, f, dx, dy, [i, i])
-            # test wrong shape
-            self.assertRaises(AssertionError, f, dx, dy, [1])
-
-
 class Test_alloc_zero(unittest.TestCase):
     def setUp(self):
         mode = theano.compile.mode.get_default_mode()
-        self.mode = mode.including("local_incsubtensor_of_zeros",
-                                   "local_setsubtensor_of_constants",
-                                   "local_0_dot_x")
+        self.mode = mode.including("local_incsubtensor_of_allocs",
+             "local_setsubtensor_of_allocs", "local_0_dot_x")
 
     def test_setsubtensor_allocs0(self):
         x = tensor.matrix()
@@ -2510,32 +2256,32 @@ def test_setsubtensor_allocs0(self):
         z = tensor.set_subtensor(x0[:4], y0)
         f = theano.function([x, y], z, mode=self.mode)
         assert numpy.all([not isinstance(x.op, tensor.IncSubtensor) for x in
-                          f.maker.fgraph.toposort()])
+                           f.maker.fgraph.toposort()])
 
     def test_setsubtensor_allocs1(self):
         y = tensor.matrix()
         x0 = tensor.constant(numpy.asarray(numpy.zeros((4, 4)),
-                                           dtype=config.floatX))
+             dtype=config.floatX))
         y0 = tensor.zeros_like(y)
         z = tensor.set_subtensor(x0[:4], y0)
         f = theano.function([y], z, mode=self.mode)
         assert numpy.all([not isinstance(x.op, tensor.IncSubtensor) for x in
-                          f.maker.fgraph.toposort()])
+                           f.maker.fgraph.toposort()])
 
     def test_setsubtensor_allocs1t(self):
         y = tensor.matrix()
         x0 = tensor.constant(numpy.asarray(numpy.zeros((4, 4)),
-                                           dtype=config.floatX))
+             dtype=config.floatX))
         y0 = tensor.zeros_like(y)
         z = tensor.set_subtensor(x0[:4], y0.T)
         f = theano.function([y], z, mode=mode_opt)
         assert numpy.all([not isinstance(x.op, tensor.IncSubtensor) for x in
-                          f.maker.fgraph.toposort()])
+                           f.maker.fgraph.toposort()])
 
     def test_setsubtensor_allocs2(self):
         x = tensor.matrix()
         y0 = tensor.constant(numpy.asarray(numpy.zeros_like((4, 4)),
-                                           dtype=config.floatX))
+             dtype=config.floatX))
         x0 = tensor.zeros_like(x)
         z = tensor.set_subtensor(x0[:4], y0)
         f = theano.function([x], z, mode=self.mode)
@@ -2549,7 +2295,7 @@ def test_incsubtensor_allocs0(self):
         z = tensor.inc_subtensor(x[:4], y0)
         f = theano.function([x, y], z, mode=self.mode)
         assert numpy.all([not isinstance(x.op, tensor.IncSubtensor) for x in
-                          f.maker.fgraph.toposort()])
+                           f.maker.fgraph.toposort()])
 
     def test_incsubtensor_allocs0t(self):
         x = tensor.matrix()
@@ -2558,93 +2304,27 @@ def test_incsubtensor_allocs0t(self):
         z = tensor.inc_subtensor(x[:4], y0.T)
         f = theano.function([x, y], z, mode=mode_opt)
         assert numpy.all([not isinstance(x.op, tensor.IncSubtensor) for x in
-                          f.maker.fgraph.toposort()])
+                           f.maker.fgraph.toposort()])
 
     def test_incsubtensor_allocs1(self):
         x = tensor.matrix()
         y0 = tensor.constant(numpy.asarray(numpy.zeros_like((4, 4)),
-                                           dtype=config.floatX))
+             dtype=config.floatX))
         z = tensor.inc_subtensor(x[:4], y0)
         f = theano.function([x], z, mode=self.mode)
         assert numpy.all([not isinstance(x.op, tensor.IncSubtensor) for x in
                            f.maker.fgraph.toposort()])
 
-    def test_advancedincsubtensor1_allocs0(self):
-        x = tensor.matrix()
-        y = tensor.matrix()
-        y0 = tensor.zeros_like(y)
-        z = tensor.inc_subtensor(x[[0, 1, 2, 3]], y0)
-        f = theano.function([x, y], z, mode=self.mode)
-        assert numpy.all([not isinstance(x.op, tensor.AdvancedIncSubtensor1)
-                          for x in f.maker.fgraph.toposort()])
-
-    def test_advancedincsubtensor1_allocs0t(self):
-        x = tensor.matrix()
-        y = tensor.matrix()
-        y0 = tensor.zeros_like(y)
-        z = tensor.inc_subtensor(x[[0, 1, 2, 3]], y0.T)
-        f = theano.function([x, y], z, mode=mode_opt)
-        assert numpy.all([not isinstance(x.op, tensor.AdvancedIncSubtensor1)
-                          for x in f.maker.fgraph.toposort()])
-
-    def test_advancedincsubtensor1_allocs1(self):
-        x = tensor.matrix()
-        y0 = tensor.constant(numpy.asarray(numpy.zeros_like((4, 4)),
-                                           dtype=config.floatX))
-        z = tensor.inc_subtensor(x[[0, 1, 2, 3]], y0)
-        f = theano.function([x], z, mode=self.mode)
-        assert numpy.all([not isinstance(x.op, tensor.AdvancedIncSubtensor1)
-                          for x in f.maker.fgraph.toposort()])
-
-    def test_advancedincsubtensor_allocs0(self):
-        if tensor.inplace_increment is None:
-            raise SkipTest('NumPy version >= 1.8 not available')
-        
-        x = tensor.matrix()
-        y = tensor.matrix()
-        y0 = tensor.zeros_like(y)
-        z = tensor.inc_subtensor(x[[[0, 0], [1, 1]], [[0, 1], [0, 1]]], y0)
-        f = theano.function([x, y], z, mode=self.mode)
-        assert numpy.all([not isinstance(x.op, tensor.AdvancedIncSubtensor)
-                          for x in f.maker.fgraph.toposort()])
-
-    def test_advancedincsubtensor_allocs0t(self):
-        if tensor.inplace_increment is None:
-            raise SkipTest('NumPy version >= 1.8 not available')
-        
-        x = tensor.matrix()
-        y = tensor.matrix()
-        y0 = tensor.zeros_like(y)
-        z = tensor.inc_subtensor(x[[[0, 0], [1, 1]], [[0, 1], [0, 1]]], y0.T)
-        f = theano.function([x, y], z, mode=mode_opt)
-        assert numpy.all([not isinstance(x.op, tensor.AdvancedIncSubtensor)
-                          for x in f.maker.fgraph.toposort()])
-
-    def test_advancedincsubtensor_allocs1(self):
-        if tensor.inplace_increment is None:
-            raise SkipTest('NumPy version >= 1.8 not available')
-        
-        x = tensor.matrix()
-        y0 = tensor.constant(numpy.asarray(numpy.zeros_like((2, 2)),
-                                           dtype=config.floatX))
-        z = tensor.inc_subtensor(x[[[0, 0], [1, 1]], [[0, 1], [0, 1]]], y0)
-        f = theano.function([x], z, mode=self.mode)
-        assert numpy.all([not isinstance(x.op, tensor.AdvancedIncSubtensor)
-                          for x in f.maker.fgraph.toposort()])
-
     def test_dot_allocs_0(self):
         v1 = tensor.vector('v1')
         v2 = tensor.vector('v2')
         m1 = tensor.matrix('m1')
         m2 = tensor.matrix('m2')
-        vv2 = numpy.asarray([0, 1], dtype=theano.config.floatX)
-        vm2 = numpy.asarray([[1, 2], [4, 5]],
-                            dtype=theano.config.floatX)
-        vv3 = numpy.asarray([0, 1, 2], dtype=theano.config.floatX)
-        vm3 = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-                            dtype=theano.config.floatX)
-        for _e1 in [(v1, vv2, vv3), (m1, vm2, vm3)]:
-            for _e2 in [(v2, vv2, vv3), (m2, vm2, vm3)]:
+        vv = numpy.asarray([0, 1, 2], dtype=theano.config.floatX)
+        vm = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                           dtype=theano.config.floatX)
+        for _e1 in [(v1, vv), (m1, vm)]:
+            for _e2 in [(v2, vv), (m2, vm)]:
                 for p in [0, 1]:
                     if p == 0:
                         e1 = tensor.zeros_like(_e1[0])
@@ -2655,254 +2335,16 @@ def test_dot_allocs_0(self):
                     o = tensor.dot(e1, e2)
                     f = theano.function([_e1[0], _e2[0]], o, mode=self.mode)
                     f(_e1[1], _e2[1])
-                    f(_e1[2], _e2[2])
-                    assert numpy.all([not isinstance(x.op, tensor.Dot) for x in
-                                      f.maker.fgraph.toposort()])
-
-                    #test that we don't remove shape errors
-                    self.assertRaises((ValueError, AssertionError), f,
-                                      _e1[1], _e2[2])
-                    self.assertRaises((ValueError, AssertionError), f,
-                                      _e1[2], _e2[1])
-
-
-def test_local_IncSubtensor_serialize():
-    d = numpy.random.normal(0, 0.01, size=(100, 100))
-    d = d.astype(theano.config.floatX)
-
-    W = theano.shared(d, name='W')
-    i = T.vector('i', dtype='int64')
-    j = T.vector('j', dtype='int64')
-    t = T.scalar('t')
-    if theano.tensor.subtensor.inplace_increment:
-        y = (W[i] + W[j] + W[1] + W[i, j]).sum()
-    else:
-        y = (W[i] + W[j] + W[1]).sum()
-    cost = T.sqr(t - y)
-    dW = theano.grad(cost, W)
-    mode = theano.compile.mode.get_default_mode().excluding('fusion')
-    mode = mode.including("local_IncSubtensor_serialize")
-    f = theano.function([i, j, t], updates=[(W, W - 0.01 * dW)], mode=mode)
-    topo = f.maker.fgraph.toposort()
-    adds = [n for n in topo if isinstance(n.op, T.Elemwise) and
-            isinstance(n.op.scalar_op, theano.scalar.Add)]
-    for a in adds:
-        assert not any([inp.owner and
-                        isinstance(inp.owner.op,
-                                   (tensor.IncSubtensor,
-                                    tensor.AdvancedIncSubtensor,
-                                    tensor.AdvancedIncSubtensor1))
-                        for inp in a.inputs])
-
-
-def test_local_subtensor_of_dot():
-    m1 = theano.tensor.matrix()
-    m2 = theano.tensor.matrix()
-    d1 = numpy.arange(6).reshape((3, 2)).astype(config.floatX)
-    d2 = numpy.arange(8).reshape((2, 4)).astype(config.floatX) + 10
-    mode = compile.get_default_mode().including("local_subtensor_of_dot")
-
-    def test_equality(a, b):
-        return a.shape == b.shape and numpy.allclose(a, b)
-
-    # [cst]
-    f = theano.function([m1, m2], theano.dot(m1, m2)[1], mode=mode)
-    topo = f.maker.fgraph.toposort()
-    assert test_equality(f(d1, d2), numpy.dot(d1, d2)[1])
-    # DimShuffle happen in FAST_COMPILE
-    assert isinstance(topo[-1].op, (T.blas_c.CGemv, T.blas.Gemv, T.DimShuffle))
-
-    # slice
-    f = theano.function([m1, m2], theano.dot(m1, m2)[1:2], mode=mode)
-    topo = f.maker.fgraph.toposort()
-    assert test_equality(f(d1, d2), numpy.dot(d1, d2)[1:2])
-    assert isinstance(topo[-1].op, (T.blas.Dot22))
-
-    m1 = theano.tensor.tensor3()
-    m2 = theano.tensor.tensor3()
-    idx = theano.tensor.iscalar()
-    d1 = numpy.arange(30).reshape(2,5,3).astype(config.floatX)
-    d2 = numpy.arange(72).reshape(4,3,6).astype(config.floatX) + 100
-
-    f = theano.function([m1, m2, idx], theano.dot(m1, m2)[idx,1:4,:,idx:], mode=mode)
-    assert test_equality(f(d1, d2, 1), numpy.dot(d1, d2)[1,1:4,:,1:])
-
-    f = theano.function([m1, m2, idx], theano.dot(m1, m2)[1:4,:,idx:,idx], mode=mode)
-    assert test_equality(f(d1, d2, 1), numpy.dot(d1, d2)[1:4,:,1:,1])
-
-
-class Test_local_elemwise_alloc(unittest.TestCase):
-    dtype = config.floatX
-
-    def setUp(self):
-        self.vec = T.vector('vec', dtype=theano.config.floatX)
-        self.mat = T.matrix('mat', dtype=theano.config.floatX)
-        self.tens = T.tensor3('tens', dtype=theano.config.floatX)
-
-        self.alloc_wo_dep = T.alloc(self.vec, 2, 2)
-        self.alloc_w_dep = T.alloc(self.vec, *self.mat.shape)
-
-    def _verify_alloc_count(self, f, count):
-        assert(
-            sum([isinstance(elem.op, T.Alloc)
-                 for elem in f.maker.fgraph.toposort()
-                 if elem.op is not None]) == count
-        )
-
-    def _verify_assert_count(self, f, count):
-        assert(
-            sum([isinstance(elem.op, T.opt.Assert)
-                 for elem in f.maker.fgraph.toposort()
-                 if elem.op is not None]) == count
-        )
-
-    def test_remove_alloc_wo_dimshuffle(self):
-        # No optimization on alloc
-        func = function(
-            [self.vec, self.mat],
-            self.alloc_wo_dep + self.mat,
-            mode='FAST_COMPILE'
-        )
-        self._verify_alloc_count(func, 1)
-        self._verify_assert_count(func, 0)
-
-        # Optimization on alloc with assert
-        func = function(
-            [self.vec, self.mat],
-            self.alloc_wo_dep + self.mat,
-            mode='FAST_RUN'
-        )
-        self._verify_alloc_count(func, 0)
-        self._verify_assert_count(func, 1)
-
-        # No optimization on alloc without assert
-        func = function(
-            [self.vec, self.mat],
-            self.alloc_w_dep + self.mat,
-            mode='FAST_COMPILE'
-        )
-        self._verify_alloc_count(func, 1)
-        self._verify_assert_count(func, 0)
-
-        # Optimization on alloc without assert
-        func = function(
-            [self.vec, self.mat],
-            self.alloc_w_dep + self. mat,
-            mode='FAST_RUN'
-        )
-        self._verify_alloc_count(func, 0)
-        self._verify_assert_count(func, 0)
-
-    def test_remove_alloc_w_dimshuffle(self):
-        # No optimization on dimshuffle with assert
-        func = function(
-            [self.vec, self.tens],
-            T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
-            mode='FAST_COMPILE'
-        )
-        self._verify_alloc_count(func, 1)
-        self._verify_assert_count(func, 0)
-
-        # Optimization on dimshuffle with assert
-        func = function(
-            [self.vec, self.tens],
-            T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
-            mode='FAST_RUN'
-        )
-        self._verify_alloc_count(func, 0)
-        self._verify_assert_count(func, 1)
-
-        # No optimization on dimshuffle without assert
-        func = function(
-            [self.vec, self.tens],
-            T.alloc(
-                self.vec,
-                self.tens.shape[0],
-                self.tens.shape[1]
-            ).dimshuffle(0, 1, 'x') + self.tens,
-            mode='FAST_COMPILE'
-        )
-        self._verify_alloc_count(func, 1)
-        self._verify_assert_count(func, 0)
-
-        # Optimization on dimshuffle without assert
-        func = function(
-            [self.vec, self.tens],
-            T.alloc(
-                self.vec,
-                self.tens.shape[0],
-                self.tens.shape[1]
-            ).dimshuffle(0, 1, 'x') + self.tens,
-            mode='FAST_RUN'
-        )
-        self._verify_alloc_count(func, 0)
-        self._verify_assert_count(func, 0)
-
-    def test_multi_input_single_alloc(self):
-        tv = T.alloc(self.vec, 5, 5)
-        tm = T.alloc(self.mat, 5, 5, 5)
-        func = function(
-            [self.vec, self.mat],
-            tv + tm,
-            mode='FAST_COMPILE'
-        )
-
-        self._verify_alloc_count(func, 2)
-        self._verify_assert_count(func, 0)
-
-        func = function(
-            [self.vec, self.mat],
-            tv + tm,
-            mode='FAST_RUN'
-        )
-        self._verify_alloc_count(func, 1)
-        self._verify_assert_count(func, 0)
-
-        s = T.iscalar('s')
-        tv = T.alloc(self.vec, s, s)
-        tm = T.alloc(self.mat, 5, 5, 5)
-        func = function(
-            [self.vec, self.mat, s],
-            tv + tm,
-            mode='FAST_COMPILE'
-        )
-
-        self._verify_alloc_count(func, 2)
-        self._verify_assert_count(func, 0)
-
-        func = function(
-            [self.vec, self.mat, s],
-            tv + tm,
-            mode='FAST_RUN'
-        )
-        self._verify_alloc_count(func, 1)
-        self._verify_assert_count(func, 1)
-
-    def test_error(self):
-        t3fft = theano.tensor.tensor(dtype=self.dtype,
-                                     broadcastable=(False, False, True))
-        row = theano.tensor.row(dtype=self.dtype)
-        o = T.alloc(row, 5, 5).dimshuffle(0, 1, 'x') + t3fft
-        func = function(
-            [t3fft, row],
-            o,
-            mode='FAST_RUN'
-        )
-        self._verify_alloc_count(func, 0)
-        self._verify_assert_count(func, 1)
-        d = numpy.random.rand(5, 5, 1).astype(self.dtype)
-        r = numpy.random.rand(1, 5).astype(self.dtype)
-        func(d, r)
+                    assert numpy.all([ not isinstance(x.op, tensor.Dot) for x in
+                                      f.maker.fgraph.toposort() ])
 
 
 def test_local_subtensor_of_alloc():
+    x = tensor.matrix('x')
 
     # DebugMode should detect if something goes wrong.
     # test shape combination of odd and event shape.
-    for shape in [(3, 5), (4, 6), (3, 8), (4, 7),
-                  (1, 5), (5, 1)]:
-        x = tensor.tensor(dtype=theano.config.floatX,
-                          broadcastable=(shape[0] == 1, shape[1] == 1))
+    for shape in [(3, 5), (4, 6), (3, 8), (4, 7)]:
 
         xval = numpy.zeros(shape, dtype=config.floatX)
         yval = numpy.arange(shape[1], dtype=config.floatX)
@@ -2919,29 +2361,21 @@ def test_local_subtensor_of_alloc():
             # Only one column
             z_vec = yx[:, 3]
             assert z_vec.ndim == 1
-            # results are vector
-            slicess = []
-            if shape[0] != 1:
-                slicess.append((2, slice(None)))
-            if shape[1] != 1:
-                slicess.append((slice(None), 3))
-
-            # results are matrix
-            slicess += [
+
+            for slices in [
+                # results are vector
+                (slice(None), 3),
+                (2, slice(None)),
+                # results are matrix
                 (slice(None), slice(3, None)),
                 (slice(3, None), ),
                 (slice(3, None), slice(3, None)),
                 (slice(1, 3), slice(None, -1)),
                 (slice(None, None, 2)),
                 (slice(1, None, 2)),
-            ]
-            for slices in slicess:
+                ]:
                 z = yx.__getitem__(slices)
                 f = theano.function([x], z)
-                if theano.config.mode != 'FAST_COMPILE':
-                    # Subtensor can be in the input of Alloc
-                    assert not isinstance(f.maker.fgraph.toposort()[-1].op,
-                                          Subtensor)
                 val = f(xval)
                 assert xval.__getitem__(slices).shape == val.shape
 
@@ -3111,15 +2545,6 @@ def test_broadcasted_dims(self):
         f = theano.function([], out, mode=mode)
         f()
 
-    def test_constant_merge(self):
-        """This test the error in gh-1122 that is a caused by the
-        combination of merge optimizer and ShapeFeature.
-        """
-        x = tensor.constant([0, 0])
-        y = x[1:]
-        x1 = x - tensor.join(0, y, y)
-        x1.eval()
-
     def test_local_track_shape_i(self):
         class IdentityNoShape(gof.Op):
             '''Op that does not infer the output shape from the input one'''
@@ -3206,31 +2631,6 @@ def test_no_shapeopt(self):
         f = theano.function([X], expr, mode=mode)
         print f([[1, 2], [2, 3]])
 
-    def test_no_cycle(self):
-        # Optimizing this graph resulted in a cycle, see gh-1549
-        # This test depends on cuda
-        import theano.sandbox.cuda as cuda
-        if not cuda.cuda_available:
-            raise SkipTest("cuda not available")
-        if sys.version_info[:2] < (2, 5):
-            raise SkipTest("Test skipped due to a too old python")
-
-        pkl_filename = os.path.join(os.path.dirname(theano.__file__),
-                                    'tensor', 'tests', 'shape_opt_cycle.pkl')
-        # Due to incompatibilities between python 2 and 3 in the format
-        # of pickled numpy ndarray, we have to force an encoding
-        from theano.misc.pkl_utils import CompatUnpickler
-        pkl_file = open(pkl_filename, "rb")
-        try:
-            if PY3:
-                u = CompatUnpickler(pkl_file, encoding="latin1")
-            else:
-                u = CompatUnpickler(pkl_file)
-            fn_args = u.load()
-            theano.function(**fn_args)
-        finally:
-            pkl_file.close()
-
 
 class test_assert(utt.InferShapeTester):
 
@@ -3240,27 +2640,27 @@ def setUp(self):
     def test0(self):
         x = T.scalar()
         y = T.scalar()
-        f = theano.function([x, y], theano.tensor.opt.assert_op(x, T.eq(x, y)))
+        f = theano.function([x, y], theano.tensor.opt.assert_(x, T.eq(x, y)))
         f(1, 1)
         self.assertRaises(AssertionError, f, 1, 0)
 
-    def test_local_remove_useless_assert1(self):
-        # remove assert that are always true
+    def test1(self):
+        #remove assert that are always true
         mode = theano.config.mode
         if mode == 'FAST_COMPILE':
             mode = 'FAST_RUN'
         mode = compile.mode.get_mode(mode)
 
         x = T.scalar()
-        f = theano.function([x], theano.tensor.opt.assert_op(x, 1), mode=mode)
+        f = theano.function([x], theano.tensor.opt.assert_(x, 1), mode=mode)
         assert f(1) == 1
         assert f(5) == 5
         topo = f.maker.fgraph.toposort()
         assert len(topo) == 1
         assert topo[0].op == deep_copy_op
 
-    def test_test_local_remove_useless_assert2(self):
-        # remove assert condition that are always true
+    def test2(self):
+        #remove assert condition that are always true
         mode = theano.config.mode
         if mode == 'FAST_COMPILE':
             mode = 'FAST_RUN'
@@ -3268,8 +2668,8 @@ def test_test_local_remove_useless_assert2(self):
 
         x = T.scalar()
         y = T.scalar()
-        f = theano.function([x, y], theano.tensor.opt.assert_op(x, y, 1),
-                            mode=mode)
+        f = theano.function([x, y], theano.tensor.opt.assert_(x, y,
+             1), mode=mode)
         assert f(1, 1) == 1
         assert f(5, 1) == 5
         topo = f.maker.fgraph.toposort()
@@ -3277,8 +2677,8 @@ def test_test_local_remove_useless_assert2(self):
         assert len(topo[0].inputs) == 2
         assert topo[1].op == deep_copy_op
 
-    def test_local_remove_useless_assert3(self):
-        # don't remove assert condition that are always false
+    def test3(self):
+        #don't remove assert condition that are always false
         mode = theano.config.mode
         if mode == 'FAST_COMPILE':
             mode = 'FAST_RUN'
@@ -3286,46 +2686,30 @@ def test_local_remove_useless_assert3(self):
 
         x = T.scalar()
         y = T.scalar()
-        f = theano.function([x, y], theano.tensor.opt.assert_op(x, y, 0),
-                            mode=mode)
+        f = theano.function([x, y], theano.tensor.opt.assert_(x, y,
+             0), mode=mode)
         self.assertRaises(AssertionError, f, 1, 0)
         topo = f.maker.fgraph.toposort()
         assert len(topo) == 2
         assert len(topo[0].inputs) == 3
         assert topo[1].op == deep_copy_op
 
-    def test_local_remove_all_assert1(self):
-        # remove assert condition that are unknown
-        mode = theano.config.mode
-        if mode == 'FAST_COMPILE':
-            mode = 'FAST_RUN'
-        mode = compile.mode.get_mode(mode).including('local_remove_all_assert')
-
-        x = T.scalar()
-        y = T.scalar()
-        f = theano.function([x, y], theano.tensor.opt.assert_op(x, y),
-                            mode=mode)
-        f(1, 0)  # Without opt, it should fail.
-        topo = f.maker.fgraph.toposort()
-        assert len(topo) == 1, topo
-        assert topo[0].op == deep_copy_op, topo
-
     def test_infer_shape(self):
 
         adscal = dscalar()
         bdscal = dscalar()
         adscal_val = numpy.random.rand()
         bdscal_val = numpy.random.rand() + 1
-        out = theano.tensor.opt.assert_op(adscal, bdscal)
+        out = theano.tensor.opt.assert_(adscal, bdscal)
         self._compile_and_check([adscal, bdscal], [out],
-                                [adscal_val, bdscal_val], Assert)
+                        [adscal_val, bdscal_val], Assert)
 
         admat = dmatrix()
         admat_val = numpy.random.rand(3, 4)
         adscal_val += 1
-        out = theano.tensor.opt.assert_op(admat, adscal, bdscal)
+        out = theano.tensor.opt.assert_(admat, adscal, bdscal)
         self._compile_and_check([admat, adscal, bdscal], [out],
-                                [admat_val, adscal_val, bdscal_val], Assert)
+                        [admat_val, adscal_val, bdscal_val], Assert)
 
 
 def test_local_mul_specialize():
@@ -3340,66 +2724,37 @@ def test_local_mul_specialize():
 
     f = function([v], v * 1, mode=mode)
     nodes = [node.op for node in f.maker.fgraph.toposort()]
+    print nodes
     nodes == [deep_copy_op]
 
     f = function([v], v * 0, mode=mode)
     nodes = [node.op for node in f.maker.fgraph.toposort()]
+    print nodes
     assert nodes == [Shape_i(0), T.alloc]
 
     f = function([v], v * (-1), mode=mode)
     nodes = [node.op for node in f.maker.fgraph.toposort()]
+    print nodes
     assert nodes == [T.neg]
 
     f = function([v, m], v * 1 * (-m), mode=mode)
     nodes = [node.op for node in f.maker.fgraph.toposort()]
-    assert nodes == [T.mul]
+    print nodes
+    theano.printing.debugprint(f)
+    assert nodes == [T.mul, inplace.neg_inplace]
 
     f = function([v, m], v * 0 * (-m), mode=mode)
     nodes = [node.op for node in f.maker.fgraph.toposort()]
+    print nodes
+    theano.printing.debugprint(f)
     assert nodes == [Shape_i(0), T.alloc]
 
     f = function([v, m], v * (-1) * (-m), mode=mode)
     nodes = [node.op for node in f.maker.fgraph.toposort()]
+    print nodes
+    theano.printing.debugprint(f)
     assert nodes == [T.mul]
 
-    f = function([v, m], v * (-1) * m, mode=mode)
-    nodes = [node.op for node in f.maker.fgraph.toposort()]
-    assert nodes == [T.mul]
-
-
-class T_Tile(unittest.TestCase):
-    def test_local_useless_tile(self):
-        v = T.vector()
-        m = T.matrix()
-        mode = None
-        if theano.config.mode == "FAST_COMPILE":
-            mode = "FAST_RUN"
-        for var, data in [(v, [1, 2, 3]), (m, [[1, 2], [3, 4]])]:
-            # Currently, only a repeat patter == ndim is supported.
-            for ndim in [var.ndim]:  # range(1, var.ndim):
-                f = theano.function([var], T.tile(var, (1,)*ndim), mode=mode)
-                topo = f.maker.fgraph.toposort()
-                assert len(topo) == 1
-                assert isinstance(topo[0].op, compile.DeepCopyOp)
-                f(data)
-
-        # If the repeat parameter is longer then v.ndim, we must
-        # replace it with a DimShuffle to add the extra parameter.
-        # But it isn't supported for now, so assert that we raise an
-        # error.
-        self.assertRaises(ValueError, T.tile, v, (1,)*(v.ndim+1))
-        # If the repeat parameter is shorter then m.ndim, it should
-        # pad tot he left the repeat patter with 1. It is not supported for now.
-        #f = theano.function([var], T.tile(v, (1,)*(v.ndim+1)))
-        #topo = f.maker.fgraph.toposort()
-        #assert len(topo) == 1
-        #assert isinstance(topo[0].op, DimShuffe)
-        self.assertRaises(ValueError, T.tile, m, (1,)*(m.ndim-1))
-        #f = theano.function([var], T.tile(m, (1,)*(m.ndim-1)))
-        #topo = f.maker.fgraph.toposort()
-        #assert len(topo) == 1
-        #assert isinstance(topo[0].op, compile.DeepCopyOp)
-
 
 def speed_local_pow_specialize_range():
     val = numpy.random.rand(1e7)
@@ -3568,6 +2923,7 @@ def test_eq(self):
         f2 = theano.function([x], T.eq(x, x), mode=self.mode)
         assert numpy.all(f2(vx) == numpy.ones((5, 4)))
         topo2 = f2.maker.fgraph.toposort()
+        print topo2
         #Shape_i{1}(<TensorType(float64, matrix)>), Shape_i{0}(<TensorType(float64, matrix)>), Alloc([[1]], Shape_i{0}.0, Shape_i{1}.0
         assert len(topo2) == 3
         assert isinstance(topo2[-1].op, T.Alloc)
@@ -3586,6 +2942,7 @@ def test_neq(self):
         f2 = theano.function([x], T.neq(x, x), mode=self.mode)
         assert numpy.all(f2(vx) == numpy.zeros((5, 4)))
         topo2 = f2.maker.fgraph.toposort()
+        print topo2
         assert len(topo2) == 3
         assert isinstance(topo2[-1].op, T.Alloc)
 
@@ -3602,6 +2959,7 @@ def test_mul(self):
         f2 = theano.function([x, y], T.mul(x, y), mode=self.mode)
         assert numpy.all(f2(vx, vy) == vx * vy)
         topo2 = f2.maker.fgraph.toposort()
+        print topo2
         assert len(topo2) == 1
         assert isinstance(topo2[0].op, T.Elemwise)
         assert isinstance(topo2[0].op.scalar_op, theano.scalar.Mul)
@@ -3619,6 +2977,7 @@ def test_add(self):
         f2 = theano.function([x, y], T.add(x, y), mode=self.mode)
         assert numpy.all(f2(vx, vy) == vx + vy)
         topo2 = f2.maker.fgraph.toposort()
+        print topo2
         assert len(topo2) == 1
         assert isinstance(topo2[0].op, T.Elemwise)
         assert isinstance(topo2[0].op.scalar_op, theano.scalar.Add)
@@ -3635,54 +2994,6 @@ def test_identity(self):
         assert topo[0].op == deep_copy_op
 
 
-class T_cast_cast(unittest.TestCase):
-    def setUp(self):
-        mode = theano.compile.get_default_mode()
-        self.mode = mode.including('local_cast_cast')
-
-    def test(self):
-        x = T.fmatrix()
-        o = T.Elemwise(scal.Cast(scal.Scalar("float64")))(x.astype("float64"))
-        f = theano.function([x], o, mode=self.mode)
-        dx = numpy.random.rand(5, 4).astype("float32")
-        f(dx)
-        topo = f.maker.fgraph.toposort()
-        assert len(topo) == 1
-        assert isinstance(topo[0].op, T.Elemwise)
-
-        x = T.dmatrix()
-        o = T.Elemwise(scal.Cast(scal.Scalar("float32")))(x.astype("float32"))
-        f = theano.function([x], o, mode=self.mode)
-        dx = numpy.random.rand(5, 4)
-        f(dx)
-        topo = f.maker.fgraph.toposort()
-        assert len(topo) == 1
-        assert isinstance(topo[0].op, T.Elemwise)
-
-
-def test_constant_folding():
-    """ Test that constant folding get registered at fast_compile
-
-    An error removed that registration during the registration.
-    """
-    x = tensor.dvector()
-    mode = theano.compile.get_mode("FAST_COMPILE").excluding("fusion")
-    f = theano.function([x], [x * 2, x + x], mode=mode)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 2
-
-    # Test that we do not crash when constant folding elemwise scalar
-    # as they should not generate c code.
-
-    x = tensor.constant(3)
-    assert x.ndim == 0
-    mode = theano.compile.get_mode("FAST_COMPILE").excluding("fusion")
-    f = theano.function([], [x * 2, x + x], mode=mode)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 2
-    assert all([isinstance(n.op, DeepCopyOp) for n in topo])
-
-
 def test_constant_get_stabilized():
     """
     Currently Theano enable the constant_folding optimization before stabilization optimization.
@@ -3719,9 +3030,8 @@ def test_constant_get_stabilized():
 class T_local_switch_sink(unittest.TestCase):
     def setUp(self):
         # condition values
-        self.condm = numpy.asarray([[0.1, 0, 1, -1],
-                                    [0., 0., 0., 0.],
-                                    [1, 1, 1, 1]])
+        self.condm = numpy.asarray([[0.1, 0, 1, -1], [0., 0., 0.,
+             0.], [1, 1, 1, 1]])
         self.condv = numpy.asarray([0.1, 0, 1, -1])
         self.conds = [0.1, 0, 1, -1]
 
@@ -3762,7 +3072,6 @@ def test_local_mul_switch_sink(self):
                         resm[idx])).sum() == self.resm[idx].size
                 idx += 1
 
-    @attr('slow')
     def test_local_div_switch_sink(self):
         c = T.dscalar()
         idx = 0
@@ -3789,8 +3098,6 @@ def setUp(self):
         self.mode = theano.compile.mode.get_default_mode().including(
                 'canonicalize', 'fast_run').excluding('gpu', 'fusion')
         self.mode._optimizer.position_cutoff = 1.50001
-        if theano.config.cxx == '' and not theano.scalar.basic_scipy.imported_scipy_special:
-            raise SkipTest("erf need a c++ compiler or scipy")
 
     def test_local_one_plus_erf(self):
         val = numpy.asarray([-30, -3, -2, -1, 0, 1, 2, 3, 30],
@@ -3798,17 +3105,20 @@ def test_local_one_plus_erf(self):
         x = T.vector()
 
         f = theano.function([x], 1 + T.erf(x), mode=self.mode)
-        assert [n.op for n in f.maker.fgraph.toposort()] == [
-            T.mul, T.erfc], f.maker.fgraph.toposort()
+        print f.maker.fgraph.toposort()
+        assert [n.op for n in f.maker.fgraph.toposort()] == [T.mul, T.
+            erfc], f.maker.fgraph.toposort()
         f(val)
 
         f = theano.function([x], T.erf(x) + 1, mode=self.mode)
-        assert [n.op for n in f.maker.fgraph.toposort()] == [
-            T.mul, T.erfc], f.maker.fgraph.toposort()
+        print f.maker.fgraph.toposort()
+        assert [n.op for n in f.maker.fgraph.toposort()] == [T.mul, T.
+            erfc], f.maker.fgraph.toposort()
         f(val)
 
         f = theano.function([x], T.erf(x) + 2, mode=self.mode)
         topo = f.maker.fgraph.toposort()
+        print topo
         assert len(topo) == 2
         assert topo[0].op == T.erf
         assert isinstance(topo[1].op, T.Elemwise)
@@ -3817,31 +3127,35 @@ def test_local_one_plus_erf(self):
 
     def test_local_one_minus_erf(self):
         val = numpy.asarray([-30, -3, -2, -1, 0, 1, 2, 3, 30],
-                            dtype=config.floatX)
+             dtype=config.floatX)
         x = T.vector()
 
         f = theano.function([x], 1 - T.erf(x), mode=self.mode)
+        print f.maker.fgraph.toposort()
         assert [n.op for n in f.maker.fgraph.toposort()] == [T.erfc]\
             , f.maker.fgraph.toposort()
         print f(val)
 
         f = theano.function([x], 1 + (-T.erf(x)), mode=self.mode)
+        print f.maker.fgraph.toposort()
         assert [n.op for n in f.maker.fgraph.toposort()] == [T.erfc]\
             , f.maker.fgraph.toposort()
         print f(val)
 
         f = theano.function([x], (-T.erf(x)) + 1, mode=self.mode)
+        print f.maker.fgraph.toposort()
         assert [n.op for n in f.maker.fgraph.toposort()] == [T.erfc]\
             , f.maker.fgraph.toposort()
         print f(val)
 
         f = theano.function([x], 2 - T.erf(x), mode=self.mode)
         topo = f.maker.fgraph.toposort()
+        print topo
         assert len(topo) == 2, f.maker.fgraph.toposort()
         assert topo[0].op == T.erf, f.maker.fgraph.toposort()
         assert isinstance(topo[1].op, T.Elemwise), f.maker.fgraph.toposort()
         assert isinstance(topo[1].op.scalar_op, scal.Add)\
-            or isinstance(topo[1].op.scalar_op, scal.Sub), f.maker.fgraph.toposort()
+            or isinstance(topo[1].op.scalar_op,scal.Sub), f.maker.fgraph.toposort()
         print f(val)
 
     def test_local_erf_minus_one(self):
@@ -3850,19 +3164,23 @@ def test_local_erf_minus_one(self):
         x = T.vector()
 
         f = theano.function([x], T.erf(x) - 1, mode=self.mode)
+        print f.maker.fgraph.toposort()
         assert [n.op for n in f.maker.fgraph.toposort()] == [T.erfc, T.mul]
         print f(val)
 
         f = theano.function([x], T.erf(x) + (-1), mode=self.mode)
+        print f.maker.fgraph.toposort()
         assert [n.op for n in f.maker.fgraph.toposort()] == [T.erfc, T.mul]
         print f(val)
 
         f = theano.function([x], -1 + T.erf(x), mode=self.mode)
+        print f.maker.fgraph.toposort()
         assert [n.op for n in f.maker.fgraph.toposort()] == [T.erfc, T.mul]
         print f(val)
 
         f = theano.function([x], T.erf(x) - 2, mode=self.mode)
         topo = f.maker.fgraph.toposort()
+        print topo
         assert len(topo) == 2
         assert topo[0].op == T.erf
         assert isinstance(topo[1].op, T.Elemwise)
@@ -3874,12 +3192,9 @@ def test_local_erf_minus_one(self):
 class T_local_erfc(unittest.TestCase):
     def setUp(self):
         self.mode_fusion = theano.compile.mode.get_default_mode().including(
-            'canonicalize').including('fast_run').excluding('gpu')
+                'canonicalize').including('fast_run').excluding('gpu')
         self.mode = self.mode_fusion.excluding('fusion')
         self.mode._optimizer.position_cutoff = 1.50001
-        if (theano.config.cxx == '' and
-            not theano.scalar.basic_scipy.imported_scipy_special):
-            raise SkipTest("erfc need a c++ compiler or scipy")
 
     def test_local_one_minus_erfc(self):
         """ test opt: 1-erfc(x) => erf(x) and -erfc(x)+1 => erf(x)
@@ -3889,17 +3204,20 @@ def test_local_one_minus_erfc(self):
         x = T.vector('x')
 
         f = theano.function([x], 1 - T.erfc(x), mode=self.mode)
+        theano.printing.debugprint(f)
         assert [n.op for n in f.maker.fgraph.toposort()] == [T.erf]\
             , f.maker.fgraph.toposort()
         print f(val)
 
         f = theano.function([x], (-T.erfc(x)) + 1, mode=self.mode)
+        theano.printing.debugprint(f)
         assert [n.op for n in f.maker.fgraph.toposort()] == [T.erf]\
             , f.maker.fgraph.toposort()
         print f(val)
 
         f = theano.function([x], 2 - T.erfc(x), mode=self.mode)
         topo = f.maker.fgraph.toposort()
+        theano.printing.debugprint(f)
         assert len(topo) == 2, f.maker.fgraph.toposort()
         assert topo[0].op == T.erfc, f.maker.fgraph.toposort()
         assert isinstance(topo[1].op, T.Elemwise), f.maker.fgraph.toposort()
@@ -3914,16 +3232,19 @@ def test_local_erf_neg_minus_one(self):
         x = T.vector('x')
 
         f = theano.function([x], -1 + T.erfc(-x), mode=self.mode)
+        theano.printing.debugprint(f)
         assert [n.op for n in f.maker.fgraph.toposort()] == [T.erf]\
             , f.maker.fgraph.toposort()
         print f(val)
 
         f = theano.function([x], T.erfc(-x) - 1, mode=self.mode)
+        theano.printing.debugprint(f)
         assert [n.op for n in f.maker.fgraph.toposort()] == [T.erf]\
             , f.maker.fgraph.toposort()
         print f(val)
 
         f = theano.function([x], T.erfc(-x) + (-1), mode=self.mode)
+        theano.printing.debugprint(f)
         assert [n.op for n in f.maker.fgraph.toposort()] == [T.erf]\
             , f.maker.fgraph.toposort()
         print f(val)
@@ -3944,11 +3265,13 @@ def test_local_log_erfc(self):
         mode_fusion.check_isfinite = False
 
         f = theano.function([x], T.log(T.erfc(x)), mode=mode)
+        #theano.printing.debugprint(f)
         assert len(f.maker.fgraph.apply_nodes) == 23, len(f.maker.fgraph.apply_nodes)
         assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
         assert all(numpy.isfinite(f(val)))
 
         f = theano.function([x], T.log(T.erfc(-x)), mode=mode)
+        #theano.printing.debugprint(f)
         assert len(f.maker.fgraph.apply_nodes) == 24, len(f.maker.fgraph.apply_nodes)
         assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
         assert all(numpy.isfinite(f(-val)))
@@ -3957,7 +3280,7 @@ def test_local_log_erfc(self):
         assert len(f.maker.fgraph.apply_nodes) == 1, len(f.maker.fgraph.apply_nodes)
         assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
         assert len(f.maker.fgraph.toposort()[0].fgraph.toposort()[
-            0].op.scalar_op.fgraph.apply_nodes)==22,len(f.maker.fgraph.toposort()[0].fgraph.toposort()[0].op.scalar_op.fgraph.apply_nodes)
+            0].op.scalar_op.fgraph.apply_nodes)==2,len(f.maker.fgraph.toposort()[0].fgraph.toposort()[0].op.scalar_op.fgraph.apply_nodes)
         #TODO: fix this problem
         if theano.config.floatX=="float32" and theano.config.mode in ["DebugMode", "DEBUG_MODE"]:
             raise KnownFailureTest(
@@ -3985,6 +3308,7 @@ def test_local_grad_log_erfc_neg(self):
         mode_fusion.check_isfinite = False
 
         f = theano.function([x], T.grad(T.log(T.erfc(x)).sum(), x), mode=mode)
+        #theano.printing.debugprint(f)
         assert len(f.maker.fgraph.apply_nodes) == 23, len(f.maker.fgraph.apply_nodes)
         assert all(numpy.isfinite(f(val)))
         assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
@@ -3992,12 +3316,14 @@ def test_local_grad_log_erfc_neg(self):
         #test with a different mul constant
         f = theano.function([x], T.mul(T.exp(T.neg(T.sqr(x))), -
             10.12837917) / T.erfc(x), mode=mode)
+        #theano.printing.debugprint(f)
         assert len(f.maker.fgraph.apply_nodes) == 23, len(f.maker.fgraph.apply_nodes)
         assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
         assert all(numpy.isfinite(f(val)))
 
         #test that we work without the mul
         f = theano.function([x], T.exp(T.neg(T.sqr(x))) / T.erfc(x), mode=mode)
+        #theano.printing.debugprint(f)
         assert len(f.maker.fgraph.apply_nodes) == 23, len(f.maker.fgraph.apply_nodes)
         assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
         assert all(numpy.isfinite(f(val)))
@@ -4005,12 +3331,14 @@ def test_local_grad_log_erfc_neg(self):
         #test that we don't work if x!=y
         f = theano.function([x, y], T.exp(T.neg(T.sqr(x))) / T.erfc(
             y), mode=mode)
+        #theano.printing.debugprint(f)
         assert len(f.maker.fgraph.apply_nodes) == 5, len(f.maker.fgraph.apply_nodes)
         assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
         f(val, val - 3)
 
         #test that we work without the sqr and neg
         f = theano.function([x], T.exp(T.mul(-1, x, x)) / T.erfc(x), mode=mode)
+        #theano.printing.debugprint(f)
         assert len(f.maker.fgraph.apply_nodes) == 22, len(f.maker.fgraph.apply_nodes)
         assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
         assert all(numpy.isfinite(f(val)))
@@ -4018,6 +3346,7 @@ def test_local_grad_log_erfc_neg(self):
         #test that it work correctly if x is x*2 in the graph.
         f = theano.function([x], T.grad(T.log(T.erfc(2 * x)).sum(),
              x), mode=mode)
+        #theano.printing.debugprint(f)
         assert len(f.maker.fgraph.apply_nodes) == 23, len(f.maker.fgraph.apply_nodes)
         assert numpy.isfinite(f(val)).all()
         assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
@@ -4096,6 +3425,7 @@ def test_broadcast1(self):
 
         z = theano.tensor.switch(1, x, y)
         f = theano.function([x, y], z, mode=self.mode)
+        #theano.printing.debugprint(f)
         assert len([node.op for node in f.maker.fgraph.toposort() if
                     isinstance(node.op, theano.tensor.Elemwise) and
                     not isinstance(node.op.scalar_op,theano.scalar.basic.Cast)]) == 0
@@ -4105,6 +3435,7 @@ def test_broadcast1(self):
 
         z = theano.tensor.switch(0, x, y)
         f = theano.function([x, y], z, mode=self.mode)
+        #theano.printing.debugprint(f)
         assert len([node.op for node in f.maker.fgraph.toposort() if
                     isinstance(node.op, theano.tensor.Elemwise)]) == 0
         vx = numpy.array([[1, 2, 3], [4, 5, 6]], dtype='int32')
@@ -4142,7 +3473,7 @@ def setUp(self):
 
     def test_local_sum_all_to_none(self):
         a = T.tensor3()
-        input = numpy.arange(3 * 4 * 5, dtype=config.floatX).reshape(3, 4, 5)
+        input = numpy.arange(3 * 3 * 3, dtype=config.floatX).reshape(3, 3, 3)
         f = theano.function([a], a.sum(), mode=self.mode)
         assert len(f.maker.fgraph.apply_nodes) == 1
         assert numpy.allclose(f(input), input.sum())
@@ -4189,23 +3520,24 @@ def my_sum(data, d, dd):
                 assert len(f.maker.fgraph.apply_nodes) == 1
             for d, dd in dims[:6]:
                 f = theano.function([a], a.sum(d).sum(dd).
-                                    sum(0), mode=self.mode)
+                    sum(0), mode=self.mode)
                 assert numpy.allclose(f(input), input.sum(d).sum(dd).sum(0))
                 assert len(f.maker.fgraph.apply_nodes) == 1
             for d in [0, 1, 2]:
                 f = theano.function([a], a.sum(d).sum(None), mode=self.mode)
                 assert numpy.allclose(f(input), input.sum(d).sum())
                 assert len(f.maker.fgraph.apply_nodes) == 1
-            f = theano.function([a], a.sum(None).sum(), mode=self.mode)
-            assert numpy.allclose(f(input), input.sum())
-            assert len(f.maker.fgraph.apply_nodes) == 1
+            for d in [0, 1, 2]:
+                f = theano.function([a], a.sum(None).sum(), mode=self.mode)
+                assert numpy.allclose(f(input), input.sum())
+                assert len(f.maker.fgraph.apply_nodes) == 1
         finally:
             config.warn.sum_sum_bug = backup
 
     def test_local_sum_alloc(self):
         a = T.dtensor3()
         input = numpy.asarray(numpy.arange(2 * 3 * 4).reshape(2, 3, 4),
-                              dtype='float64')
+             dtype='float64')
         mode = self.mode.including('specialize').excluding('fusion')
 
         for t_like,n_like,nb_nodes in [(tensor.zeros_like,numpy.zeros_like,(1,3,3,2)),
@@ -4239,14 +3571,14 @@ def test_local_sum_alloc(self):
             try:
                 for d, dd in [(0, 0), (1, 0), (2, 0), (0, 1), (1, 1), (2, 1)]:
                     f = theano.function([a], t_like(a).
-                                        sum(d).sum(dd), mode=mode)
+                        sum(d).sum(dd), mode=mode)
                     assert numpy.allclose(f(input),
-                                          n_like(input).sum(d).sum(dd))
+                        n_like(input).sum(d).sum(dd))
                     assert len(f.maker.fgraph.apply_nodes) == nb_nodes[3]
                     topo = f.maker.fgraph.toposort()
                     assert topo[-1].op == T.alloc
                     assert not any([isinstance(node.op,
-                                               T.Sum) for node in topo])
+                         T.Sum) for node in topo])
             finally:
                 config.warn.sum_sum_bug = backup
 
@@ -4283,10 +3615,8 @@ def test_local_sum_sum_dtype(self):
 
 class T_local_reduce(unittest.TestCase):
     def setUp(self):
-        self.mode = theano.compile.get_default_mode().including(
-            'canonicalize',
-            'specialize',
-            'uncanonicalize', 'local_max_and_argmax')
+        self.mode = theano.compile.get_default_mode().including('canonicalize',
+                                                                'specialize')
 
     def test_local_reduce_broadcast_all_0(self):
         for fct in [tensor.sum, tensor.all, tensor.any, tensor.prod,
@@ -4336,64 +3666,6 @@ def test_local_reduce_broadcast_some_1(self):
                 isinstance(node.op, T.CAReduce)
                 for node in f.maker.fgraph.toposort()])
 
-    def test_local_reduce_join(self):
-        vx = matrix()
-        vy = matrix()
-        vz = matrix()
-        x = numpy.asarray([[1, 0], [3, 4]], dtype=config.floatX)
-        y = numpy.asarray([[4, 0], [2, 1]], dtype=config.floatX)
-        z = numpy.asarray([[5, 0], [1, 2]], dtype=config.floatX)
-        # Test different reduction scalar operation
-        for out, res in [
-            (T.max((vx, vy), 0), numpy.max((x, y), 0)),
-            (T.min((vx, vy), 0), numpy.min((x, y), 0)),
-            (T.sum((vx, vy, vz), 0), numpy.sum((x, y, z), 0)),
-            (T.prod((vx, vy, vz), 0), numpy.prod((x, y, z), 0)),
-            (T.prod((vx, vy.T, vz), 0), numpy.prod((x, y.T, z), 0)),
-        ]:
-            f = theano.function([vx, vy, vz], out,
-                                on_unused_input='ignore', mode=self.mode)
-            assert (f(x, y, z) == res).all(), out
-            topo = f.maker.fgraph.toposort()
-            assert len(topo) <= 2, out
-            assert isinstance(topo[-1].op, T.Elemwise), out
-
-        # Test different axis for the join and the reduction
-        # We must force the dtype, of otherwise, this tests will fail in 32 bit system
-        A = theano.shared(numpy.array([1, 2, 3, 4, 5], dtype='int64'))
-
-        f = theano.function([], T.sum(T.stack(A, A), axis=0), mode=self.mode)
-        assert numpy.allclose(f(), [2, 4, 6, 8, 10])
-        topo = f.maker.fgraph.toposort()
-        assert isinstance(topo[-1].op, T.Elemwise)
-
-        # Test a case that was bugged in a old Theano bug
-        try:
-            old = theano.config.warn.reduce_join
-            theano.config.warn.reduce_join = False
-            f = theano.function([], T.sum(T.stack(A, A), axis=1),
-                                mode=self.mode)
-        finally:
-            theano.config.warn.reduce_join = old
-        assert numpy.allclose(f(), [15, 15])
-        topo = f.maker.fgraph.toposort()
-        assert not isinstance(topo[-1].op, T.Elemwise)
-
-        # This case could be optimized
-        A = theano.shared(numpy.array([1, 2, 3, 4, 5]).reshape(5, 1))
-        f = theano.function([], T.sum(T.concatenate((A, A), axis=1), axis=1),
-                            mode=self.mode)
-        assert numpy.allclose(f(), [2, 4, 6, 8, 10])
-        topo = f.maker.fgraph.toposort()
-        assert not isinstance(topo[-1].op, T.Elemwise)
-
-        A = theano.shared(numpy.array([1, 2, 3, 4, 5]).reshape(5, 1))
-        f = theano.function([], T.sum(T.concatenate((A, A), axis=1), axis=0),
-                            mode=self.mode)
-        assert numpy.allclose(f(), [15, 15])
-        topo = f.maker.fgraph.toposort()
-        assert not isinstance(topo[-1].op, T.Elemwise)
-
 
 class T_local_sum_dimshuffle(unittest.TestCase):
     def setUp(self):
@@ -4455,7 +3727,9 @@ def test_local_sum_div_dimshuffle(self):
                 print i
                 f = theano.function([a, b, c, d], s, mode=self.mode,
                         on_unused_input='ignore')
+                theano.printing.debugprint(f)
                 g = f.maker.fgraph.toposort()
+                #print 'g =', g
                 assert isinstance(g[-1].op.scalar_op,
                                   theano.scalar.basic.TrueDiv)
                 f(a_val, b_val, c_val, d_val)
@@ -4626,6 +3900,27 @@ def test_local_join_1():
     assert f.maker.fgraph.outputs[0].dtype == config.floatX
 
 
+def test_local_mul_to_neg():
+    """
+    Test that a multiplication by -1 or -1.0 yields the appropriate data type
+    """
+    a = T.imatrix()
+    f1 = theano.function([a], -1 * a)
+    f2 = theano.function([a], -1.0 * a)
+    aval = numpy.random.randint(0, 10, (2, 2)).astype('int32')
+    if config.cast_policy == 'custom':
+        assert f1(aval).dtype == a.dtype
+        assert f2(aval).dtype == 'float64'
+    elif config.cast_policy == 'numpy':
+        assert f1(aval).dtype == str(numpy.array(0).dtype)
+        assert f2(aval).dtype == 'float64'
+    elif config.cast_policy == 'numpy+floatX':
+        assert f1(aval).dtype == str(numpy.array(0).dtype)
+        assert f2(aval).dtype == config.floatX
+    else:
+        raise NotImplementedError(config.cast_policy)
+
+
 def test_local_add_specialize():
     # test of non-zero dimension
     a = tensor.vector()
@@ -4698,6 +3993,8 @@ def test_local_div_to_inv():
     denom_m = denom_s.dimshuffle('x', 'x')
 
     out = num_v / denom_m
+    theano.printing.debugprint(out, print_type=True)
+    print out.broadcastable
     assert numpy.all(out.broadcastable == (True, False))
 
     f = theano.function([num_len_s, denom_s], out)
@@ -4775,15 +4072,6 @@ def test_local_upcast_elemwise_constant_inputs():
     f = function([s], [tensor.grad(x, s)])
     f([-42, -2.1, -1, -0.5, 0, 0.2, 1, 2, 12])
 
-    # This test a corner where the optimization should not be applied.
-    old = theano.config.floatX
-    theano.config.floatX = 'float32'
-    try:
-        v = lvector()
-        function([v], theano.tensor.basic.true_div(v, 2))
-    finally:
-        theano.config.floatX = old
-
 
 class TestShape_i(utt.InferShapeTester):
 
diff --git a/theano/tensor/tests/test_raw_random.py b/theano/tensor/tests/test_raw_random.py
index 531db968e2d..3de40b2630b 100644
--- a/theano/tensor/tests/test_raw_random.py
+++ b/theano/tensor/tests/test_raw_random.py
@@ -1,10 +1,13 @@
 __docformat__ = "restructuredtext en"
-import numpy
+import sys
+import unittest
+import numpy as N
 from theano.tests import unittest_tools as utt
 
 from theano.tensor.raw_random import *
 from theano.tensor import (raw_random, ivector, dvector, iscalar, dcol,
                            dtensor3)
+from theano.tests import unittest_tools as utt
 from theano import tensor
 
 from theano import compile, config, gof
@@ -230,7 +233,7 @@ def test_random_function_noshape_noargs(self):
         rng_R = random_state_type()
 
         # No shape, no args -> TypeError
-        self.assertRaises(TypeError, poisson, rng_R, size=None, ndim=2)
+        self.assertRaises(TypeError, permutation, rng_R, size=None, ndim=2)
 
     def test_random_function_ndim_added(self):
         """Test that random_function helper function accepts ndim_added as
@@ -471,66 +474,6 @@ def test_permutation_helper(self):
                     update=post_r2, mutable=True)],
                 [out2], accept_inplace=True)
         self.assertRaises(ValueError, f2)
-    
-    def test_choice(self):
-        """Test that raw_random.choice generates the same
-        results as numpy."""
-        # numpy.random.choice is only available for numpy versions >= 1.7
-        major, minor, _ = numpy.version.short_version.split('.')
-        if (int(major), int(minor)) < (1, 7):
-            raise utt.SkipTest('choice requires at NumPy version >= 1.7 '
-                               '(%s)' % numpy.__version__)
-        
-        # Check over two calls to see if the random state is correctly updated.
-        rng_R = random_state_type()
-        # Use non-default parameters, and larger dimensions because of
-        # the integer nature of the result
-        post_r, out = choice(rng_R, (11, 8), 10, 1, 0)
-
-        f = compile.function(
-                [compile.In(rng_R,
-                    value=numpy.random.RandomState(utt.fetch_seed()),
-                    update=post_r, mutable=True)],
-                [out], accept_inplace=True)
-
-        numpy_rng = numpy.random.RandomState(utt.fetch_seed())
-        val0 = f()
-        val1 = f()
-        numpy_val0 = numpy_rng.choice(10, (11, 8), True, None)
-        numpy_val1 = numpy_rng.choice(10, (11, 8), True, None)
-        print val0
-        print numpy_val0
-        print val1
-        print numpy_val1
-        self.assertTrue(numpy.allclose(val0, numpy_val0))
-        self.assertTrue(numpy.allclose(val1, numpy_val1))
-
-    def test_poisson(self):
-        """Test that raw_random.poisson generates the same
-        results as numpy."""
-        # Check over two calls to see if the random state is correctly updated.
-        rng_R = random_state_type()
-        # Use non-default parameters, and larger dimensions because of
-        # the integer nature of the result
-        post_r, out = poisson(rng_R, lam=5, size=(11,8))
-
-        f = compile.function(
-                [compile.In(rng_R,
-                    value=numpy.random.RandomState(utt.fetch_seed()),
-                    update=post_r, mutable=True)],
-                [out], accept_inplace=True)
-
-        numpy_rng = numpy.random.RandomState(utt.fetch_seed())
-        val0 = f()
-        val1 = f()
-        numpy_val0 = numpy_rng.poisson(5,size=(11,8))
-        numpy_val1 = numpy_rng.poisson(5,size=(11,8))
-        print val0
-        print numpy_val0
-        print val1
-        print numpy_val1
-        self.assertTrue(numpy.allclose(val0, numpy_val0))
-        self.assertTrue(numpy.allclose(val1, numpy_val1))
 
     def test_permutation(self):
         """Test that raw_random.permutation generates the same
@@ -561,19 +504,6 @@ def test_permutation(self):
         self.assertTrue(numpy.all(val0 == numpy_val0))
         self.assertTrue(numpy.all(val1 == numpy_val1))
 
-        # Test that we can generate a list: have size=None or ().
-        for ndim in [1, None]:
-            post_r, out = permutation(rng_R, n=10, size=None, ndim=ndim)
-            inp = compile.In(rng_R,
-                             value=numpy.random.RandomState(utt.fetch_seed()),
-                             update=post_r, mutable=True)
-            f = theano.function([inp], out)
-            o = f()
-            assert o.shape == (10,)
-            assert (numpy.sort(o) == numpy.arange(10)).all()
-        # Wrong number of dimensions asked
-        self.assertRaises(TypeError, permutation, rng_R, size=None, ndim=2)
-
     def test_multinomial(self):
         """Test that raw_random.multinomial generates the same
         results as numpy."""
@@ -615,42 +545,6 @@ def test_symbolic_shape(self):
         self.assertRaises(ValueError, f, rng_state0, [4])
         self.assertRaises(ValueError, f, rng_state0, [4, 3, 4, 5])
 
-    def test_mixed_shape(self):
-        # Test when the provided shape is a tuple of ints and scalar vars
-        rng_R = random_state_type()
-        shape0 = tensor.lscalar()
-        shape = (shape0, 3)
-        post_r, u = uniform(rng_R, size=shape, ndim=2)
-        f = compile.function([rng_R, shape0], u)
-        rng_state0 = numpy.random.RandomState(utt.fetch_seed())
-
-        assert f(rng_state0, 2).shape == (2, 3)
-        assert f(rng_state0, 8).shape == (8, 3)
-
-        post_r, v = uniform(rng_R, size=shape)
-        g = compile.function([rng_R, shape0], v)
-        assert g(rng_state0, 2).shape == (2, 3)
-        assert g(rng_state0, 8).shape == (8, 3)
-
-    def test_mixed_shape_bcastable(self):
-        # Test when the provided shape is a tuple of ints and scalar vars
-        rng_R = random_state_type()
-        shape0 = tensor.lscalar()
-        shape = (shape0, 1)
-        post_r, u = uniform(rng_R, size=shape, ndim=2)
-        assert u.broadcastable == (False, True)
-        f = compile.function([rng_R, shape0], u)
-        rng_state0 = numpy.random.RandomState(utt.fetch_seed())
-
-        assert f(rng_state0, 2).shape == (2, 1)
-        assert f(rng_state0, 8).shape == (8, 1)
-
-        post_r, v = uniform(rng_R, size=shape)
-        assert v.broadcastable == (False, True)
-        g = compile.function([rng_R, shape0], v)
-        assert g(rng_state0, 2).shape == (2, 1)
-        assert g(rng_state0, 8).shape == (8, 1)
-
     def test_default_shape(self):
         rng_R = random_state_type()
         post_r, out = uniform(rng_R)
diff --git a/theano/tensor/tests/test_shared_randomstreams.py b/theano/tensor/tests/test_shared_randomstreams.py
index ffe4a7a2486..2f0355a6a3c 100644
--- a/theano/tensor/tests/test_shared_randomstreams.py
+++ b/theano/tensor/tests/test_shared_randomstreams.py
@@ -182,45 +182,6 @@ def test_random_integers(self):
 
         assert numpy.all(fn_val0 == numpy_val0)
         assert numpy.all(fn_val1 == numpy_val1)
-    
-    def test_choice(self):
-        """Test that RandomStreams.choice generates the same results as numpy"""
-        # numpy.random.choice is only available for numpy versions >= 1.7
-        major, minor, _ = numpy.version.short_version.split('.')
-        if (int(major), int(minor)) < (1, 7):
-            raise utt.SkipTest('choice requires at NumPy version >= 1.7 '
-                               '(%s)' % numpy.__version__)
-        
-        # Check over two calls to see if the random state is correctly updated.
-        random = RandomStreams(utt.fetch_seed())
-        fn = function([], random.choice((11, 8), 10, 1, 0))
-        fn_val0 = fn()
-        fn_val1 = fn()
-
-        rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30)
-        rng = numpy.random.RandomState(int(rng_seed)) #int() is for 32bit
-        numpy_val0 = rng.choice(10, (11, 8), True, None)
-        numpy_val1 = rng.choice(10, (11, 8), True, None)
-
-        assert numpy.all(fn_val0 == numpy_val0)
-        assert numpy.all(fn_val1 == numpy_val1)
-
-    def test_poisson(self):
-        """Test that RandomStreams.poisson generates the same results as numpy"""
-        
-        # Check over two calls to see if the random state is correctly updated.
-        random = RandomStreams(utt.fetch_seed())
-        fn = function([], random.poisson(lam=5, size=(11, 8)))
-        fn_val0 = fn()
-        fn_val1 = fn()
-
-        rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30)
-        rng = numpy.random.RandomState(int(rng_seed)) #int() is for 32bit
-        numpy_val0 = rng.poisson(lam=5, size=(11, 8))
-        numpy_val1 = rng.poisson(lam=5, size=(11, 8))
-
-        assert numpy.all(fn_val0 == numpy_val0)
-        assert numpy.all(fn_val1 == numpy_val1)
 
     def test_permutation(self):
         """Test that RandomStreams.permutation generates the same results as numpy"""
@@ -372,36 +333,6 @@ def test_symbolic_shape(self):
         self.assertRaises(ValueError, f, [4])
         self.assertRaises(ValueError, f, [4,3,4,5])
 
-    def test_mixed_shape(self):
-        # Test when the provided shape is a tuple of ints and scalar vars
-        random = RandomStreams(utt.fetch_seed())
-        shape0 = tensor.lscalar()
-        shape = (shape0, 3)
-        f = function([shape0], random.uniform(size=shape, ndim=2))
-        assert f(2).shape == (2, 3)
-        assert f(8).shape == (8, 3)
-
-        g = function([shape0], random.uniform(size=shape))
-        assert g(2).shape == (2, 3)
-        assert g(8).shape == (8, 3)
-
-    def test_mixed_shape_bcastable(self):
-        # Test when the provided shape is a tuple of ints and scalar vars
-        random = RandomStreams(utt.fetch_seed())
-        shape0 = tensor.lscalar()
-        shape = (shape0, 1)
-        u = random.uniform(size=shape, ndim=2)
-        assert u.broadcastable == (False, True)
-        f = function([shape0], u)
-        assert f(2).shape == (2, 1)
-        assert f(8).shape == (8, 1)
-
-        v = random.uniform(size=shape)
-        assert v.broadcastable == (False, True)
-        g = function([shape0], v)
-        assert g(2).shape == (2, 1)
-        assert g(8).shape == (8, 1)
-
     def test_default_shape(self):
         random = RandomStreams(utt.fetch_seed())
         f = function([], random.uniform())
diff --git a/theano/tensor/tests/test_slinalg.py b/theano/tensor/tests/test_slinalg.py
deleted file mode 100644
index 839b2bb3cf5..00000000000
--- a/theano/tensor/tests/test_slinalg.py
+++ /dev/null
@@ -1,229 +0,0 @@
-import unittest
-
-import numpy
-import numpy.linalg
-from numpy.testing import assert_array_almost_equal
-from numpy.testing import dec, assert_array_equal, assert_allclose
-from numpy import inf
-
-import theano
-from theano import tensor, function
-from theano.tensor.basic import _allclose
-from theano.tests.test_rop import break_op
-from theano.tests import unittest_tools as utt
-from theano import config
-
-from theano.tensor.slinalg import ( Cholesky,
-                                    cholesky,
-                                    CholeskyGrad,
-                                    Solve,
-                                    solve,
-                                    Eigvalsh,
-                                    EigvalshGrad,
-                                    eigvalsh
-                                    )
-
-from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
-from nose.tools import assert_raises
-
-try:
-    import scipy.linalg
-    imported_scipy = True
-except ImportError:
-    # some ops (e.g. Cholesky, Solve, A_Xinv_b) won't work
-    imported_scipy = False
-
-def check_lower_triangular(pd, ch_f):
-    ch = ch_f(pd)
-    assert ch[0, pd.shape[1] - 1] == 0
-    assert ch[pd.shape[0] - 1, 0] != 0
-    assert numpy.allclose(numpy.dot(ch, ch.T), pd)
-    assert not numpy.allclose(numpy.dot(ch.T, ch), pd)
-
-
-def check_upper_triangular(pd, ch_f):
-    ch = ch_f(pd)
-    assert ch[4, 0] == 0
-    assert ch[0, 4] != 0
-    assert numpy.allclose(numpy.dot(ch.T, ch), pd)
-    assert not numpy.allclose(numpy.dot(ch, ch.T), pd)
-
-
-def test_cholesky():
-    if not imported_scipy:
-        raise SkipTest("Scipy needed for the Cholesky op.")
-
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    r = rng.randn(5, 5).astype(config.floatX)
-    pd = numpy.dot(r, r.T)
-    x = tensor.matrix()
-    chol = cholesky(x)
-    # Check the default.
-    ch_f = function([x], chol)
-    yield check_lower_triangular, pd, ch_f
-    # Explicit lower-triangular.
-    chol = Cholesky(lower=True)(x)
-    ch_f = function([x], chol)
-    yield check_lower_triangular, pd, ch_f
-    # Explicit upper-triangular.
-    chol = Cholesky(lower=False)(x)
-    ch_f = function([x], chol)
-    yield check_upper_triangular, pd, ch_f
-
-
-def test_cholesky_grad():
-    if not imported_scipy:
-        raise SkipTest("Scipy needed for the Cholesky op.")
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    r = rng.randn(5, 5).astype(config.floatX)
-    pd = numpy.dot(r, r.T)
-    eps = None
-    if config.floatX == "float64":
-        eps = 2e-8
-    # Check the default.
-    yield (lambda: utt.verify_grad(cholesky, [pd], 3, rng, eps=eps))
-    # Explicit lower-triangular.
-    yield (lambda: utt.verify_grad(Cholesky(lower=True), [pd], 3,
-                                   rng, eps=eps))
-    # Explicit upper-triangular.
-    yield (lambda: utt.verify_grad(Cholesky(lower=False), [pd], 3,
-                                   rng, eps=eps))
-
-
-@attr('slow')
-def test_cholesky_and_cholesky_grad_shape():
-    if not imported_scipy:
-        raise SkipTest("Scipy needed for the Cholesky op.")
-
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    x = tensor.matrix()
-    for l in (cholesky(x), Cholesky(lower=True)(x), Cholesky(lower=False)(x)):
-        f_chol = theano.function([x], l.shape)
-        g = tensor.grad(l.sum(), x)
-        f_cholgrad = theano.function([x], g.shape)
-        topo_chol = f_chol.maker.fgraph.toposort()
-        topo_cholgrad = f_cholgrad.maker.fgraph.toposort()
-        if config.mode != 'FAST_COMPILE':
-            assert sum([node.op.__class__ == Cholesky
-                        for node in topo_chol]) == 0
-            assert sum([node.op.__class__ == CholeskyGrad
-                        for node in topo_cholgrad]) == 0
-        for shp in [2, 3, 5]:
-            m = numpy.cov(rng.randn(shp, shp + 10)).astype(config.floatX)
-            yield numpy.testing.assert_equal, f_chol(m), (shp, shp)
-            yield numpy.testing.assert_equal, f_cholgrad(m), (shp, shp)
-
-
-
-def test_eigvalsh():
-    if not imported_scipy:
-        raise SkipTest("Scipy needed for the geigvalsh op.")
-    import scipy.linalg
-
-    A = theano.tensor.dmatrix('a')
-    B = theano.tensor.dmatrix('b')
-    f = function([A, B], eigvalsh(A, B))
-
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    a = rng.randn(5, 5)
-    a = a + a.T
-    for b in [10 * numpy.eye(5, 5) + rng.randn(5, 5)]:
-        w = f(a, b)
-        refw = scipy.linalg.eigvalsh(a, b)
-        numpy.testing.assert_array_almost_equal(w, refw)
-
-    # We need to test None separatly, as otherwise DebugMode will
-    # complain, as this isn't a valid ndarray.
-    b = None
-    B = theano.tensor.NoneConst
-    f = function([A], eigvalsh(A, B))
-    w = f(a)
-    refw = scipy.linalg.eigvalsh(a, b)
-    numpy.testing.assert_array_almost_equal(w, refw)
-
-
-def test_eigvalsh_grad():
-    if not imported_scipy:
-        raise SkipTest("Scipy needed for the geigvalsh op.")
-    import scipy.linalg
-
-    rng = numpy.random.RandomState(utt.fetch_seed())
-    a = rng.randn(5, 5)
-    a = a + a.T
-    b = 10 * numpy.eye(5, 5) + rng.randn(5, 5)
-    tensor.verify_grad(lambda a, b: eigvalsh(a, b).dot([1, 2, 3, 4, 5]),
-                       [a, b], rng=numpy.random)
-
-
-class test_Solve(utt.InferShapeTester):
-    def setUp(self):
-        super(test_Solve, self).setUp()
-        self.op_class = Solve
-        self.op = Solve()
-
-    def test_infer_shape(self):
-        if not imported_scipy:
-            raise SkipTest("Scipy needed for the Cholesky op.")
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        A = theano.tensor.matrix()
-        b = theano.tensor.matrix()
-        self._compile_and_check([A, b],  # theano.function inputs
-                                [self.op(A, b)],  # theano.function outputs
-                                # A must be square
-                                [numpy.asarray(rng.rand(5, 5),
-                                               dtype=config.floatX),
-                                 numpy.asarray(rng.rand(5, 1),
-                                               dtype=config.floatX)],
-                                self.op_class,
-                                warn=False)
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        A = theano.tensor.matrix()
-        b = theano.tensor.vector()
-        self._compile_and_check([A, b],  # theano.function inputs
-                                [self.op(A, b)],  # theano.function outputs
-                                # A must be square
-                                [numpy.asarray(rng.rand(5, 5),
-                                               dtype=config.floatX),
-                                 numpy.asarray(rng.rand(5),
-                                               dtype=config.floatX)],
-                                self.op_class,
-                                warn=False)
-        
-    def test_solve_correctness(self):
-        if not imported_scipy:
-            raise SkipTest("Scipy needed for the Cholesky op.")
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        A = theano.tensor.matrix()
-        b = theano.tensor.matrix()
-        y = self.op(A, b)
-        gen_solve_func = theano.function([A,b],y)
-
-        cholesky_lower = Cholesky(lower=True)
-        L = cholesky_lower(A)
-        y_lower = self.op(L, b)
-        lower_solve_func = theano.function([L,b],y_lower)
-
-        cholesky_upper = Cholesky(lower=False)
-        U = cholesky_upper(A)
-        y_upper = self.op(U, b)
-        upper_solve_func = theano.function([U,b],y_upper)
-
-        b_val = numpy.asarray(rng.rand(5, 1), dtype=config.floatX)
-        
-        # 1-test general case
-        A_val = numpy.asarray(rng.rand(5, 5), dtype=config.floatX)
-        # positive definite matrix:
-        A_val = numpy.dot(A_val.transpose(), A_val)
-        assert numpy.allclose(scipy.linalg.solve(A_val, b_val),
-                              gen_solve_func(A_val, b_val))
-
-        # 2-test lower traingular case
-        L_val = scipy.linalg.cholesky(A_val, lower=True)
-        assert numpy.allclose(scipy.linalg.solve_triangular(L_val, b_val, lower=True),
-                              lower_solve_func(L_val, b_val))
-
-        # 3-test upper traingular case
-        U_val = scipy.linalg.cholesky(A_val, lower=False)
-        assert numpy.allclose(scipy.linalg.solve_triangular(U_val, b_val, lower=False),
-                              upper_solve_func(U_val, b_val))
diff --git a/theano/tensor/tests/test_sort.py b/theano/tensor/tests/test_sort.py
index 904ef52cba8..a83bb9d16eb 100644
--- a/theano/tensor/tests/test_sort.py
+++ b/theano/tensor/tests/test_sort.py
@@ -68,21 +68,6 @@ def test_None(self):
         gt = np.sort(self.m_val, None)
         assert np.allclose(gv, gt)
 
-    def test_grad_vector(self):
-        a = theano.tensor.vector()
-        data = np.random.rand(10).astype(theano.config.floatX)
-        utt.verify_grad(sort, [data])
-
-    def test_grad_none_axis(self):
-        data = np.random.rand(10).astype(theano.config.floatX)
-        utt.verify_grad(lambda x: sort(x, None), [data])
-        utt.verify_grad(lambda x: sort(x, 0), [data])
-
-        data = np.random.rand(2, 3).astype(theano.config.floatX)
-        utt.verify_grad(lambda x: sort(x, None), [data])
-        #utt.verify_grad(lambda x: sort(x, 0), [data])
-        #utt.verify_grad(lambda x: sort(x, 1), [data])
-
 
 class TensorInferShapeTester(utt.InferShapeTester):
     def test_sort(self):
@@ -158,5 +143,3 @@ def test_argsort():
     gv = f(m_val)
     gt = np.argsort(m_val, None)
     assert np.allclose(gv, gt)
-
-
diff --git a/theano/tensor/tests/test_subtensor.py b/theano/tensor/tests/test_subtensor.py
deleted file mode 100644
index b7153a51987..00000000000
--- a/theano/tensor/tests/test_subtensor.py
+++ /dev/null
@@ -1,1478 +0,0 @@
-from itertools import izip
-import logging
-import sys
-import unittest
-
-from nose.plugins.skip import SkipTest
-from nose.plugins.attrib import attr
-import numpy
-
-import theano
-from theano.compat import exc_message
-from theano.compat.six import StringIO
-from theano.gof.python25 import any
-from theano.compile import DeepCopyOp
-from theano import config
-from theano import gof
-import theano.scalar as scal
-import theano.tensor as tensor
-from theano.tests import unittest_tools as utt
-from theano.tensor.subtensor import (inc_subtensor, set_subtensor,
-                                     Subtensor, IncSubtensor,
-                                     AdvancedSubtensor1, AdvancedSubtensor,
-                                     advanced_subtensor1, inplace_increment,
-                                     AdvancedIncSubtensor1,
-                                     AdvancedIncSubtensor,
-                                     get_canonical_form_slice)
-from theano.tensor import (as_tensor_variable, _shared,
-                           NotScalarConstantError,
-                           fscalar, iscalar, dscalar, cscalar,
-                           vector, dvector, fvector, lvector, lrow,
-                           fmatrix, dmatrix, lmatrix, matrix,
-                           ctensor3, dtensor4)
-from theano.tensor.tests.test_basic import rand, randint_ranged, inplace_func
-
-
-class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
-    """
-    This is build in a way that allow to reuse it to test the
-    equivalent gpu op.
-    """
-    def __init__(self, name, shared=tensor._shared,
-                 sub=tensor.Subtensor,
-                 inc_sub=tensor.IncSubtensor,
-                 adv_sub1=tensor.AdvancedSubtensor1,
-                 adv_incsub1=tensor.AdvancedIncSubtensor1,
-                 mode=None,
-                 dtype=theano.config.floatX,
-                 ignore_topo=DeepCopyOp):
-        self.shared = shared
-        self.sub = sub
-        self.inc_sub = inc_sub
-        self.adv_sub1 = adv_sub1
-        self.adv_incsub1 = adv_incsub1
-        if mode is None:
-            mode = theano.compile.mode.get_default_mode()
-        self.mode = mode
-        self.dtype = dtype
-        self.ignore_topo = ignore_topo
-        self.fast_compile = theano.config.mode == 'FAST_COMPILE'
-        self.ops = (sub, inc_sub, adv_sub1, adv_incsub1)
-        return super(T_subtensor, self).__init__(name)
-
-    def function(self, inputs, outputs, accept_inplace=False,
-                 op=None, mode=None, N=1, N_fast=None):
-        """ wrapper around theano.function that also check the output
-
-        :param N: the number of op expected in the toposort
-                  if tuple of length 2, (expected if fast_compile,
-                                         if not fast_compile)
-        """
-        if self.fast_compile and N_fast is not None:
-            N = N_fast
-        if mode is None:
-            mode = self.mode
-        if op is None:
-            op = self.sub
-
-        f = theano.function(inputs, outputs, mode=mode,
-                            accept_inplace=accept_inplace)
-        self.assertFunctionContainsClassN(f, op, N)
-        return f
-
-    def setUp(self):
-        Subtensor.debug = False
-        utt.seed_rng()
-
-    def eval_output_and_check(self, t, list=False):
-        f = inplace_func([], t, mode=self.mode)
-        topo = f.maker.fgraph.toposort()
-        topo_ = [node for node in topo if not isinstance(node.op,
-                                                         self.ignore_topo)]
-        assert len(topo_) == 1
-        if not list:
-            assert isinstance(topo_[0].op, self.sub)
-        else:
-            assert isinstance(topo_[0].op, self.adv_sub1)
-        tval = f()
-        return tval
-
-    def test0_err_invalid(self):
-        #it is impossible to retrieve a view of a 0-d tensor
-        n = self.shared(numpy.ones((), dtype=self.dtype))
-        try:
-            t = n[0]
-        except ValueError, e:
-            self.assertTrue(hasattr(e, 'subtensor_invalid'))
-            return
-        self.fail()
-
-    def test1_err_bounds(self):
-        n = self.shared(numpy.ones(3, dtype=self.dtype))
-        ctv_backup = config.compute_test_value
-        config.compute_test_value = 'off'
-        try:
-            t = n[7]
-        finally:
-            config.compute_test_value = ctv_backup
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        # Silence expected error messages
-        _logger = logging.getLogger('theano.gof.opt')
-        oldlevel = _logger.level
-        _logger.setLevel(logging.CRITICAL)
-        try:
-            try:
-                self.eval_output_and_check(t)
-            except IndexError, e:
-                return
-            self.fail()
-        finally:
-            _logger.setLevel(oldlevel)
-
-    def test1_err_subslice(self):
-        n = self.shared(numpy.ones(3, dtype=self.dtype))
-        try:
-            t = n[slice(0, slice(1, 2, None), None)]
-        except Exception, e:
-            ### Relax constraint on the type of Exception,
-            ### since this might be handled by AvancedSubtensor
-            #if e[0] != Subtensor.e_indextype:
-            #    raise
-            return
-        self.fail()
-
-    def test1_ok_range_finite(self):
-        n = self.shared(numpy.arange(3, dtype=self.dtype))
-        t = n[0:2]
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == (2,))
-        self.assertTrue((tval == [0, 1]).all())
-
-    def test2_ok_range_finite(self):
-        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((3, 4)))
-        # Also check negative index
-        for idx in [(slice(0, 2), 3), ((slice(0, 2), -1)), (slice(0, 2), -4)]:
-            t = n[idx]  # l]#0:2,3]
-            self.assertTrue(isinstance(t.owner.op, Subtensor))
-            tval = self.eval_output_and_check(t)
-            self.assertTrue(tval.shape == (2,))
-            self.assertTrue(numpy.allclose(tval, n.get_value()[idx]))
-
-    def test1_0_dims(self):
-        n = self.shared(numpy.ones((), dtype=self.dtype))
-        t = self.sub([])(n)
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        mode = self.mode
-        self.mode = mode.excluding("local_useless_subtensor")
-        try:
-            self.eval_output_and_check(t)
-        finally:
-            self.mode = mode
-
-    def test1_err_invalid(self):
-        n = self.shared(numpy.ones(1, dtype=self.dtype))
-        try:
-            t = n[0, 0]
-        except ValueError, e:
-            self.assertTrue(hasattr(e, 'subtensor_invalid'))
-            return
-        self.fail()
-
-    def test1_ok_elem(self):
-        n = self.shared(numpy.ones(1, dtype=self.dtype) * 5)
-        t = n[0]
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == ())
-        self.assertTrue(tval == 5.0)
-
-    def test1_ok_range_infinite(self):
-        n = self.shared(numpy.arange(3, dtype=self.dtype))
-        t = n[1:]
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == (2,))
-        self.assertTrue((tval == [1.0, 2.0]).all())
-
-    def test1_ok_strided(self):
-        n = self.shared(numpy.arange(5, dtype=self.dtype))
-        t = n[1::2]
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == (2,))
-        self.assertTrue((tval == [1.0, 3.0]).all())
-
-        t = n[0:-1:2]  # 0 to 1 from the end stepping by 2
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == (2,))
-        self.assertTrue((tval == [0.0, 2.0]).all())
-
-    def test2_err_bounds0(self):
-        n = self.shared(numpy.ones((2, 3), dtype=self.dtype) * 5)
-        ctv_backup = config.compute_test_value
-        config.compute_test_value = 'off'
-        try:
-            for idx in [(0, 4), (0, -4)]:
-                t = n[idx]
-                self.assertTrue(isinstance(t.owner.op, Subtensor))
-                # Silence expected warnings
-                _logger = logging.getLogger('theano.gof.opt')
-                oldlevel = _logger.level
-                _logger.setLevel(logging.CRITICAL)
-                try:
-                    self.assertRaises(IndexError,
-                                      self.eval_output_and_check, [t])
-                finally:
-                    _logger.setLevel(oldlevel)
-        finally:
-            config.compute_test_value = ctv_backup
-
-    def test2_err_bounds1(self):
-        n = self.shared((numpy.ones((2, 3), dtype=self.dtype) * 5))
-        t = n[4:5, 3]
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        old_stderr = sys.stderr
-        sys.stderr = StringIO()
-        try:
-            self.assertRaises(IndexError,
-                              self.eval_output_and_check, [t])
-        finally:
-            sys.stderr = old_stderr
-
-    def test2_ok_elem(self):
-        n = self.shared(numpy.arange(6, dtype=self.dtype).reshape((2, 3)))
-        t = n[0, 2]
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == ())
-        self.assertTrue(numpy.all(tval == 2))
-
-    def test2_ok_row(self):
-        n = self.shared(numpy.arange(6, dtype=self.dtype).reshape((2, 3)))
-        t = n[1]
-        self.assertFalse(any(n.type.broadcastable))
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == (3,))
-        self.assertTrue(numpy.all(tval == [3, 4, 5]))
-
-    def test2_ok_col(self):
-        n = self.shared(numpy.arange(6, dtype=self.dtype).reshape((2, 3)))
-        t = n[:, 0]
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        self.assertFalse(any(n.type.broadcastable))
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == (2,))
-        self.assertTrue(numpy.all(tval == [0, 3]))
-
-    def test2_ok_rows_finite(self):
-        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((4, 3)))
-        t = n[1:3, 0]
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == (2,))
-        self.assertTrue(numpy.all(tval == [3, 6]))
-
-    def test2_ok_cols_infinite(self):
-        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((4, 3)))
-        t = n[1, 2:]
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == (1,))
-        self.assertTrue(numpy.all(tval == 5))
-
-    def test2_ok_strided(self):
-        n = self.shared(numpy.arange(20, dtype=self.dtype).reshape((4, 5)))
-        t = n[1:4:2, 1:5:2]
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == (2, 2))
-        self.assertTrue(numpy.all(tval == [[6, 8], [16, 18]]))
-
-    def test3_ok_mat(self):
-        n = self.shared(numpy.arange(24, dtype=self.dtype).reshape((2, 3, 4)))
-        t = n[0, 0, 0]
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == ())
-        self.assertTrue(numpy.all(tval == 0))
-
-    def test_long(self):
-        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((4, 3)))
-        t = n[1L:4L:2L, 1L]
-        self.assertTrue(isinstance(t.owner.op, Subtensor))
-        tval = self.eval_output_and_check(t)
-        self.assertTrue(tval.shape == (2,))
-        self.assertTrue(numpy.all(tval == [4, 10]))
-
-    def test_long_too_big(self):
-        # Currently, we cast Python longs to int64 when used for indexing.
-        # This test checks that using a long that does not fit raises an error.
-        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((4, 3)))
-        self.assertRaises(Exception, lambda: n[:(2L ** 63)])
-
-    def test_newaxis(self):
-        """
-        newaxis support comes from logic in the __getitem__ of TensorType
-        Variables, which currently inserts dimshuffle to get the right number
-        of dimensions, and adjusts the slice tuple accordingly.
-
-        So testing is done via square-bracket notation rather than direct
-        interaction with the Subtensor Op (which has no support of its own for
-        newaxis).
-        """
-        newaxis = numpy.newaxis
-
-        n = self.shared(numpy.arange(24, dtype=self.dtype).reshape((2, 3, 4)))
-        assert n.ndim == 3
-
-        n4 = n[newaxis, :, :, :]
-        assert n4.broadcastable == (True, False, False, False), n4
-
-        n4 = n[:, newaxis, :, :]
-        assert n4.broadcastable == (False, True, False, False), n4
-
-        n4 = n[:, :, newaxis, :]
-        assert n4.broadcastable == (False, False, True, False), n4
-
-        n4 = n[:, :, :, newaxis]
-        assert n4.broadcastable == (False, False, False, True), n4
-
-        n3 = n.flatten()[newaxis, :, newaxis]
-        assert n3.broadcastable == (True, False, True), n3
-
-        s = cscalar()
-        s1 = s[newaxis]
-        assert s1.broadcastable == (True,), s1
-
-        vs1, vn3, vn4 = theano.function([s], [s1, n3, n4])(-2.0)
-
-        assert numpy.all(vs1 == [-2.0])
-        assert numpy.all(vn3
-                == numpy.arange(24)[newaxis, :, newaxis])
-        assert numpy.all(vn4
-                == numpy.arange(24).reshape((2, 3, 4))[:, :, :, newaxis])
-
-    def test_grad_1d(self):
-        subi = 0
-        data = numpy.asarray(rand(2, 3), dtype=self.dtype)
-        n = self.shared(data)
-        z = scal.constant(subi)
-        t = n[z:, z]
-        gn = theano.tensor.grad(theano.tensor.sum(theano.tensor.exp(t)), n)
-
-        f = inplace_func([], gn, mode=self.mode)
-        topo = f.maker.fgraph.toposort()
-        topo_ = [node for node in topo if not isinstance(node.op,
-                                                         self.ignore_topo)]
-        if not self.fast_compile:
-            assert len(topo_) == 6
-        assert numpy.sum([isinstance(node.op, self.inc_sub)
-                          for node in topo_]) == 1
-        assert numpy.sum([isinstance(node.op, self.sub)
-                          for node in topo_]) == 1
-        gval = f()
-
-        good = numpy.zeros_like(data)
-        good[subi:, subi] = numpy.exp(data[subi:, subi])
-        self.assertTrue(numpy.allclose(gval, good), (gval, good))
-
-    def test_grad_2d_inc_set_subtensor(self):
-        for n_shape, m_shape in [
-            [(2, 3), (2, 2)],
-            [(3, 2), (2, 2)],
-            [(3, 2), (1, 2)],
-            [(3, 2), (2,)],
-        ]:
-            for op in [inc_subtensor, set_subtensor]:
-                subi = 2
-                data = numpy.asarray(rand(*n_shape), dtype=self.dtype)
-                n = self.shared(data)
-                z = scal.constant(subi)
-                m = matrix('m', dtype=self.dtype)
-                mv = numpy.asarray(rand(*m_shape), dtype=self.dtype)
-
-                t = op(n[:z, :z], m)
-                gn, gm = theano.tensor.grad(theano.tensor.sum(t), [n, m])
-                utt.verify_grad(lambda m: op(n[:z, :z], m), [mv])
-                utt.verify_grad(lambda nn: op(nn[:z, :z], mv), [data])
-
-    def test_grad_0d(self):
-        data = numpy.asarray(rand(2, 3), dtype=self.dtype)
-        n = self.shared(data)
-        t = n[1, 0]
-        gn = theano.tensor.grad(theano.tensor.sum(theano.tensor.exp(t)), n)
-        f = self.function([], gn)
-        topo = f.maker.fgraph.toposort()
-        topo_ = [node for node in topo if not isinstance(node.op,
-             self.ignore_topo)]
-        if not self.fast_compile:
-            assert len(topo_) == 6
-        assert numpy.sum([isinstance(node.op, self.inc_sub)
-             for node in topo_]) == 1
-        assert numpy.sum([isinstance(node.op, self.sub)
-             for node in topo_]) == 1
-
-        gval = f()
-        good = numpy.zeros_like(data)
-        good[1, 0] = numpy.exp(data[1, 0])
-        self.assertTrue(numpy.allclose(gval, good), (gval, good))
-
-    def test_ok_list(self):
-        for data, idx in [(rand(4), [1, 0]),
-                          (rand(4, 5), [2, 3]),
-                          (rand(4, 2, 3), [0, 3]),
-                          (rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0]),
-                          (rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0,
-                                           -1, -2, -3, -4]),
-                          # Test 4 dims as gpu code use another algo
-                          # in that case This new algo is not as much
-                          # optimized for that case.
-                          (rand(4, 4, 2, 3), [3,
-                               3, 1, 1, 2, 2, 0, 0, -1, -2, -3, -4]),
-                          # Test with TensorConstant index.
-                          (rand(4, 2, 3),
-                           theano.tensor.constant([3, 3, 1, 1, 2, 2, 0, 0])),
-                          ]:
-            data = numpy.asarray(data, dtype=self.dtype)
-            n = self.shared(data)
-            t = n[idx]
-
-            # We test again AdvancedSubtensor1 as we transfer data to the cpu.
-            self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
-
-            val = self.eval_output_and_check(t, list=True)
-            if isinstance(idx, list):
-                good = data[idx]
-            else:
-                good = data[idx.data]
-            self.assertTrue(val.ndim == data.ndim)
-            self.assertTrue(numpy.allclose(val, good), (val, good))
-
-            # Test reuse of output memory
-            if type(self.adv_sub1) == tensor.AdvancedSubtensor1:
-                op = self.adv_sub1()
-                # When idx is a TensorConstant.
-                if hasattr(idx, "data"):
-                    idx = idx.data
-                test_out = [[None]]
-                op.perform(None, [data, idx], test_out)
-                out1 = test_out[0][0]
-                op.perform(None, [data, idx], test_out)
-                out2 = test_out[0][0]
-                assert out1 is out2
-
-    def test_err_invalid_list(self):
-        n = self.shared(numpy.asarray(5, dtype=self.dtype))
-        self.assertRaises(TypeError, n.__getitem__, [0, 0])
-
-    def test_err_invalid_2list_dtype(self):
-        n = self.shared(numpy.ones((3, 3), dtype=self.dtype) * 5)
-        self.assertRaises(TypeError, n.__getitem__, ([0., 0], [1, 1]))
-
-    def test_err_bound_list(self):
-        n = self.shared(numpy.ones((2, 3), dtype=self.dtype) * 5)
-        l = lvector()
-        t = n[l]
-        # We test again AdvancedSubtensor1 as we transfer data to the cpu.
-        self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
-
-        f = self.function([l], t, op=self.adv_sub1)
-        topo = f.maker.fgraph.toposort()
-        topo_ = [node for node in topo if not isinstance(node.op,
-             self.ignore_topo)]
-        assert len(topo_) == 1
-        self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
-        for shp in [[0, 4], [0, -3], [-10]]:
-            self.assertRaises(IndexError, f, shp)
-
-    def test_adv_sub1_broadcast(self):
-        ones = numpy.ones((1, 3), dtype=self.dtype)
-        n = self.shared(ones * 5, broadcastable=(True, False))
-        idx = tensor.lvector()
-        t = n[idx]
-        self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
-
-        f = self.function([idx], t, op=self.adv_sub1)
-        topo = f.maker.fgraph.toposort()
-        topo_ = [node for node in topo if not isinstance(node.op,
-             self.ignore_topo)]
-        assert len(topo_) == 1
-        self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
-        f_0 = f([0])
-        self.assertTrue(f_0.shape == (1, 3))
-        self.assertTrue(numpy.allclose(f_0, ones[0] * 5))
-        f_00 = f([0, 0])
-        self.assertTrue(f_00.shape == (2, 3))
-        self.assertTrue(numpy.allclose(f_00, 5))
-        self.assertRaises(IndexError, f, [0, 1])
-
-        # Test the gradient
-        c = t.sum()
-        gn = theano.grad(c, n)
-        g = self.function([idx], gn, op=self.adv_incsub1)
-        g_0 = g([0])
-        self.assertTrue(g_0.shape == (1, 3))
-        self.assertTrue(numpy.allclose(g_0, 1))
-        g_00 = g([0, 0])
-        self.assertTrue(g_00.shape == (1, 3))
-        self.assertTrue(numpy.allclose(g_00, 2))
-
-    def test_adv_sub1_idx_broadcast(self):
-        # The idx can be a broadcastable vector.
-        ones = numpy.ones((4, 3), dtype=self.dtype)
-        n = self.shared(ones * 5)
-        idx = tensor.TensorType(dtype='int64', broadcastable=(True,))()
-        assert idx.type.broadcastable == (True,)
-        t = n[idx]
-        self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
-
-        f = self.function([idx], t, op=self.adv_sub1)
-        topo = f.maker.fgraph.toposort()
-        topo_ = [node for node in topo if not isinstance(node.op,
-             self.ignore_topo)]
-        assert len(topo_) == 1
-        self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
-        f_0 = f([0])
-        self.assertTrue(f_0.shape == (1, 3))
-        self.assertTrue(numpy.allclose(f_0, 5))
-
-        # Test the gradient
-        c = t.sum()
-        gn = theano.grad(c, n)
-        g = self.function([idx], gn, op=self.adv_incsub1)
-        g_0 = g([0])
-        self.assertTrue(g_0.shape == (4, 3))
-        self.assertTrue(numpy.allclose(g_0[0], 1))
-        self.assertTrue(numpy.allclose(g_0[1:], 0))
-
-    @attr('slow')
-    def test_shape_i_const(self):
-        # Each axis is treated independently by shape_i/shape operators
-
-        mode_opt = self.mode.including("fast_run")
-        data = self.shared(numpy.array(numpy.arange(5), dtype=self.dtype))
-        for start in [None] + [-8, -5, -1, 0, 1, 5, 8]:
-            outs = []
-            shapes = []
-            for stop in [None] + [-8, -5, -1, 0, 1, 5, 8]:
-                for step in [None] + [-3, -1, 2]:
-                    outs += [data[start:stop:step].shape]
-                    shapes += [data.get_value(
-                        borrow=True)[start:stop:step].shape]
-            f = self.function([], outs, mode=mode_opt,
-                              op=self.ops, N=0)
-            t_shapes = f()
-            for t_shape, shape in zip(t_shapes, shapes):
-                assert numpy.all(t_shape == shape)
-            assert tensor.Subtensor not in [x.op for x in
-                                           f.maker.fgraph.toposort()]
-
-    def test_shape_i_scalar(self):
-        # Each axis is treated independently by shape_i/shape operators
-
-        mode_opt = self.mode.including("fast_run")
-
-        v_data = numpy.array(numpy.arange(5), dtype=self.dtype)
-        t_data = self.shared(v_data)
-        start = tensor.iscalar('b')
-        stop = tensor.iscalar('e')
-        step = tensor.iscalar('s')
-        f = self.function([start, stop, step],
-                          t_data[start:stop:step].shape,
-                          mode=mode_opt,
-                          op=self.ops,
-                          N=0)
-        assert tensor.Subtensor not in [x.op for x in f.maker.
-            fgraph.toposort()]
-        for start in [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
-            for stop in [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
-                for step in [-3, -1, 2, 5]:
-                    assert numpy.all(f(start, stop, step) ==
-                                     v_data[start:stop:step].shape)
-
-    def test_slice_canonical_form_0(self):
-        start = tensor.iscalar('b')
-        stop = tensor.iscalar('e')
-        step = tensor.iscalar('s')
-        length = tensor.iscalar('l')
-        cnf = get_canonical_form_slice(slice(start, stop, step), length)
-        f = self.function([start, stop, step, length], [
-            tensor.as_tensor_variable(cnf[0].start),
-            tensor.as_tensor_variable(cnf[0].stop),
-            tensor.as_tensor_variable(cnf[0].step),
-            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
-
-        length = 5
-        a = numpy.arange(length)
-        for start in [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
-            for stop in  [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
-                for step in [-6, -3, -1, 2, 5]:
-                    out = f(start, stop, step, length)
-                    t_out = a[out[0]:out[1]:out[2]][::out[3]]
-                    v_out = a[start:stop:step]
-                    assert numpy.all(t_out == v_out)
-                    assert numpy.all(t_out.shape == v_out.shape)
-
-    def test_slice_canonical_form_1(self):
-        stop = tensor.iscalar('e')
-        step = tensor.iscalar('s')
-        length = tensor.iscalar('l')
-        cnf = get_canonical_form_slice(slice(None, stop, step), length)
-        f = self.function([stop, step, length], [
-            tensor.as_tensor_variable(cnf[0].start),
-            tensor.as_tensor_variable(cnf[0].stop),
-            tensor.as_tensor_variable(cnf[0].step),
-            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
-
-        length = 5
-        a = numpy.arange(length)
-        for stop in  [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
-            for step in [-6, -3, -1, 2, 5]:
-                out = f(stop, step, length)
-                t_out = a[out[0]:out[1]:out[2]][::out[3]]
-                v_out = a[:stop:step]
-                assert numpy.all(t_out == v_out)
-                assert numpy.all(t_out.shape == v_out.shape)
-
-    def test_slice_canonical_form_2(self):
-        start = tensor.iscalar('b')
-        step = tensor.iscalar('s')
-        length = tensor.iscalar('l')
-        cnf = get_canonical_form_slice(slice(start, None, step), length)
-        f = self.function([start, step, length], [
-            tensor.as_tensor_variable(cnf[0].start),
-            tensor.as_tensor_variable(cnf[0].stop),
-            tensor.as_tensor_variable(cnf[0].step),
-            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
-
-        length = 5
-        a = numpy.arange(length)
-        for start in [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
-            for step in [-6, -3, -1, 2, 5]:
-                out = f(start, step, length)
-                t_out = a[out[0]:out[1]:out[2]][::out[3]]
-                v_out = a[start:None:step]
-                assert numpy.all(t_out == v_out)
-                assert numpy.all(t_out.shape == v_out.shape)
-
-    def test_slice_canonical_form_3(self):
-        start = tensor.iscalar('b')
-        stop = tensor.iscalar('e')
-        length = tensor.iscalar('l')
-        cnf = get_canonical_form_slice(slice(start, stop, None), length)
-        f = self.function([start, stop, length], [
-            tensor.as_tensor_variable(cnf[0].start),
-            tensor.as_tensor_variable(cnf[0].stop),
-            tensor.as_tensor_variable(cnf[0].step),
-            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
-
-        length = 5
-        a = numpy.arange(length)
-        for start in [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
-            for stop in  [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
-                out = f(start, stop, length)
-                t_out = a[out[0]:out[1]:out[2]][::out[3]]
-                v_out = a[start:stop:None]
-                assert numpy.all(t_out == v_out)
-                assert numpy.all(t_out.shape == v_out.shape)
-
-    def test_slice_canonical_form_4(self):
-        step = tensor.iscalar('s')
-        length = tensor.iscalar('l')
-        cnf = get_canonical_form_slice(slice(None, None, step), length)
-        f = self.function([step, length], [
-            tensor.as_tensor_variable(cnf[0].start),
-            tensor.as_tensor_variable(cnf[0].stop),
-            tensor.as_tensor_variable(cnf[0].step),
-            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
-
-        length = 5
-        a = numpy.arange(length)
-        for step in [-6, -3, -1, 2, 5]:
-            out = f(step, length)
-            t_out = a[out[0]:out[1]:out[2]][::out[3]]
-            v_out = a[None:None:step]
-            assert numpy.all(t_out == v_out)
-            assert numpy.all(t_out.shape == v_out.shape)
-
-    def test_slice_canonical_form_5(self):
-        start = tensor.iscalar('b')
-        length = tensor.iscalar('l')
-        cnf = get_canonical_form_slice(slice(start, None, None), length)
-        f = self.function([start, length], [
-            tensor.as_tensor_variable(cnf[0].start),
-            tensor.as_tensor_variable(cnf[0].stop),
-            tensor.as_tensor_variable(cnf[0].step),
-            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
-
-        length = 5
-        a = numpy.arange(length)
-        for start in [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
-            out = f(start, length)
-            t_out = a[out[0]:out[1]:out[2]][::out[3]]
-            v_out = a[start:None:None]
-            assert numpy.all(t_out == v_out)
-            assert numpy.all(t_out.shape == v_out.shape)
-
-    def test_slice_canonical_form_6(self):
-        stop = tensor.iscalar('e')
-        length = tensor.iscalar('l')
-        cnf = get_canonical_form_slice(slice(None, stop, None), length)
-        f = self.function([stop, length], [
-            tensor.as_tensor_variable(cnf[0].start),
-            tensor.as_tensor_variable(cnf[0].stop),
-            tensor.as_tensor_variable(cnf[0].step),
-            tensor.as_tensor_variable(cnf[1])], N=0, op=self.ops)
-
-        length = 5
-        a = numpy.arange(length)
-        for stop in  [-8, -5, -4, -1, 0, 1, 4, 5, 8]:
-            out = f(stop, length)
-            t_out = a[out[0]:out[1]:out[2]][::out[3]]
-            v_out = a[None:stop:None]
-            assert numpy.all(t_out == v_out)
-            assert numpy.all(t_out.shape == v_out.shape)
-
-    def grad_list_(self, idxs, data):
-        n = self.shared(data)
-
-        for idx in idxs:
-            # Should stay on the cpu.
-            idx_ = _shared(numpy.asarray(idx))
-            t = n[idx_]
-            gn = theano.tensor.grad(theano.tensor.sum(theano.tensor.exp(t)), n)
-            f = self.function([], [gn, gn.shape], op=self.adv_incsub1)
-            topo = f.maker.fgraph.toposort()
-            if not self.fast_compile:
-                assert any([isinstance(node.op, self.
-                    adv_incsub1) and node.op.inplace for node in topo])
-            else:
-                assert any([isinstance(node.op, self.
-                    adv_incsub1) for node in topo])
-            assert any([isinstance(node.op, self.adv_sub1) for node in topo])
-            gval, gshape = f()
-            good = numpy.zeros_like(data)
-            # don't work when the same index is used many time
-            # good[idx] += numpy.exp(data[idx])
-            for i in idx:
-                good[i] += numpy.exp(data[i])
-            self.assertTrue(gval.ndim == data.ndim)
-            self.assertTrue(numpy.allclose(gval, good), (gval, good))
-            self.assertTrue(numpy.allclose(gshape, data.shape))
-
-            def fct(t):
-                return theano.tensor.sum(t[idx_])
-            utt.verify_grad(fct, [data])
-
-            # Test the grad of the grad (e.i. AdvancedIncSubtensor1.grad)
-            def fct2(t):
-                return theano.tensor.grad(theano.tensor.sum(t[idx_]), t)
-            utt.verify_grad(fct2, [data])
-
-            # Test shape of AdvancedIncSubtensor1 and AdvancedSubtensor1
-            if not self.fast_compile:
-                ops = (self.adv_incsub1, self.adv_sub1)
-            else:
-                ops = self.ops
-            if idx is idxs[0]:
-                f = self.function([], [gn.shape, n[idx_].shape],
-                                  op=ops,
-                                  N=0, N_fast=2)
-                f()
-
-    def test_wrong_exception_regression(self):
-        a = fscalar()
-        b = fscalar()
-        c = vector()
-        try:
-            c[a:b]
-        except NotImplementedError:
-            self.fail()
-        except TypeError:
-            pass
-        try:
-            c[a:]
-        except NotImplementedError:
-            self.fail()
-        except TypeError:
-            pass
-        try:
-            c[:b]
-        except NotImplementedError:
-            self.fail()
-        except TypeError:
-            pass
-
-    @attr('slow')
-    def test_grad_list(self):
-        data = rand(4)
-        data = numpy.asarray(data, dtype=self.dtype)
-        idxs = [[i] for i in range(data.shape[0])]
-        for i in range(data.shape[0]):
-            for j in range(0, data.shape[0], 2):
-                idxs.append([i, j, (i + 1) % data.shape[0]])
-        self.grad_list_(idxs, data)
-
-        data = rand(4, 3)
-        data = numpy.asarray(data, dtype=self.dtype)
-        self.grad_list_(idxs, data)
-
-        data = rand(4, 3, 2)
-        data = numpy.asarray(data, dtype=self.dtype)
-        self.grad_list_(idxs, data)
-
-    def test_shape_list(self):
-        #TODO for all type of subtensor shape
-        for data, idx in [(rand(4), [1, 0]),
-                          (rand(4, 2), [2, 3]),
-                          (rand(4, 2, 3), [0, 3]),
-                          (rand(4, 2, 3), [3, 3, 1, 2, 2, ]),
-                          ]:
-            data = numpy.asarray(data, dtype=self.dtype)
-            n = self.shared(data)
-            t = n[idx]
-            f = self.function([], t.shape, op=self.ops, N=0, N_fast=1)
-            val = f()
-            self.assertTrue(numpy.allclose(val, data[idx].shape))
-
-    def test_grad_advanced_inc_subtensor(self):
-        def inc_slice(*s):
-            def just_numeric_args(a, b):
-                cost = (a[s] + b).sum()
-                cost_wrt_a = theano.tensor.grad(cost, a)
-                cost_wrt_b = theano.tensor.grad(cost, b)
-                grads = cost_wrt_a.sum() + cost_wrt_b.sum()
-                return grads
-            return just_numeric_args
-
-        # vector
-        utt.verify_grad(
-            inc_slice(slice(2, 4, None)),
-            (numpy.asarray([0, 1, 2, 3, 4, 5.]), numpy.asarray([9, 9.]),))
-
-        # matrix
-        utt.verify_grad(
-            inc_slice(slice(1, 2, None), slice(None, None, None)),
-            (numpy.asarray([[0, 1], [2, 3], [4, 5.]]),
-             numpy.asarray([[9, 9.]]),))
-
-        #single element
-        utt.verify_grad(
-            inc_slice(2, 1),
-            (numpy.asarray([[0, 1], [2, 3], [4, 5.]]), numpy.asarray(9.),))
-
-    def test_inc_and_set_subtensor(self):
-        """
-        Test increment and set with broadcast
-        """
-
-        X = tensor.matrix(dtype=self.dtype)
-        y = set_subtensor(X[1::, 1::],  0)
-        f = self.function([X], [y],
-                          op=self.inc_sub,
-                          N=1)
-
-        x_ = numpy.ones((9, 9))
-        out = f(x_.astype('float32'))
-
-        res = x_.copy()
-        res[1::, 1::] = 0
-        assert numpy.allclose(out, res)
-
-    def test_advanced1_inc_and_set(self):
-        """
-        Test advanced increment and set.
-        """
-        rng = numpy.random.RandomState(seed=utt.fetch_seed())
-        all_inputs_var = []
-        all_inputs_num = []
-        all_outputs_var = []
-        all_outputs_num = []
-        for set_instead_of_inc in (False, True):
-            for inplace in (False, True):
-                for data_shape in ((10,), (4, 5), (1, 2, 3), (4, 5, 6, 7)):
-                    data_n_dims = len(data_shape)
-                    data_size = numpy.product(data_shape)
-                    # Corresponding numeric variable.
-                    data_num_init = numpy.arange(data_size, dtype=self.dtype)
-                    data_num_init = data_num_init.reshape(data_shape)
-                    inc_shapes = [data_shape[i:]
-                                  for i in xrange(0, len(data_shape) + 1)]
-                    for inc_shape in inc_shapes:
-                        inc_n_dims = len(inc_shape)
-                        # We copy the numeric value to be 100% sure there is no
-                        # risk of accidentally sharing it.
-                        data_num = data_num_init.copy()
-                        # Symbolic variable to be incremented.
-                        # We create a new one every time in order not to
-                        # have duplicated variables in the function's inputs
-                        data_var = tensor.tensor(
-                                broadcastable=[False] * data_n_dims,
-                                dtype=self.dtype)
-                        # Symbolic variable with rows to be incremented.
-                        idx_var = theano.tensor.vector(dtype='int64')
-                        n_to_inc = rng.randint(data_shape[0])
-                        # Corresponding numeric variable.
-                        idx_num = rng.randint(0, data_shape[0], n_to_inc)
-                        idx_num = idx_num.astype('int64')
-                        # Symbolic variable with increment value.
-                        inc_var = tensor.tensor(
-                                broadcastable=[False] * inc_n_dims,
-                                dtype=self.dtype)
-                        # Trick for the case where `inc_shape` is the same as
-                        # `data_shape`: what we actually want is the first
-                        # shape element to be equal to the number of rows to
-                        # increment.
-                        if len(inc_shape) == len(data_shape):
-                            inc_shape = (n_to_inc,) + inc_shape[1:]
-                        inc_size = numpy.product(inc_shape)
-                        # Corresponding numeric variable.
-                        inc_num = rng.uniform(size=inc_size).astype(self.dtype)
-                        inc_num = inc_num.reshape(inc_shape)
-                        # Result of the incrementation.
-                        # (i) Theano
-                        if set_instead_of_inc:
-                            op = set_subtensor
-                        else:
-                            op = inc_subtensor
-                        output = op(data_var[idx_var], inc_var,
-                                    inplace=inplace)
-                        # (ii) Numpy (note that Numpy increments only once
-                        # duplicated indices, so we cannot directly use +=).
-                        data_copy = data_num.copy()
-                        for j, idx in enumerate(idx_num):
-                            if len(inc_shape) == len(data_shape):
-                                # Special case where there is no broadcasting.
-                                if set_instead_of_inc:
-                                    data_copy[idx] = inc_num[j]
-                                else:
-                                    data_copy[idx] += inc_num[j]
-                            else:
-                                if set_instead_of_inc:
-                                    data_copy[idx] = inc_num
-                                else:
-                                    data_copy[idx] += inc_num
-                        data_var = theano.In(data_var, mutable=True)
-
-                        # Remember data for the Theano function (see below).
-                        all_inputs_var += [data_var, idx_var, inc_var]
-                        all_inputs_num += [data_num, idx_num, inc_num]
-                        all_outputs_var.append(output)
-                        all_outputs_num.append(data_copy)
-                        if False:  # Enable for debugging purpose.
-                            f = self.function([data_var, idx_var, inc_var],
-                                              output, accept_inplace=inplace,
-                                              op=self.adv_incsub1)
-                            if inplace:
-                                # Ensure calling `f` will not alter `data_num`.
-                                data_num = data_num.copy()
-                            f_out = f(data_num.copy(), idx_num, inc_num)
-                            assert numpy.allclose(f_out, data_copy)
-                            if not inplace:
-                                # Sanity check: `data_num` should be intact.
-                                assert (data_num == data_num_init).all()
-
-        # Actual test (we compile a single Theano function to make it faster).
-        orig_warn = theano.config.warn.gpu_set_subtensor1
-        try:
-            theano.config.warn.gpu_set_subtensor1 = False
-            f = self.function(all_inputs_var, all_outputs_var,
-                              accept_inplace=True,
-                              op=self.adv_incsub1,
-                              N=len(all_outputs_var))
-        finally:
-            theano.config.warn.gpu_set_subtensor1 = orig_warn
-
-        f_outs = f(*all_inputs_num)
-        assert len(f_outs) == len(all_outputs_num)
-        for f_out, output_num in izip(f_outs, all_outputs_num):
-            # NB: if this assert fails, it will probably be easier to debug if
-            # you enable the debug code above.
-            assert numpy.allclose(f_out, output_num)
-
-    def test_adv_constant_arg(self):
-        # Test case provided (and bug detected, gh-607) by John Salvatier
-        m = matrix('m')
-        gv = numpy.array([0, 1, 3])
-        g = theano.tensor.constant(gv)
-        i = theano.tensor.lvector('i')
-
-        # s1 used to fail
-        s1 = m[gv, i]
-        s2 = m[g, i]
-
-        assert gof.graph.is_same_graph(s1, s2)
-
-    def test_adv1_inc_sub_notlastdim(self):
-        # Test that taking 1-dimensional advanced indexing
-        # over a dimension that's not the first (outer-most) works.
-        m = matrix('m')
-        i = lvector('i')
-
-        m1 = set_subtensor(m[:, i], 0)
-        m2 = inc_subtensor(m[:, i], 1)
-        f = theano.function([m, i], [m1, m2])
-
-        m_val = rand(3, 5)
-        i_val = randint_ranged(min=0, max=4, shape=(4,))
-        m1_ref = m_val.copy()
-        m2_ref = m_val.copy()
-
-        m1_val, m2_val = f(m_val, i_val)
-        for idx in i_val:
-            m1_ref[:, idx] = 0
-            m2_ref[:, idx] += 1
-
-        assert numpy.allclose(m1_val, m1_ref), (m1_val, m1_ref)
-        assert numpy.allclose(m2_val, m2_ref), (m2_val, m2_ref)
-
-    def test_adv1_inc_sub_notlastdim_2didx(self):
-        # Test that taking 1-dimensional advanced indexing
-        # over a dimension that's not the first (outer-most) works,
-        # if the index is a matrix.
-        m = matrix('m')
-        i = lmatrix('i')
-
-        m1 = set_subtensor(m[:, i], 0)
-        m2 = inc_subtensor(m[:, i], 1)
-        f = theano.function([m, i], [m1, m2])
-
-        m_val = rand(5, 7)
-        i_val = randint_ranged(min=0, max=6, shape=(4, 2))
-        m1_ref = m_val.copy()
-        m2_ref = m_val.copy()
-
-        m1_val, m2_val = f(m_val, i_val)
-        for idx in i_val.ravel():
-            m1_ref[:, idx] = 0
-            m2_ref[:, idx] += 1
-
-        assert numpy.allclose(m1_val, m1_ref), (m1_val, m1_ref)
-        assert numpy.allclose(m2_val, m2_ref), (m2_val, m2_ref)
-
-
-class TestIncSubtensor1(unittest.TestCase):
-    # test inc_subtensor
-    # also tests set_subtensor
-
-    def setUp(self):
-        self.s = tensor.iscalar()
-        self.v = tensor.fvector()
-        self.m = tensor.dmatrix()
-        self.t = tensor.ctensor3()
-
-        self.adv1q = tensor.lvector()  # advanced 1d query
-
-    def test_cant_adv_idx_into_scalar(self):
-        self.assertRaises(TypeError, lambda: self.s[self.adv1q])
-
-    def test_index_into_vec_w_vec(self):
-        a = self.v[self.adv1q]
-        assert a.type == self.v.type
-
-    def test_1d_set_adv_selection(self):
-        a = set_subtensor(self.v[self.adv1q], self.v[self.adv1q])
-
-        assert a.type == self.v.type
-
-        #TODO: compile a function and verify that the subtensor is removed
-        #      completely, because the whole expression is redundant.
-
-        f = theano.function([self.v, self.adv1q], a, allow_input_downcast=True)
-        aval = f([.4, .9, .1], [1, 2])
-        assert numpy.allclose(aval, [.4, 0.9, 0.1])
-
-    def test_1d_inc_adv_selection(self):
-        a = inc_subtensor(self.v[self.adv1q], self.v[self.adv1q])
-
-        assert a.type == self.v.type
-        f = theano.function([self.v, self.adv1q], a, allow_input_downcast=True)
-        aval = f([.4, .9, .1], [1, 2])
-        assert numpy.allclose(aval, [.4, 1.8, 0.2])
-
-    def test_1d_inc_adv_selection_w_broadcasting(self):
-        a = inc_subtensor(self.v[self.adv1q], 3.0)
-
-        assert a.type == self.v.type
-        f = theano.function([self.v, self.adv1q], a, allow_input_downcast=True)
-        aval = f([.4, .9, .1], [1, 2])
-        assert numpy.allclose(aval, [.4, 3.9, 3.1])
-
-    def test_assigning_matrix_to_vector_selection(self):
-        self.assertRaises(TypeError,
-                          lambda: inc_subtensor(self.v[self.adv1q], fmatrix()))
-
-
-inplace_increment_missing = SkipTest(
-    "inc_subtensor with advanced indexing not enabled. "
-    "Installing NumPy 1.8 or the latest development version "
-    "should make that feature available.")
-
-
-class TestAdvancedSubtensor(unittest.TestCase):
-    # test inc_subtensor
-    # also tests set_subtensor
-    def __init__(self, name,
-                 shared=tensor._shared,
-                 sub=tensor.AdvancedSubtensor,
-                 inc_sub=tensor.AdvancedIncSubtensor,
-                 mode=None,
-                 dtype=theano.config.floatX,
-                 ignore_topo=DeepCopyOp):
-        self.shared = shared
-        self.sub = sub
-        self.inc_sub = inc_sub
-        if mode is None:
-            mode = theano.compile.mode.get_default_mode()
-        self.mode = mode
-        self.dtype = dtype
-        self.ignore_topo = ignore_topo
-        return super(TestAdvancedSubtensor, self).__init__(name)
-
-    def setUp(self):
-        self.s = iscalar()
-        self.v = fvector()
-        self.m = dmatrix()
-        self.t = ctensor3()
-
-        self.ix1 = lvector()  # advanced 1d query
-        self.ix12 = lvector()
-        self.ix2 = lmatrix()
-        self.ixr = lrow()
-
-    def eval_output_and_check(self, t):
-        f = inplace_func([], t, mode=self.mode)
-        topo = f.maker.fgraph.toposort()
-        topo_ = [node for node in topo if not isinstance(node.op,
-             self.ignore_topo)]
-        assert len(topo_) == 1
-        assert isinstance(topo_[0].op, self.sub)
-        tval = f()
-        return tval
-
-    def test_cant_adv_idx_into_scalar(self):
-        self.assertRaises(TypeError, lambda: self.s[self.ix1])
-
-    def test_index_into_vec_w_vec(self):
-        a = self.v[self.ix1]
-        assert a.type == self.v.type, (a.type, self.v.type)
-
-    def test_index_into_vec_w_matrix(self):
-        a = self.v[self.ix2]
-        assert a.dtype == self.v.dtype, (a.dtype, self.v.dtype)
-        assert a.broadcastable == self.ix2.broadcastable, (
-                a.broadcastable, self.ix2.broadcastable)
-
-    def test_index_into_mat_w_row(self):
-        a = self.m[self.ixr]
-        assert a.dtype == self.m.dtype, (a.dtype, self.m.dtype)
-        assert a.broadcastable == (True, False, False)
-
-    def test_index_w_int_and_vec(self):
-        # like test_ok_list, but with a single index on the first one
-        # data has to have at least 2 dimensions
-        for data, idx in [(rand(4, 5), [2, 3]),
-                          (rand(2, 4, 3), [0, 3]),
-                          (rand(2, 4, 3), [3, 3, 1, 1, 2, 2, 0, 0]),
-                          (rand(2, 4, 3), [3, 3, 1, 1, 2, 2, 0, 0,
-                                           -1, -2, -3, -4]),
-                          # Test 4 dims as gpu code use another algo
-                          # in that case This new algo is not as much
-                          # optimized for that case.
-                          (rand(4, 4, 2, 3), [3,
-                               3, 1, 1, 2, 2, 0, 0, -1, -2, -3, -4]),
-                          # Test with TensorConstant index.
-                          (rand(2, 4, 3),
-                           theano.tensor.constant([3, 3, 1, 1, 2, 2, 0, 0])),
-                          ]:
-            data = numpy.asarray(data, dtype=self.dtype)
-            n = self.shared(data)
-            t = n[0, idx]
-
-            self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor))
-
-            val = self.eval_output_and_check(t)
-            if isinstance(idx, list):
-                good = data[0, idx]
-            else:
-                good = data[0, idx.data]
-            self.assertTrue(val.ndim == data.ndim - 1)
-            self.assertTrue(numpy.allclose(val, good), (val, good))
-
-    def test_inc_adv_subtensor_w_matrix(self):
-        subt = self.v[self.ix2]
-        a = inc_subtensor(subt, subt)
-
-        assert a.type == self.v.type, (a.type, self.v.type)
-        f = theano.function([self.v, self.ix2], a, allow_input_downcast=True)
-        aval = f([.4, .9, .1], [[1, 2],
-                                [1, 2]])
-        assert numpy.allclose(aval, [.4, .9 * 3, .1 * 3])
-
-    def test_inc_adv_subtensor_w_2vec(self):
-        if inplace_increment is None:
-            raise inplace_increment_missing
-
-        subt = self.m[self.ix1, self.ix12]
-        a = inc_subtensor(subt, subt)
-
-        typ = tensor.TensorType(self.m.type.dtype, self.ix2.type.broadcastable)
-        assert a.type == typ, (a.type, typ)
-        f = theano.function([self.m, self.ix1, self.ix12], a,
-                            allow_input_downcast=True)
-        aval = f([[.4, .9, .1],
-                  [5, 6, 7],
-                  [.5, .3, .15]],
-                 [1, 2, 1],
-                 [0, 1, 0])
-        assert numpy.allclose(aval,
-                [[.4, .9, .1],
-                  [5 * 3, 6, 7],
-                  [.5, .3 * 2, .15]]), aval
-
-    def test_inc_adv_subtensor_with_broadcasting(self):
-        if inplace_increment is None:
-            raise inplace_increment_missing
-
-        a = inc_subtensor(self.m[self.ix1, self.ix12], 2.1)
-
-        assert a.type == self.m.type, (a.type, self.m.type)
-        f = theano.function([self.m, self.ix1, self.ix12], a,
-                            allow_input_downcast=True)
-        aval = f([[.4, .9, .1],
-                  [5, 6, 7],
-                  [.5, .3, .15]],
-                 [1, 2, 1],
-                 [0, 1, 0])
-        assert numpy.allclose(aval,
-                [[.4, .9, .1],
-                  [5 + 2.1 * 2, 6, 7],
-                  [.5, .3 + 2.1, .15]]), aval
-
-    def test_inc_adv_subtensor_with_index_broadcasting(self):
-        if inplace_increment is None:
-            raise inplace_increment_missing
-
-        a = inc_subtensor(self.m[self.ix1, self.ix2], 2.1)
-
-        assert a.type == self.m.type, (a.type, self.m.type)
-        f = theano.function([self.m, self.ix1, self.ix2], a,
-                            allow_input_downcast=True)
-        aval = f([[.4, .9, .1],
-                  [5, 6, 7],
-                  [.5, .3, .15]],
-                 [0, 2, 0],
-                 [[0, 1, 0],
-                  [2, 2, 2]])
-        assert numpy.allclose(aval,
-                [[.4 + 2 * 2.1, .9, .1 + 2 * 2.1],
-                  [5, 6, 7],
-                  [.5, .3 + 2.1, .15 + 2.1]]), aval
-
-    def test_advanced_indexing(self):
-        # tests advanced indexing in Theano for 2D and 3D tensors
-        rng = numpy.random.RandomState(utt.seed_rng())
-        a = rng.uniform(size=(3, 3))
-        b = theano.shared(a)
-        i = tensor.iscalar()
-        j = tensor.iscalar()
-        z = b[[i, j], :]
-        f1 = theano.function([i, j], z)
-        cmd = f1(0, 1) == a[[0, 1], :]
-        self.assertTrue(cmd.all())
-
-        aa = rng.uniform(size=(4, 2, 3))
-        bb = theano.shared(aa)
-        k = tensor.iscalar()
-        z = bb[[i, j, k], :, i:k]
-        f2 = theano.function([i, j, k], z)
-        cmd = f2(0, 1, 2) == aa[[0, 1, 2], :, 0:2]
-        self.assertTrue(cmd.all())
-
-
-class TestInferShape(utt.InferShapeTester):
-    def test_infer_shape(self):
-        # IncSubtensor
-        admat = dmatrix()
-        bdmat = dmatrix()
-        advec = dvector()
-        adscal = dscalar()
-        admat_val = rand(5, 4)
-        self._compile_and_check([admat, bdmat],
-                            [inc_subtensor(admat[2:4], bdmat)],
-                            [admat_val, [[1, 2, 3, 4]]], IncSubtensor)
-
-        self._compile_and_check([admat, advec],
-                            [inc_subtensor(admat[2], advec)],
-                            [admat_val, [1, 2, 3, 4]], IncSubtensor)
-
-        self._compile_and_check([admat, adscal],
-                            [inc_subtensor(admat[2, 3], adscal)],
-                            [admat_val, 1], IncSubtensor)
-
-        self._compile_and_check([admat, adscal],
-                            [inc_subtensor(admat[1:3, 2], adscal)],
-                            [admat_val, 1], IncSubtensor)
-
-        self._compile_and_check([admat, bdmat],
-                            [set_subtensor(admat[2:4], bdmat)],
-                            [admat_val, [[1, 2, 3, 4]]], IncSubtensor)
-
-        self._compile_and_check([admat, advec],
-                            [set_subtensor(admat[2], advec)],
-                            [admat_val, [1, 2, 3, 4]], IncSubtensor)
-
-        self._compile_and_check([admat, adscal],
-                            [set_subtensor(admat[2, 3], adscal)],
-                            [admat_val, 1], IncSubtensor)
-
-        self._compile_and_check([admat, adscal],
-                            [set_subtensor(admat[1:3, 2], adscal)],
-                            [admat_val, 1], IncSubtensor)
-
-        adtens4 = dtensor4()
-        bdtens4 = dtensor4()
-        adtens4_val = rand(3, 4, 2, 5)
-        self._compile_and_check([adtens4, bdtens4],
-                            [inc_subtensor(adtens4[::, 2:4, ::, ::], bdtens4)],
-                            [adtens4_val, [[[[1, 2, 3, 4, 5]]]]], IncSubtensor,
-                            warn=False)
-        self._compile_and_check([adtens4, bdmat],
-                            [inc_subtensor(adtens4[2, 2:4, 1, ::], bdmat)],
-                            [adtens4_val, [[1, 2, 3, 4, 5]]], IncSubtensor)
-
-        self._compile_and_check([adtens4, advec],
-                            [inc_subtensor(adtens4[0, 1, ::, 4], advec)],
-                            [adtens4_val, [1, 2]], IncSubtensor)
-
-        self._compile_and_check([adtens4, adscal],
-                            [inc_subtensor(adtens4[1:3, 1, ::, 2:4], adscal)],
-                            [adtens4_val, 1], IncSubtensor)
-
-        self._compile_and_check([adtens4, bdtens4],
-                            [set_subtensor(adtens4[::, 2:4, ::, ::], bdtens4)],
-                            [adtens4_val, [[[[1, 2, 3, 4, 5]]]]], IncSubtensor,
-                            warn=False)
-
-        self._compile_and_check([adtens4, bdmat],
-                            [set_subtensor(adtens4[2, 2:4, 1, ::], bdmat)],
-                            [adtens4_val, [[1, 2, 3, 4, 5]]], IncSubtensor)
-
-        self._compile_and_check([adtens4, advec],
-                            [set_subtensor(adtens4[0, 1, ::, 4], advec)],
-                            [adtens4_val, [1, 2]], IncSubtensor)
-
-        self._compile_and_check([adtens4, adscal],
-                            [set_subtensor(adtens4[1:3, 1, ::, 2:4], adscal)],
-                            [adtens4_val, 1], IncSubtensor)
-
-        # AdvancedIncSubtensor1
-        admat = dmatrix()
-        bdmat = dmatrix()
-        advec = dvector()
-        adscal = dscalar()
-        admat_val = rand(5, 4)
-        aivec_val = [2, 3]
-        self._compile_and_check([admat, bdmat],
-                            [set_subtensor(admat[aivec_val], bdmat)],
-                            [admat_val, [[1, 2, 3, 4]]], AdvancedIncSubtensor1)
-
-        aivec_val = [1, 3, 2]
-        self._compile_and_check([admat, advec],
-                            [set_subtensor(admat[aivec_val], advec)],
-                            [admat_val, [1, 2, 3, 4]], AdvancedIncSubtensor1)
-
-        aivec_val = [0, 3, 0]
-        self._compile_and_check([admat, adscal],
-                            [set_subtensor(admat[aivec_val], adscal)],
-                            [admat_val, 1], AdvancedIncSubtensor1)
-
-        bdtens4 = dtensor4()
-        adtens4_val = rand(4, 3, 2, 5)
-        aivec_val = [2, 3]
-        self._compile_and_check([adtens4, bdtens4],
-                            [set_subtensor(adtens4[aivec_val], bdtens4)],
-                            [adtens4_val, [[[[1, 2, 3, 4, 5]]]]],
-                            AdvancedIncSubtensor1,
-                            warn=False)
-
-        aivec_val = [1, 3, 2]
-        self._compile_and_check([adtens4, advec],
-                            [set_subtensor(adtens4[aivec_val], advec)],
-                            [adtens4_val, [1, 2, 3, 4, 5]],
-                            AdvancedIncSubtensor1)
-
-        aivec_val = [0, 3, 0]
-        self._compile_and_check([adtens4, adscal],
-                            [set_subtensor(adtens4[aivec_val], adscal)],
-                            [adtens4_val, 1],
-                            AdvancedIncSubtensor1)
-
-        aivec_val = [2, 3]
-        self._compile_and_check([admat, bdmat],
-                                [inc_subtensor(admat[aivec_val], bdmat)],
-                                [admat_val, [[1, 2, 3, 4], [5, 6, 7, 8]]],
-                                AdvancedIncSubtensor1)
-
-        aivec_val = [1, 3, 2]
-        self._compile_and_check([admat, advec],
-                            [inc_subtensor(admat[aivec_val], advec)],
-                            [admat_val, [1, 2, 3, 4]], AdvancedIncSubtensor1)
-
-        aivec_val = [0, 3, 0]
-        self._compile_and_check([admat, adscal],
-                            [inc_subtensor(admat[aivec_val], adscal)],
-                            [admat_val, 1], AdvancedIncSubtensor1)
-
-        bdtens4 = dtensor4()
-        adtens4_val = rand(4, 3, 2, 5)
-        aivec_val = [2, 3]
-        self._compile_and_check([adtens4, bdtens4],
-                            [inc_subtensor(adtens4[aivec_val], bdtens4)],
-                            [adtens4_val, [[[[1, 2, 3, 4, 5]]],
-                                           [[[6, 7, 8, 9, 10]]]]],
-                            AdvancedIncSubtensor1,
-                            warn=False)
-
-        aivec_val = [1, 2, 1]
-        self._compile_and_check([adtens4, advec],
-                            [inc_subtensor(adtens4[aivec_val], advec)],
-                            [adtens4_val, [1, 2, 3, 4, 5]],
-                            AdvancedIncSubtensor1)
-
-        aivec_val = [0, 3, 0]
-        self._compile_and_check([adtens4, adscal],
-                            [inc_subtensor(adtens4[aivec_val], adscal)],
-                            [adtens4_val, 2],
-                            AdvancedIncSubtensor1)
-
-        # AdvancedIncSubtensor
-        aivec_val = [1, 3, 2]
-        bivec_val = [0, 3, 3]
-        advec_val = [23, 24, 25]
-        self._compile_and_check([admat, advec],
-                    [set_subtensor(admat[aivec_val, bivec_val], advec)],
-                    [admat_val, advec_val], AdvancedIncSubtensor)
-
-    def test_adv_sub(self):
-        admat = dmatrix()
-        aivec = lvector()
-        bivec = lvector()
-
-        admat_val = rand(5, 4)
-        aivec_val = [1, 3, 2]
-        bivec_val = [0, 3, 3]
-        self._compile_and_check([admat, aivec, bivec],
-                                [admat[aivec, bivec]],
-                                [admat_val, aivec_val, bivec_val], AdvancedSubtensor)
-        # Test case that aren't implemented, but make sure they do not crash.
-        self._compile_and_check([admat, aivec],
-                                [admat[aivec, 1:3]],
-                                [admat_val, aivec_val], AdvancedSubtensor,
-                                check_topo=False)
-        self._compile_and_check([admat, aivec],
-                                [admat[1:3, aivec]],
-                                [admat_val, aivec_val], AdvancedSubtensor,
-                                check_topo=False)
diff --git a/theano/tensor/tests/test_type_other.py b/theano/tensor/tests/test_type_other.py
deleted file mode 100644
index d49325c3701..00000000000
--- a/theano/tensor/tests/test_type_other.py
+++ /dev/null
@@ -1,14 +0,0 @@
-""" This file don't test everything. It only test one past crash error."""
-import theano
-from theano.tensor.type_other import MakeSlice, make_slice
-
-
-def test_make_slice_merge():
-    # In the past, this was crahsing during compilation.
-    i = theano.tensor.iscalar()
-    s1 = make_slice(0, i)
-    s2 = make_slice(0, i)
-    f = theano.function([i], [s1, s2])
-    nodes = f.maker.fgraph.nodes
-    assert len([n for n in nodes if isinstance(n.op, MakeSlice)]) == 1
-    theano.printing.debugprint(f)
\ No newline at end of file
diff --git a/theano/tensor/tests/test_utils.py b/theano/tensor/tests/test_utils.py
index 7f16a4b2ea1..716c330df99 100644
--- a/theano/tensor/tests/test_utils.py
+++ b/theano/tensor/tests/test_utils.py
@@ -1,10 +1,8 @@
-import unittest
-
 import numpy
 
 import theano
 from theano.tensor.utils import (hash_from_ndarray, hash_from_dict,
-                                 shape_of_variables)
+        shape_of_variables)
 
 
 def test_hash_from_ndarray():
@@ -12,18 +10,18 @@ def test_hash_from_ndarray():
     rng = numpy.random.rand(5, 5)
 
     for data in [-2, -1, 0, 1, 2, numpy.zeros((1, 5)), numpy.zeros((1, 6)),
-                 # Data buffer empty but different shapes
-                 numpy.zeros((1, 0)), numpy.zeros((2, 0)),
-                 # Same data buffer and shapes but different strides
-                 numpy.arange(25).reshape(5, 5),
-                 numpy.arange(25).reshape(5, 5).T,
-                 # Same data buffer, shapes and strides but different dtypes
-                 numpy.zeros((5, 5), dtype="uint32"),
-                 numpy.zeros((5, 5), dtype="int32"),
-
-                 # Test slice
-                 rng, rng[1:], rng[:4], rng[1:3], rng[::2], rng[::-1]
-                 ]:
+                  # Data buffer empty but different shapes
+                  numpy.zeros((1, 0)), numpy.zeros((2, 0)),
+                  # Same data buffer and shapes but different strides
+                  numpy.arange(25).reshape(5, 5),
+                  numpy.arange(25).reshape(5, 5).T,
+                  # Same data buffer, shapes and strides but different dtypes
+                  numpy.zeros((5, 5), dtype="uint32"),
+                  numpy.zeros((5, 5), dtype="int32"),
+
+                  # Test slice
+                  rng, rng[1:], rng[:4], rng[1:3], rng[::2], rng[::-1]
+                  ]:
         data = numpy.asarray(data)
         hashs.append(hash_from_ndarray(data))
 
@@ -51,31 +49,22 @@ def test_hash_from_dict():
     # List are not hashable. So they are transformed into tuple.
     assert hash_from_dict({0: (0,)}) == hash_from_dict({0: [0]})
 
-
-class Tshape_of_variables(unittest.TestCase):
-    def test_simple(self):
-        x = theano.tensor.matrix('x')
-        y = x+x
-        fgraph = theano.FunctionGraph([x], [y], clone=False)
-        shapes = shape_of_variables(fgraph, {x: (5, 5)})
-        assert shapes == {x: (5, 5), y: (5, 5)}
-
-        x = theano.tensor.matrix('x')
-        y = theano.tensor.dot(x, x.T)
-        fgraph = theano.FunctionGraph([x], [y], clone=False)
-        shapes = shape_of_variables(fgraph, {x: (5, 1)})
-        assert shapes[x] == (5, 1)
-        assert shapes[y] == (5, 5)
-
-    def test_subtensor(self):
-        x = theano.tensor.matrix('x')
-        subx = x[1:]
-        fgraph = theano.FunctionGraph([x], [subx], clone=False)
-        shapes = shape_of_variables(fgraph, {x: (10, 10)})
-        assert shapes[subx] == (9, 10)
-
-    def test_err(self):
-        x = theano.tensor.matrix('x')
-        subx = x[1:]
-        fgraph = theano.FunctionGraph([x], [subx])
-        self.assertRaises(ValueError, shape_of_variables, fgraph, {x: (10, 10)})
+def test_shape_of_variables_simple():
+    x = theano.tensor.matrix('x')
+    y = x+x
+    fgraph = theano.FunctionGraph([x], [y])
+    assert shape_of_variables(fgraph, {x: (5, 5)}) == {x: (5, 5), y: (5, 5)}
+
+    x = theano.tensor.matrix('x')
+    y = theano.tensor.dot(x, x.T)
+    fgraph = theano.FunctionGraph([x], [y])
+    shapes = shape_of_variables(fgraph, {x: (5, 1)})
+    assert shapes[x] == (5, 1)
+    assert shapes[y] == (5, 5)
+
+def test_shape_of_variables_subtensor():
+    x = theano.tensor.matrix('x')
+    subx = x[1:]
+    fgraph = theano.FunctionGraph([x], [subx])
+    shapes = shape_of_variables(fgraph, {x: (10, 10)})
+    assert shapes[subx] == (9, 10)
diff --git a/theano/tensor/type.py b/theano/tensor/type.py
deleted file mode 100644
index d5e12195940..00000000000
--- a/theano/tensor/type.py
+++ /dev/null
@@ -1,731 +0,0 @@
-import logging
-_logger = logging.getLogger("theano.tensor.type")
-
-import numpy
-
-import theano
-from theano import config
-from theano.gof import Constant, hashtype, Type, Variable
-from theano.gof.python25 import any
-from theano.gof.utils import MethodNotDefined
-from theano import scalar as scal
-
-
-class TensorType(Type):
-    """Symbolic `Type` representing a numpy.ndarray value."""
-
-    filter_checks_isfinite = False
-    """
-    When this is True, strict filtering rejects data containing NaN or
-    Inf entries. (Used in `DebugMode`)
-    """
-
-    def __init__(self, dtype, broadcastable, name=None, sparse_grad=False):
-        """Initialize self.dtype and self.broadcastable.
-
-        :Parameters:
-         - `dtype`: str corresponding to numpy dtype (e.g., 'int64')
-           The value (ndarray) associated to a `Variable` of this `Type` will
-           have this dtype.
-         - `broadcastable`: tuple, list, or array of boolean values
-           This argument serves two purposes.  First, the True elements of this
-           list indicate the dimensions where the shape of an associated value
-           must be 1.  Secondly, the length of this list is the number of
-           dimensions that an associated value must have.  See
-           :doc:`broadcasting` for an explanation of how this list is used.
-         - `name`: str
-           Optional name for this type.
-        """
-        self.dtype = str(dtype)
-        if self.dtype == 'floatX':
-            self.dtype = config.floatX
-        ###    broadcastable is immutable, and all elements are either
-        ###    True or False
-        self.broadcastable = tuple(bool(b) for b in broadcastable)
-        self.dtype_specs()  # error checking is done there
-        self.name = name
-        self.numpy_dtype = numpy.dtype(self.dtype)
-        self.sparse_grad = sparse_grad
-        if sparse_grad:
-            warnings.warn(
-                "DEPRECATION WARNING: You use an old interface to"
-                " AdvancedSubtensor1 sparse_grad. Now use"
-                " theano.sparse_grad(a_tensor[an_int_vector]).")
-
-    def filter(self, data, strict=False, allow_downcast=None):
-        """Convert `data` to something which can be associated to a
-        `TensorVariable`.
-
-        This function is not meant to be called in user code.  It is for
-        `Linker` instances to use when running a compiled graph.
-        """
-        # Explicit error message when one accidentally uses a Variable as
-        # input (typical mistake, especially with shared variables).
-        if isinstance(data, Variable):
-            raise TypeError(
-                    'Expected an array-like object, but found a Variable: '
-                    'maybe you are trying to call a function on a (possibly '
-                    'shared) variable instead of a numeric array?')
-
-        if ((type(data) is numpy.ndarray)
-                and (data.dtype == self.numpy_dtype)):
-            if data.dtype.num != self.numpy_dtype.num:
-                data = theano._asarray(data, dtype=self.dtype)
-            # -- now fall through to ndim check
-        elif((type(data) is numpy.memmap)
-                and (data.dtype == self.numpy_dtype)):
-            # numpy.memmap is a "safe" subclass of ndarray,
-            # so we can use it whereever we expect a base ndarray.
-            # however, casting it would defeat the purpose of not
-            # loading the whole data into memory
-            pass
-        elif strict:
-            # If any of the two conditions above was not met,
-            # we raise a meaningful TypeError.
-            if not (type(data) is numpy.ndarray):
-                raise TypeError("%s expected a ndarray object." % self,
-                        data, type(data))
-            if data.dtype != self.numpy_dtype:
-                raise TypeError(("%s expected a ndarray object with "
-                        "dtype = %s (got %s).") % (
-                            self, self.numpy_dtype, data.dtype))
-            assert False, "This point should never be reached."
-        else:
-            if allow_downcast:
-                # Convert to self.dtype, regardless of the type of data
-                data = theano._asarray(data, dtype=self.dtype)
-                # TODO: consider to pad shape with ones to make it consistent
-                # with self.broadcastable... like vector->row type thing
-            else:
-                if isinstance(data, numpy.ndarray):
-                    # Check if self.dtype can accurately represent data
-                    # (do not try to convert the data)
-                    up_dtype = scal.upcast(self.dtype, data.dtype)
-                    if up_dtype == self.dtype:
-                        # Bug in the following line when data is a
-                        # scalar array, see
-                        # http://projects.scipy.org/numpy/ticket/1611
-                        # data = data.astype(self.dtype)
-                        data = theano._asarray(data, dtype=self.dtype)
-                    if up_dtype != self.dtype:
-                        err_msg = (
-                            '%s cannot store a value of dtype %s without '
-                            'risking loss of precision. If you do not mind '
-                            'this loss, you can: '
-                            '1) explicitly cast your data to %s, or '
-                            '2) set "allow_input_downcast=True" when calling '
-                            '"function".'
-                            % (self, data.dtype, self.dtype))
-                        raise TypeError(err_msg, data)
-                elif (allow_downcast is None and
-                        type(data) is float and
-                        self.dtype == theano.config.floatX):
-                    # Special case where we allow downcasting of Python float
-                    # literals to floatX, even when floatX=='float32'
-                    data = theano._asarray(data, self.dtype)
-                else:
-                    # data has to be converted.
-                    # Check that this conversion is lossless
-                    converted_data = theano._asarray(data, self.dtype)
-                    # We use the `values_eq` static function from TensorType
-                    # to handle NaN values.
-                    if TensorType.values_eq(numpy.asarray(data),
-                                            converted_data,
-                                            force_same_dtype=False):
-                        data = converted_data
-                    else:
-                        # Do not print a too long description of data
-                        # (ndarray truncates it, but it's not sure for data)
-                        str_data = str(data)
-                        if len(str_data) > 80:
-                            str_data = str_data[:75] + '(...)'
-
-                        err_msg = (
-                            '%s cannot store accurately value %s, '
-                            'it would be represented as %s. '
-                            'If you do not mind this precision loss, you can: '
-                            '1) explicitly convert your data to a numpy array '
-                            'of dtype %s, or '
-                            '2) set "allow_input_downcast=True" when calling '
-                            '"function".'
-                            % (self, data, converted_data, self.dtype))
-                        raise TypeError(err_msg, data)
-
-        if self.ndim != data.ndim:
-            raise TypeError("Wrong number of dimensions: expected %s,"
-                            " got %s with shape %s." % (self.ndim, data.ndim,
-                                                        data.shape))
-        if not data.flags.aligned:
-            try:
-                msg = "object buffer" + str(data.data)
-            except AttributeError:
-                msg = ""
-            raise TypeError("The numpy.ndarray object is not aligned."
-                            " Theano C code does not support that.",
-                            msg,
-                            "object shape", data.shape,
-                            "object strides", data.strides,
-                            "object dtype", data.dtype)
-
-        i = 0
-        for b in self.broadcastable:
-            if b and data.shape[i] != 1:
-                raise TypeError("Non-unit value on shape on a broadcastable"
-                                " dimension.", data.shape, self.broadcastable)
-            i += 1
-        if (self.filter_checks_isfinite and
-            not numpy.all(numpy.isfinite(data))):
-            raise ValueError("non-finite elements not allowed")
-        return data
-
-    def filter_variable(self, other):
-        """Convert a symbolic Variable into a TensorType, if compatible.
-
-        For the moment, only a TensorType or CudaNdarrayType will be
-        converted, provided they have the same number of dimensions,
-        broadcastable pattern, and dtype.
-        """
-        if hasattr(other, '_as_TensorVariable'):
-            other = other._as_TensorVariable()
-
-        if not isinstance(other, Variable):
-            # The value is not a Variable: we cast it into
-            # a Constant of the appropriate Type.
-            other = self.Constant(type=self, data=other)
-
-        if other.type == self:
-            return other
-
-        raise TypeError(
-                'Cannot convert Type %(othertype)s '
-                '(of Variable %(other)s) into Type %(self)s. '
-                'You can try to manually convert %(other)s into a %(self)s.'
-                % dict(
-                    othertype=other.type,
-                    other=other,
-                    self=self)
-                )
-
-    def value_validity_msg(self, a):
-        try:
-            self.filter(a, strict=True)
-        except Exception, e:
-            return str(e)
-        return "value is valid"
-
-    def dtype_specs(self):
-        """Return a tuple (python type, c type, numpy typenum) that corresponds
-        to self.dtype.
-
-        This function is used internally as part of C code generation.
-        """
-        # TODO: add more type correspondances for e.g. int32, int64, float32,
-        # complex64, etc.
-        try:
-            return {
-                'float32': (float, 'npy_float32', 'NPY_FLOAT32'),
-                'float64': (float, 'npy_float64', 'NPY_FLOAT64'),
-                'uint8': (int, 'npy_uint8', 'NPY_UINT8'),
-                'int8': (int, 'npy_int8', 'NPY_INT8'),
-                'uint16': (int, 'npy_uint16', 'NPY_UINT16'),
-                'int16': (int, 'npy_int16', 'NPY_INT16'),
-                'uint32': (int, 'npy_uint32', 'NPY_UINT32'),
-                'int32': (int, 'npy_int32', 'NPY_INT32'),
-                'uint64': (int, 'npy_uint64', 'NPY_UINT64'),
-                'int64': (int, 'npy_int64', 'NPY_INT64'),
-                'complex128': (complex, 'theano_complex128', 'NPY_COMPLEX128'),
-                'complex64': (complex, 'theano_complex64', 'NPY_COMPLEX64')
-                }[self.dtype]
-        except KeyError:
-            raise TypeError("Unsupported dtype for %s: %s"
-                    % (self.__class__.__name__, self.dtype))
-
-    def to_scalar_type(self):
-        return scal.get_scalar_type(dtype=self.dtype)
-
-    def __eq__(self, other):
-        """Compare True iff other is the same kind of TensorType"""
-        return type(self) == type(other) and other.dtype == self.dtype \
-            and other.broadcastable == self.broadcastable
-
-    @staticmethod
-    def may_share_memory(a, b):
-        # This is a method of TensorType, so both a and b should be ndarrays
-        if isinstance(a, numpy.ndarray) and isinstance(b, numpy.ndarray):
-            return numpy.may_share_memory(a, b)
-        else:
-            return False
-
-    @staticmethod
-    def values_eq(a, b, force_same_dtype=True):
-        # TODO: check to see if the shapes must match
-        #      for now, we err on safe side...
-        if a.shape != b.shape:
-            return False
-        if force_same_dtype and a.dtype != b.dtype:
-            return False
-        a_eq_b = (a == b)
-        r = numpy.all(a_eq_b)
-        if r:
-            return True
-        # maybe the trouble is that there are NaNs
-        a_missing = numpy.isnan(a)
-        if a_missing.any():
-            b_missing = numpy.isnan(b)
-            return numpy.all(a_eq_b + (a_missing == b_missing))
-        else:
-            return False
-
-    @staticmethod
-    def values_eq_approx(a, b, allow_remove_inf=False, allow_remove_nan=False,
-                         rtol=None, atol=None):
-        """
-        :param allow_remove_inf: If True, when there is an inf in a,
-                                 we allow any value in b in that position.
-                                 Event -inf
-        :param allow_remove_nan: If True, when there is a nan in a,
-                                 we allow any value in b in that position.
-                                 Event +-inf
-        :param rtol: relative tolerance, passed to _allclose
-        :param atol: absolute tolerance, passed to _allclose
-        """
-        if isinstance(a, numpy.ndarray) and isinstance(b, numpy.ndarray):
-            if a.shape != b.shape:
-                return False
-            if a.dtype != b.dtype:
-                return False
-            if 'int' in str(a.dtype):
-                return numpy.all(a == b)
-            else:
-                # work around a numpy.allclose bug:
-                # http://projects.scipy.org/numpy/ticket/1672
-                if a.ndim == 0 and numpy.isinf(a):
-                    a = a.reshape(1)
-                    b = b.reshape(1)
-
-                cmp = theano.tensor.basic._allclose(a, b, rtol=rtol, atol=atol)
-                if cmp:
-                    # Numpy claims they are close, this is good enough for us.
-                    return True
-                # Numpy is unhappy, but it does not necessarily mean that a and
-                # b are different. Indeed, Numpy does not like missing values
-                # and will return False whenever some are found in a or b.
-                # The proper way would be to use the MaskArray stuff available
-                # in Numpy. However, it looks like it has been added to Numpy's
-                # core recently, so it may not be available to everyone. Thus,
-                # for now we use a home-made recipe, that should probably be
-                # revisited in the future.
-                a_missing = numpy.isnan(a)
-                a_inf = numpy.isinf(a)
-
-                if not (a_missing.any() or (allow_remove_inf and a_inf.any())):
-                    # There are no missing values in a, thus this is not the
-                    # reason why numpy.allclose(a, b) returned False.
-                    _logger.info(
-                        'numpy allclose failed for abs_err %f and rel_err %f',
-                        numpy.max(abs(a - b)),
-                        numpy.max(abs(a - b) / (abs(a) + abs(b))))
-                    return False
-                # The following line is what numpy.allclose bases its decision
-                # upon, according to its documentation.
-                rtol = 1.0000000000000001e-05
-                atol = 1e-8
-                cmp_elemwise = (numpy.absolute(a - b) <=
-                        (atol + rtol * numpy.absolute(b)))
-                # Find places where both a and b have missing values.
-                both_missing = a_missing * numpy.isnan(b)
-
-                # Find places where both a and b have inf of the same sign.
-                both_inf = a_inf * numpy.isinf(b)
-
-                # cmp_elemwise is weird when we have inf and -inf.
-                # set it to False
-                cmp_elemwise = numpy.where(
-                        both_inf & cmp_elemwise,
-                        a == b,
-                        cmp_elemwise)
-
-                # check the sign of the inf
-                both_inf = numpy.where(both_inf, (a == b), both_inf)
-
-                if allow_remove_inf:
-                    both_inf += a_inf
-                if allow_remove_nan:
-                    both_missing += a_missing
-
-                # Combine all information.
-                return (cmp_elemwise + both_missing + both_inf).all()
-
-        return False
-
-    @staticmethod
-    def values_eq_approx_remove_inf(a, b):
-        return TensorType.values_eq_approx(a, b, True)
-
-    @staticmethod
-    def values_eq_approx_remove_nan(a, b):
-        return TensorType.values_eq_approx(a, b, False, True)
-
-    @staticmethod
-    def values_eq_approx_remove_inf_nan(a, b):
-        return TensorType.values_eq_approx(a, b, True, True)
-
-    def __hash__(self):
-        """Hash equal for same kinds of TensorType"""
-        return hashtype(self) ^ hash(self.dtype) ^ hash(self.broadcastable)
-
-    ndim = property(lambda self: len(self.broadcastable),
-            doc="number of dimensions")
-    """Number of dimensions
-
-    This read-only property is the preferred way to get the number of
-    dimensions of a `TensorType`.
-
-    """
-
-    def make_variable(self, name=None):
-        """Return a `TensorVariable` of this type
-
-        :Parameters:
-         - `name`: str
-           A pretty name to identify this `Variable` when printing and
-           debugging
-        """
-        return self.Variable(self, name=name)
-
-    def __str__(self):
-        if self.name:
-            return self.name
-        else:
-            b = self.broadcastable
-            named_broadcastable = {(): 'scalar',
-                     (False,): 'vector',
-                     (False, True): 'col',
-                     (True, False): 'row',
-                     (False, False): 'matrix'}
-            if b in named_broadcastable:
-                bcast = named_broadcastable[b]
-            else:
-                if any(b):
-                    bcast = str(b)
-                else:
-                    bcast = '%iD' % len(b)
-            return "TensorType(%s, %s)" % (str(self.dtype), bcast)
-
-    def __repr__(self):
-        return str(self)
-        #"TensorType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
-
-    def c_declare(self, name, sub, check_input=True):
-        """Override `CLinkerType.c_declare` """
-        if(check_input):
-            check = """
-            typedef %(dtype)s dtype_%(name)s;
-            """ % dict(sub, name=name, dtype=self.dtype_specs()[1])
-        else:
-            check = ""
-        declaration = """
-        PyArrayObject* %(name)s;
-        """ % dict(sub, name=name, dtype=self.dtype_specs()[1])
-
-        return declaration + check
-
-    def c_init(self, name, sub):
-        """Override `CLinkerType.c_init` """
-        return """
-        %(name)s = NULL;
-        """ % dict(sub, name=name, type_num=self.dtype_specs()[2])
-
-    def c_extract(self, name, sub, check_input=True):
-        """Override `CLinkerType.c_extract` """
-        if(check_input):
-            check = """
-            %(name)s = NULL;
-            if (py_%(name)s == Py_None) {
-                // We can either fail here or set %(name)s to NULL and rely on Ops
-                // using tensors to handle the NULL case, but if they fail to do so
-                // they'll end up with nasty segfaults, so this is public service.
-                PyErr_SetString(PyExc_ValueError, "expected an ndarray, not None");
-                %(fail)s
-            }
-            if (!PyArray_Check(py_%(name)s)) {
-                PyErr_SetString(PyExc_ValueError, "expected an ndarray");
-                %(fail)s
-            }
-            // We expect %(type_num)s
-            if (!PyArray_ISALIGNED((PyArrayObject*) py_%(name)s)) {
-                PyArrayObject * tmp = (PyArrayObject*) py_%(name)s;
-                PyErr_Format(PyExc_NotImplementedError,
-                             "expected an aligned array of type %%ld "
-                             "(%(type_num)s), got non-aligned array of type %%ld"
-                             " with %%ld dimensions, with 3 last dims "
-                             "%%ld, %%ld, %%ld"
-                             " and 3 last strides %%ld %%ld, %%ld.",
-                             (long int) %(type_num)s,
-                             (long int) PyArray_TYPE((PyArrayObject*) py_%(name)s),
-                             (long int) PyArray_NDIM(tmp),
-                             (long int) PyArray_NDIM(tmp) >= 3 ?
-            PyArray_DIMS(tmp)[PyArray_NDIM(tmp)-3] : -1,
-                             (long int) PyArray_NDIM(tmp) >= 2 ?
-            PyArray_DIMS(tmp)[PyArray_NDIM(tmp)-2] : -1,
-                             (long int) PyArray_NDIM(tmp) >= 1 ?
-            PyArray_DIMS(tmp)[PyArray_NDIM(tmp)-1] : -1,
-                             (long int) PyArray_NDIM(tmp) >= 3 ?
-            PyArray_STRIDES(tmp)[PyArray_NDIM(tmp)-3] : -1,
-                             (long int) PyArray_NDIM(tmp) >= 2 ?
-            PyArray_STRIDES(tmp)[PyArray_NDIM(tmp)-2] : -1,
-                             (long int) PyArray_NDIM(tmp) >= 1 ?
-            PyArray_STRIDES(tmp)[PyArray_NDIM(tmp)-1] : -1
-            );
-                %(fail)s
-            }
-            // This is a TypeError to be consistent with DEBUG_MODE
-            // Note: DEBUG_MODE also tells the name of the container
-            if (PyArray_TYPE((PyArrayObject*) py_%(name)s) != %(type_num)s) {
-                PyErr_Format(PyExc_TypeError,
-                             "expected type_num %%d (%(type_num)s) got %%d",
-                             %(type_num)s, PyArray_TYPE((PyArrayObject*) py_%(name)s));
-                %(fail)s
-            }
-            """ % dict(sub, name=name, type_num=self.dtype_specs()[2])
-        else:
-            check = ""
-        return check + """
-        %(name)s = (PyArrayObject*)(py_%(name)s);
-        Py_XINCREF(%(name)s);
-        """ % dict(sub, name=name, type_num=self.dtype_specs()[2])
-
-    def c_cleanup(self, name, sub):
-        """Override `CLinkerType.c_cleanup` """
-        return """
-        if (%(name)s) {
-            Py_XDECREF(%(name)s);
-        }
-        """ % locals()
-
-    def c_sync(self, name, sub):
-        """Override `CLinkerType.c_sync` """
-        fail = sub['fail']
-        type_num = self.dtype_specs()[2]
-        return """
-        {Py_XDECREF(py_%(name)s);}
-        if (!%(name)s) {
-            Py_INCREF(Py_None);
-            py_%(name)s = Py_None;
-        }
-        else if ((void*)py_%(name)s != (void*)%(name)s) {
-            py_%(name)s = (PyObject*)%(name)s;
-        }
-
-        {Py_XINCREF(py_%(name)s);}
-
-        if (%(name)s && !PyArray_ISALIGNED((PyArrayObject*) py_%(name)s)) {
-            PyErr_Format(PyExc_NotImplementedError,
-                         "c_sync: expected an aligned array, got non-aligned array of type %%ld"
-                         " with %%ld dimensions, with 3 last dims "
-                         "%%ld, %%ld, %%ld"
-                         " and 3 last strides %%ld %%ld, %%ld.",
-                         (long int) PyArray_TYPE((PyArrayObject*) py_%(name)s),
-                         (long int) PyArray_NDIM(%(name)s),
-                         (long int) PyArray_NDIM(%(name)s) >= 3 ?
-        PyArray_DIMS(%(name)s)[PyArray_NDIM(%(name)s)-3] : -1,
-                         (long int) PyArray_NDIM(%(name)s) >= 2 ?
-        PyArray_DIMS(%(name)s)[PyArray_NDIM(%(name)s)-2] : -1,
-                         (long int) PyArray_NDIM(%(name)s) >= 1 ?
-        PyArray_DIMS(%(name)s)[PyArray_NDIM(%(name)s)-1] : -1,
-                         (long int) PyArray_NDIM(%(name)s) >= 3 ?
-        PyArray_STRIDES(%(name)s)[PyArray_NDIM(%(name)s)-3] : -1,
-                         (long int) PyArray_NDIM(%(name)s) >= 2 ?
-        PyArray_STRIDES(%(name)s)[PyArray_NDIM(%(name)s)-2] : -1,
-                         (long int) PyArray_NDIM(%(name)s) >= 1 ?
-        PyArray_STRIDES(%(name)s)[PyArray_NDIM(%(name)s)-1] : -1
-        );
-            %(fail)s
-        }
-        """ % locals()
-
-    def c_headers(self):
-        """Override `CLinkerObject.c_headers` """
-        return scal.get_scalar_type(self.dtype).c_headers()
-
-    def c_libraries(self):
-        return scal.get_scalar_type(self.dtype).c_libraries()
-
-    def c_compile_args(self):
-        return scal.get_scalar_type(self.dtype).c_compile_args()
-
-    def c_support_code(self):
-        """Override `CLinkerObject.c_support_code` """
-        return scal.get_scalar_type(self.dtype).c_support_code()
-
-    def c_init_code(self):
-        return scal.get_scalar_type(self.dtype).c_init_code()
-
-    def c_code_cache_version(self):
-        scalar_version = scal.get_scalar_type(self.dtype).c_code_cache_version()
-        if scalar_version:
-            return (11,) + scalar_version
-        else:
-            return ()
-
-    def value_zeros(self, shape):
-        """
-        Create an numpy ndarray full of 0 values.
-        """
-        return numpy.zeros(shape, dtype=self.dtype)
-
-    def get_shape_info(self, obj):
-        """
-        Return the information needed to compute the memory size of ``obj``.
-
-        The memory size is only the data, so this excludes the container.
-        For an ndarray, this is the data, but not the ndarray object and
-        other data structures such as shape and strides.
-
-        ``get_shape_info()`` and ``get_size()`` work in tandem for the memory
-        profiler.
-
-        ``get_shape_info()`` is called during the execution of the function.
-        So it is better that it is not too slow.
-
-        ``get_size()`` will be called on the output of this function
-        when printing the memory profile.
-
-        :param obj: The object that this Type represents during execution
-        :return: Python object that ``self.get_size()`` understands
-        """
-        return obj.shape
-
-    def get_size(self, shape_info):
-        """ Number of bytes taken by the object represented by shape_info.
-
-        :param shape_info: the output of the call to get_shape_info()
-        :return: the number of bytes taken by the object described by
-            ``shape_info``.
-        """
-        if shape_info:
-            return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
-        else:  # a scalar
-            return numpy.dtype(self.dtype).itemsize
-theano.compile.ops.expandable_types += (TensorType,)
-
-# Register TensorType C code for ViewOp.
-theano.compile.register_view_op_c_code(
-        TensorType,
-        """
-        Py_XDECREF(%(oname)s);
-        %(oname)s = %(iname)s;
-        Py_XINCREF(%(oname)s);
-        """,
-        version=1)
-
-
-# Register TensorType C code for Shape Op.
-theano.compile.register_shape_c_code(
-    TensorType,
-    """
-    npy_intp shape[] = {PyArray_NDIM(%(iname)s)};
-    if(%(oname)s == NULL || (PyArray_DIMS(%(oname)s)[0] != shape[0]))
-    {
-        Py_XDECREF(%(oname)s);
-        %(oname)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
-    }
-    for(int i=0;i<shape[0];i++)
-    {
-        ((npy_int64*)PyArray_GETPTR1(%(oname)s, i))[0] = PyArray_DIMS(%(iname)s)[i];
-    }
-    """,
-    version=1)
-
-
-# Register TensorType C code for ViewOp.
-theano.compile.register_shape_i_c_code(
-        TensorType,
-        """
-        if(!%(oname)s)
-            %(oname)s=(PyArrayObject*)PyArray_EMPTY(0, NULL, NPY_INT64, 0);
-        ((npy_int64*)PyArray_DATA(%(oname)s))[0]=PyArray_DIMS(%(iname)s)[%(i)s];
-        """,
-        """
-        if (%(i)s>=PyArray_NDIM(%(iname)s)){
-            PyErr_SetString(PyExc_TypeError,
-                "Number of dimensions lower than expected");
-            %(fail)s
-        }
-        """,
-        version=3)
-
-# Register TensorType C code for DeepCopyOp
-theano.compile.register_deep_copy_op_c_code(
-        TensorType,
-        """
-        int alloc = %(oname)s == NULL;
-        for(int i=0; !alloc && i<PyArray_NDIM(%(oname)s); i++) {
-           if(PyArray_DIMS(%(iname)s)[i] != PyArray_DIMS(%(oname)s)[i]) {
-               alloc = true;
-               break;
-           }
-        }
-        if(alloc) {
-            Py_XDECREF(%(oname)s);
-            %(oname)s = (PyArrayObject*)PyArray_NewCopy(%(iname)s,
-                                                        NPY_ANYORDER);
-            if (!%(oname)s)
-            {
-                PyErr_SetString(PyExc_ValueError,
-                                "DeepCopyOp: the copy failed!");
-                %(fail)s;
-            }
-        } else {
-            if(PyArray_CopyInto(%(oname)s, %(iname)s)){
-                PyErr_SetString(PyExc_ValueError,
-            "DeepCopyOp: the copy failed into already allocated space!");
-                %(fail)s;
-            }
-        }
-        """,
-        version=2)
-
-
-theano.compile.register_rebroadcast_c_code(
-    TensorType,
-    """
-    if(PyArray_DIMS(%(iname)s)[%(axis)s] != 1){
-        PyErr_Format(PyExc_ValueError,
-            "Dimension %(axis)s in Rebroadcast's input was"
-            " supposed to be 1 (got %%d instead)",
-            PyArray_DIMS(%(iname)s)[%(axis)s]);
-        %(fail)s
-    }
-    """,
-        version=1)
-
-
-theano.compile.register_specify_shape_c_code(
-    TensorType,
-    """
-        if (PyArray_NDIM(%(iname)s) != PyArray_DIMS(%(shape)s)[0]) {
-            PyErr_Format(PyExc_AssertionError,
-                         "SpecifyShape: vector of shape has %%d elements,"
-                         " but the input has %%d dimensions.",
-                         PyArray_NDIM(%(iname)s),
-                         PyArray_DIMS(%(shape)s)[0]);
-            %(fail)s;
-        }
-        for(int i = 0; i < PyArray_NDIM(%(iname)s); i++){
-            dtype_%(shape)s shp = ((dtype_%(shape)s*)PyArray_GETPTR1(%(shape)s,
-                                                                     i))[0];
-            if (PyArray_DIMS(%(iname)s)[i] != shp) {
-                PyErr_Format(PyExc_AssertionError,
-                             "SpecifyShape: dim %%d of input has shape %%d,"
-                             " expected %%d.",
-                             i, PyArray_DIMS(%(iname)s)[i],
-                             shp);
-                %(fail)s;
-            }
-        }
-        Py_XDECREF(%(oname)s);
-        %(oname)s = %(iname)s;
-        Py_XINCREF(%(oname)s);
-    """,
-    version=1)
diff --git a/theano/tensor/type_other.py b/theano/tensor/type_other.py
deleted file mode 100644
index 0fc89288d53..00000000000
--- a/theano/tensor/type_other.py
+++ /dev/null
@@ -1,133 +0,0 @@
-#
-# Slice type and Op. None Type and NoneConst.
-#
-
-import numpy
-
-import theano
-from theano.gof import Apply, Constant, Generic, Op, Type, hashtype
-from theano.gradient import DisconnectedType
-
-
-def as_int_none_variable(x):
-    if x is None:
-        return NoneConst
-    elif NoneConst.equals(x):
-        return x
-    x = theano.tensor.as_tensor_variable(x, ndim=0)
-    if x.type.dtype[:3] not in ('int', 'uin'):
-        raise TypeError('index must be integers')
-    return x
-
-
-class MakeSlice(Op):
-    def make_node(self, slc, stop=None, step=None):
-        # We need to accept and handle in make_node inputs the node
-        # inputs to allow redoing a new op elsewhere in the graph by
-        # optimization.
-        if isinstance(slc, slice):
-            assert stop is None
-            assert step is None
-            inp = [slc.start, slc.stop, slc.step]
-        else:
-            inp = [slc, stop, step]
-        return Apply(self,
-                     map(as_int_none_variable, inp),
-                     [slicetype()])
-
-    def perform(self, node, inp, out_):
-        out, = out_
-        out[0] = slice(*inp)
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def grad(self, inputs, grads):
-        return [DisconnectedType()() for i in inputs]
-
-make_slice = MakeSlice()
-
-
-class SliceType(Type):
-
-    def filter(self, x, strict=False, allow_downcast=None):
-        if isinstance(x, slice):
-            return x
-        else:
-            raise TypeError('Expected a slice!')
-
-    def __str__(self):
-        return "slice"
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hashtype(self)
-
-    @staticmethod
-    def may_share_memory(a, b):
-        # Slices never shared memory between object
-        return isinstance(a, slice) and a is b
-
-slicetype = SliceType()
-
-
-class SliceConstant(Constant):
-    def __init__(self, type, data, name=None):
-        assert isinstance(data, slice)
-        # Numpy ndarray aren't hashable, so get rid of them.
-        if isinstance(data.start, numpy.ndarray):
-            assert data.start.ndim == 0
-            assert "int" in str(data.start.dtype)
-            data = slice(int(data.start), data.stop, data.step)
-        elif isinstance(data.stop, numpy.ndarray):
-            assert data.stop.ndim == 0
-            assert "int" in str(data.stop.dtype)
-            data = slice(data.start, int(data.stop), data.step)
-        elif isinstance(data.step, numpy.ndarray):
-            assert data.step.ndim == 0
-            assert "int" in str(data.step.dtype)
-            data = slice(data.start, int(data.stop), data.step)
-        Constant.__init__(self, type, data, name)
-
-    def signature(self):
-        return (SliceConstant, self.data.start, self.data.stop, self.data.step)
-
-    def __str__(self):
-        return "%s{%s, %s, %s}" % (self.__class__.__name__,
-                                   self.data.start,
-                                   self.data.stop,
-                                   self.data.step)
-SliceType.Constant = SliceConstant
-
-
-class NoneTypeT(Generic):
-    """
-    Inherit from Generic to have c code working.
-    """
-
-    def filter(self, x, strict=False, allow_downcast=None):
-        if x is None:
-            return x
-        else:
-            raise TypeError('Expected None!')
-
-    @staticmethod
-    def may_share_memory(a, b):
-        # None never share memory between object, in the sence of DebugMode.
-        # Python None are singleton
-        return False
-
-none_type_t = NoneTypeT()
-
-# This is a variable instance. It can be used only once per fgraph.
-# So use NoneConst.clone() before using it in a Theano graph.
-# Use NoneConst.equal(x) to check if two variable are NoneConst.
-NoneConst = Constant(NoneTypeT(), None, name='NoneConst')
diff --git a/theano/tensor/utils.py b/theano/tensor/utils.py
index ac2fdf5ce6b..1c5c1aa19dc 100644
--- a/theano/tensor/utils.py
+++ b/theano/tensor/utils.py
@@ -1,7 +1,6 @@
 import numpy
 
 import theano
-from theano.compat.python2x import any
 from theano.gof.cc import hash_from_code
 
 
@@ -19,10 +18,7 @@ def hash_from_ndarray(data):
 
     # python hash are not strong, so I always use md5 in order not to have a
     # too long hash, I call it again on the concatenation of all parts.
-    if not data.flags["C_CONTIGUOUS"]:
-        # Version 1.7.1 and previous of NumPy allowed calling
-        # hash_from_code on an F-contiguous array, but more recent
-        # versions need a C-contiguous one.
+    if not data.flags["C_CONTIGUOUS"] and not data.flags["F_CONTIGUOUS"]:
         data = numpy.ascontiguousarray(data)
     return hash_from_code(hash_from_code(data) +
                           hash_from_code(str(data.shape)) +
@@ -70,7 +66,7 @@ def shape_of_variables(fgraph, input_shapes):
     >>> import theano
     >>> x = theano.tensor.matrix('x')
     >>> y = x[512:]; y.name = 'y'
-    >>> fgraph = theano.FunctionGraph([x], [y], clone=False)
+    >>> fgraph = theano.FunctionGraph([x], [y])
     >>> shape_of_variables(fgraph, {x: (1024, 1024)})
     {y: (512, 1024), x: (1024, 1024)}
     """
@@ -86,12 +82,6 @@ def shape_of_variables(fgraph, input_shapes):
 
     compute_shapes = theano.function(input_dims, output_dims)
 
-    if any([i not in fgraph.inputs for i in input_shapes.keys()]):
-        raise ValueError(
-            "input_shapes keys aren't in the fgraph.inputs. FunctionGraph()"
-            " interface changed. Now by default, it clones the graph it receives."
-            " To have the old behavior, give it this new parameter `clone=False`.")
-
     numeric_input_dims  = [dim for inp in fgraph.inputs
                                for dim in input_shapes[inp]]
     numeric_output_dims = compute_shapes(*numeric_input_dims)
diff --git a/theano/tensor/var.py b/theano/tensor/var.py
deleted file mode 100644
index fdf88dae0d3..00000000000
--- a/theano/tensor/var.py
+++ /dev/null
@@ -1,761 +0,0 @@
-import copy
-import pdb
-import sys
-import traceback as tb
-import warnings
-
-import numpy
-
-import theano
-from theano.compat import all, PY3
-from theano.scalar import ComplexError, IntegerDivisionError
-from theano.gof import Constant, Variable
-from theano.gof.utils import hashtype
-from theano.tensor.utils import hash_from_ndarray
-from theano.tensor.type import TensorType
-from theano.configparser import config
-
-
-def equal_slices(s1, s2):
-    return (s1.start == s2.start and
-            s1.stop == s2.stop and
-            s1.step == s2.step)
-
-
-class AsTensorError(TypeError):
-    """Raised when as_tensor_variable isn't able to create a
-    TensorVariable.
-    """
-    pass
-
-
-class _tensor_py_operators:
-    # UNARY
-    def __abs__(self):
-        return theano.tensor.basic.abs_(self)
-
-    def __neg__(self):
-        return theano.tensor.basic.neg(self)
-
-    # CASTS
-    #### REMOVED THESE BECAUSE PYTHON appears to require __int__ to return
-    #### an int. -JB 20081112
-    #def __int__(self): return convert_to_int32(self)
-    #def __float__(self): return convert_to_float64(self)
-    #def __complex__(self): return convert_to_complex128(self)
-
-    # COMPARISONS
-    _is_nonzero = True
-
-    def __lt__(self, other):
-        rval = theano.tensor.basic.lt(self, other)
-        rval._is_nonzero = False
-        return rval
-
-    def __le__(self, other):
-        rval = theano.tensor.basic.le(self, other)
-        rval._is_nonzero = False
-        return rval
-
-    def __gt__(self, other):
-        rval = theano.tensor.basic.gt(self, other)
-        rval._is_nonzero = False
-        return rval
-
-    def __ge__(self, other):
-        rval = theano.tensor.basic.ge(self, other)
-        rval._is_nonzero = False
-        return rval
-
-    def __nonzero__(self):
-        # This is meant to prohibit stuff like a < b < c, which is internally
-        # implemented as (a < b) and (b < c). The trouble with this is the
-        # side-effect that checking for a non-NULL a by typing "if a: ..."
-        # uses the same __nonzero__ method.  We want these both to work, but
-        # it seems impossible.  Currently, all vars evaluate to nonzero except
-        # the return values of comparison operators, which raise this
-        # exception.  If you can think of a better solution, go for it!
-        if self._is_nonzero:
-            return True
-        else:
-            raise TypeError(
-                "Variables do not support boolean operations. This "
-                "can happen if you do a logical operation (<, <=, >, <=, "
-                "==, !=) between a numpy.ndarray and a Theano tensor"
-                "variable. Due to NumPy implementation before NumPy 1.8, "
-                "we cannot make the Python syntax work when the ndarray "
-                "is on the left, and this results in this error. To work "
-                "around that, either call "
-                "theano.tensor.{lt,le,eq,ne,gt,ge}(ndarray, tensor), or "
-                "use the Python syntax with the Theano tensor on the "
-                "left. Or update to NumPy 1.8 or above."
-            )
-
-    # BITWISE
-    def __invert__(self):
-        return theano.tensor.basic.invert(self)
-
-    def __and__(self, other):
-        return theano.tensor.basic.and_(self, other)
-
-    def __or__(self, other):
-        return theano.tensor.basic.or_(self, other)
-
-    def __xor__(self, other):
-        return theano.tensor.basic.xor(self, other)
-
-    def __rand__(self, other):
-        return theano.tensor.basic.and_(other, self)
-
-    def __ror__(self, other):
-        return theano.tensor.basic.or_(other, self)
-
-    def __rxor__(self, other):
-        return theano.tensor.basic.xor(other, self)
-
-    # def __iand__(self, other):
-    #    return _and_inplace(self, other)
-    #
-    # def __ior__(self, other):
-    #    return _or_inplace(self, other)
-    #
-    #def __ixor__(self, other):
-    #    return _xor_inplace(self, other)
-
-    # ARITHMETIC - NORMAL
-    def __add__(self, other):
-        try:
-            return theano.tensor.basic.add(self, other)
-        # We should catch the minimum number of exception here.
-        # Otherwise this will convert error when Theano flags
-        # compute_test_value is used
-        # Evidently, we need to catch NotImplementedError
-        # TypeError from as_tensor_variable are caught in Elemwise.make_node
-        # Oterwise TensorVariable * SparseVariable won't work!
-        except (NotImplementedError, AsTensorError):
-            # We must return NotImplemented and not an
-            # NotImplementedError or raise an NotImplementedError.
-            # That way python will give a good error message like this
-            # `TypeError: unsupported operand type(s) for +:
-            # 'TensorVariable' and 'TensorVariable'`
-            return NotImplemented
-
-    def __sub__(self, other):
-        # See explanation in __add__ for the error catched
-        # and the return value in that case
-        try:
-            return theano.tensor.basic.sub(self, other)
-        except (NotImplementedError, AsTensorError):
-            return NotImplemented
-
-    def __mul__(self, other):
-        # See explanation in __add__ for the error catched
-        # and the return value in that case
-        try:
-            return theano.tensor.mul(self, other)
-        except (NotImplementedError, AsTensorError):
-            return NotImplemented
-
-    def __div__(self, other):
-        # See explanation in __add__ for the error catched
-        # and the return value in that case
-        try:
-            return theano.tensor.basic.div_proxy(self, other)
-        except IntegerDivisionError:
-            # This is to raise the exception that occurs when trying to divide
-            # two integer arrays (currently forbidden).
-            raise
-        except (NotImplementedError, AsTensorError):
-            return NotImplemented
-    if PY3:
-        __truediv__ = __div__
-
-    def __pow__(self, other):
-        # See explanation in __add__ for the error catched
-        # adn the return value in that case
-        try:
-            return theano.tensor.basic.pow(self, other)
-        except (NotImplementedError, AsTensorError):
-            return NotImplemented
-
-    def __mod__(self, other):
-        # See explanation in __add__ for the error catched
-        # adn the return value in that case
-        try:
-            return theano.tensor.basic.mod_check(self, other)
-        except ComplexError:
-            # This is to raise the exception that occurs when trying to compute
-            # x % y with either x or y a complex number.
-            raise
-        except (NotImplementedError, AsTensorError):
-            return NotImplemented
-
-    def __truediv__(self, other):
-        return theano.tensor.basic.true_div(self, other)
-
-    def __floordiv__(self, other):
-        return theano.tensor.basic.floor_div(self, other)
-
-    def __rtruediv__(self, other):
-        return theano.tensor.basic.true_div(other, self)
-
-    def __rfloordiv__(self, other):
-        return theano.tensor.basic.floor_div(other, self)
-
-    ##### DO NOT USE THESE BECAUSE INPLACE OPS SHOULD BE INSERTED
-    ##### BY OPTIMIZATIONS ONLY
-    ## ARITHMETIC - INPLACE
-    #def __iadd__(self, other):
-    #    return _add_inplace(self, other)
-    #def __isub__(self, other):
-    #    return _sub_inplace(self, other)
-    #
-    #def __imul__(self, other):
-    #    return _mul_inplace(self, other)
-    #
-    #def __idiv__(self, other):
-    #    return _div_inplace(self, other)
-    #
-    #def __ipow__(self, other):
-    #    return _pow_inplace(self, other)
-
-    # ARITHMETIC - RIGHT-OPERAND
-    def __radd__(self, other):
-        return theano.tensor.basic.add(other, self)
-
-    def __rsub__(self, other):
-        return theano.tensor.basic.sub(other, self)
-
-    def __rmul__(self, other):
-        return theano.tensor.basic.mul(other, self)
-
-    def __rdiv__(self, other):
-        return theano.tensor.basic.div_proxy(other, self)
-
-    def __rmod__(self, other):
-        return theano.tensor.basic.mod(other, self)
-
-    def __rpow__(self, other):
-        return theano.tensor.basic.pow(other, self)
-
-    # TRANSPOSE
-    T = property(lambda self: theano.tensor.basic.transpose(self))
-
-    def transpose(self, *axes):
-        """
-        Return `tensor.transpose(self, axes)`
-        or `tensor.transpose(self, axes[0])`
-
-        If only one `axes` argument is provided and it is iterable, then it is
-        assumed to be the entire axes tuple, and passed intact to
-        tensor.transpose.
-
-        """
-        if len(axes) == 0:
-            return theano.tensor.basic.transpose(self)
-        try:
-            iter(axes[0])
-            iterable = True
-        except TypeError:
-            iterable = False
-        if len(axes) == 1 and iterable:
-            return theano.tensor.basic.transpose(self, axes[0])
-        else:
-            return theano.tensor.basic.transpose(self, axes)
-
-    shape = property(lambda self: theano.tensor.basic.shape(self))
-
-    size = property(lambda self: theano.tensor.basic.prod(self.shape))
-
-    # We can't implement __len__ to provide a better error message.
-    def any(self, axis=None, keepdims=False):
-        return theano.tensor.basic.any(self, axis=axis, keepdims=keepdims)
-
-    def all(self, axis=None, keepdims=False):
-        return theano.tensor.basic.all(self, axis=axis, keepdims=keepdims)
-
-    # Otherwise TensorVariable[:-1] does not work as Python 2.5.1 calls
-    # __len__ before calling __getitem__. It also does not catch the raised
-    # Exception!
-    # def __len__(self):
-    #     # We can't implement __len__ as Python requests that this
-    #     # function returns an integer >=0
-    #     raise Exception("Theano Variables can't work with len(Theano "
-    #                     "Variable) due to Python restriction. You can use "
-    #                     "TheanoVariable.shape[0] instead.")
-
-    def reshape(self, shape, ndim=None):
-        """Return a reshaped view/copy of this variable.
-
-        :param shape: something that can be converted to a symbolic vector of
-            integers
-
-        :param ndim: the length of the shape.  Passing None here means for
-            theano to try and guess the length of `shape`.
-
-        * warning-- this has a different signature than numpy's
-                    ndarray.reshape!
-                    in numpy you do not need to wrap the shape arguments
-                    in a tuple, in theano you do need to
-
-        """
-
-        if ndim is not None:
-            if not isinstance(ndim, int):
-                raise ValueError("Expected ndim to be an integer, is " +
-                                 str(type(ndim)))
-
-        return theano.tensor.basic.reshape(self, shape, ndim=ndim)
-
-    def dimshuffle(self, *pattern):
-        """
-        Reorder the dimensions of this variable, optionally inserting
-        broadcasted dimensions.
-
-        :param pattern: list/tuple of int mixed with 'x' for broadcastable
-            dimensions
-
-        For example, to create a 3D view of a [2D] matrix, call
-        ``dimshuffle([0,'x',1])``.  This will create a 3D view such that the
-        middle dimension is an implicit broadcasted dimension.  To do the same
-        thing on the transpose of that matrix, call
-        ``dimshuffle([1, 'x', 0])``.
-
-        This function supports the pattern passed as a tuple, or as a
-        variable-length argument (e.g. ``a.dimshuffle(pattern)`` is equivalent
-        to ``a.dimshuffle(*pattern)`` where ``pattern`` is a list/tuple of ints
-        mixed with 'x' characters).
-
-        For more information, see `DimShuffle`.
-        """
-        if (len(pattern) == 1) and (isinstance(pattern[0], (list, tuple))):
-            pattern = pattern[0]
-        op = theano.tensor.basic.DimShuffle(list(self.type.broadcastable),
-                                            pattern)
-        return op(self)
-
-    def flatten(self, ndim=1):
-        return theano.tensor.basic.flatten(self, ndim)
-
-    def ravel(self):
-        return theano.tensor.basic.flatten(self)
-
-    def diagonal(self, offset=0, axis1=0, axis2=1):
-        return theano.tensor.basic.diagonal(self, offset, axis1, axis2)
-
-    # CASTING
-    def astype(self, dtype):
-        return theano.tensor.cast(self, dtype)
-
-    # SLICING
-    # Do not define __getslice__ here:
-    # When calling t[1:], for instance, the arguments passed to __getslice__
-    # are (1, sys.maxsize), which is a pain to deal with, and can even not be
-    # an int (but a long).
-    # If __getslice__ does not exist, __getitem__ is called instead, with
-    # argument slice(1, None, None), which is much more desirable.
-    # __getslice__ is deprecated in python 2.6 anyway.
-
-    def __getitem__(self, args):
-        if not isinstance(args, tuple):
-            args = args,
-        # Convert python literals to theano constants
-        args = theano.tensor.subtensor.make_constant(args)
-        # Determine if advanced indexing is needed or not
-        # The logic is already in Subtensor.convert: if it succeeds,
-        # standard indexing is used; if it fails with
-        # AdvancedIndexingError, advanced indexing
-        advanced = False
-        axis = None
-        for i, arg in enumerate(args):
-            try:
-                if arg != numpy.newaxis:
-                    theano.tensor.subtensor.Subtensor.convert(arg)
-            except theano.tensor.subtensor.AdvancedIndexingError:
-                if advanced:
-                    axis = None
-                    break
-                else:
-                    advanced = True
-                    axis = i
-
-        if advanced:
-            if (axis is not None
-                and all(isinstance(a, slice) and
-                        equal_slices(a, slice(None)) for a in args[:axis])
-                and all(isinstance(a, slice) and
-                        equal_slices(a, slice(None)) for a in args[axis + 1:])
-                and isinstance(args[axis], (
-                    numpy.ndarray,
-                    list,
-                    TensorVariable,
-                    TensorConstant,
-                    theano.tensor.sharedvar.TensorSharedVariable))):
-                return self.take(args[axis], axis)
-            else:
-                return theano.tensor.subtensor.advanced_subtensor(self, *args)
-        else:
-            if numpy.newaxis in args:
-                # None (aka np.newaxis) in numpy indexing means to add a
-                # broadcastable dimension, which theano traditionally did with
-                # the dimshuffle op.  The following code converts numpy-style
-                # indexing on self to traditional [read: implemented] theano
-                # indexing on a dimshuffled view of self.
-
-                counter = 0
-                pattern = []
-                new_args = []
-                for arg in args:
-                    if arg == numpy.newaxis:
-                        pattern.append('x')
-                        new_args.append(slice(None, None, None))
-                    else:
-                        pattern.append(counter)
-                        counter += 1
-                        new_args.append(arg)
-                view = self.dimshuffle(pattern)
-                rval = view.__getitem__(tuple(new_args))
-                return rval
-            else:
-                return theano.tensor.subtensor.Subtensor(args)(
-                    self, *theano.tensor.subtensor.Subtensor.collapse(
-                        args,
-                        lambda entry: isinstance(entry, Variable)))
-
-    def take(self, indices, axis=None, mode='raise'):
-        return theano.tensor.subtensor.take(self, indices, axis, mode)
-
-    # COPYING
-    def copy(self):
-        return theano.tensor.basic.tensor_copy(self)
-
-    def __iter__(self):
-        try:
-            for i in xrange(theano.tensor.basic.get_vector_length(self)):
-                yield self[i]
-        except TypeError:
-            # This prevents accidental iteration via builtin.sum(self)
-            raise TypeError(('TensorType does not support iteration. '
-                             'Maybe you are using builtin.sum instead of '
-                             'theano.tensor.sum? (Maybe .max?)'))
-
-    # CONVENIENT ACCESS TO TYPE PROPERTIES
-    ndim = property(lambda self: self.type.ndim)
-    """The rank of this tensor."""
-
-    broadcastable = property(lambda self: self.type.broadcastable)
-    """The broadcastable signature of this tensor.
-
-    See :doc:`broadcasting` for details.
-    """
-
-    dtype = property(lambda self: self.type.dtype)
-    """ The dtype of this tensor.  """
-
-    # extra pseudo-operator symbols
-    def __dot__(left, right):
-        return theano.tensor.basic.dot(left, right)
-
-    def __rdot__(right, left):
-        return theano.tensor.basic.dot(left, right)
-
-    dot = __dot__
-
-    def sum(self, axis=None, dtype=None, keepdims=False, acc_dtype=None):
-        """See `theano.tensor.sum`"""
-        return theano.tensor.basic.sum(self, axis=axis,
-                                       dtype=dtype, keepdims=keepdims,
-                                       acc_dtype=acc_dtype)
-
-    def prod(self, axis=None, dtype=None, keepdims=False, acc_dtype=None):
-        """See `theano.tensor.prod`"""
-        return theano.tensor.basic.prod(self, axis=axis,
-                                        dtype=dtype, keepdims=keepdims,
-                                        acc_dtype=acc_dtype)
-
-    def norm(self, L, axis=None):
-        if L == 0:
-            raise NotImplementedError()
-        if numpy.isinf(L):
-            raise NotImplementedError()
-        # optimizations will/should catch cases like L=1, L=2
-        return theano.tensor.basic.pow(
-            theano.tensor.basic.pow(
-                theano.tensor.basic.abs_(self), L).sum(axis=axis), 1.0 / L)
-
-    def mean(self, axis=None, dtype=None, keepdims=False, acc_dtype=None):
-        """See `theano.tensor.mean`"""
-        return theano.tensor.basic.mean(self, axis=axis,
-                                        dtype=dtype, keepdims=keepdims,
-                                        acc_dtype=acc_dtype)
-
-    def var(self, axis=None, keepdims=False):
-        """See `theano.tensor.var`"""
-        return theano.tensor.basic.var(self, axis, keepdims=keepdims)
-
-    def std(self, axis=None, keepdims=False):
-        """See `theano.tensor.std`"""
-        return theano.tensor.basic.std(self, axis, keepdims=keepdims)
-
-    def min(self, axis=None, keepdims=False):
-        """See `theano.tensor.min`"""
-        return theano.tensor.basic.min(self, axis, keepdims=keepdims)
-
-    def max(self, axis=None, keepdims=False):
-        """See `theano.tensor.max`"""
-        return theano.tensor.basic.max(self, axis, keepdims=keepdims)
-
-    def argmin(self, axis=None, keepdims=False):
-        """See `theano.tensor.argmin`"""
-        return theano.tensor.basic.argmin(self, axis, keepdims=keepdims)
-
-    def argmax(self, axis=None, keepdims=False):
-        """See `theano.tensor.argmax`"""
-        return theano.tensor.basic.argmax(self, axis, keepdims=keepdims)
-
-    def nonzero(self, return_matrix=False):
-        """See `theano.tensor.nonzero`"""
-        return theano.tensor.basic.nonzero(self, return_matrix=return_matrix)
-
-    def nonzero_values(self):
-        """See `theano.tensor.nonzero_values`"""
-        return theano.tensor.basic.nonzero_values(self)
-
-    def sort(self, axis=-1, kind='quicksort', order=None):
-        """See `theano.tensor.sort`"""
-        return theano.tensor.sort(self, axis, kind, order)
-
-    def argsort(self, axis=-1, kind='quicksort', order=None):
-        """See `theano.tensor.argsort`"""
-        return theano.tensor.argsort(self, axis, kind, order)
-
-    def clip(self, a_min, a_max):
-        "Clip (limit) the values in an array."
-        return theano.tensor.basic.clip(self, a_min, a_max)
-
-    def conj(self):
-        """See `theano.tensor.conj`"""
-        return theano.tensor.basic.conj(self)
-
-    conjugate = conj
-
-    def repeat(self, repeats, axis=None):
-        """See `theano.tensor.repeat`"""
-        return theano.tensor.extra_ops.repeat(self, repeats, axis)
-
-    def round(self, mode="half_away_from_zero"):
-        """See `theano.tensor.round`"""
-        return theano.tensor.basic.round(self, mode)
-
-    def trace(self):
-        return theano.tensor.nlinalg.trace(self)
-
-    # TO TRUMP NUMPY OPERATORS
-    __array_priority__ = 1000
-
-    def get_scalar_constant_value(self):
-        return theano.tensor.basic.get_scalar_constant_value(self)
-
-    def zeros_like(model, dtype=None):
-        return theano.tensor.basic.zeros_like(model, dtype=dtype)
-
-    def cumsum(self, axis=None):
-        return theano.tensor.extra_ops.cumsum(self, axis)
-
-    def cumprod(self, axis=None):
-        return theano.tensor.extra_ops.cumprod(self, axis)
-
-    def ptp(self, axis=None):
-        """see 'theano.tensor.ptp'"""
-
-        return theano.tensor.ptp(self, axis)
-
-    def swapaxes(self, axis1, axis2):
-        """Return 'tensor.swapaxes(self, axis1, axis2)
-
-        If a matrix is provided with the right axes, its transpose
-        will be returned.
-
-        """
-        return theano.tensor.basic.swapaxes(self, axis1, axis2)
-
-    def fill(self, value):
-        """Fill inputted tensor with the assigned value"""
-        return theano.tensor.basic.fill(self, value)
-
-    def choose(self, a, choices, out=None, mode='raise'):
-        """Construct an array from an index array and a set of arrays to choose from."""
-        return theano.tensor.basic.choose(self, a, choices, out=None, mode='raise')
-
-class TensorVariable(_tensor_py_operators, Variable):
-    """Subclass to add the tensor operators to the basic `Variable` class."""
-
-    def __init__(self, type, owner=None, index=None, name=None):
-        super(TensorVariable, self).__init__(type, owner=owner,
-                                             index=index, name=name)
-        if (config.warn_float64 != 'ignore' and type.dtype == 'float64'):
-            msg = ('You are creating a TensorVariable '
-                   'with float64 dtype. You requested an action via '
-                   'the Theano flag warn_float64={ignore,warn,raise,pdb}.')
-            if config.warn_float64 == "warn":
-                # Get the user stack. We don't want function inside the
-                # tensor and gof directory to be shown to the user.
-                x = tb.extract_stack()
-                nb_rm = 0
-                while x:
-                    file_path = x[-1][0]
-                    rm = False
-                    for p in ["theano/tensor/",
-                              "theano/gof/"]:
-                        if p in file_path:
-                            x = x[:-1]
-                            nb_rm += 1
-                            rm = True
-                            break
-                    if not rm:
-                        break
-                warnings.warn(msg, stacklevel=1 + nb_rm)
-            elif config.warn_float64 == "raise":
-                raise Exception(msg)
-            elif config.warn_float64 == 'pdb':
-                import pdb;pdb.set_trace()
-TensorType.Variable = TensorVariable
-
-
-class TensorConstantSignature(tuple):
-    """A Signature object for comparing TensorConstant instances
-
-    An instance is a pair: (Type instance, ndarray).
-    """
-    def __eq__(self, other):
-        if type(self) != type(other):
-            return False
-        try:
-            (t0, d0), (t1, d1) = self, other
-        except Exception:
-            return False
-
-        # N.B. compare shape to ensure no broadcasting in ==
-        if t0 != t1 or d0.shape != d1.shape:
-            return False
-
-        self.no_nan  # Ensure has_nan is computed.
-        # Note that in the comparisons below, the elementwise comparisons
-        # come last because they are the most expensive checks.
-        if self.has_nan:
-            other.no_nan  # Ensure has_nan is computed.
-            return (other.has_nan and
-                    self.sum == other.sum and
-                    (self.no_nan.mask == other.no_nan.mask).all() and
-                    # Note that the second test below (==) may crash e.g. for
-                    # a single scalar NaN value, so we do not run it when all
-                    # values are missing.
-                    (self.no_nan.mask.all() or
-                     (self.no_nan == other.no_nan).all()))
-        else:
-            # Simple case where we do not need to worry about NaN values.
-            # (note that if there are NaN values in d1, this will return
-            # False, which is why we do not bother with testing `other.has_nan`
-            # here).
-            return (self.sum == other.sum) and numpy.all(d0 == d1)
-
-    def __hash__(self):
-        t, d = self
-        return hashtype(self) ^ hash(t) ^ hash(d.shape) ^ hash(self.sum)
-
-    def theano_hash(self):
-        _, d = self
-        return hash_from_ndarray(d)
-
-    def _get_sum(self):
-        """Compute sum of non NaN / Inf values in the array."""
-        try:
-            return self._sum
-        except AttributeError:
-            self._sum = self.no_nan.sum()
-            # The following 2 lines are needede as in Python 3.3 with NumPy
-            # 1.7.1, numpy.ndarray and numpy.memmap aren't hashable.
-            if type(self._sum) is numpy.memmap:
-                self._sum = numpy.asarray(self._sum).item()
-            if self.has_nan and self.no_nan.mask.all():
-                # In this case the sum is not properly computed by numpy.
-                self._sum = 0
-            if numpy.isinf(self._sum) or numpy.isnan(self._sum):
-                # NaN may happen when there are both -inf and +inf values.
-                if self.has_nan:
-                    # Filter both NaN and Inf values.
-                    mask = self.no_nan.mask + numpy.isinf(self[1])
-                else:
-                    # Filter only Inf values.
-                    mask = numpy.isinf(self[1])
-                if mask.all():
-                    self._sum = 0
-                else:
-                    self._sum = numpy.ma.masked_array(self[1], mask).sum()
-                # At this point there should be no more NaN.
-                assert not numpy.isnan(self._sum)
-        return self._sum
-    sum = property(_get_sum)
-
-    def _get_no_nan(self):
-        try:
-            return self._no_nan
-        except AttributeError:
-            nan_mask = numpy.isnan(self[1])
-            if nan_mask.any():
-                self._no_nan = numpy.ma.masked_array(self[1], nan_mask)
-                self.has_nan = True
-            else:
-                self._no_nan = self[1]
-                self.has_nan = False
-        return self._no_nan
-    no_nan = property(_get_no_nan)
-
-
-class TensorConstant(_tensor_py_operators, Constant):
-    """Subclass to add the tensor operators to the basic `Constant` class.
-
-    To create a TensorConstant, use the `constant` function in this module.
-    """
-    def __init__(self, type, data, name=None):
-        Constant.__init__(self, type, data, name)
-        if (isinstance(data, numpy.ndarray) and
-            data.ndim > 0 and
-            len(numpy.unique(data)) == 1):
-            self.tag.unique_value = numpy.unique(data)[0]
-        else:
-            self.tag.unique_value = None
-
-    def __str__(self):
-        if self.tag.unique_value is not None:
-            name = "%s of %s" % (str(self.data.shape),
-                                 str(self.tag.unique_value))
-        else:
-            name = "%s" % self.data
-        if len(name) > 20:
-            name = name[:10] + ".." + name[-10:]
-
-        return "TensorConstant{%s}" % name
-
-    def signature(self):
-        return TensorConstantSignature((self.type, self.data))
-
-    def equals(self, other):
-        # Override Contant.equals to allow to compare with numpy.ndarray
-        if isinstance(other, numpy.ndarray):
-            # Make a TensorConstant to be able to compare
-            other = theano.tensor.basic.constant(other)
-        return (isinstance(other, TensorConstant) and
-                self.signature() == other.signature())
-
-    def __copy__(self):
-        # We need to do this to remove the cached attribute
-        return type(self)(self.type, self.data, self.name)
-
-    def __deepcopy__(self, memo):
-        # We need to do this to remove the cached attribute
-        return type(self)(copy.deepcopy(self.type, memo),
-                          copy.deepcopy(self.data, memo),
-                          copy.deepcopy(self.name, memo))
-
-TensorType.Constant = TensorConstant
diff --git a/theano/tests/diverse_tests.py b/theano/tests/diverse_tests.py
index 29a237c0c64..de561f45141 100644
--- a/theano/tests/diverse_tests.py
+++ b/theano/tests/diverse_tests.py
@@ -1,33 +1,31 @@
-import unittest
+from nose.plugins.skip import SkipTest
 
+import unittest
+import theano
 import numpy
+import random
 import numpy.random
-
-import theano
-from theano.tests import unittest_tools as utt
+from theano.tests  import unittest_tools as utt
 
 '''
-  Different tests that are not connected to any particular Op, or
-  functionality of Theano. Here will go for example code that we will
-  publish in papers, that we should ensure that it will remain
-  operational
-
+  Different tests that are not connected to any particular Op, or functionality of
+  Theano. Here will go for example code that we will publish in papers, that we
+  should ensure that it will remain operational
 '''
 
-
 class T_scipy(unittest.TestCase):
     def setUp(self):
         utt.seed_rng()
         self.orig_floatX = theano.config.floatX
-
     def tearDown(self):
         theano.config.floatX = self.orig_floatX
 
+
     def test_scipy_paper_example1(self):
-        a = theano.tensor.vector('a')  # declare variable
-        b = a + a**10                  # build expression
-        f = theano.function([a], b)    # compile function
-        assert numpy.all(f([0, 1, 2]) == numpy.array([0, 2, 1026]))
+        a = theano.tensor.vector('a') # declare variable
+        b = a + a**10                 # build expression
+        f = theano.function([a], b)   # compile function
+        assert numpy.all(f([0,1,2]) == numpy.array([0,2,1026]))
 
     def test_scipy_paper_example2(self):
         ''' This just sees if things compile well and if they run '''
@@ -36,7 +34,7 @@ def test_scipy_paper_example2(self):
         shared = theano.shared
         function = theano.function
         rng = numpy.random
-        theano.config.floatX = 'float64'
+        theano.config.floatX='float64'
 
         #
         # ACTUAL SCRIPT FROM PAPER
@@ -51,18 +49,18 @@ def test_scipy_paper_example2(self):
         xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1)
         prediction = p_1 > 0.5
         cost = xent.mean() + 0.01*(w**2).sum()
-        gw, gb = T.grad(cost, [w, b])
+        gw,gb = T.grad(cost, [w,b])
 
         # Compile expressions to functions
         train = function(
-            inputs=[x, y],
+            inputs=[x,y],
             outputs=[prediction, xent],
             updates=[(w, w-0.1*gw), (b, b-0.1*gb)])
         predict = function(inputs=[x], outputs=prediction)
 
         N = 4
         feats = 100
-        D = (rng.randn(N, feats), rng.randint(size=4, low=0, high=2))
+        D = (rng.randn(N, feats), rng.randint(size=4,low=0, high=2))
         training_steps = 10
         for i in range(training_steps):
             pred, err = train(D[0], D[1])
@@ -70,3 +68,4 @@ def test_scipy_paper_example2(self):
 
 if __name__ == '__main__':
     unittest.main()
+
diff --git a/theano/tests/main.py b/theano/tests/main.py
index f8a0d4c0d46..90455c5d81d 100644
--- a/theano/tests/main.py
+++ b/theano/tests/main.py
@@ -2,7 +2,6 @@
 import nose.plugins.builtin
 
 from nose.config import Config
-from nose.plugins.manager import PluginManager
 from numpy.testing.nosetester import import_nose, NoseTester
 from numpy.testing.noseclasses import KnownFailure, NumpyTestProgram
 
@@ -124,11 +123,8 @@ def test(self, verbose=1, extra_argv=None, coverage=False, capture=True,
         argv, plugins = self.prepare_test_args(verbose, extra_argv, coverage,
                 capture, knownfailure)
 
-        # The "plugins" keyword of NumpyTestProgram gets ignored if config is
-        # specified. Moreover, using "addplugins" instead can lead to strange
-        # errors. So, we specify the plugins in the Config as well.
-        cfg = Config(includeExe=True, plugins=PluginManager(plugins=plugins))
-        t = NumpyTestProgram(argv=argv, exit=False, config=cfg)
+        cfg = Config(includeExe=True)
+        t = NumpyTestProgram(argv=argv, exit=False, plugins=plugins, config=cfg)
         return t.result
 
 
diff --git a/theano/tests/run_tests_in_batch.py b/theano/tests/run_tests_in_batch.py
index d95fa5c00d8..7aaaaefb723 100644
--- a/theano/tests/run_tests_in_batch.py
+++ b/theano/tests/run_tests_in_batch.py
@@ -62,7 +62,7 @@
 import time
 
 import theano
-from theano.misc.windows import output_subprocess_Popen
+from theano.misc.windows import call_subprocess_Popen
 
 
 def main(stdout=None, stderr=None, argv=None, theano_nose=None,
@@ -152,10 +152,7 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile,
     stderr.flush()
     assert rval == 0
     noseids_file = '.noseids'
-
-    with open(noseids_file, 'rb') as f: 
-        data = cPickle.load(f)
-
+    data = cPickle.load(open(noseids_file, 'rb'))
     ids = data['ids']
     n_tests = len(ids)
     if n_tests == 0:
@@ -188,19 +185,16 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile,
                 subprocess_extra_args.update(dict(
                     stdout=dummy_out.fileno(),
                     stderr=dummy_out.fileno()))
-            t0 = time.time()
             subprocess.call(cmd, **subprocess_extra_args)
-            t1 = time.time()
             # Recover failed test indices from the 'failed' field of the
             # '.noseids' file. We need to do it after each batch because
             # otherwise this field may get erased. We use a set because it
             # seems like it is not systematically erased though, and we want
             # to avoid duplicates.
-            with open(noseids_file, 'rb') as f:
-                failed = failed.union(cPickle.load(f)['failed'])
-
-            print '%s%% done in %.3fs (failed: %s)' % (
-                (test_range[-1] * 100) // n_tests, t1 - t0, len(failed))
+            failed = failed.union(cPickle.load(open(noseids_file, 'rb'))
+                                  ['failed'])
+            print '%s%% done (failed: %s)' % ((test_range[-1] * 100) //
+                                n_tests, len(failed))
         # Sort for cosmetic purpose only.
         failed = sorted(failed)
         if failed:
@@ -275,17 +269,19 @@ def getIndexOfLast(l, s):
                     time.ctime(), test_id, data["ids"][test_id]))
                 f_rawlog.flush()
 
-                p_out = output_subprocess_Popen(
+                proc = call_subprocess_Popen(
                     ([python, theano_nose, '-v', '--with-id']
-                     + [str(test_id)] + argv +
-                     ['--disabdocstring']))
+                    + [str(test_id)] + argv +
+                     ['--disabdocstring']),
                     # the previous option calls a custom Nosetests plugin
                     # precluding automatic sustitution of doc. string for
                     # test name in display
                     # (see class 'DisabDocString' in file theano-nose)
+                    stderr=subprocess.PIPE,
+                    stdout=dummy_out.fileno())
 
                 # recovering and processing data from pipe
-                err = p_out[1]
+                err = proc.stderr.read()
                 # print the raw log
                 f_rawlog.write(err)
                 f_rawlog.flush()
diff --git a/theano/tests/test_2nd_order_grads.py b/theano/tests/test_2nd_order_grads.py
index 758850e96da..cb7ba6d5ab8 100644
--- a/theano/tests/test_2nd_order_grads.py
+++ b/theano/tests/test_2nd_order_grads.py
@@ -142,25 +142,3 @@ def test004_hessian():
     f = theano.function([x], Hx)
     vx = numpy.arange(10).astype(theano.config.floatX)
     assert numpy.allclose(f(vx), numpy.eye(10) * 2)
-
-
-def test_jacobian_disconnected_inputs():
-    """
-    Test that disconnected inputs are properly handled by jacobian.
-    """
-    v1 = tensor.vector()
-    v2 = tensor.vector()
-    jacobian_v = theano.gradient.jacobian(1 + v1, v2,
-        disconnected_inputs='ignore')
-    func_v = theano.function([v1, v2], jacobian_v)
-    val = numpy.arange(4.0).astype(theano.config.floatX)
-    assert numpy.allclose(func_v(val, val), numpy.zeros((4, 4)))
-
-    s1 = tensor.scalar()
-    s2 = tensor.scalar()
-    jacobian_s = theano.gradient.jacobian(1 + s1, s2,
-        disconnected_inputs='ignore')
-    func_s = theano.function([s2], jacobian_s)
-    val = numpy.array(1.0).astype(theano.config.floatX)
-    assert numpy.allclose(func_s(val), numpy.zeros(1))
-
diff --git a/theano/tests/test_gradient.py b/theano/tests/test_gradient.py
index c14c75ab9ca..9a991cc7ce9 100644
--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py
@@ -5,7 +5,6 @@
 import unittest
 import theano
 from theano import gof
-from theano.tests import unittest_tools as utt
 
 from theano import gradient
 from theano.tensor.nnet.Conv3D import conv3D
@@ -554,92 +553,6 @@ def test_disconnected_cost_grad():
         except theano.gradient.DisconnectedInputError:
             return
         raise AssertionError("A disconnected gradient has been ignored.")
-        
-def test_subgraph_grad():
-
-    # Tests that the grad method with no known_grads
-    # matches what happens if you use successive subgraph_grads
-
-    x = theano.tensor.fvector('x')
-    t = theano.tensor.fvector('t')
-    w1 = theano.shared(np.random.randn(3,4))
-    w2 = theano.shared(np.random.randn(4,2))
-    a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
-    a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
-    cost2 = theano.tensor.sqr(a2 - t).sum() 
-    cost2 += theano.tensor.sqr(w2.sum())
-    cost1 = theano.tensor.sqr(w1.sum())
-    
-    params = [[w2],[w1]]
-    costs = [cost2,cost1]
-    grad_ends = [[a1], [x]]
-    
-    inputs = [t, x]
-    rng = np.random.RandomState([2012, 11, 15])
-    values = [rng.randn(2), rng.randn(3)]
-    values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)]
-
-    wrt = [w2, w1]
-    cost = cost2 + cost1
-    true_grads = theano.grad(cost, wrt)
-    true_grads = theano.function(inputs, true_grads)
-    true_grads = true_grads(*values)
-    from theano.gof.python25 import OrderedDict
-    next_grad = None
-    param_grads = []
-    for i in xrange(2):
-        param_grad, next_grad = theano.subgraph_grad(
-            wrt=params[i], end=grad_ends[i], 
-            start=next_grad, cost=costs[i]
-        )
-        next_grad = OrderedDict(zip(grad_ends[i], next_grad))
-        param_grads.extend(param_grad)
-    
-    pgrads = theano.function(inputs, param_grads)
-    pgrads = pgrads(*values)
-    
-    for true_grad, pgrad in zip(true_grads, pgrads):
-        assert(np.sum(np.abs(true_grad - pgrad)) < 0.00001)
-
-
-class TestConsiderConstant(unittest.TestCase):
-
-    def setUp(self):
-        utt.seed_rng()
-        self.rng = np.random.RandomState(seed=utt.fetch_seed())
-
-    def test_op_removed(self):
-        x = theano.tensor.matrix('x')
-        y = x * gradient.consider_constant(x)
-        f = theano.function([x], y)
-        # need to refer to theano.gradient.consider_constant_ here,
-        # theano.gradient.consider_constant is a wrapper function!
-        assert gradient.consider_constant_ not in \
-            [node.op for node in f.maker.fgraph.toposort()]
-        
-    def test_grad(self):
-        T = theano.tensor
-        a = np.asarray(self.rng.randn(5, 5),
-            dtype=config.floatX)
-        
-        x = T.matrix('x')
-
-        expressions_gradients = [
-            (x * gradient.consider_constant(x), x),
-            (x * gradient.consider_constant(T.exp(x)), T.exp(x)),
-            (gradient.consider_constant(x), T.constant(0.)),
-            (x**2 * gradient.consider_constant(x), 2 * x**2),
-        ]
-
-        for expr, expr_grad in expressions_gradients:
-            g = gradient.grad(expr.sum(), x)
-            # gradient according to theano
-            f = theano.function([x], g, on_unused_input='ignore')
-            # desired gradient
-            f2 = theano.function([x], expr_grad, on_unused_input='ignore')
-
-            assert np.allclose(f(a), f2(a))
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/theano/tests/test_ifelse.py b/theano/tests/test_ifelse.py
index 5e7cb5c48b0..bd76793b1f1 100644
--- a/theano/tests/test_ifelse.py
+++ b/theano/tests/test_ifelse.py
@@ -159,49 +159,6 @@ def test_multiple_out_grad(self):
         assert numpy.all(outs_0[2] == 1.)
         assert numpy.all(outs_0[3] == 1.)
 
-    def test_multiple_out_crash(self):
-        # This test failed up to commit 2faeb62c38
-        p0 = self.shared(numpy.asarray(numpy.random.random([4, 8]),
-                                       dtype=self.dtype))
-        p1 = self.shared(numpy.asarray(numpy.random.random(8),
-                                       dtype=self.dtype))
-        p2 = self.shared(numpy.asarray(numpy.random.random([8, 3]),
-                                       dtype=self.dtype))
-        p3 = self.shared(numpy.asarray(numpy.random.random(3),
-                                       dtype=self.dtype))
-        p = [p0, p1, p2, p3]
-
-        # in my code these vars are the result of applying scan
-        ften0 = tensor.tensor3('ft0', dtype=self.dtype)
-        fmat1 = tensor.matrix('fm1', dtype=self.dtype)
-        ften2 = tensor.tensor3('ft2', dtype=self.dtype)
-        fmat3 = tensor.matrix('fm3', dtype=self.dtype)
-
-        # then I keep only the last iteration
-        fsub0 = ften0[-1]
-        fsub1 = fmat1[-1]
-        fsub2 = ften2[-1]
-        fsub3 = fmat3[-1]
-
-        fsub = [fsub0, fsub1, fsub2, fsub3]
-
-        acc = theano.tensor.constant(1, 'int8') >= 0
-
-        new_positions = theano.ifelse.ifelse(acc, fsub, p)
-
-        new_updates = [(p[0], new_positions[0])]
-
-        f = theano.function([ften0, fmat1, ften2, fmat3], [],
-                            updates=new_updates, mode=self.mode)
-        self.assertFunctionContains1(f, self.get_ifelse(4))
-
-        i1 = numpy.asarray(numpy.random.random([19, 4, 8]), dtype=self.dtype)
-        i2 = numpy.asarray(numpy.random.random([19, 8]), dtype=self.dtype)
-        i3 = numpy.asarray(numpy.random.random([19, 8, 3]), dtype=self.dtype)
-        i4 = numpy.asarray(numpy.random.random([19, 3]), dtype=self.dtype)
-
-        f(i1, i2, i3, i4)
-
     def test_dtype_mismatch(self):
         rng = numpy.random.RandomState(utt.fetch_seed())
         data = rng.rand(5).astype(self.dtype)
diff --git a/theano/tests/test_pickle_unpickle_theano_fn.py b/theano/tests/test_pickle_unpickle_theano_fn.py
deleted file mode 100644
index 7d20e022f45..00000000000
--- a/theano/tests/test_pickle_unpickle_theano_fn.py
+++ /dev/null
@@ -1,91 +0,0 @@
-"""
-This script tests the pickle and unpickle of theano functions.
-When a compiled theano has shared vars, their values are also being pickled.
-
-Side notes useful for debugging:
-The pickling tools theano uses is here:
-theano.compile.function_module._pickle_Function()
-theano.compile.function_module._pickle_FunctionMaker()
-Whether reoptimize the pickled function graph is handled by
-FunctionMaker.__init__()
-The config option is in configdefaults.py
-
-This note is written by Li Yao.
-"""
-import unittest
-import numpy
-import cPickle
-from theano.compat.python2x import DictMixin, OrderedDict
-floatX = 'float32'
-import theano
-import theano.tensor as T
-
-
-def test_pickle_unpickle_with_reoptimization():
-    mode = theano.config.mode
-    if mode in ["DEBUG_MODE", "DebugMode"]:
-        mode = "FAST_RUN"
-    x1 = T.fmatrix('x1')
-    x2 = T.fmatrix('x2')
-    x3 = theano.shared(numpy.ones((10, 10), dtype=floatX))
-    x4 = theano.shared(numpy.ones((10, 10), dtype=floatX))
-    y = T.sum(T.sum(T.sum(x1 ** 2 + x2) + x3) + x4)
-
-    updates = OrderedDict()
-    updates[x3] = x3 + 1
-    updates[x4] = x4 + 1
-    f = theano.function([x1, x2], y, updates=updates, mode=mode)
-
-    # now pickle the compiled theano fn
-    string_pkl = cPickle.dumps(f, -1)
-
-    in1 = numpy.ones((10, 10), dtype=floatX)
-    in2 = numpy.ones((10, 10), dtype=floatX)
-
-    # test unpickle with optimization
-    default = theano.config.reoptimize_unpickled_function
-    try:
-        # the default is True
-        theano.config.reoptimize_unpickled_function = True
-        f_ = cPickle.loads(string_pkl)
-        assert f(in1, in2) == f_(in1, in2)
-    finally:
-        theano.config.reoptimize_unpickled_function = default
-
-
-def test_pickle_unpickle_without_reoptimization():
-    mode = theano.config.mode
-    if mode in ["DEBUG_MODE", "DebugMode"]:
-        mode = "FAST_RUN"
-    x1 = T.fmatrix('x1')
-    x2 = T.fmatrix('x2')
-    x3 = theano.shared(numpy.ones((10, 10), dtype=floatX))
-    x4 = theano.shared(numpy.ones((10, 10), dtype=floatX))
-    y = T.sum(T.sum(T.sum(x1**2 + x2) + x3) + x4)
-
-    updates = OrderedDict()
-    updates[x3] = x3 + 1
-    updates[x4] = x4 + 1
-    f = theano.function([x1, x2], y, updates=updates, mode=mode)
-
-    # now pickle the compiled theano fn
-    string_pkl = cPickle.dumps(f, -1)
-
-    # compute f value
-    in1 = numpy.ones((10, 10), dtype=floatX)
-    in2 = numpy.ones((10, 10), dtype=floatX)
-
-    # test unpickle without optimization
-    default = theano.config.reoptimize_unpickled_function
-    try:
-        # the default is True
-        theano.config.reoptimize_unpickled_function = False
-        f_ = cPickle.loads(string_pkl)
-        assert f(in1, in2) == f_(in1, in2)
-    finally:
-        theano.config.reoptimize_unpickled_function = default
-
-
-if __name__ == '__main__':
-    test_pickle_unpickle_with_reoptimization()
-    test_pickle_unpickle_without_reoptimization()
diff --git a/theano/tests/test_printing.py b/theano/tests/test_printing.py
index c37c34daf73..e844da50f32 100644
--- a/theano/tests/test_printing.py
+++ b/theano/tests/test_printing.py
@@ -11,7 +11,6 @@
 from theano.printing import min_informative_str, debugprint
 from theano.compat.six import StringIO
 
-
 def test_pydotprint_cond_highlight():
     """
     This is a REALLY PARTIAL TEST.
@@ -45,46 +44,6 @@ def test_pydotprint_cond_highlight():
             ' is no IfElse node in the graph\n')
 
 
-def test_pydotprint_return_image():
-    # Skip test if pydot is not available.
-    if not theano.printing.pydot_imported:
-        raise SkipTest('pydot not available')
-
-    x = tensor.dvector()
-    ret = theano.printing.pydotprint(x * 2, return_image=True)
-    assert isinstance(ret, str)
-
-
-def test_pydotprint_variables():
-    """
-    This is a REALLY PARTIAL TEST.
-
-    I did them to help debug stuff.
-
-    It make sure the code run.
-    """
-
-    # Skip test if pydot is not available.
-    if not theano.printing.pydot_imported:
-        raise SkipTest('pydot not available')
-
-    x = tensor.dvector()
-
-    s = StringIO()
-    new_handler = logging.StreamHandler(s)
-    new_handler.setLevel(logging.DEBUG)
-    orig_handler = theano.logging_default_handler
-
-    theano.theano_logger.removeHandler(orig_handler)
-    theano.theano_logger.addHandler(new_handler)
-    try:
-        theano.printing.pydotprint(x * 2)
-        theano.printing.pydotprint_variables(x * 2)
-    finally:
-        theano.theano_logger.addHandler(orig_handler)
-        theano.theano_logger.removeHandler(new_handler)
-
-
 def test_pydotprint_long_name():
     """This is a REALLY PARTIAL TEST.
 
@@ -103,13 +62,14 @@ def test_pydotprint_long_name():
     f = theano.function([x], [x * 2, x + x], mode=mode)
     f([1, 2, 3, 4])
 
+    s = StringIO()
+    new_handler = logging.StreamHandler(s)
+    new_handler.setLevel(logging.DEBUG)
+    orig_handler = theano.logging_default_handler
+
     theano.printing.pydotprint(f, max_label_size=5,
                                print_output_file=False,
                                assert_nb_all_strings=6)
-    theano.printing.pydotprint([x * 2, x + x],
-                               max_label_size=5,
-                               print_output_file=False,
-                               assert_nb_all_strings=8)
 
 
 def test_pydotprint_profile():
diff --git a/theano/tests/test_rop.py b/theano/tests/test_rop.py
index b09208aaf3e..d5f15c92e1b 100644
--- a/theano/tests/test_rop.py
+++ b/theano/tests/test_rop.py
@@ -347,13 +347,3 @@ def test_multiple_outputs(self):
             all_outs.extend(o)
         f = theano.function([m, v, m_, v_], all_outs)
         f(mval, vval, m_val, v_val)
-
-    def test_Rop_dot_bug_18Oct2013_Jeremiah(self):
-        # This test refers to a bug reported by Jeremiah Lowin on 18th Oct
-        # 2013. The bug consists when through a dot operation there is only
-        # one differentiable path (i.e. there is no gradient wrt to one of
-        # the inputs).
-        x = tensor.arange(20.0).reshape([1, 20])
-        v = theano.shared(numpy.ones([20]))
-        d = tensor.dot(x, v).sum()
-        tensor.Rop(tensor.grad(d, v), v, v)
diff --git a/theano/tests/test_tutorial.py b/theano/tests/test_tutorial.py
index 1699b1231db..76b8d7e5827 100644
--- a/theano/tests/test_tutorial.py
+++ b/theano/tests/test_tutorial.py
@@ -1,32 +1,25 @@
 """ test code snippet in the Theano tutorials.
 """
 
-import os
-import shutil
-import unittest
-
-from nose.plugins.skip import SkipTest
-import numpy
-from numpy import array
-
+import os, shutil, unittest
 import theano
 import theano.tensor as T
 from theano import function
+import numpy
+from numpy import array
 
 from theano import config
-from theano.tests import unittest_tools as utt
-from theano.sandbox.rng_mrg import MRG_RandomStreams
-from theano.tensor.shared_randomstreams import RandomStreams
+from theano.tests  import unittest_tools as utt
 
 
 class T_extending(unittest.TestCase):
-    # All tests here belong to files in
-    # http://deeplearning.net/software/theano/extending
-    # Theano/doc/extending/*.txt
-    # Any change you do here also add it to the tutorial!
-    # This belongs to an entire folder since code-snippets are connected
-    # from one file to another .. and they do not make sense on their
-    # own.
+    ## All tests here belong to files in
+    ## http://deeplearning.net/software/theano/extending
+    ## Theano/doc/extending/*.txt
+    ## Any change you do here also add it to the tutorial!
+    ## This belongs to an entire folder since code-snippets are connected
+    ## from one file to another .. and they do not make sense on their
+    ## own.
 
     def test_extending_1(self):
 
@@ -64,11 +57,6 @@ def filter(self, x, strict=False):
             def values_eq_approx(self, x, y, tolerance=1e-4):
                 return abs(x - y) / (abs(x) + abs(y)) < tolerance
 
-            # Added to make those tests pass in DebugMode
-            @staticmethod
-            def may_share_memory(a, b):
-                return a is b
-
         double = Double()
 
 
@@ -92,11 +80,6 @@ def values_eq_approx(self, x, y, tolerance=1e-4):
             def __str__(self):
                 return "double"
 
-            # Added to make those tests pass in DebugMode
-            @staticmethod
-            def may_share_memory(a, b):
-                return a is b
-
         double = Double()
 
 
@@ -118,8 +101,8 @@ def perform(node, inputs, output_storage):
         x, y = double('x'), double('y')
         z = mul(x, y)
         f = theano.function([x, y], z)
-        assert f(5, 6) == 30.0
-        assert f(5.6, 6.7) == 37.519999999999996
+        assert f(5, 6)     == 30.0
+        assert f(5.6, 6.7) ==  37.519999999999996
 
         x = double('x')
         self.assertRaises(AttributeError, mul, x, 2)
@@ -171,17 +154,18 @@ def perform(self, node, inp, out):
             def __str__(self):
                 return self.name
 
-        add = BinaryDoubleOp(name='add',
-                             fn=lambda x, y: x + y)
+        add = BinaryDoubleOp(name = 'add',
+                            fn = lambda x, y: x + y)
 
-        sub = BinaryDoubleOp(name='sub',
-                             fn=lambda x, y: x - y)
+        sub = BinaryDoubleOp(name = 'sub',
+                            fn = lambda x, y: x - y)
 
-        mul = BinaryDoubleOp(name='mul',
-                             fn=lambda x, y: x * y)
+        mul = BinaryDoubleOp(name = 'mul',
+                            fn = lambda x, y: x * y)
+
+        div = BinaryDoubleOp(name = 'div',
+                            fn = lambda x, y: x / y)
 
-        div = BinaryDoubleOp(name='div',
-                             fn=lambda x, y: x / y)
 
     def test_extending_2(self):
         '''
@@ -204,11 +188,6 @@ def values_eq_approx(self, x, y, tolerance=1e-4):
             def __str__(self):
                 return "double"
 
-            # Added to make those tests pass in DebugMode
-            @staticmethod
-            def may_share_memory(a, b):
-                return a is b
-
         double = Double()
 
         class BinaryDoubleOp(gof.Op):
@@ -239,22 +218,22 @@ def perform(self, node, inp, out):
             def __str__(self):
                 return self.name
 
-        add = BinaryDoubleOp(name='add',
-                             fn=lambda x, y: x + y)
+        add = BinaryDoubleOp(name = 'add',
+                            fn = lambda x, y: x + y)
 
-        sub = BinaryDoubleOp(name='sub',
-                             fn=lambda x, y: x - y)
+        sub = BinaryDoubleOp(name = 'sub',
+                            fn = lambda x, y: x - y)
 
-        mul = BinaryDoubleOp(name='mul',
-                             fn=lambda x, y: x * y)
+        mul = BinaryDoubleOp(name = 'mul',
+                            fn = lambda x, y: x * y)
 
-        div = BinaryDoubleOp(name='div',
-                             fn=lambda x, y: x / y)
+        div = BinaryDoubleOp(name = 'div',
+                            fn = lambda x, y: x / y)
 
-        def c_declare(name, sub, check_input=True):
+        def c_declare(name, sub):
             return """
             double %(name)s;
-            """ % dict(name=name)
+            """ % dict(name = name)
         double.c_declare = c_declare
 
 
@@ -264,20 +243,19 @@ def c_init(name, sub):
             """ % dict(name = name)
         double.c_init = c_init
 
-        def c_extract(name, sub, check_input=True):
-            if(check_input):
-                pre = """
-                if (!PyFloat_Check(py_%(name)s)) {
-                    PyErr_SetString(PyExc_TypeError, "expected a float");
-                    %(fail)s
-                }""" % dict(name = name, fail = sub['fail'])
-            else:
-                pre = ""
-            return pre + """
+
+
+        def c_extract(name, sub):
+            return """
+            if (!PyFloat_Check(py_%(name)s)) {
+                PyErr_SetString(PyExc_TypeError, "expected a float");
+                %(fail)s
+            }
             %(name)s = PyFloat_AsDouble(py_%(name)s);
             """ % dict(name = name, fail = sub['fail'])
         double.c_extract = c_extract
 
+
         def c_sync( name, sub):
             return """
             Py_XDECREF(py_%(name)s);
@@ -318,7 +296,7 @@ def values_eq_approx(self, x, y, tolerance=1e-4):
             def __str__(self):
                 return "double"
 
-            def c_declare(self, name, sub, check_input=True):
+            def c_declare(self, name, sub):
                 return """
                 double %(name)s;
                 """ % dict(name = name)
@@ -328,19 +306,14 @@ def c_init(self, name, sub):
                 %(name)s = 0.0;
                 """ % dict(name = name)
 
-            def c_extract(self, name, sub, check_input=True):
-                if(check_input):
-                    pre = """
-                    if (!PyFloat_Check(py_%(name)s)) {
-                        PyErr_SetString(PyExc_TypeError, "expected a float");
-                        %(fail)s
-                    }
-                    """ % dict(sub, name=name)
-                else:
-                    pre = ""
-                return pre + """
+            def c_extract(self, name, sub):
+                return """
+                if (!PyFloat_Check(py_%(name)s)) {
+                    PyErr_SetString(PyExc_TypeError, "expected a float");
+                    %(fail)s
+                }
                 %(name)s = PyFloat_AsDouble(py_%(name)s);
-                """ % dict(sub, name=name)
+                """ % dict(sub, name = name)
 
             def c_sync(self, name, sub):
                 return """
@@ -356,11 +329,6 @@ def c_sync(self, name, sub):
             def c_cleanup(self, name, sub):
                 return ""
 
-            # Added to make those tests pass in DebugMode
-            @staticmethod
-            def may_share_memory(a, b):
-                return a is b
-
         double = Double()
 
 
@@ -404,21 +372,21 @@ def c_code(self, node, name, inp, out, sub):
                 return self.ccode % locals()
 
 
-        add = BinaryDoubleOp(name='add',
-                            fn=lambda x, y: x + y,
-                            ccode="%(z)s = %(x)s + %(y)s;")
+        add = BinaryDoubleOp(name = 'add',
+                            fn = lambda x, y: x + y,
+                            ccode = "%(z)s = %(x)s + %(y)s;")
 
-        sub = BinaryDoubleOp(name='sub',
-                            fn=lambda x, y: x - y,
-                            ccode="%(z)s = %(x)s - %(y)s;")
+        sub = BinaryDoubleOp(name = 'sub',
+                            fn = lambda x, y: x - y,
+                            ccode = "%(z)s = %(x)s - %(y)s;")
 
-        mul = BinaryDoubleOp(name='mul',
-                            fn=lambda x, y: x * y,
-                            ccode="%(z)s = %(x)s * %(y)s;")
+        mul = BinaryDoubleOp(name = 'mul',
+                            fn = lambda x, y: x * y,
+                            ccode = "%(z)s = %(x)s * %(y)s;")
 
-        div = BinaryDoubleOp(name='div',
-                            fn=lambda x, y: x / y,
-                            ccode="%(z)s = %(x)s / %(y)s;")
+        div = BinaryDoubleOp(name = 'div',
+                            fn = lambda x, y: x / y,
+                            ccode = "%(z)s = %(x)s / %(y)s;")
 
 
         from theano.gof import toolbox
@@ -473,45 +441,13 @@ def tracks(self):
         simplify = gof.TopoOptimizer(local_simplify)
         simplify.optimize(e)
 
-    def test_as_op(self):
-        import theano
-        import numpy
-        from theano.compile.ops import as_op
-
-        def infer_shape_numpy_dot(node, input_shapes):
-            ashp, bshp = input_shapes
-            return [ashp[:-1] + bshp[-1:]]
-
-        @as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
-               otypes=[theano.tensor.fmatrix],
-               infer_shape=infer_shape_numpy_dot)
-        def numpy_add(a, b):
-            return numpy.add(a, b)
-
-        def infer_shape_numpy_add_sub(node, input_shapes):
-            ashp, bshp = input_shapes
-            # Both inputs should have that same shape, so we just
-            # return one of them.
-            return [ashp[0]]
-
-        @as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
-               otypes=[theano.tensor.fmatrix],
-               infer_shape=infer_shape_numpy_add_sub)
-        def numpy_add(a, b):
-            return numpy.add(a, b)
-
-        @as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
-               otypes=[theano.tensor.fmatrix],
-               infer_shape=infer_shape_numpy_add_sub)
-        def numpy_sub(a, b):
-            return numpy.sub(a, b)
 
 
 class T_introduction(unittest.TestCase):
-    # All tests here belong to
-    # http://deeplearning.net/software/theano/tutorial/introduction.html
-    # Theano/doc/tutorial/introduction.txt
-    # Any change you do here also add it to the tutorial !
+    ## All tests here belong to
+    ## http://deeplearning.net/software/theano/tutorial/introduction.html
+    ## Theano/doc/tutorial/introduction.txt
+    ## Any change you do here also add it to the tutorial !
     def test_introduction_1(self):
 
         import theano
@@ -533,10 +469,10 @@ def test_introduction_1(self):
 
 
 class T_adding(unittest.TestCase):
-    # All tests here belong to
-    # http://deeplearning.net/software/theano/tutorial/adding.html
-    # Theano/doc/tutorial/adding.txt
-    # Any change you do here also add it to the tutorial !
+    ## All tests here belong to
+    ## http://deeplearning.net/software/theano/tutorial/adding.html
+    ## Theano/doc/tutorial/adding.txt
+    ## Any change you do here also add it to the tutorial !
 
 
     def test_adding_1(self):
@@ -564,10 +500,10 @@ def test_adding_2(self):
 
 
 class T_examples(unittest.TestCase):
-    # All tests here belog to
-    # http://deeplearning.net/software/theano/tutorial/examples.html
-    # Theano/doc/tutorial/examples.txt
-    # Any change you do here also add it to the tutorial !
+    ## All tests here belog to
+    ## http://deeplearning.net/software/theano/tutorial/examples.html
+    ## Theano/doc/tutorial/examples.txt
+    ## Any change you do here also add it to the tutorial !
 
     def test_examples_1(self):
         x = T.dmatrix('x')
@@ -649,7 +585,7 @@ def test_examples_8(self):
         from theano import shared
         # Force the dtype to int64 to work correctly on 32 bit computer.
         # Otherwise, it create by default a int32 on 32 bit computer.
-        state = shared(0)
+        state = shared(numpy.int64(0))
         inc = T.iscalar('inc')
         accumulator = function([inc], state, updates=[(state, state+inc)])
 
@@ -668,11 +604,10 @@ def test_examples_8(self):
         assert state.get_value()       == array(0)
 
         fn_of_state = state * 2 + inc
-        # The type of foo must match the shared variable we are replacing
-        # with the ``givens``
-        foo = T.scalar(dtype=state.dtype)
+        foo = T.lscalar()    # the type (lscalar) must match the shared variable we
+                            # are replacing with the ``givens`` list
         skip_shared = function([inc, foo], fn_of_state,
-                               givens=[(state, foo)])
+                                                givens=[(state, foo)])
         assert skip_shared(1, 3)       == array(7)
         assert state.get_value()       == array(0)
 
@@ -714,99 +649,14 @@ def test_examples_9(self):
         rng.set_state(state_after_v0)
         rv_u.rng.set_value(rng, borrow=True)
         v2 = f()             # v2 != v1
-        v3 = f()             # v3 == v1
         assert numpy.all(v1 != v2)
-        assert numpy.all(v1 == v3)
-
-    def test_copy_random_state(self):
-
-        class Graph():
-            def __init__(self, seed=123):
-                self.rng = RandomStreams(seed)
-                self.y = self.rng.uniform(size=(1,))
-
-        g1 = Graph(seed=123)
-        f1 = theano.function([], g1.y)
-
-        g2 = Graph(seed=987)
-        f2 = theano.function([], g2.y)
-
-        #print 'By default, the two functions are out of sync.'
-        v1 =  f1()
-        v2 =  f2()
-
-        def copy_random_state(g1, g2):
-            if isinstance(g1.rng, MRG_RandomStreams):
-                g2.rng.rstate = g1.rng.rstate
-            for (su1, su2) in zip(g1.rng.state_updates, g2.rng.state_updates):
-                su2[0].set_value(su1[0].get_value())
-
-        #print 'We now copy the state of the theano random number generators.'
-        copy_random_state(g1, g2)
-        v3 = f1()
-        v4 = f2()
-        assert numpy.allclose(v1, 0.72803009)
-        assert numpy.allclose(v2, 0.55056769)
-        assert numpy.allclose(v3, 0.59044123)
-        assert numpy.allclose(v4, 0.59044123)
-
-    def test_examples_real_example(self):
-        rng = numpy.random
-
-        N = 400
-        feats = 784
-        D = (rng.randn(N, feats).astype(config.floatX),
-             rng.randint(size=N, low=0, high=2).astype(config.floatX))
-        training_steps = 10000
-        if config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
-            training_steps = 10
-
-        # Declare Theano symbolic variables
-        x = T.matrix("x")
-        y = T.vector("y")
-        # The *.03 have been added to have DebugMode don't complain
-        w = theano.shared(rng.randn(feats).astype(config.floatX) * .03,
-                          name="w")
-        b = theano.shared(numpy.asarray(0., dtype=config.floatX),
-                          name="b")
-        print "Initial model:"
-        print w.get_value(), b.get_value()
-
-        # Construct Theano expression graph
-        p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))   # Probability that target = 1
-        prediction = p_1 > 0.5                    # The prediction thresholded
-        xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
-        cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
-        gw, gb = T.grad(cost, [w, b])             # Compute the gradient of the cost
-                                                  # (we shall return to this in a
-                                                  # following section of this tutorial)
-
-        # Compile
-        train = theano.function(
-            inputs=[x,y],
-            outputs=[prediction, xent],
-            updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)))
-        predict = theano.function(inputs=[x], outputs=prediction)
-
-        # Train
-        for i in range(training_steps):
-            pred, err = train(D[0], D[1])
-
-        print "Final model:"
-        print w.get_value(), b.get_value()
-        print "target values for D:", D[1]
-        print "prediction on D:", predict(D[0])
-
-        # A user reported that this happened on the mailig list.
-        assert not numpy.isnan(b.get_value()).any()
-        assert not numpy.isnan(w.get_value()).any()
 
 
 class T_aliasing(unittest.TestCase):
-    # All tests here belog to
-    # http://deeplearning.net/software/theano/tutorial/aliasing.html
-    # Theano/doc/tutorial/aliasing.txt
-    # Any change you do here also add it to the tutorial !
+    ## All tests here belog to
+    ## http://deeplearning.net/software/theano/tutorial/aliasing.html
+    ## Theano/doc/tutorial/aliasing.txt
+    ## Any change you do here also add it to the tutorial !
 
     def test_aliasing_1(self):
 
@@ -838,7 +688,7 @@ def test_aliasing_2(self):
 
 
         s.set_value(
-            # some_inplace_fn
+            ## some_inplace_fn
             s.get_value(borrow=True).__imul__(2),
             borrow=True)
 
@@ -853,11 +703,12 @@ def test_aliasing_3(self):
         f = theano.function([theano.In(x, borrow=True)], theano.Out(y, borrow=True))
 
 
+
 class T_loading_and_saving(unittest.TestCase):
-    # All tests here belong to
-    # http://deeplearning.net/software/theano/tutorial/loading_and_saving.html
-    # Theano/doc/tutorial/loading_and_saving.txt
-    # Any change you do here also add it to the tutorial !
+    ## All tests here belong to
+    ## http://deeplearning.net/software/theano/tutorial/loading_and_saving.html
+    ## Theano/doc/tutorial/loading_and_saving.txt
+    ## Any change you do here also add it to the tutorial !
 
     def test_loading_and_saving_1(self):
 
@@ -910,12 +761,11 @@ def test_loading_and_saving_1(self):
                 if tmpdir is not None:
                     shutil.rmtree(tmpdir)
 
-
 class T_modes(unittest.TestCase):
-    # All tests here belong to
-    # http://deeplearning.net/software/theano/tutorial/modes.html
-    # Theano/doc/tutorial/modes.txt
-    # Any change you do here also add it to the tutorial !
+    ## All tests here belog to
+    ## http://deeplearning.net/software/theano/tutorial/modes.html
+    ## Theano/doc/tutorial/modes.txt
+    ## Any change you do here also add it to the tutorial !
 
     def test_modes_1(self):
 
@@ -924,15 +774,15 @@ def test_modes_1(self):
         f = theano.function([x], 10*x, mode='DEBUG_MODE')
 
         assert numpy.all(f([5]) == [50.])
-        assert numpy.all(f([0]) == [0.])
+        assert numpy.all(f([0]) == [0.] )
         assert numpy.all(f([7]) == [70.])
 
-
 class T_using_gpu(unittest.TestCase):
-    # All tests here belog to
-    # http://deeplearning.net/software/theano/tutorial/using_gpu.html
-    # Theano/doc/tutorial/using_gpu.txt
-    # Any change you do here also add it to the tutorial !
+    ## All tests here belog to
+    ## http://deeplearning.net/software/theano/tutorial/using_gpu.html
+    ## Theano/doc/tutorial/using_gpu.txt
+    ## Any change you do here also add it to the tutorial !
+
 
     def test_using_gpu_1(self):
         # I'm checking if this compiles and runs
@@ -963,7 +813,9 @@ def test_using_gpu_1(self):
         else:
             assert numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()])
 
+
     def test_using_gpu_2(self):
+
         if theano.config.device.find('gpu') > -1:
 
             from theano import function, config, shared, sandbox
@@ -1026,122 +878,63 @@ def test_using_gpu_3(self):
             assert not numpy.any([isinstance(x.op, T.Elemwise)
                                   for x in f.maker.fgraph.toposort()])
 
-    def test_using_gpu_pycudaop(self):
-        import theano.misc.pycuda_init
-        if not theano.misc.pycuda_init.pycuda_available:
-            raise SkipTest("Pycuda not installed. Skip test of theano op"
-                           " with pycuda code.")
-        from pycuda.compiler import SourceModule
-        import theano.sandbox.cuda as cuda
 
-        import theano.sandbox.cuda as cuda_ndarray
-        if not cuda_ndarray.cuda_available:
-            raise SkipTest('Optional package cuda disabled')
+class T_fibby(unittest.TestCase):
+    ## All tests here belong to
+    ## http://deeplearning.net/software/theano/extending/fibby.html
+    ## Theano/doc/extending/fibby.txt
+    ## Any change you do here also add it to the tutorial !
+
+    def test_fibby_1(self):
+
+        class Fibby(theano.Op):
+
+            """
+            An arbitrarily generalized Fibbonacci sequence
+            """
 
-        class PyCUDADoubleOp(theano.Op):
             def __eq__(self, other):
                 return type(self) == type(other)
 
             def __hash__(self):
                 return hash(type(self))
 
-            def __str__(self):
-                return self.__class__.__name__
-
-            def make_node(self, inp):
-                inp = cuda.basic_ops.gpu_contiguous(
-                    cuda.basic_ops.as_cuda_ndarray_variable(inp))
-                assert inp.dtype == "float32"
-                return theano.Apply(self, [inp], [inp.type()])
-
-            def make_thunk(self, node, storage_map, _, _2):
-                mod = SourceModule("""
-    __global__ void my_fct(float * i0, float * o0, int size) {
-    int i = blockIdx.x*blockDim.x + threadIdx.x;
-    if(i<size){
-        o0[i] = i0[i]*2;
-    }
-  }""")
-                pycuda_fct = mod.get_function("my_fct")
-                inputs = [storage_map[v] for v in node.inputs]
-                outputs = [storage_map[v] for v in node.outputs]
-
-                def thunk():
-                    z = outputs[0]
-                    if z[0] is None or z[0].shape != inputs[0][0].shape:
-                        z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
-                        grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1)
-                        pycuda_fct(inputs[0][0], z[0],
-                                   numpy.intc(inputs[0][0].size),
-                                   block=(512, 1, 1), grid=grid)
-                return thunk
-        x = theano.tensor.fmatrix()
-        f = theano.function([x], PyCUDADoubleOp()(x))
-        xv = numpy.ones((4, 5), dtype="float32")
-        assert numpy.allclose(f(xv), xv*2)
-        # print numpy.asarray(f(xv))
-
-
-# Used in T_fibby
-class Fibby(theano.Op):
-
-    """
-    An arbitrarily generalized Fibbonacci sequence
-    """
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def make_node(self, x):
-        x_ = theano.tensor.as_tensor_variable(x)
-        assert x_.ndim == 1
-        return theano.Apply(self,
-            inputs=[x_],
-            outputs=[x_.type()])
-        # using x_.type() is dangerous, it copies x's broadcasting
-        # behaviour
-
-    def perform(self, node, inputs, output_storage):
-        x, = inputs
-        y = output_storage[0][0] = x.copy()
-        for i in range(2, len(x)):
-            y[i] = y[i - 1] * y[i - 2] + x[i]
-
-    def c_code(self, node, name, inames, onames, sub):
-        x, = inames
-        y, = onames
-        fail = sub['fail']
-        return """
-            Py_XDECREF(%(y)s);
-            %(y)s = (PyArrayObject*)PyArray_FromArray(
-                    %(x)s, 0, NPY_ARRAY_ENSURECOPY);
-            if (!%(y)s)
-                %(fail)s;
-            {//New scope needed to make compilation work
-                dtype_%(y)s * y = (dtype_%(y)s*)PyArray_DATA(%(y)s);
-                dtype_%(x)s * x = (dtype_%(x)s*)PyArray_DATA(%(x)s);
-                for (int i = 2; i < PyArray_DIMS(%(x)s)[0]; ++i)
-                    y[i] = y[i-1]*y[i-2] + x[i];
-            }
-        """ % locals()
-
-    def c_code_cache_version(self):
-        return (1,)
-
+            def make_node(self, x):
+                x_ = theano.tensor.as_tensor_variable(x)
+                assert x_.ndim == 1
+                return theano.Apply(self,
+                    inputs=[x_],
+                    outputs=[x_.type()])
+                # using x_.type() is dangerous, it copies x's broadcasting
+                # behaviour
+
+            def perform(self, node, inputs, output_storage):
+                x, = inputs
+                y = output_storage[0][0] = x.copy()
+                for i in range(2, len(x)):
+                    y[i] = y[i - 1] * y[i - 2] + x[i]
+
+            def c_code(self, node, name, inames, onames, sub):
+                x, = inames
+                y, = onames
+                fail = sub['fail']
+                return """
+                    Py_XDECREF(%(y)s);
+                    %(y)s = (PyArrayObject*)PyArray_FromArray(
+                            %(x)s, 0, NPY_ARRAY_ENSURECOPY);
+                    if (!%(y)s)
+                        %(fail)s;
+                    {//New scope needed to make compilation work
+                        dtype_%(y)s * y = (dtype_%(y)s*)%(y)s->data;
+                        dtype_%(x)s * x = (dtype_%(x)s*)%(x)s->data;
+                        for (int i = 2; i < %(x)s->dimensions[0]; ++i)
+                            y[i] = y[i-1]*y[i-2] + x[i];
+                    }
+                """ % locals()
 
-class T_fibby(unittest.TestCase):
-    # All tests here belong to
-    # http://deeplearning.net/software/theano/extending/fibby.html
-    # Theano/doc/extending/fibby.txt
-    # Any change you do here also add it to the tutorial !
+            def c_code_cache_version(self):
+                return (1,)
 
-    def test_fibby_1(self):
-
-        # The definition of class Fibby is done outside of the test,
-        # so the object can be pickled.
         fibby = Fibby()
 
         from theano.tensor.opt import (get_scalar_constant_value,
@@ -1189,10 +982,10 @@ def fibby_of_zero(node):
 
 
 class T_graphstructures(unittest.TestCase):
-    # All tests here belong to
-    # http://deeplearning.net/software/theano/extending/graphstructures.html
-    # Theano/doc/extending/graphstructures.txt
-    # Any change you do here also add it to the tutorial !
+    ## All tests here belong to
+    ## http://deeplearning.net/software/theano/extending/graphstructures.html
+    ## Theano/doc/extending/graphstructures.txt
+    ## Any change you do here also add it to the tutorial !
 
     def test_graphstructures_1(self):
 
@@ -1251,251 +1044,3 @@ def test_graphstructures_1(self):
         assert e.owner.inputs[1] is mul_variable
         assert e.owner.inputs[1].owner.inputs[0] is y
         assert e.owner.inputs[1].owner.inputs[1] is z
-
-
-class T_scan(unittest.TestCase):
-    # All tests here belong to
-    # http://deeplearning.net/software/theano/tutorial/loop.html
-    # Theano/doc/tutorial/loop.txt
-    # Any change you do here also add it to the tutorial !
-
-    def test_elemwise(self):
-        # defining the tensor variables
-        X = T.matrix("X")
-        W = T.matrix("W")
-        b_sym = T.vector("b_sym")
-
-        results, updates = theano.scan(lambda v: T.tanh(T.dot(v, W) + b_sym),
-                                       sequences=X)
-        compute_elementwise = theano.function(inputs=[X, W, b_sym],
-                                              outputs=[results])
-
-        # test values
-        x = numpy.eye(2, dtype=theano.config.floatX)
-        w = numpy.ones((2, 2), dtype=theano.config.floatX)
-        b = numpy.ones((2), dtype=theano.config.floatX)
-        b[1] = 2
-
-        print "Scan results:", compute_elementwise(x, w, b)[0]
-
-        # comparison with numpy
-        print "Numpy results:", numpy.tanh(x.dot(w) + b)
-
-    def test_sequence(self):
-        # define tensor variables
-        X = T.vector("X")
-        W = T.matrix("W")
-        b_sym = T.vector("b_sym")
-        U = T.matrix("U")
-        Y = T.matrix("Y")
-        V = T.matrix("V")
-        P = T.matrix("P")
-
-        results, updates = theano.scan(
-            lambda y, p, x_tm1: T.tanh(T.dot(x_tm1, W) +
-                                       T.dot(y, U) + T.dot(p, V)),
-            sequences=[Y, P[::-1]], outputs_info=[X])
-
-        compute_seq = theano.function(inputs=[X, W, Y, U, P, V],
-                                      outputs=[results])
-
-        # test values
-        x = numpy.zeros((2), dtype=theano.config.floatX)
-        x[1] = 1
-        w = numpy.ones((2, 2), dtype=theano.config.floatX)
-        y = numpy.ones((5, 2), dtype=theano.config.floatX)
-        y[0, :] = -3
-        u = numpy.ones((2, 2), dtype=theano.config.floatX)
-        p = numpy.ones((5, 2), dtype=theano.config.floatX)
-        p[0, :] = 3
-        v = numpy.ones((2, 2), dtype=theano.config.floatX)
-
-        print "Scan results", compute_seq(x, w, y, u, p, v)[0]
-
-        # comparison with numpy
-        x_res = numpy.zeros((5, 2), dtype=theano.config.floatX)
-        x_res[0] = numpy.tanh(x.dot(w) + y[0].dot(u) + p[4].dot(v))
-        for i in range(1, 5):
-            x_res[i] = numpy.tanh(x_res[i-1].dot(w) +
-                                  y[i].dot(u) + p[4-i].dot(v))
-
-        print "Numpy results:", x_res
-
-    def test_norm(self):
-        # define tensor variable
-        X = T.matrix("X")
-        results, updates = theano.scan(lambda x_i: T.sqrt((x_i**2).sum()),
-                                       sequences=[X])
-        compute_norm_lines = theano.function(inputs=[X], outputs=[results])
-
-        results, updates = theano.scan(lambda x_i: T.sqrt((x_i**2).sum()),
-                                       sequences=[X.T])
-        compute_norm_cols = theano.function(inputs=[X], outputs=[results])
-
-        # test value
-        x = numpy.diag(numpy.arange(1, 6, dtype=theano.config.floatX), 1)
-        print "Scan results:", compute_norm_lines(x)[0], \
-                            compute_norm_cols(x)[0]
-
-        # comparison with numpy
-        print "Numpy results:", numpy.sqrt((x**2).sum(1)), \
-                            numpy.sqrt((x**2).sum(0))
-
-    def test_trace(self):
-        # define tensor variable
-        X = T.matrix("X")
-        results, updates = theano.scan(lambda i, j, t_f: T.cast(X[i,j] +
-                                                                t_f, theano.config.floatX),
-                                       sequences=[T.arange(X.shape[0]),
-                                                  T.arange(X.shape[1])],
-                                       outputs_info=numpy.asarray(
-                                           0., dtype=theano.config.floatX))
-
-        result = results[-1]
-        compute_trace = theano.function(inputs=[X], outputs=[result])
-
-        # test value
-        x = numpy.eye(5, dtype=theano.config.floatX)
-        x[0] = numpy.arange(5, dtype=theano.config.floatX)
-        print "Scan results:", compute_trace(x)[0]
-
-        # comparison with numpy
-        print "Numpy results:", numpy.diagonal(x).sum()
-
-    def test_taps(self):
-        # define tensor variables
-        X = T.matrix("X")
-        W = T.matrix("W")
-        b_sym = T.vector("b_sym")
-        U = T.matrix("U")
-        V = T.matrix("V")
-        n_sym = T.iscalar("n_sym")
-
-        results, updates = theano.scan(
-            lambda x_tm2,x_tm1: T.dot(x_tm2,U) + T.dot(x_tm1,V) + T.tanh(T.dot(x_tm1,W) + b_sym),
-            n_steps=n_sym,
-            outputs_info=[dict(initial=X, taps=[-2, -1])])
-
-        compute_seq2 = theano.function(inputs=[X, U, V, W, b_sym, n_sym],
-                                       outputs=[results])
-
-        # test values
-        x = numpy.zeros((2, 2), dtype=theano.config.floatX)
-        # the initial value must be able to return x[-2]
-        x[1, 1] = 1
-        w = 0.5 * numpy.ones((2, 2), dtype=theano.config.floatX)
-        u = 0.5 * (numpy.ones((2, 2), dtype=theano.config.floatX) -
-                   numpy.eye(2, dtype=theano.config.floatX))
-        v = 0.5 * numpy.ones((2, 2), dtype=theano.config.floatX)
-        n = 10
-        b = numpy.ones((2), dtype=theano.config.floatX)
-
-        print "Scan results:", compute_seq2(x, u, v, w, b, n)
-
-        # comparison with numpy
-        x_res = numpy.zeros((10, 2), dtype=theano.config.floatX)
-        x_res[0] = x[0].dot(u) + x[1].dot(v) + numpy.tanh(x[1].dot(w) + b)
-        x_res[1] = x[1].dot(u) + x_res[0].dot(v) \
-                        + numpy.tanh(x_res[0].dot(w) + b)
-        x_res[2] = x_res[0].dot(u) + x_res[1].dot(v) \
-                   + numpy.tanh(x_res[1].dot(w) + b)
-        for i in range(2, 10):
-            x_res[i] = (x_res[i-2].dot(u) + x_res[i-1].dot(v) +
-                        numpy.tanh(x_res[i-1].dot(w) + b))
-
-        print "Numpy results:", x_res
-
-    def test_jacobian(self):
-        # define tensor variables
-        v = T.vector()
-        A = T.matrix()
-        y = T.tanh(T.dot(v, A))
-        results, updates = theano.scan(lambda i: T.grad(y[i], v),
-                                       sequences=[T.arange(y.shape[0])])
-        compute_jac_t = theano.function([A, v], [results],
-                                        allow_input_downcast=True)  # shape (d_out, d_in)
-
-        # test values
-        x = numpy.eye(5)[0]
-        w = numpy.eye(5, 3)
-        w[2] = numpy.ones((3))
-        print "Scan results:", compute_jac_t(w, x)[0]
-
-        # compare with numpy
-        print "Numpy results:", ((1 - numpy.tanh(x.dot(w))**2)*w).T
-
-    def test_accumulator(self):
-        # define shared variables
-        k = theano.shared(0)
-        n_sym = T.iscalar("n_sym")
-
-        results, updates = theano.scan(lambda: {k: (k + 1)}, n_steps=n_sym)
-        accumulator = theano.function([n_sym], [], updates=updates,
-                                      allow_input_downcast=True)
-
-        print "Before 5 steps:", k.get_value()
-        accumulator(5)
-        print "After 5 steps:", k.get_value()
-
-    def test_random(self):
-        # define tensor variables
-        X = T.matrix("X")
-        W = T.matrix("W")
-        b_sym = T.vector("b_sym")
-
-        # define shared random stream
-        trng = T.shared_randomstreams.RandomStreams(1234)
-        d = trng.binomial(size=W[1].shape)
-
-        results, updates = theano.scan(lambda v: T.tanh(T.dot(v, W) + b_sym) * d,
-                                       sequences=X)
-        compute_with_bnoise = theano.function(inputs=[X, W, b_sym],
-                                              outputs=[results],
-                                              updates=updates,
-                                              allow_input_downcast = True)
-        x = numpy.eye(10,2)
-        w = numpy.ones((2,2))
-        b = numpy.ones((2))
-
-        print compute_with_bnoise(x, w, b)
-
-
-class T_typedlist(unittest.TestCase):
-    # All tests here belong to
-    # http://deeplearning.net/software/theano/library/typed_list.html
-    # Theano/doc/library/typed_list.txt
-    # Any change you do here must also be done in the documentation !
-
-    def test_typedlist_basic(self):
-        import theano.typed_list
-
-        tl = theano.typed_list.TypedListType(theano.tensor.fvector)()
-        v = theano.tensor.fvector()
-        o = theano.typed_list.append(tl, v)
-        f = theano.function([tl, v], o)
-        output = f([[1, 2, 3], [4, 5]], [2])
-
-        # Validate ouput is as expected
-        expected_output = [numpy.array([1,2,3], dtype="float32"),
-                           numpy.array([4,5], dtype="float32"),
-                           numpy.array([2], dtype="float32")]
-
-        assert len(output) == len(expected_output)
-        for i in range(len(output)):
-            utt.assert_allclose(output[i], expected_output[i])
-
-    def test_typedlist_with_scan(self):
-        import theano.typed_list
-
-        a = theano.typed_list.TypedListType(theano.tensor.fvector)()
-        l = theano.typed_list.length(a)
-        s, _ = theano.scan(fn=lambda i, tl: tl[i].sum(),
-                        non_sequences=[a],
-                        sequences=[theano.tensor.arange(l, dtype='int64')])
-
-        f = theano.function([a], s)
-        output = f([[1, 2, 3], [4, 5]])
-
-        # Validate ouput is as expected
-        expected_output = numpy.array([6, 9], dtype="float32")
-        utt.assert_allclose(output, expected_output)
diff --git a/theano/tests/unittest_tools.py b/theano/tests/unittest_tools.py
index 77a275d76a6..724f1b1f317 100644
--- a/theano/tests/unittest_tools.py
+++ b/theano/tests/unittest_tools.py
@@ -1,6 +1,5 @@
 from copy import copy, deepcopy
 import logging
-from StringIO import StringIO
 import sys
 import unittest
 
@@ -30,10 +29,10 @@ def good_seed_param(seed):
     return True
 
 AddConfigVar('unittests.rseed',
-             "Seed to use for randomized unit tests. "
-             "Special value 'random' means using a seed of None.",
-             StrParam(666, is_valid=good_seed_param),
-             in_c_key=False)
+        "Seed to use for randomized unit tests. "
+        "Special value 'random' means using a seed of None.",
+        StrParam(666, is_valid=good_seed_param),
+        in_c_key=False)
 
 
 def fetch_seed(pseed=None):
@@ -41,15 +40,15 @@ def fetch_seed(pseed=None):
     Returns the seed to use for running the unit tests.
     If an explicit seed is given, it will be used for seeding numpy's rng.
     If not, it will use config.unittest.rseed (its default value is 666).
-    If config.unittest.rseed is set to "random", it will seed the rng with
-    None, which is equivalent to seeding with a random seed.
+    If config.unittest.rseed is set to "random", it will seed the rng with None,
+    which is equivalent to seeding with a random seed.
 
     Useful for seeding RandomState objects.
     >>> rng = numpy.random.RandomState(unittest_tools.fetch_seed())
     """
 
     seed = pseed or config.unittests.rseed
-    if seed == 'random':
+    if seed=='random':
         seed = None
 
     try:
@@ -58,8 +57,8 @@ def fetch_seed(pseed=None):
         else:
             seed = None
     except ValueError:
-        print >> sys.stderr, ('Error: config.unittests.rseed contains '
-                              'invalid seed, using None instead')
+        print >> sys.stderr, 'Error: config.unittests.rseed contains '\
+                'invalid seed, using None instead'
         seed = None
 
     return seed
@@ -72,7 +71,7 @@ def seed_rng(pseed=None):
     """
 
     seed = fetch_seed(pseed)
-    if pseed and pseed != seed:
+    if pseed and pseed!=seed:
         print >> sys.stderr, 'Warning: using seed given by config.unittests.rseed=%i'\
                 'instead of seed %i given as parameter' % (seed, pseed)
     numpy.random.seed(seed)
@@ -155,8 +154,7 @@ def test_eq(self):
             assert op_i == self.clone(op_i)
             assert op_i != self.other_op
             for j, op_j in enumerate(self.ops):
-                if i == j:
-                    continue
+                if i == j: continue
                 assert op_i != op_j
 
     def test_hash(self):
@@ -168,8 +166,7 @@ def test_hash(self):
             assert h_i == hash(self.clone(op_i))
             assert h_i != hash(self.other_op)
             for j, op_j in enumerate(self.ops):
-                if i == j:
-                    continue
+                if i == j: continue
                 assert op_i != hash(op_j)
 
     def test_name(self):
@@ -182,16 +179,13 @@ class InferShapeTester(unittest.TestCase):
     def setUp(self):
         seed_rng()
         # Take into account any mode that may be defined in a child class
-        # and it can be None
-        mode = getattr(self, 'mode', None)
-        if mode is None:
-            mode = theano.compile.get_default_mode()
+        mode = getattr(self, 'mode', theano.compile.get_default_mode())
         # This mode seems to be the minimal one including the shape_i
         # optimizations, if we don't want to enumerate them explicitly.
         self.mode = mode.including("canonicalize")
 
     def _compile_and_check(self, inputs, outputs, numeric_inputs, cls,
-                           excluding=None, warn=True, check_topo=True):
+                           excluding=None, warn=True):
         """This tests the infer_shape method only
 
         When testing with input values with shapes that take the same
@@ -204,9 +198,6 @@ def _compile_and_check(self, inputs, outputs, numeric_inputs, cls,
         matrices will not detect the problem. If warn=True, we emit a
         warning when testing with such values.
 
-        :param check_topo: If True, we check that the Op where removed
-            from the graph. False is useful to test not implemented case.
-
         """
         mode = self.mode
         if excluding:
@@ -239,9 +230,8 @@ def _compile_and_check(self, inputs, outputs, numeric_inputs, cls,
                                           mode=mode)
         #theano.printing.debugprint(shapes_function)
         # Check that the Op is removed from the compiled function.
-        if check_topo:
-            topo_shape = shapes_function.maker.fgraph.toposort()
-            assert not any(isinstance(t.op, cls) for t in topo_shape)
+        topo_shape = shapes_function.maker.fgraph.toposort()
+        assert not any(isinstance(t.op, cls) for t in topo_shape)
         topo_out = outputs_function.maker.fgraph.toposort()
         assert any(isinstance(t.op, cls) for t in topo_out)
         # Check that the shape produced agrees with the actual shape.
@@ -249,95 +239,3 @@ def _compile_and_check(self, inputs, outputs, numeric_inputs, cls,
         numeric_shapes = shapes_function(*numeric_inputs)
         for out, shape in zip(numeric_outputs, numeric_shapes):
             assert numpy.all(out.shape == shape)
-
-
-def str_diagnostic(expected, value, rtol, atol):
-    """Return a pretty multiline string representating the cause
-    of the exception"""
-    sio = StringIO()
-
-    try:
-        ssio = StringIO()
-        print >> ssio, "           : shape, dtype, strides, min, max, n_inf, n_nan:"
-        print >> ssio, "  Expected :",
-        print >> ssio, expected.shape,
-        print >> ssio, expected.dtype,
-        print >> ssio, expected.strides,
-        print >> ssio, expected.min(),
-        print >> ssio, expected.max(),
-        print >> ssio, numpy.isinf(expected).sum(),
-        print >> ssio, numpy.isnan(expected).sum(),
-        # only if all succeeds to we add anything to sio
-        print >> sio, ssio.getvalue()
-    except Exception:
-        pass
-    try:
-        ssio = StringIO()
-        print >> ssio, "  Value    :",
-        print >> ssio, value.shape,
-        print >> ssio, value.dtype,
-        print >> ssio, value.strides,
-        print >> ssio, value.min(),
-        print >> ssio, value.max(),
-        print >> ssio, numpy.isinf(value).sum(),
-        print >> ssio, numpy.isnan(value).sum(),
-        # only if all succeeds to we add anything to sio
-        print >> sio, ssio.getvalue()
-    except Exception:
-        pass
-
-    print >> sio, "  expected    :", expected
-    print >> sio, "  value    :", value
-
-    try:
-        ov = numpy.asarray(expected)
-        nv = numpy.asarray(value)
-        ssio = StringIO()
-        absdiff = numpy.absolute(nv - ov)
-        print >> ssio, "  Max Abs Diff: ", numpy.max(absdiff)
-        print >> ssio, "  Mean Abs Diff: ", numpy.mean(absdiff)
-        print >> ssio, "  Median Abs Diff: ", numpy.median(absdiff)
-        print >> ssio, "  Std Abs Diff: ", numpy.std(absdiff)
-        reldiff = numpy.absolute(nv - ov) / (numpy.absolute(nv) +
-                                             numpy.absolute(ov))
-        print >> ssio, "  Max Rel Diff: ", numpy.max(reldiff)
-        print >> ssio, "  Mean Rel Diff: ", numpy.mean(reldiff)
-        print >> ssio, "  Median Rel Diff: ", numpy.median(reldiff)
-        print >> ssio, "  Std Rel Diff: ", numpy.std(reldiff)
-        # only if all succeeds to we add anything to sio
-        print >> sio, ssio.getvalue()
-    except Exception:
-        pass
-    #Use the same formula as in _allclose to find the tolerance used
-    narrow = 'float32', 'complex64'
-    if ((str(expected.dtype) in narrow) or
-        (str(value.dtype) in narrow)):
-        atol_ = T.basic.float32_atol
-        rtol_ = T.basic.float32_rtol
-    else:
-        atol_ = T.basic.float64_atol
-        rtol_ = T.basic.float64_rtol
-    if rtol is not None:
-        rtol_ = rtol
-    if atol is not None:
-        atol_ = atol
-    print >> sio, "  rtol, atol:", rtol_, atol_
-    return sio.getvalue()
-
-
-class WrongValue(Exception):
-    def __init__(self, expected_val, val, rtol, atol):
-        Exception.__init__(self)  # to be compatible with python2.4
-        self.val1 = expected_val
-        self.val2 = val
-        self.rtol = rtol
-        self.atol = atol
-
-    def __str__(self):
-        s = "WrongValue\n"
-        return s + str_diagnostic(self.val1, self.val2, self.rtol, self.atol)
-
-
-def assert_allclose(val1, val2, rtol=None, atol=None):
-    if not T.basic._allclose(val1, val2, rtol, atol):
-        raise WrongValue(val1, val2, rtol, atol)
diff --git a/theano/typed_list/__init__.py b/theano/typed_list/__init__.py
deleted file mode 100644
index a23def68f7a..00000000000
--- a/theano/typed_list/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from type import TypedListType
-from basic import *
-import opt
diff --git a/theano/typed_list/basic.py b/theano/typed_list/basic.py
deleted file mode 100644
index eaf4b6bd954..00000000000
--- a/theano/typed_list/basic.py
+++ /dev/null
@@ -1,606 +0,0 @@
-import copy
-
-import numpy
-
-from type import TypedListType
-import theano
-from theano.gof import Apply, Constant, Op, Variable
-from theano.tensor.type_other import SliceType
-from theano import tensor as T
-from theano.compile.debugmode import _lessbroken_deepcopy
-
-
-class _typed_list_py_operators:
-
-    def __getitem__(self, index):
-        return getitem(self, index)
-
-    def __len__(self):
-        return length(self)
-
-    def append(self, toAppend):
-        return append(self, toAppend)
-
-    def extend(self, toAppend):
-        return extend(self, toAppend)
-
-    def insert(self, index, toInsert):
-        return insert(self, index, toInsert)
-
-    def remove(self, toRemove):
-        return remove(self, toRemove)
-
-    def reverse(self):
-        return reverse(self)
-
-    def count(self, elem):
-        return count(self, elem)
-
-    # name "index" is already used by an attribute
-    def ind(self, elem):
-        return index_(self, elem)
-
-    ttype = property(lambda self: self.type.ttype)
-    dtype = property(lambda self: self.type.ttype.dtype)
-    ndim = property(lambda self: self.type.ttype.ndim + 1)
-
-
-class TypedListVariable(_typed_list_py_operators, Variable):
-    """
-    Subclass to add the typed list operators to the basic `Variable` class.
-    """
-
-TypedListType.Variable = TypedListVariable
-
-
-class GetItem(Op):
-    # See doc in instance of this Op or function after this class definition.
-    view_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def make_node(self, x, index):
-        assert isinstance(x.type, TypedListType)
-        if not isinstance(index, Variable):
-            if isinstance(index, slice):
-                index = Constant(SliceType(), index)
-                return Apply(self, [x, index], [x.type()])
-            else:
-                index = T.constant(index, ndim=0, dtype='int64')
-                return Apply(self, [x, index], [x.ttype()])
-        if isinstance(index.type, SliceType):
-            return Apply(self, [x, index], [x.type()])
-        elif isinstance(index, T.TensorVariable) and index.ndim == 0:
-            assert index.dtype == 'int64'
-            return Apply(self, [x, index], [x.ttype()])
-        else:
-            raise TypeError('Expected scalar or slice as index.')
-
-    def perform(self, node, (x, index), (out, )):
-        if not isinstance(index, slice):
-            index = int(index)
-        out[0] = x[index]
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def c_code(self, node, name, inp, out, sub):
-        x_name, index = inp[0], inp[1]
-        output_name = out[0]
-        fail = sub['fail']
-        return """
-        %(output_name)s = (typeof %(output_name)s) PyList_GetItem( (PyObject*) %(x_name)s, *((npy_int64 *) PyArray_DATA(%(index)s)));
-        if(%(output_name)s == NULL){
-            %(fail)s
-        }
-        Py_INCREF(%(output_name)s);
-        """ % locals()
-
-    def c_code_cache_version(self):
-        return (1,)
-
-getitem = GetItem()
-"""
-Get specified slice of a typed list.
-
-:param x: typed list.
-:param index: the index of the value to return from `x`.
-"""
-
-
-class Append(Op):
-    # See doc in instance of this Op after the class definition.
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-            # TODO: make destroy_handler support having views and
-            # destroyed version of multiple inputs.
-            # self.view_map = {0: [1]}
-        else:
-            # TODO: make destroy_handler support multiple view
-            # self.view_map = {0: [0, 1]}
-            self.view_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.inplace == other.inplace
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.inplace)
-
-    def make_node(self, x, toAppend):
-        assert isinstance(x.type, TypedListType)
-        assert x.ttype == toAppend.type
-        return Apply(self, [x, toAppend], [x.type()])
-
-    def perform(self, node, (x, toAppend), (out, )):
-        if not self.inplace:
-            out[0] = list(x)
-        else:
-            out[0] = x
-        # need to copy toAppend due to destroy_handler limitation
-        toAppend = _lessbroken_deepcopy(toAppend)
-        out[0].append(toAppend)
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    # DISABLED AS WE NEED TO UPDATE IT TO COPY toAppend().
-    def _c_code_(self, node, name, inp, out, sub):
-        x_name, toAppend = inp[0], inp[1]
-        output_name = out[0]
-        fail = sub['fail']
-        if not self.inplace:
-            init = """
-            %(output_name)s = (PyListObject*) PyList_GetSlice((PyObject*) %(x_name)s, 0, PyList_GET_SIZE((PyObject*) %(x_name)s)) ;
-            """ % locals()
-        else:
-            init = """
-            %(output_name)s =  %(x_name)s;
-            """ % locals()
-        return init + """
-        if(%(output_name)s==NULL){
-                %(fail)s
-        };
-        if(PyList_Append( (PyObject*) %(output_name)s,(PyObject*) %(toAppend)s)){
-            %(fail)s
-        };
-        Py_INCREF(%(output_name)s);
-        """ % locals()
-
-    def c_code_cache_version(self):
-        return (1,)
-
-append = Append()
-"""
-Append an element at the end of another list.
-
-:param x: the base typed list.
-:param y: the element to append to `x`.
-"""
-
-
-class Extend(Op):
-    # See doc in instance of this Op after the class definition.
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-            # TODO: make destroy_handler support having views and
-            # destroyed version of multiple inputs.
-            # self.view_map = {0: [1]}
-        else:
-            # TODO: make destroy_handler support multiple view
-            # self.view_map = {0: [0, 1]}
-            self.view_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.inplace == other.inplace
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.inplace)
-
-    def make_node(self, x, toAppend):
-        assert isinstance(x.type, TypedListType)
-        assert x.type == toAppend.type
-        return Apply(self, [x, toAppend], [x.type()])
-
-    def perform(self, node, (x, toAppend), (out, )):
-        if not self.inplace:
-            out[0] = list(x)
-        else:
-            out[0] = x
-        # need to copy toAppend due to destroy_handler limitation
-        if toAppend:
-            o = out[0]
-            for i in toAppend:
-                o.append(_lessbroken_deepcopy(i))
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    # DISABLED AS WE NEED TO UPDATE IT TO COPY toAppend().
-    def _c_code_(self, node, name, inp, out, sub):
-        x_name, toAppend = inp[0], inp[1]
-        output_name = out[0]
-        fail = sub['fail']
-        if not self.inplace:
-            init = """
-            %(output_name)s = (PyListObject*) PyList_GetSlice((PyObject*) %(x_name)s, 0, PyList_GET_SIZE((PyObject*) %(x_name)s)) ;
-            """ % locals()
-        else:
-            init = """
-            %(output_name)s =  %(x_name)s;
-            """ % locals()
-        return init + """
-        int i =0;
-        int length = PyList_GET_SIZE((PyObject*) %(toAppend)s);
-        if(%(output_name)s==NULL){
-                %(fail)s
-        };
-        for(i; i < length; i++){
-            if(PyList_Append( (PyObject*) %(output_name)s,(PyObject*) PyList_GetItem((PyObject*) %(toAppend)s,i))==-1){
-                %(fail)s
-            };
-        }
-        Py_INCREF(%(output_name)s);
-        """ % locals()
-
-    def c_code_cache_version_(self):
-        return (1,)
-
-extend = Extend()
-"""
-Append all elements of a list at the end of another list.
-
-:param x: The typed list to extend.
-:param toAppend: The typed list that will be added at the end of `x`.
-"""
-
-
-class Insert(Op):
-    # See doc in instance of this Op after the class definition.
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-            # TODO: make destroy_handler support having views and
-            # destroyed version of multiple inputs.
-            # self.view_map = {0: [2]}
-        else:
-            # TODO: make destroy_handler support multiple view
-            # self.view_map = {0: [0, 2]}
-            self.view_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.inplace == other.inplace
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.inplace)
-
-    def make_node(self, x, index, toInsert):
-        assert isinstance(x.type, TypedListType)
-        assert x.ttype == toInsert.type
-        if not isinstance(index, Variable):
-            index = T.constant(index, ndim=0, dtype='int64')
-        else:
-            assert index.dtype == 'int64'
-            assert isinstance(index, T.TensorVariable) and index.ndim == 0
-        return Apply(self, [x, index, toInsert], [x.type()])
-
-    def perform(self, node, (x, index, toInsert), (out, )):
-        if not self.inplace:
-            out[0] = list(x)
-        else:
-            out[0] = x
-        # need to copy toAppend due to destroy_handler limitation
-        toInsert = _lessbroken_deepcopy(toInsert)
-        out[0].insert(index, toInsert)
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    # DISABLED AS WE NEED TO UPDATE IT TO COPY toAppend().
-    def _c_code_(self, node, name, inp, out, sub):
-        x_name, index, toInsert = inp[0], inp[1], inp[2]
-        output_name = out[0]
-        fail = sub['fail']
-        if not self.inplace:
-            init = """
-            %(output_name)s = (PyListObject*) PyList_GetSlice((PyObject*) %(x_name)s, 0, PyList_GET_SIZE((PyObject*) %(x_name)s)) ;
-            """ % locals()
-        else:
-            init = """
-            %(output_name)s =  %(x_name)s;
-            """ % locals()
-        return init + """
-        if(%(output_name)s==NULL){
-                %(fail)s
-        };
-        if(PyList_Insert((PyObject*) %(output_name)s, *((npy_int64 *) PyArray_DATA(%(index)s)), (PyObject*) %(toInsert)s)==-1){
-            %(fail)s
-        };
-        Py_INCREF(%(output_name)s);
-        """ % locals()
-
-    def c_code_cache_version(self):
-        return (1,)
-
-insert = Insert()
-"""
-Insert an element at an index in a typed list.
-
-:param x: the typed list to modify.
-:param index: the index where to put the new element in `x`.
-:param toInsert: The new element to insert.
-"""
-
-
-class Remove(Op):
-    # See doc in instance of this Op after the class definition.
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-        else:
-            self.view_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.inplace == other.inplace
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.inplace)
-
-    def make_node(self, x, toRemove):
-        assert isinstance(x.type, TypedListType)
-        assert x.ttype == toRemove.type
-        return Apply(self, [x, toRemove], [x.type()])
-
-    def perform(self, node, (x, toRemove), (out, )):
-
-        if not self.inplace:
-            out[0] = list(x)
-        else:
-            out[0] = x
-
-        """
-        inelegant workaround for ValueError: The truth value of an
-        array with more than one element is ambiguous. Use a.any() or a.all()
-        being thrown when trying to remove a matrix from a matrices list
-        """
-        for y in range(out[0].__len__()):
-                if node.inputs[0].ttype.values_eq(out[0][y], toRemove):
-                    del out[0][y]
-                    break
-
-    def __str__(self):
-        return self.__class__.__name__
-
-remove = Remove()
-"""Remove an element from a typed list.
-
-:param x: the typed list to be changed.
-:param toRemove: an element to be removed from the typed list.
-    We only remove the first instance.
-
-:note: Python implementation of remove doesn't work when we want to
-    remove an ndarray from a list. This implementation works in that
-    case.
-
-"""
-
-
-class Reverse(Op):
-    # See doc in instance of this Op after the class definition.
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-        else:
-            self.view_map = {0: [0]}
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.inplace == other.inplace
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.inplace)
-
-    def make_node(self, x):
-        assert isinstance(x.type, TypedListType)
-        return Apply(self, [x], [x.type()])
-
-    def perform(self, node, inp, (out, )):
-
-        if not self.inplace:
-            out[0] = list(inp[0])
-        else:
-            out[0] = inp[0]
-        out[0].reverse()
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def c_code(self, node, name, inp, out, sub):
-        x_name = inp[0]
-        output_name = out[0]
-        fail = sub['fail']
-        if not self.inplace:
-            init = """
-            %(output_name)s = (PyListObject*) PyList_GetSlice((PyObject*) %(x_name)s, 0, PyList_GET_SIZE((PyObject*) %(x_name)s)) ;
-            """ % locals()
-        else:
-            init = """
-            %(output_name)s =  %(x_name)s;
-            """ % locals()
-        return init + """
-        if(%(output_name)s==NULL){
-                %(fail)s
-        };
-        if(PyList_Reverse((PyObject*) %(output_name)s)==-1){
-            %(fail)s
-        };
-        Py_INCREF(%(output_name)s);
-        """ % locals()
-
-    def c_code_cache_version(self):
-        return (1,)
-
-reverse = Reverse()
-"""
-Reverse the order of a typed list.
-
-:param x: the typed list to be reversed.
-"""
-
-
-class Index(Op):
-    # See doc in instance of this Op after the class definition.
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def make_node(self, x, elem):
-        assert isinstance(x.type, TypedListType)
-        assert x.ttype == elem.type
-        return Apply(self, [x, elem], [T.scalar()])
-
-    def perform(self, node, (x, elem), (out, )):
-        """
-        inelegant workaround for ValueError: The truth value of an
-        array with more than one element is ambiguous. Use a.any() or a.all()
-        being thrown when trying to remove a matrix from a matrices list
-        """
-        for y in range(len(x)):
-            if node.inputs[0].ttype.values_eq(x[y], elem):
-                out[0] = numpy.asarray(y, dtype=theano.config.floatX)
-                break
-
-    def __str__(self):
-        return self.__class__.__name__
-
-index_ = Index()
-
-
-class Count(Op):
-    # See doc in instance of this Op after the class definition.
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def make_node(self, x, elem):
-        assert isinstance(x.type, TypedListType)
-        assert x.ttype == elem.type
-        return Apply(self, [x, elem], [T.scalar()])
-
-    def perform(self, node, (x, elem), (out, )):
-        """
-        inelegant workaround for ValueError: The truth value of an
-        array with more than one element is ambiguous. Use a.any() or a.all()
-        being thrown when trying to remove a matrix from a matrices list
-        """
-        out[0] = 0
-        for y in range(len(x)):
-            if node.inputs[0].ttype.values_eq(x[y], elem):
-                out[0] += 1
-        out[0] = numpy.asarray(out[0], dtype=theano.config.floatX)
-
-    def __str__(self):
-        return self.__class__.__name__
-
-count = Count()
-"""
-Count the number of times an element is in the typed list.
-
-:param x: The typed list to look into.
-:param elem: The element we want to count in list.
-    The elements are compared with equals.
-
-:note: Python implementation of count doesn't work when we want to
-    count an ndarray from a list. This implementation works in that
-    case.
-
-"""
-
-
-class Length(Op):
-    # See doc in instance of this Op after the class definition.
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def make_node(self, x):
-        assert isinstance(x.type, TypedListType)
-        return Apply(self, [x], [T.scalar(dtype='int64')])
-
-    def perform(self, node, x, (out, )):
-        out[0] = numpy.asarray(len(x[0]), 'int64')
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    def c_code(self, node, name, inp, out, sub):
-        x_name = inp[0]
-        output_name = out[0]
-        fail = sub['fail']
-        return """
-        if(!%(output_name)s)
-            %(output_name)s=(PyArrayObject*)PyArray_EMPTY(0, NULL, NPY_INT64, 0);
-        ((npy_int64*)PyArray_DATA(%(output_name)s))[0]=PyList_Size((PyObject*)%(x_name)s);
-        Py_INCREF(%(output_name)s);
-        """ % locals()
-
-    def c_code_cache_version(self):
-        return (1,)
-
-length = Length()
-"""
-Returns the size of a list.
-
-:param x: typed list.
-"""
-
-
-class MakeList(Op):
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def make_node(self, a):
-        assert isinstance(a, (tuple, list))
-        a2 = []
-        for elem in a:
-            if not isinstance(elem, theano.gof.Variable):
-                elem = as_tensor_variable(elem)
-            a2.append(elem)
-        if not all(a2[0].type == elem.type for elem in a2):
-            raise TypeError(
-                "MakeList need all input variable to be of the same type.")
-        tl = theano.typed_list.TypedListType(a2[0].type)()
-
-        return Apply(self, a2, [tl])
-
-    def perform(self, node, inputs, (out, )):
-        out[0] = list(inputs)
-
-make_list = MakeList()
-"""
-Build a Python list from those Theano variable.
-
-:param a: tuple/list of Theano variable
-
-:note: All Theano variable must have the same type.
-"""
diff --git a/theano/typed_list/opt.py b/theano/typed_list/opt.py
deleted file mode 100644
index 8ff104b8691..00000000000
--- a/theano/typed_list/opt.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from theano import gof
-from theano import compile
-from theano.gof import TopoOptimizer
-from theano.typed_list.basic import (Reverse,
-                    Append, Extend, Insert, Remove)
-
-
-@gof.local_optimizer([Append, Extend, Insert, Reverse, Remove], inplace=True)
-def typed_list_inplace_opt(node):
-    if isinstance(node.op, (Append, Extend, Insert, Reverse, Remove)) \
-                                        and not node.op.inplace:
-
-        new_op = node.op.__class__(
-                    inplace=True)
-        new_node = new_op(*node.inputs)
-        return [new_node]
-    return False
-compile.optdb.register('typed_list_inplace_opt',
-                       TopoOptimizer(typed_list_inplace_opt,
-    failure_callback=TopoOptimizer.warn_inplace), 60,
-                       'fast_run', 'inplace')
diff --git a/theano/typed_list/tests/__init__.py b/theano/typed_list/tests/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/theano/typed_list/tests/test_basic.py b/theano/typed_list/tests/test_basic.py
deleted file mode 100644
index 8612044346a..00000000000
--- a/theano/typed_list/tests/test_basic.py
+++ /dev/null
@@ -1,584 +0,0 @@
-import unittest
-
-from nose.plugins.skip import SkipTest
-import numpy
-
-import theano
-import theano.typed_list
-from theano import tensor as T
-from theano.tensor.type_other import SliceType
-from theano.typed_list.type import TypedListType
-from theano.typed_list.basic import (GetItem, Insert,
-                                     Append, Extend, Remove, Reverse,
-                                     Index, Count, Length, make_list, MakeList)
-from theano import sparse
-from theano.tests import unittest_tools as utt
-# TODO, handle the case where scipy isn't installed.
-try:
-    import scipy.sparse as sp
-    scipy_imported = True
-except ImportError:
-    scipy_imported = False
-
-
-# took from tensors/tests/test_basic.py
-def rand_ranged_matrix(minimum, maximum, shape):
-    return numpy.asarray(numpy.random.rand(*shape) * (maximum - minimum)
-                         + minimum, dtype=theano.config.floatX)
-
-
-# took from sparse/tests/test_basic.py
-def random_lil(shape, dtype, nnz):
-    rval = sp.lil_matrix(shape, dtype=dtype)
-    huge = 2 ** 30
-    for k in range(nnz):
-        # set non-zeros in random locations (row x, col y)
-        idx = numpy.random.random_integers(huge, size=2) % shape
-        value = numpy.random.rand()
-        # if dtype *int*, value will always be zeros!
-        if "int" in dtype:
-            value = int(value * 100)
-        # The call to tuple is needed as scipy 0.13.1 do not support
-        # ndarray with lenght 2 as idx tuple.
-        rval.__setitem__(
-            tuple(idx),
-            value)
-    return rval
-
-
-class test_get_item(unittest.TestCase):
-
-    def setUp(self):
-        utt.seed_rng()
-
-    def test_sanity_check_slice(self):
-
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-
-        mySymbolicSlice = SliceType()()
-
-        z = GetItem()(mySymbolicMatricesList, mySymbolicSlice)
-
-        self.assertFalse(isinstance(z, T.TensorVariable))
-
-        f = theano.function([mySymbolicMatricesList, mySymbolicSlice],
-                            z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x], slice(0, 1, 1)), [x]))
-
-    def test_sanity_check_single(self):
-
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-
-        mySymbolicScalar = T.scalar(dtype='int64')
-
-        z = GetItem()(mySymbolicMatricesList, mySymbolicScalar)
-
-        f = theano.function([mySymbolicMatricesList, mySymbolicScalar],
-                            z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x],
-                                            numpy.asarray(0, dtype='int64')),
-                                          x))
-
-    def test_interface(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        mySymbolicScalar = T.scalar(dtype='int64')
-
-        z = mySymbolicMatricesList[mySymbolicScalar]
-
-        f = theano.function([mySymbolicMatricesList, mySymbolicScalar],
-                            z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x],
-                                            numpy.asarray(0, dtype='int64')),
-                                          x))
-
-        z = mySymbolicMatricesList[0]
-
-        f = theano.function([mySymbolicMatricesList],
-                            z)
-
-        self.assertTrue(numpy.array_equal(f([x]), x))
-
-    def test_wrong_input(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        mySymbolicMatrix = T.matrix()
-
-        self.assertRaises(TypeError, GetItem(), mySymbolicMatricesList,
-                          mySymbolicMatrix)
-
-    def test_constant_input(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-
-        z = GetItem()(mySymbolicMatricesList, 0)
-
-        f = theano.function([mySymbolicMatricesList],
-                            z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x]), x))
-
-        z = GetItem()(mySymbolicMatricesList, slice(0, 1, 1))
-
-        f = theano.function([mySymbolicMatricesList],
-                            z)
-
-        self.assertTrue(numpy.array_equal(f([x]), [x]))
-
-
-class test_append(unittest.TestCase):
-
-    def test_inplace(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-
-        z = Append(True)(mySymbolicMatricesList, myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myMatrix], z,
-                            accept_inplace=True)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x], y), [x, y]))
-
-    def test_sanity_check(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-
-        z = Append()(mySymbolicMatricesList, myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myMatrix], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x], y), [x, y]))
-
-    def test_interfaces(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-
-        z = mySymbolicMatricesList.append(myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myMatrix], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x], y), [x, y]))
-
-
-class test_extend(unittest.TestCase):
-
-    def test_inplace(self):
-        mySymbolicMatricesList1 = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        mySymbolicMatricesList2 = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-
-        z = Extend(True)(mySymbolicMatricesList1, mySymbolicMatricesList2)
-
-        f = theano.function([mySymbolicMatricesList1, mySymbolicMatricesList2],
-                            z, accept_inplace=True)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x], [y]), [x, y]))
-
-    def test_sanity_check(self):
-        mySymbolicMatricesList1 = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        mySymbolicMatricesList2 = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-
-        z = Extend()(mySymbolicMatricesList1, mySymbolicMatricesList2)
-
-        f = theano.function([mySymbolicMatricesList1, mySymbolicMatricesList2],
-                            z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x], [y]), [x, y]))
-
-    def test_interface(self):
-        mySymbolicMatricesList1 = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        mySymbolicMatricesList2 = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-
-        z = mySymbolicMatricesList1.extend(mySymbolicMatricesList2)
-
-        f = theano.function([mySymbolicMatricesList1, mySymbolicMatricesList2],
-                            z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x], [y]), [x, y]))
-
-
-class test_insert(unittest.TestCase):
-
-    def test_inplace(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-        myScalar = T.scalar(dtype='int64')
-
-        z = Insert(True)(mySymbolicMatricesList, myScalar, myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myScalar, myMatrix], z,
-                            accept_inplace=True)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x],
-                                            numpy.asarray(1, dtype='int64'),
-                                            y),
-                                          [x, y]))
-
-    def test_sanity_check(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-        myScalar = T.scalar(dtype='int64')
-
-        z = Insert()(mySymbolicMatricesList, myScalar, myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myScalar, myMatrix], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x], numpy.asarray(1,
-                        dtype='int64'), y), [x, y]))
-
-    def test_interface(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-        myScalar = T.scalar(dtype='int64')
-
-        z = mySymbolicMatricesList.insert(myScalar, myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myScalar, myMatrix], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x],
-                                            numpy.asarray(1, dtype='int64'),
-                                            y),
-                                          [x, y]))
-
-
-class test_remove(unittest.TestCase):
-
-    def test_inplace(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-
-        z = Remove(True)(mySymbolicMatricesList, myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myMatrix], z,
-                            accept_inplace=True)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x, y], y), [x]))
-
-    def test_sanity_check(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-
-        z = Remove()(mySymbolicMatricesList, myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myMatrix], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x, y], y), [x]))
-
-    def test_interface(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-
-        z = mySymbolicMatricesList.remove(myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myMatrix], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x, y], y), [x]))
-
-
-class test_reverse(unittest.TestCase):
-
-    def test_inplace(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-
-        z = Reverse(True)(mySymbolicMatricesList)
-
-        f = theano.function([mySymbolicMatricesList], z,
-                            accept_inplace=True)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x, y]), [y, x]))
-
-    def test_sanity_check(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-
-        z = Reverse()(mySymbolicMatricesList)
-
-        f = theano.function([mySymbolicMatricesList], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x, y]), [y, x]))
-
-    def test_interface(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-
-        z = mySymbolicMatricesList.reverse()
-
-        f = theano.function([mySymbolicMatricesList], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x, y]), [y, x]))
-
-
-class test_index(unittest.TestCase):
-
-    def test_sanity_check(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-
-        z = Index()(mySymbolicMatricesList, myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myMatrix], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(f([x, y], y) == 1)
-
-    def test_interface(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-
-        z = mySymbolicMatricesList.ind(myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myMatrix], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(f([x, y], y) == 1)
-
-    def test_non_tensor_type(self):
-        mySymbolicNestedMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)), 1)()
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-
-        z = Index()(mySymbolicNestedMatricesList, mySymbolicMatricesList)
-
-        f = theano.function([mySymbolicNestedMatricesList,
-                             mySymbolicMatricesList], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(f([[x, y], [x, y, y]], [x, y]) == 0)
-
-    def test_sparse(self):
-        if not scipy_imported:
-            raise SkipTest('Optional package SciPy not installed')
-        mySymbolicSparseList = TypedListType(
-            sparse.SparseType('csr', theano.config.floatX))()
-        mySymbolicSparse = sparse.csr_matrix()
-
-        z = Index()(mySymbolicSparseList, mySymbolicSparse)
-
-        f = theano.function([mySymbolicSparseList, mySymbolicSparse], z)
-
-        x = sp.csr_matrix(random_lil((10, 40), theano.config.floatX, 3))
-        y = sp.csr_matrix(random_lil((10, 40), theano.config.floatX, 3))
-
-        self.assertTrue(f([x, y], y) == 1)
-
-
-class test_count(unittest.TestCase):
-
-    def test_sanity_check(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-
-        z = Count()(mySymbolicMatricesList, myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myMatrix], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(f([y, y, x, y], y) == 3)
-
-    def test_interface(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        myMatrix = T.matrix()
-
-        z = mySymbolicMatricesList.count(myMatrix)
-
-        f = theano.function([mySymbolicMatricesList, myMatrix], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(f([x, y], y) == 1)
-
-    def test_non_tensor_type(self):
-        mySymbolicNestedMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)), 1)()
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-
-        z = Count()(mySymbolicNestedMatricesList, mySymbolicMatricesList)
-
-        f = theano.function([mySymbolicNestedMatricesList,
-                             mySymbolicMatricesList], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(f([[x, y], [x, y, y]], [x, y]) == 1)
-
-    def test_sparse(self):
-        if not scipy_imported:
-            raise SkipTest('Optional package SciPy not installed')
-        mySymbolicSparseList = TypedListType(
-            sparse.SparseType('csr', theano.config.floatX))()
-        mySymbolicSparse = sparse.csr_matrix()
-
-        z = Count()(mySymbolicSparseList, mySymbolicSparse)
-
-        f = theano.function([mySymbolicSparseList, mySymbolicSparse], z)
-
-        x = sp.csr_matrix(random_lil((10, 40), theano.config.floatX, 3))
-        y = sp.csr_matrix(random_lil((10, 40), theano.config.floatX, 3))
-
-        self.assertTrue(f([x, y, y], y) == 2)
-
-
-class test_length(unittest.TestCase):
-
-    def test_sanity_check(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-
-        z = Length()(mySymbolicMatricesList)
-
-        f = theano.function([mySymbolicMatricesList], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(f([x, x, x, x]) == 4)
-
-    def test_interface(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-            theano.config.floatX, (False, False)))()
-        z = mySymbolicMatricesList.__len__()
-
-        f = theano.function([mySymbolicMatricesList], z)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(f([x, x]) == 2)
-
-
-class T_MakeList(unittest.TestCase):
-
-    def test_wrong_shape(self):
-        a = T.vector()
-        b = T.matrix()
-
-        self.assertRaises(TypeError, make_list, (a,b))
-
-    def correct_answer(self):
-        a = T.matrix()
-        b = T.matrix()
-
-        x = T.tensor3()
-        y = T.tensor3()
-
-        A = numpy.random.rand(5)
-        B = numpy.random.rand(7)
-        X = numpy.random.rand(5,6)
-        Y = numpy.random.rand(1,9)
-
-        c = make_list((a, b))
-        z = make_list((x, y))
-        fc = function([a, b], c)
-        fz = function([x, y], z)
-
-        self.assertTrue(f([A, B]) == [A, B])
-        self.assertTrue(f([X, Y]) == [X, Y])
diff --git a/theano/typed_list/tests/test_opt.py b/theano/typed_list/tests/test_opt.py
deleted file mode 100644
index 814f8f1dc1d..00000000000
--- a/theano/typed_list/tests/test_opt.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import unittest
-
-import numpy
-
-import theano
-import theano.typed_list
-from theano import tensor as T
-from theano.tensor.type_other import SliceType
-from theano.typed_list.type import TypedListType
-from theano.typed_list.basic import (GetItem, Insert,
-                                      Append, Extend, Remove, Reverse,
-                                      Index, Count)
-from theano import In
-
-
-#took from tensors/tests/test_basic.py
-def rand_ranged_matrix(minimum, maximum, shape):
-    return numpy.asarray(numpy.random.rand(*shape) * (maximum - minimum)
-                         + minimum, dtype=theano.config.floatX)
-
-
-class test_inplace(unittest.TestCase):
-
-    def test_reverse_inplace(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-                                theano.config.floatX, (False, False)))()
-
-        z = Reverse()(mySymbolicMatricesList)
-        m = theano.compile.mode.get_default_mode().including("typed_list_inplace_opt")
-        f = theano.function([In(mySymbolicMatricesList, borrow=True,
-                        mutable=True)], z, accept_inplace=True, mode=m)
-        self.assertTrue(f.maker.fgraph.toposort()[0].op.inplace)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x, y]), [y, x]))
-
-    def test_append_inplace(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-                                theano.config.floatX, (False, False)))()
-        mySymbolicMatrix = T.matrix()
-        z = Append()(mySymbolicMatricesList, mySymbolicMatrix)
-        m = theano.compile.mode.get_default_mode().including("typed_list_inplace_opt")
-        f = theano.function([In(mySymbolicMatricesList, borrow=True,
-                        mutable=True), In(mySymbolicMatrix, borrow=True,
-                        mutable=True)], z, accept_inplace=True, mode=m)
-        self.assertTrue(f.maker.fgraph.toposort()[0].op.inplace)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x], y), [x, y]))
-
-    def test_extend_inplace(self):
-        mySymbolicMatricesList1 = TypedListType(T.TensorType(
-                                theano.config.floatX, (False, False)))()
-
-        mySymbolicMatricesList2 = TypedListType(T.TensorType(
-                                theano.config.floatX, (False, False)))()
-
-        z = Extend()(mySymbolicMatricesList1, mySymbolicMatricesList2)
-        m = theano.compile.mode.get_default_mode().including("typed_list_inplace_opt")
-        f = theano.function([In(mySymbolicMatricesList1, borrow=True,
-                    mutable=True), mySymbolicMatricesList2],
-                            z, mode=m)
-        self.assertTrue(f.maker.fgraph.toposort()[0].op.inplace)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x], [y]), [x, y]))
-
-    def test_insert_inplace(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-                                theano.config.floatX, (False, False)))()
-        mySymbolicIndex = T.scalar(dtype='int64')
-        mySymbolicMatrix = T.matrix()
-
-        z = Insert()(mySymbolicMatricesList, mySymbolicIndex, mySymbolicMatrix)
-        m = theano.compile.mode.get_default_mode().including("typed_list_inplace_opt")
-
-        f = theano.function([In(mySymbolicMatricesList, borrow=True,
-                        mutable=True), mySymbolicIndex, mySymbolicMatrix],
-                        z, accept_inplace=True, mode=m)
-        self.assertTrue(f.maker.fgraph.toposort()[0].op.inplace)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x], numpy.asarray(1,
-                                dtype='int64'), y), [x, y]))
-
-    def test_remove_inplace(self):
-        mySymbolicMatricesList = TypedListType(T.TensorType(
-                                    theano.config.floatX, (False, False)))()
-        mySymbolicMatrix = T.matrix()
-        z = Remove()(mySymbolicMatricesList, mySymbolicMatrix)
-        m = theano.compile.mode.get_default_mode().including("typed_list_inplace_opt")
-        f = theano.function([In(mySymbolicMatricesList, borrow=True,
-                            mutable=True), In(mySymbolicMatrix, borrow=True,
-                            mutable=True)], z, accept_inplace=True, mode=m)
-        self.assertTrue(f.maker.fgraph.toposort()[0].op.inplace)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        y = rand_ranged_matrix(-1000, 1000, [100, 101])
-
-        self.assertTrue(numpy.array_equal(f([x, y], y), [x]))
diff --git a/theano/typed_list/tests/test_type.py b/theano/typed_list/tests/test_type.py
deleted file mode 100644
index 6214c5b5e31..00000000000
--- a/theano/typed_list/tests/test_type.py
+++ /dev/null
@@ -1,191 +0,0 @@
-import unittest
-
-import numpy
-
-import theano
-import theano.typed_list
-from theano import tensor as T
-from theano.typed_list.type import TypedListType
-from theano.tests import unittest_tools as utt
-
-
-#took from tensors/tests/test_basic.py
-def rand_ranged_matrix(minimum, maximum, shape):
-    return numpy.asarray(numpy.random.rand(*shape) * (maximum - minimum)
-                         + minimum, dtype=theano.config.floatX)
-
-
-class test_typed_list_type(unittest.TestCase):
-
-    def setUp(self):
-        utt.seed_rng()
-
-    def test_wrong_input_on_creation(self):
-        """
-        Typed list type should raises an
-        error if the argument passed for
-        type is not a valid theano type
-        """
-
-        self.assertRaises(TypeError, TypedListType, None)
-
-    def test_wrong_input_on_filter(self):
-        """
-        Typed list type should raises an
-        error if the argument given to filter
-        isn't of the same type as the one
-        specified on creation
-        """
-
-        #list of matrices
-        myType = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)))
-
-        self.assertRaises(TypeError, myType.filter, [4])
-
-    def test_not_a_list_on_filter(self):
-        """
-        Typed List Value should raises an error
-        if no iterable variable is given on input
-        """
-
-        #list of matrices
-        myType = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)))
-
-        self.assertRaises(TypeError, myType.filter, 4)
-
-    def test_type_equality(self):
-        """
-        Typed list types should only be equal
-        when they contains the same theano
-        variables
-        """
-        #list of matrices
-        myType1 = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)))
-        #list of matrices
-        myType2 = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)))
-        #list of scalars
-        myType3 = TypedListType(T.TensorType(theano.config.floatX,
-                                            ()))
-
-        self.assertTrue(myType2 == myType1)
-        self.assertFalse(myType3 == myType1)
-
-    def test_filter_sanity_check(self):
-        """
-        Simple test on typed list type filter
-        """
-        myType = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)))
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 100])
-
-        self.assertTrue(numpy.array_equal(myType.filter([x]), [x]))
-
-    def test_intern_filter(self):
-        """
-        Test checking if values contained are themselves
-        filtered. If they weren't this code would raise
-        an exception.
-        """
-        myType = TypedListType(T.TensorType('float64',
-                                            (False, False)))
-
-        x = numpy.asarray([[4, 5], [4, 5]], dtype='float32')
-
-        self.assertTrue(numpy.array_equal(myType.filter([x]), [x]))
-
-    #Will fail for unknown reasons
-    #under search
-    """
-    def test_load(self):
-        myType = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)))
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 100])
-        testList = []
-        for i in range(10000):
-            testList.append(x)
-
-        self.assertTrue(numpy.array_equal(myType.filter(testList), testList))
-    """
-
-    def test_basic_nested_list(self):
-        """
-        Testing nested list with one level of depth
-        """
-        myNestedType = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)))
-
-        myType = TypedListType(myNestedType)
-
-        x = rand_ranged_matrix(-1000, 1000, [100, 100])
-
-        self.assertTrue(numpy.array_equal(myType.filter([[x]]), [[x]]))
-
-    def test_comparison_different_depth(self):
-        """
-        Nested list with different depth aren't the same
-        """
-        myNestedType = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)))
-
-        myNestedType2 = TypedListType(myNestedType)
-
-        myNestedType3 = TypedListType(myNestedType2)
-
-        self.assertFalse(myNestedType2 == myNestedType3)
-
-    def test_nested_list_arg(self):
-        """
-        test for the 'depth' optionnal argument
-        """
-        myNestedType = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)), 3)
-
-        myType = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)))
-
-        myManualNestedType = TypedListType(TypedListType(
-                             TypedListType(myType)))
-
-        self.assertTrue(myNestedType == myManualNestedType)
-
-    def test_get_depth(self):
-        """
-        test case for get_depth utilitary function
-        """
-        myType = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)))
-
-        myManualNestedType = TypedListType(TypedListType(
-                             TypedListType(myType)))
-
-        self.assertTrue(myManualNestedType.get_depth() == 3)
-
-    def test_comparison_uneven_nested(self):
-        """
-        test for comparison between uneven nested list
-        """
-
-        myType = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)))
-
-        myManualNestedType1 = TypedListType(TypedListType(
-                             TypedListType(myType)))
-
-        myManualNestedType2 = TypedListType(TypedListType(
-                             myType))
-
-        self.assertFalse(myManualNestedType1 == myManualNestedType2)
-        self.assertFalse(myManualNestedType2 == myManualNestedType1)
-
-    def test_variable_is_Typed_List_variable(self):
-        mySymbolicVariable = TypedListType(T.TensorType(theano.config.floatX,
-                                            (False, False)))()
-
-        self.assertTrue(isinstance(mySymbolicVariable,
-                                   theano.typed_list.TypedListVariable))
diff --git a/theano/typed_list/type.py b/theano/typed_list/type.py
deleted file mode 100644
index 04db22e3b44..00000000000
--- a/theano/typed_list/type.py
+++ /dev/null
@@ -1,132 +0,0 @@
-from theano import gof
-
-
-class TypedListType(gof.Type):
-
-    def __init__(self, ttype, depth=0):
-        """
-        :Parameters:
-            -'ttype' : Type of theano variable this list
-            will contains, can be another list.
-            -'depth' : Optionnal parameters, any value
-            above 0 will create a nested list of this
-            depth. (0-based)
-        """
-        if depth < 0:
-            raise ValueError('Please specify a depth superior or'
-                            'equal to 0')
-        if not isinstance(ttype, gof.Type):
-            raise TypeError('Expected a Theano Type')
-
-        if depth == 0:
-            self.ttype = ttype
-        else:
-            self.ttype = TypedListType(ttype, depth - 1)
-
-    def filter(self, x, strict=False, allow_downcast=None):
-        """
-        :Parameters:
-            -'x' : value to filter
-            -'strict' : if true, only native python list will be accepted
-            -'allow_downcast' : does not have any utility at the moment
-        """
-        if strict:
-            if not isinstance(x, list):
-                raise TypeError('Expected a python list')
-        else:
-            x = [self.ttype.filter(y) for y in x]
-
-            if all(self.ttype.is_valid_value(y) for y in x):
-                return x
-
-            else:
-                raise TypeError('Expected all elements to'
-                                ' be %s' % str(self.ttype))
-
-    def __eq__(self, other):
-        """
-        two list are equals if they contains the same type.
-        """
-
-        return  type(self) == type(other) and self.ttype == other.ttype
-
-    def __hash__(self):
-        return gof.hashtype(self) ^ hash(self.ttype)
-
-    def __str__(self):
-        return 'TypedList <' + str(self.ttype) + '>'
-
-    def get_depth(self):
-        """
-        utilitary function to get the 0 based
-        level of the list
-        """
-        if isinstance(self.ttype, TypedListType):
-            return self.ttype.get_depth() + 1
-        else:
-            return 0
-
-    def values_eq(self, a, b):
-        if not len(a) == len(b):
-            return False
-
-        for x in range(len(a)):
-            if not self.ttype.values_eq(a[x], b[x]):
-                return False
-
-        return True
-
-    def may_share_memory(self, a, b):
-        if a is b:
-            return True
-        # As a list contain other element, if a or b isn't a list, we
-        # still need to check if that element is contained in the
-        # other list.
-        if not isinstance(a, list):
-            a = [a]
-        if not isinstance(b, list):
-            b = [b]
-        for idx1 in range(len(a)):
-            for idx2 in range(len(b)):
-                if self.ttype.may_share_memory(a[idx1], b[idx2]):
-                    return True
-
-    def c_declare(self, name, sub, check_input=True):
-        return """
-        PyListObject* %(name)s;
-        """ % dict(name=name)
-
-    def c_init(self, name, sub):
-        return """
-        %(name)s = NULL;
-        """ % dict(name=name)
-
-    def c_extract(self, name, sub, check_input=True):
-        if check_input:
-            pre = """
-            if (!PyList_Check(py_%(name)s)) {
-                PyErr_SetString(PyExc_TypeError, "expected a list");
-                %(fail)s
-            }""" % dict(name=name, fail=sub['fail'])
-        else:
-            pre = ""
-        return pre + """
-        %(name)s = (PyListObject*) (py_%(name)s);
-        """ % dict(name=name, fail=sub['fail'])
-
-    def c_sync(self, name, sub):
-
-        return """
-        Py_XDECREF(py_%(name)s);
-        py_%(name)s = (PyObject*)(%(name)s);
-        Py_INCREF(py_%(name)s);
-        """ % dict(name=name)
-
-    def c_cleanup(self, name, sub):
-        return ""
-
-    def c_code_cache_version(self):
-        return (2,)
-
-    dtype = property(lambda self: self.ttype)
-    ndim = property(lambda self: self.ttype.ndim + 1)
diff --git a/theano/updates.py b/theano/updates.py
index c728442d772..74f6e8a982f 100755
--- a/theano/updates.py
+++ b/theano/updates.py
@@ -32,10 +32,7 @@ def __init__(self, *key, **kwargs):
             # Warn when using as input a non-ordered dictionary.
             warnings.warn('Initializing an `OrderedUpdates` from a '
                           'non-ordered dictionary with 2+ elements could '
-                          'make your code non-deterministic. You can use '
-                          'an OrderedDict that is implemented at '
-                          'theano.compat.python2x.OrderedDict '
-                          'for python 2.4+.')
+                          'make your code non-deterministic')
         super(OrderedUpdates, self).__init__(*key, **kwargs)
         for key in self:
             if not isinstance(key, SharedVariable):