diff --git a/.gitignore b/.gitignore index 62f99999dd7..5865e241be2 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ *.log *.nav *.out +*.pdf *.snm *.toc *.vrb diff --git a/.mailmap b/.mailmap index c59dab83ce8..bb3acb31ee2 100644 --- a/.mailmap +++ b/.mailmap @@ -7,11 +7,8 @@ # # # # This file is up-to-date if the command git log --format="%aN <%aE>" | sort -u # # gives no duplicates. -Jörg Bornschein Joerg Bornschein -Eric Hunsberger hunse -Jan Schlüter f0k + Rami Al-Rfou' Rami Al-Rfou -Arnaud Bergeron David Warde-Farley David Warde-Farley David Warde-Farley David Warde Farley @@ -20,7 +17,7 @@ Douglas Eck eckdoug@waits.local Dumitru Erhan dumitru@deepnets.mtv.corp.google.com Dumitru Erhan erhandum@bikat.iro.umontreal.ca Francois Savard fsavard -Steven Pigeon steven-pigeon + # 5 Firstname Lastname # 4 Laboratoire d'Informatique des Systemes Adaptatifs # 6 Li Yao @@ -28,11 +25,6 @@ Steven Pigeon steven-pigeon # 2 onze # 25 projects@lgcm # 1 tutorial/debug_faq.txt -Bogdan Budescu bbudescu -Sebastian Berg seberg -Huy Nguyen huyng -Wei Li kuantkid -Ethan Buchman ebuchman Frederic Bastien Frederic Bastien Frederic Bastien Frederic Bastien Frederic Bastien Frédéric Bastien @@ -63,7 +55,6 @@ James Bergstra james@mackie James Bergstra james@x40.unstable James Bergstra test_rng_mrg.py John Salvatier jsalvatier -John Salvatier john salvatier Joseph Turian Joseph Turian Joseph Turian turian@grenat.iro.umontreal.ca Joseph Turian turian@lgcm @@ -92,4 +83,3 @@ Sander Dieleman benanne Xavier Glorot glorotxa Xavier Glorot glorotxa@timide.iro.umontreal.ca Yoshua Bengio bengioy@bengio-mac.local -Sina Honari SinaHonari diff --git a/.travis.yml b/.travis.yml index b1f1aa0b29f..3698be65683 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,49 +3,42 @@ language: python python: - - "2.6" + - "2.5" # - "2.7" # - "3.2" - # command to install dependencies before_install: - - sudo apt-get install -q libatlas3gf-base libatlas-dev liblapack-dev gfortran -# Install miniconda to avoid compiling scipy - - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh - - chmod +x miniconda.sh - - ./miniconda.sh -b - - export PATH=/home/travis/miniconda/bin:$PATH - - conda update --yes conda - + - sudo apt-get install -qq libatlas3gf-base libatlas-dev liblapack-dev gfortran +# - sudo apt-get install -qq libopenblas-dev install: -# We support scipy 0.7.2, but it is not available on conda. -# So we test with 0.11. Our internal buildbot have 0.7.2. - - conda create --yes -q -n py26 python=2.6 numpy=1.6 scipy=0.11 nose=1.1 pip - - source activate py26 - - pip install pydot - - pip install . --no-deps --use-mirrors - +# If we don't install numpy before SciPy 0.10.1, the SciPy installations fails. + - "pip install -q numpy==1.5 --use-mirrors" +# We support scipy 0.7.2, but it is not available on pypi anymore. +# So we test with 0.8. Our internal buildbot have 0.7.2. +# We install it later only for the PART that need it. +# - "pip install -q scipy==0.8 --use-mirrors" + - "pip install . --no-deps --use-mirrors" # command to run tests env: - - PART="theano/scan_module/" - - PART="theano/sandbox theano/sparse theano/scalar/ theano/tensor/nnet/" - - PART="theano/tensor/tests/test_basic.py theano/tensor/signal/ theano/compile/ theano/gof/ theano/misc/ theano/tests/ theano/compat" + - PART="theano/tensor/nnet/ theano/tensor/signal/ theano/compile/ theano/gof/ theano/misc/ theano/tests/ theano/compat theano/scan_module/" +# This part is select such that all scipy code is there. +# We install scipy only for this part to make the test time faster. + - PART="theano/sandbox theano/sparse theano/scalar/" + - PART="theano/tensor/tests/test_basic.py" - PART="-e test_basic.py theano/tensor/tests" - script: - - export THEANO_FLAGS=blas.ldflags="-lblas -lgfortran",warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise + - "if [ `expr \"$PART\" : '.*sparse'` -gt \"0\" ]; then pip install scipy==0.8 --use-mirrors; fi" + - export THEANO_FLAGS=warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise - python --version - uname -a - free -m - df -h - ulimit -a - - echo "$PART" - - theano-nose -v $PART - - theano-cache list + - echo $PART + - theano-nose $PART + #after_script: after_failure: - cat /home/travis/.pip/pip.log #after_success: - -cache: apt diff --git a/MANIFEST.in b/MANIFEST.in index 458a924bd22..cabc8eaeacb 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,10 +1,7 @@ global-include *.txt -global-include *.c global-include *.cu global-include *.cuh -global-include *.h global-include *.sh -global-include *.pkl recursive-include docs include bin/theano-cache include bin/theano-nose diff --git a/NEWS.txt b/NEWS.txt index edba00faf1c..977366d3873 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -1,391 +1,9 @@ +.. _NEWS: + ============= Release Notes ============= -Theano 0.6 (December 3th, 2013) -=================================== - -We recommend that everybody update to this version. - - -Highlights (since 0.6rc5): - * Last release with support for Python 2.4 and 2.5. - * We will try to release more frequently. - * Fix crash/installation problems. - * Use less memory for conv3d2d. - -0.6rc4 skipped for a technical reason. - -Highlights (since 0.6rc3): - * Python 3.3 compatibility with buildbot test for it. - * Full advanced indexing support. - * Better Windows 64 bit support. - * New profiler. - * Better error messages that help debugging. - * Better support for newer NumPy versions (remove useless warning/crash). - * Faster optimization/compilation for big graph. - * Move in Theano the Conv3d2d implementation. - * Better SymPy/Theano bridge: Make an Theano op from SymPy expression and use SymPy c code generator. - * Bug fixes. - -Change from 0.6rc5: - * Fix crash when specifing march in cxxflags Theano flag. (Frederic B., reported by FiReTiTi) - * code cleanup (Jorg Bornschein) - * Fix Canopy installation on windows when it was installed for all users: Raingo - * Fix Theano tests due to a scipy change. (Frederic B.) - * Work around bug introduced in scipy dev 0.14. (Frederic B.) - * Fix Theano tests following bugfix in SciPy. (Frederic B., reported by Ziyuan Lin) - * Add Theano flag cublas.lib (Misha Denil) - * Make conv3d2d work more inplace (so less memory usage) (Frederic B., repoted by Jean-Philippe Ouellet) - - -Committers since 0.5: - -Frederic Bastien -Pascal Lamblin -Ian Goodfellow -Olivier Delalleau -Razvan Pascanu -abalkin -Arnaud Bergeron -Nicolas Bouchard + -Jeremiah Lowin + -Matthew Rocklin -Eric Larsen + -James Bergstra -David Warde-Farley -John Salvatier + -Vivek Kulkarni + -Yann N. Dauphin -Ludwig Schmidt-Hackenberg + -Gabe Schwartz + -Rami Al-Rfou' + -Guillaume Desjardins -Caglar + -Sigurd Spieckermann + -Steven Pigeon + -Bogdan Budescu + -Jey Kottalam + -Mehdi Mirza + -Alexander Belopolsky + -Ethan Buchman + -Jason Yosinski -Nicolas Pinto + -Sina Honari + -Ben McCann + -Graham Taylor -Hani Almousli -Ilya Dyachenko + -Jan Schlüter + -Jorg Bornschein + -Micky Latowicki + -Yaroslav Halchenko + -Eric Hunsberger + -Amir Elaguizy + -Hannes Schulz + -Huy Nguyen + -Ilan Schnell + -Li Yao -Misha Denil + -Robert Kern + -Sebastian Berg + -Vincent Dumoulin + -Wei Li + -XterNalz + - - -A total of 51 people contributed to this release. -People with a "+" by their names contributed a patch for the first time. - - -Theano 0.6rc5 (November 25th, 2013) -=================================== - -We recommend that everybody update to this version. - -We plan to release 0.6 in one week if there is no problem introduced -with this release candidate. - -Theano 0.6rc4 was skipped due to a problem with pypi - -Highlights: - * Python 3.3 compatibility with buildbot test for it. - * Full advanced indexing support. - * Better Windows 64 bit support. - * New profiler. - * Better error messages that help debugging. - * Better support for newer NumPy versions (remove useless warning/crash). - * Faster optimization/compilation for big graph. - * Move in Theano the Conv3d2d implementation. - * Better SymPy/Theano bridge: Make an Theano op from SymPy expression and use SymPy c code generator. - * Bug fixes. - -Committers for this rc5 only: - -Frederic Bastien -Pascal Lamblin -Arnaud Bergeron -abalkin -Olivier Delalleau -John Salvatier -Razvan Pascanu -Jeremiah Lowin -Ludwig Schmidt-Hackenberg + -Vivek Kulkarni -Matthew Rocklin -Gabe Schwartz -James Bergstra -Sigurd Spieckermann + -Bogdan Budescu + -Mehdi Mirza + -Nicolas Bouchard -Ethan Buchman + -Guillaume Desjardins -Ian Goodfellow -Jason Yosinski -Sina Honari + -Ben McCann + -David Warde-Farley -Ilya Dyachenko + -Jan Schluter + -Micky Latowicki + -Yaroslav Halchenko + -Alexander Belopolsky -Hannes Schulz + -Huy Nguyen + -Robert Kern + -Sebastian Berg + -Vincent Dumoulin + -Wei Li + -XterNalz + - - -A total of 36 people contributed to this release. -People with a "+" by their names contributed a patch for the first time. - -Installation: - * Canopy support (direct link to MKL): - * On Linux and Mac OSX (Frederic B., Robert Kern) - * On Windows (Edward Shi, Frederic B.) - - * Anaconda instructions (Pascal L., Frederic B.) - * Doc Ubuntu 13.04 (Frederic B.) - * Better support of newer NumPy version(remove useless warning/crash) (Frederic B., Huy Nguyen) - -Bug fixes: - * Scan: if a scan node was cloned (by theano.clone) with different inputs, and if both the initial and the cloned nodes are used in the function being compiled, the value of the outputs of one would be replaced with the outputs of the other one. (Pascal L.) - * Sparse: Disable the optimization that introduce the CSMGradC op as it doesn't work correctly with unsorted indices. (Frederic B.) - * Mac: Fix wrong result of GpuDownsampleFactorMaxGrad on Mac OSX. (Pascal L.) - * Mac: Auto-Detect and work around a bug in BLAS on MacOS X (Pascal L.) - * Mac: Work around bug in MacOS X. If 2 compiled modules had the same name, the OS or Python was not always the right one even when we used the right handle to it. (Pascal L.) - Use this hash in the Python module, and in %(nodename)s, so that different helper functions in the support code for different Ops will always have different names. - * Sparse grad: Fix ConstructSparseFromList.infer_shape (Pascal L., reported by Rami Al-Rfou') - * (introduced in the development version after 0.6rc3 release) (Frederic B.) - Reduction that upcasts the input on no axis (ex: call theano.sum() on a scalar when the original dtype isn't float64 or - [u]int64). It produced bad results as we did not upcasted the inputs in the code, we just copy them. - * Fix some cases of theano.clone() when we get a replacement of x that is a function of x. (Razvan P., reported by Akio Takano) - * Fix grad of Alloc when we unbroadcast the value and it isn't a scalar. (Frederic B., reported Ian G.) - - * In some cases (I think most cases), there was an exception raised in the theano.tensor.grad() method. - But in theory, there could be bad shapes produced in the unbroadcasted dimensions. - -Interface Deprecation (a warning is printed): - * The mode ProfileMode is now deprecated, use the Theano flag profile=True to replace it. - * New theano.sparse_grad() interface to get the sparse grad of a_tensor[an_int_vector]. (Frederic B.) - This can speed up the sparse computations when a small fraction of a_tensor is taken. - Deprecate the old interface for this. (Frederic B.) - -Interface Changes: - * Interface change subtensor and take are not in tensor.basic anymore. They were available from tensor.* and are still available from there. (Frederic B., Matthew Rocklin) - * This lowers the basic.py size to 191k, so under 200k for github search. - * Add -m32 or -m64 in the module cache key and add the python bitwidth in the compiledir path. (Pascal L.) - * mrg.normal now has the parameter size mandatory. It was crashing with the default value of None. (Olivier D.) - * Remove the deprecated passing of multiple modes to theano function. (Frederic B.) - * Change FunctionGraph Features interface of the {on_prune(),on_import()} call back to take a reason. (Frederic B.) - * FunctionGraph now clone the input graph by default. (Frederic B.) - * Added a parameter to optionally not do this cloning. - * This was needed to speed up compilation - -New Interface (reuses existing functionality): - * Add hostname as a var in compiledir_format (Frederic B.) - * Add a new Theano flag: compute_test_value_opt. It takes the same values as compute_test_value. It enables compute_test_value during Theano optimization. Only useful to debug Theano optimization. Also small changes to some optimization to work correctly in that setup. (Frederic B.) - * Add the value pdb to the Theano flag: compute_test_value and compute_test_value_opt. (Frederic B.) - * Add the Theano flag: optimizer_verbose. Default False. When True, we print all the optimization being applied.(Frederic B.) - * Add Op.c_init_code() to allow running the code when the c cmodule is imported (Pascal L.) - * Allow theano.tensor.ones(3) to support scalar and not just list of scalar as numpy.ones (Jeremiah Lowin) - * Make the memory profiler print the FLOPS used for the ops that know how to compute it. (Frederic B.) - -New Features: - * Make tensor.{constant,as_tensor_variable} work with memmap. (Christian Hudon, Frederic Bastien) - * compilation work on ARM processor (Raspberry Pi, Vincent Dumoulin) - * Add numpy.random.choice wrapper to our random number generator (Sigurd Spieckermann) - * Better SymPy/Theano bridge: Make an Theano op from SymPy expression and use SymPy c code generator (Matthew Rocklin) - * Move in Theano the Conv3d2d implementation (James Bergstra, Frederic B., Pascal L.) - * First version of the new GPU back-end available (Arnaud Bergeron, Frederic B.) - - * Not all Ops have been converted to this new back-end. - To use, use Theano flag device=cudaN or device=openclN, where N is a integer. - * Python 3.3 compatible (abalkin, Gabe Schwartz, Frederic B., Pascal L.) - * A new profiler (Frederic B.) - The new profiler now can profile the memory with the Theano flag profile_memory=True. - The ProfileMode now can't profile memory anymore and prints a message about it. - Now we raise an error if we try to profile when the gpu is enabled if we didn't set - correctly the env variable to force the driver to sync the kernel launch. - Otherwise the profile information are useless. - The new profiler supports the enabling/disabling of the garbage collection. - * Adds tensor.tri, tensor.triu, and tensor.tril functions that wrap Numpy equivalents (Jeremiah Lowin) - * Adds tensor.nonzero, tensor.flatnonzero functions that wrap Numpy equivalents (Jeremiah Lowin) - * Adds tensor.nonzero_values to get around lack of advanced indexing for nonzero elements (Jeremiah Lowin) - * Make {inc,set}_subtensor work on output of take. (Pascal L.) - * When device=cpu and force_device=True, force that we disable the gpu. (Frederic B.) - * Better Windows 64 bit support for indexing/reshaping (Pascal L.) - * Full advanced indexing support (John Salvatier, seberg) - * Add theano.tensor.stacklist(). Recursivly stack lists of tensors to maintain similar structure (Matthew R.) - * Add Theano flag value: on_opt_error=pdb (Olivier D.) - * GpuSoftmax[WithBias] for bigger row. (Frederic B.) - * Make Erfinv work on the GPU (Guillaume Desjardin, Pascal L.) - * Add "theano-cache basecompiledir purge" (Pascal L.) - This purges all the compiledirs that are in the base compiledir. - * A_tensor_variable.zeros_like() now supports the dtype parameter (Pascal L.) - * More stable reduce operations by default (Pascal L.) - Add an accumulator dtype to CAReduceDtype (acc_dtype) - by default, acc_dtype is float64 for float32 inputs, - then cast to specified output dtype (float32 for float32 inputs) - * Test default blas flag before using it (Pascal L.) - This makes it work correctly by default if no blas library is installed. - * Add cuda.unuse() to help tests that need to enable/disable the GPU (Frederic B.) - * Add theano.tensor.nnet.ultra_fast_sigmoid and the opt (disabled by default) local_ultra_fast_sigmoid. (Frederic B.) - * Add theano.tensor.nnet.hard_sigmoid and the opt (disabled by default) local_hard_sigmoid. (Frederic B.) - * Add class theano.compat.python2x.Counter() (Mehdi Mirza) - * Allow a_cuda_ndarray += another_cuda_ndarray for 6d tensor. (Frederic B.) - * Make the op ExtractDiag work on the GPU. (Frederic B.) - * New op theano.tensor.chi2sf (Ethan Buchman) - * Lift Flatten/Reshape toward input on unary elemwise. (Frederic B.) - This makes the "log(1-sigmoid) -> softplus" stability optimization being applied with a flatten/reshape in the middle. - * Make MonitorMode use the default optimizers config and allow it to change used optimizers (Frederic B.) - * Add support for ScalarOp.c_support_code in GpuElemwise. (Frederic B.) - * Also make the Psi function run on GPU. (Frederic B.) - * Make tensor.outer(x,y) work when ndim != 1 as numpy.outer. - * Kron op: Speed up/generalize/GPU friendly. (Frederic B.) - (It is not an op anymore, but reuses current op) - * Add gpu max for pattern (0, 1) and added all gpu max pattern for gpu min. (Frederic B.) - * Add GpuEye (Frederic B.) - * Make GpuCrossentropySoftmaxArgmax1HotWithBias and GpuCrossentropySoftmax1HotWithBiasDx work for bigger inputs (Frederic B., reported by Ryan Price) - * Finish and move out of sandbox theano.sparse.basic.true_dot (Nicolas Bouchard, Frederic B.) - And document all sparse dot variants. - * Implement the mode ignore_borders for GpuImages2Neibs (Frederic B.) - * Make many reduction functions accept a numpy scalar as axis (Jeremiah Lowin) - * Allow numpy.asarray(cuda_ndarray, dtype=...) (Frederic B.) - * theano-cache cleanup now remove cached module old version of code. (Frederic B.) - - -Speed-ups: - * Optimizer speed up. (Frederic B.) - * Fix warning on newer llvm version on Mac. (Pascal L., reported by Jeremiah Lowin and Chris Fonnesbeck) - * Allow pickling of more Ops to allow reusing the compiled code (Pascal L., Frederic B.) - * Optimize more cases of dot22 and scalar when we can't make a gemm (Pascal L., Frederic B.) - * Speed up GpuJoin with c code (Ludwig Schmidt-Hackenberg, Frederic B.) - * Faster GpuAdvancedIncSubtensor1 on Fermi GPU (and up) on matrix. (Vivek Kulkarni) - * Faster GPUAdvancedIncSubtensor1 in some cases on all GPU (Vivek Kulkarni) - * Implemented c_code for AdvancedSubtensor1 (abalkin) - * Add the equivalent of -march=native to g++ command line. (Frederic B., Pascal L.) - * Speed up compilation with Scan (Jan Schluter) - * Merge more Scan nodes together (Pascal L., Yao Li). - * Add MakeVector.c_code (Frederic B.) - * Add Shape.c_code (Frederic B.) - * Optimize Elemwise when all the inputs are fortran (Frederic B.) - We now generate a fortran output and use vectorisable code. - * Add ScalarOp.c_code_contiguous interface and do a default version. (Frederic B.) - This could optimize elemwise by helping the compiler generate SIMD instruction. - * Use ScalarOp.c_code_contiguous with amdlibm. (Frederic B.) - This speeds up exp, pow, sin, cos, log, log2, log10 and sigmoid when the input is contiguous in memory. - * A fix that removes a local_setsubtensor_of_allocs optimization warning and enables it in that case. (Frederic B., reported by John Salvatier) - * Make inv_as_solve optimization work (Matthew Rocklin) - -Crash/no return fixes: - * Fix scan crash in the grad of grad of a scan with special structure (including scan in a scan) (Razvan P., Bitton Tenessi) - * Fix various crashes when calling scan() with inputs specified in unusual ways. (Pascal L.) - * Fix shape crash inserted by Scan optimization. The gradient of some recursive scan was making the PushOutSeqScan optimization insert crash during the execution of a Theano function. (Frederic B., reported by Hugo Larochelle) - * Fix command not returning with recent mingw64 on Windows (Pascal L., reported by many people) - * Fix infinite loop related to Scan on the GPU. (Pascal L.) - * Fix infinite loop when the compiledir is full. (Frederic B.) - * Fix a shape cycle crash in the optimizer (Pascal L., Frederic B., reported by Cho KyungHyun) - * Fix MRG normal() now allow it to generate scalars. (Pascal L.) - * Fix some GPU compilation issue on Mac (John Yani, Frederic B.) - * Fix crash when building symbolic random variables with a mix of symbolic and numeric scalar in the "size" parameter. (Pascal L., Reported by Wu Zhen Zhou) - * Make some Op.grad() implementions not return None (Pascal L.) - * Crash fix in the grad of elemwise about an DisconnectedType (Pascal L, reported by Thomas Wiecki) - * Fix local_gpu_multinomial optimization handling of broadcast information. (Frederic B., reported by Caglar) - * Fix crash with change introduced in NumPy 1.7.1 (Pascal L., reported by Thomas Wiecki) - * Compilation failure with complex (Pascal L., reported by autumncat) - * Gpu reduction on all dimensions of a 4d tensor. (Frederic B., reported by Arjun Jain) - * Fix crash for a combination of grad of dot and dimshuffle when only one of the inputs for a corresponding dimensions was knowing that it was broadcastable. (Frederic B., reported by Micky Latowicki) - * AdvancedSubtensor1: allow broadcasted index vector. (Frederic B., reported by Jeremiah Lowin) - * Fix compute_test_value for ifelse (Olivier D., reported by Bitton Tenessi) - * Fix import error with some versions of NumPy (Olivier D.) - * Fix Scan grad exception (Razvan P., reported by Nicolas BL) - * Fix compute_test_value for a non_sequence when calling the gradient of Scan (Pascal L., reported by Bitton Tenessi). - * Crash fix in Scan following interface change in 0.6rc2 (Razvan P.) - * Crash fix on Scan (Razvan P.) - * Crash fix on Scan (Pascal L., reported by Sina Honari and Sigurd) - * Fix crash in Scan gradient related to compute_test_value (Frederic B., reported by Bitton Tenessi) - * Fix a scan optimization warning/error depending of Theano flags (Frederic B.) - * Fixed crash for unimplemented elemwise gradient (Olivier D., reported by Michael McNeil Forbes) - * Fix crash in the elemwise python code for some big shape with power of 2. (Sina Honari, Pascal L.) - * Fix compile and import errors on Windows including for the GPU. (Bogdan Budescu) - * Fix GPU compilation on Windows (XterNalz) - * Fix local_abs_merge optimization crash (Pascal L., reported by Jeremiah Lowin) - * Fix import theano crash when g++ isn't there (Olivier D.) - * Fix crash related to rebuild of Theano graph (Pascal L., reported by Divine Eguzouwa) - * Fix crash during compilation (David Ward-Farley) - * Crash fix in the grad of GPU op in corner case (Pascal L.) - * Crash fix on MacOS X (Robert Kern) - * theano.misc.gnumpy_utils.garray_to_cudandarray() set strides correctly for dimensions of 1. (Frederic B., reported by Justin Bayer) - * Fix crash during optimization with consecutive sums and some combination of axis (Frederic B., reported by Caglar Gulcehre) - * Fix crash with keepdims and negative axis (Frederic B., reported by David W.-F.) - * Fix crash of theano.[sparse.]dot(x,y) when x or y is a vector. (Frederic B., reported by Zsolt Bitvai) - * Fix opt crash/disabled with ifelse on the gpu (Frederic B, reported by Ryan Price) - * Fix crash in optimization involving dot22, (Pascal L., reported by @micklat) - * Prevent shape optimizations from introducing cycles in the graph (Frederic Bastien, Pascal Lamblin, reported by Kyunghyun Cho) - -Others: - * Update/Fixes/Typo/pep8 documentation and/or tutorial (Olivier D., David W.-F., Frederic B., Yaroslav Halchenko, Micky Latowicki, Ben McCann, Jason Yosinski, reported by Arnaud Bergeron) - * Doc how to make a sparse Op. (Frederic B.) - * Doc compatibility guide (abalkin) - * Fix problem in remove_constants_and_unused_inputs_scan. (useless warning and maybe slow down) (Pascal L.) - * Fix rop dot.(Razvan P., reported by Jeremiah Lowin) - * Raise better error related to pydot bug. (Frederic B., reported by Jason Yosinski and Ludwig Schmidt-Hackenberg) - * Fix to Theano tutorial examples. (reported by Ilya Dyachenko) - * Fix SharedVar.value property to make it raise an exception (Frederic B., reported by Drew Duncan) - * Fix verification with compute_test_value in grad() (Frederic B.) - * Theano flags are now evaluated lazily, only if requested (Frederic B.) - * Fix test when g++ is not avail (Frederic B.) - * Add manual instructions for OpenBLAS on Ubuntu by (Jianri Li ) - * Better/more error messages (Frederic B., Pascal L., Ian Goodfellow) - * Fix Error reporting with GpuConv (Frederic B., reported by Heng Luo and Nicolas Pinto) - * Now travis-ci tests with scipy the parts that need it (Frederic B.) - * Export some functions that work on CudaNdarray for windows (Frederic B.) - * If the user specifies a -arch=sm_* value in the Theano flags for the gpu, don't add one (Frederic B., Pascal L.) - * If a C thunk returns an error, check if a python exception is set. Otherwise, set a default one (Pascal L.) - * Crash fix introduced in the development version (Wei LI) - * Added BLAS benchmark result (Frederic B., Ben McCann) - * Fix code comment (Hannes Schulz) - * More stable tests (Frederic B.) - * Add utt.asset_allclose(a, b) to have better error message. (Frederic B.) - * Better error message with compute_test_value (Frederic, reported by John Salvatier) - * Stochastic order behavior fix (Frederic B.) - - * Simpler initial graph for subtensor infer shape (Olivier D.) - The optimization was doing the optimization, but this allows better reading of the graph before optimization. - * Better detection of non-aligned ndarray (Frederic B.) - * Update MRG multinomial gradient to the new interface (Mehdi Mirza) - * Implement Image2Neibs.perform() to help debug (Frederic B.) - * Remove some Theano flags from the compilation key (Frederic B.) - * Make theano-nose work on executable '\*.py' files. (Alistair Muldal) - * Make theano-nose work with older nose version (Frederic B.) - * Add extra debug info in verify_grad() (Frederic B.) - - Theano 0.6rc3 (February 14th, 2013) =================================== @@ -524,6 +142,9 @@ Others: * Documentation improvements. (Many people including David W-F, abalkin, Amir Elaguizy, Olivier D., Frederic B.) * The current GPU back-end have a new function CudaNdarray_prep_output(CudaNdarray ** arr, int nd, const int * dims) (Ian G) +============= +Release Notes +============= Theano 0.6rc2 (November 21th, 2012) =================================== @@ -636,6 +257,9 @@ Crash Fixes: Other: * Doc typo fixes, Doc updates, Better error messages: Olivier D., David W.F., Frederic B., James B., Matthew Rocklin, Ian G., abalkin. +============= +Release Notes +============= Theano 0.6rc1 (October 1st, 2012) ================================= @@ -857,7 +481,6 @@ Speed up: Speed up GPU: * Convolution on the GPU now checks the generation of the card to make it faster in some cases (especially medium/big ouput image) (Frederic B.) - * We had hardcoded 512 as the maximum number of threads per block. Newer cards support up to 1024 threads per block. * Faster GpuAdvancedSubtensor1, GpuSubtensor, GpuAlloc (Frederic B.) diff --git a/NEWS_DEV.txt b/NEWS_DEV.txt deleted file mode 100644 index 0709d7ea649..00000000000 --- a/NEWS_DEV.txt +++ /dev/null @@ -1,53 +0,0 @@ -.. _NEWS: - -=================== -DRAFT Release Notes -=================== - -git log -p rel-0.6rc3... |grep Merge|grep '#' |cut -f 8 -d ' ' | replace "#" "* https://github.com/Theano/Theano/pull/" - -git shortlog -sn rel-0.6rc3.. - -Done up to PR 1608 -* https://github.com/Theano/Theano/pull/1608 - -* https://github.com/Theano/Theano/pull/1591 # need info - -Interface change: - - theano.tensor.signal.conv2d(2d,2d) output 2d answer. (Frederic B., reported by Alexander Izvorski) - - -Theano Development version -========================== - -NEWS.txt: - -We recommend that everybody update to this version. - -Highlights: - - -Committers for this dev version only: - - -A total of X people contributed to this release. -People with a "+" by their names contributed a patch for the first time. - -Installation: - -Bug fixes: - -Interface Deprecation (a warning is printed): - -Interface Changes: - -New Interface (reuses existing functionality): - -Speed-ups: - -Crash/no return fixes: - -Others: - -Todo for the final release: - * update the NEWS.txt file. diff --git a/bin/theano-cache b/bin/theano-cache index 142f53d9881..f5f74ff9842 100755 --- a/bin/theano-cache +++ b/bin/theano-cache @@ -8,6 +8,7 @@ from theano import config from theano.gof.cc import get_module_cache _logger = logging.getLogger('theano.bin.theano-cache') +_logger.setLevel(logging.WARN) def print_help(exit_status): @@ -18,7 +19,7 @@ def print_help(exit_status): print 'Type "theano-cache clear" to erase the cache' print 'Type "theano-cache list" to print the cache content' print 'Type "theano-cache unlock" to unlock the cache directory' - print 'Type "theano-cache cleanup" to delete keys in the old format/code version' + print 'Type "theano-cache cleanup" to delete keys in the old format' print 'Type "theano-cache purge" to force deletion of the cache directory' print ('Type "theano-cache basecompiledir" ' 'to print the parent of the cache directory') @@ -59,8 +60,6 @@ elif len(sys.argv) == 2: theano.gof.compiledir.print_compiledir_content() elif sys.argv[1] == 'cleanup': theano.gof.compiledir.cleanup() - cache = get_module_cache(init_args=dict(do_refresh=False)) - cache.clear_old() elif sys.argv[1] == 'unlock': theano.gof.compilelock.force_unlock() print 'Lock successfully removed!' diff --git a/bin/theano-nose b/bin/theano-nose index db4b5118093..37cf4ee75e1 100755 --- a/bin/theano-nose +++ b/bin/theano-nose @@ -18,6 +18,7 @@ disable that plugin. import logging _logger = logging.getLogger('theano.bin.theano-nose') +_logger.setLevel(logging.WARN) import os import nose diff --git a/doc/acknowledgement.txt b/doc/acknowledgement.txt index 412e5fbeeb1..427b3962e98 100644 --- a/doc/acknowledgement.txt +++ b/doc/acknowledgement.txt @@ -19,8 +19,3 @@ Acknowledgements `theano/misc/cpucount.py` come from the project `pyprocessing `_. It is available under the same license as Theano. -* Our random number generator implementation on CPU and GPU uses the MRG31k3p algorithm that is described in: - - P. L'Ecuyer and R. Touzin, `Fast Combined Multiple Recursive Generators with Multipliers of the form a = +/- 2^d +/- 2^e `_, Proceedings of the 2000 Winter Simulation Conference, Dec. 2000, 683--689. - - We were authorized by Pierre L'Ecuyer to copy/modify his Java implementation in the `SSJ `_ software and to relicense it under BSD 3-Clauses in Theano. diff --git a/doc/cifarSC2011/advanced_theano.txt b/doc/cifarSC2011/advanced_theano.txt index 6a2fe1019ff..b37dcca132f 100644 --- a/doc/cifarSC2011/advanced_theano.txt +++ b/doc/cifarSC2011/advanced_theano.txt @@ -16,55 +16,46 @@ Conditions **IfElse Example: Comparison with Switch** -.. testcode:: - - from theano import tensor as T - from theano.ifelse import ifelse - import theano, time, numpy - - a,b = T.scalars('a','b') - x,y = T.matrices('x','y') - - z_switch = T.switch(T.lt(a,b), T.mean(x), T.mean(y)) - z_lazy = ifelse(T.lt(a,b), T.mean(x), T.mean(y)) +.. code-block:: python - f_switch = theano.function([a,b,x,y], z_switch, - mode=theano.Mode(linker='vm')) - f_lazyifelse = theano.function([a,b,x,y], z_lazy, - mode=theano.Mode(linker='vm')) + from theano import tensor as T + from theano.ifelse import ifelse + import theano, time, numpy - val1 = 0. - val2 = 1. - big_mat1 = numpy.ones((10000,1000)) - big_mat2 = numpy.ones((10000,1000)) + a,b = T.scalars('a','b') + x,y = T.matrices('x','y') + + z_switch = T.switch(T.lt(a,b), T.mean(x), T.mean(y)) + z_lazy = ifelse(T.lt(a,b), T.mean(x), T.mean(y)) - n_times = 10 + f_switch = theano.function([a,b,x,y], z_switch, + mode=theano.Mode(linker='vm')) + f_lazyifelse = theano.function([a,b,x,y], z_lazy, + mode=theano.Mode(linker='vm')) - tic = time.clock() - for i in xrange(n_times): - f_switch(val1, val2, big_mat1, big_mat2) - print 'time spent evaluating both values %f sec'%(time.clock()-tic) + val1 = 0. + val2 = 1. + big_mat1 = numpy.ones((10000,1000)) + big_mat2 = numpy.ones((10000,1000)) - tic = time.clock() - for i in xrange(n_times): - f_lazyifelse(val1, val2, big_mat1, big_mat2) - print 'time spent evaluating one value %f sec'%(time.clock()-tic) + n_times = 10 -.. testoutput:: - :hide: - :options: +ELLIPSIS + tic = time.clock() + for i in xrange(n_times): + f_switch(val1, val2, big_mat1, big_mat2) + print 'time spent evaluating both values %f sec'%(time.clock()-tic) - time spent evaluating both values ... sec - time spent evaluating one value ... sec + tic = time.clock() + for i in xrange(n_times): + f_lazyifelse(val1, val2, big_mat1, big_mat2) + print 'time spent evaluating one value %f sec'%(time.clock()-tic) IfElse Op spend less time (about an half) than Switch since it computes only one variable instead of both. -.. code-block:: none - - $ python ifelse_switch.py - time spent evaluating both values 0.6700 sec - time spent evaluating one value 0.3500 sec +>>> python ifelse_switch.py +time spent evaluating both values 0.6700 sec +time spent evaluating one value 0.3500 sec Note that IfElse condition is a boolean while Switch condition is a tensor, so Switch is more general. @@ -121,7 +112,7 @@ Loops **Scan Example: Calculating a Polynomial** -.. testcode:: +.. code-block:: python import theano import theano.tensor as T @@ -142,10 +133,7 @@ Loops test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32) print calculate_polynomial(test_coeff, 3) - -.. testoutput:: - - 19.0 + # 19.0 @@ -279,7 +267,7 @@ Printing/Drawing Theano graphs ``theano.printing.pprint(variable)`` ->>> theano.printing.pprint(prediction) # doctest: +SKIP +>>> theano.printing.pprint(prediction) gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),TensorConstant{0.5}) @@ -287,7 +275,7 @@ gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),TensorC ``theano.printing.debugprint({fct, variable, list of variables})`` ->>> theano.printing.debugprint(prediction) # doctest: +SKIP +>>> theano.printing.debugprint(prediction) Elemwise{gt,no_inplace} [@181772236] '' |Elemwise{true_div,no_inplace} [@181746668] '' | |InplaceDimShuffle{x} [@181746412] '' @@ -305,7 +293,7 @@ Elemwise{gt,no_inplace} [@181772236] '' | | | | | |b [@181730156] |InplaceDimShuffle{x} [@181771788] '' | |TensorConstant{0.5} [@181771148] ->>> theano.printing.debugprint(predict) # doctest: +SKIP +>>> theano.printing.debugprint(predict) Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] '' 2 |dot [@183018796] '' 1 | |x [@183000780] @@ -316,19 +304,19 @@ Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] '' 2 - Picture Printing of Graphs ->>> theano.printing.pydotprint_variables(prediction) # doctest: +SKIP +>>> theano.printing.pydotprint_variables(prediction) .. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png :width: 800 px All pydotprint* requires graphviz and pydot ->>> theano.printing.pydotprint(predict) # doctest: +SKIP +>>> theano.printing.pydotprint(predict) .. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_predic.png :width: 800 px ->>> theano.printing.pydotprint(train) # This is a small train example! # doctest: +SKIP +>>> theano.printing.pydotprint(train) # This is a small train example! .. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_train.png :width: 1500 px diff --git a/doc/cifarSC2011/boot_camp_overview.txt b/doc/cifarSC2011/boot_camp_overview.txt index 861bd1164f4..322b8fcef7d 100644 --- a/doc/cifarSC2011/boot_camp_overview.txt +++ b/doc/cifarSC2011/boot_camp_overview.txt @@ -15,7 +15,7 @@ Day 1 * Show of hands - what is your background? -* Python & NumPy in a nutshell +* Python & Numpy in a nutshell * Theano basics diff --git a/doc/cifarSC2011/gpundarray.txt b/doc/cifarSC2011/gpundarray.txt index 05a05abaeac..0babc8ea6c6 100644 --- a/doc/cifarSC2011/gpundarray.txt +++ b/doc/cifarSC2011/gpundarray.txt @@ -1,5 +1,5 @@ -.. _cifar2013_gpundarray: +.. _gpundarray: ********** GpuNdArray diff --git a/doc/cifarSC2011/index.txt b/doc/cifarSC2011/index.txt index 92debb1e54d..3c1c043c015 100644 --- a/doc/cifarSC2011/index.txt +++ b/doc/cifarSC2011/index.txt @@ -18,7 +18,7 @@ What does it do? * symbolic differentiation. -It complements the Python numeric/scientific software stack (e.g. NumPy, SciPy, +It complements the Python numeric/scientific software stack (e.g. numpy, scipy, scikits, matplotlib, PIL.) Design and feature set has been driven by machine learning research diff --git a/doc/cifarSC2011/introduction.txt b/doc/cifarSC2011/introduction.txt index 836568812fd..06e1ac06480 100644 --- a/doc/cifarSC2011/introduction.txt +++ b/doc/cifarSC2011/introduction.txt @@ -13,7 +13,7 @@ Background Questionaire * What did you do with it? -* Who has used Python? NumPy? SciPy? matplotlib? +* Who has used Python? numpy? scipy? matplotlib? * Who has used iPython? @@ -72,14 +72,14 @@ Python in one slide # PYTHON SYNTAX EXAMPLE ####################### a = 1 # no type declaration required! - b = (1, 2, 3) # tuple of three int literals - c = [1, 2, 3] # list of three int literals + b = (1,2,3) # tuple of three int literals + c = [1,2,3] # list of three int literals d = {'a': 5, b: None} # dictionary of two elements # N.B. string literal, None print d['a'] # square brackets index # -> 5 - print d[(1, 2, 3)] # new tuple == b, retrieves None + print d[(1,2,3)] # new tuple == b, retrieves None # -> None print d[6] # raises KeyError Exception @@ -116,18 +116,18 @@ Python in one slide print Bar(99).hello() # Creating an instance of Bar # -> 99 -NumPy in one slide +Numpy in one slide ------------------ * Python floats are full-fledged objects on the heap * Not suitable for high-performance computing! -* NumPy provides a N-dimensional numeric array in Python +* Numpy provides a N-dimensional numeric array in Python * Perfect for high-performance computing. -* NumPy provides +* Numpy provides * elementwise computations @@ -135,7 +135,7 @@ NumPy in one slide * pseudorandom numbers from many distributions -* SciPy provides lots more, including +* Scipy provides lots more, including * more linear algebra @@ -148,29 +148,29 @@ NumPy in one slide .. code-block:: python ############################## - # Properties of NumPy arrays + # Properties of Numpy arrays # that you really need to know ############################## import numpy as np # import can rename - a = np.random.rand(3, 4, 5) # random generators + a = np.random.rand(3,4,5) # random generators a32 = a.astype('float32') # arrays are strongly typed a.ndim # int: 3 - a.shape # tuple: (3, 4, 5) + a.shape # tuple: (3,4,5) a.size # int: 60 a.dtype # np.dtype object: 'float64' a32.dtype # np.dtype object: 'float32' Arrays can be combined with numeric operators, standard mathematical -functions. NumPy has great `documentation `_. +functions. Numpy has great `documentation `_. -Training an MNIST-ready classification neural network in pure NumPy might look like this: +Training an MNIST-ready classification neural network in pure numpy might look like this: .. code-block:: python ######################### - # NumPy for Training a + # Numpy for Training a # Neural Network on MNIST ######################### @@ -186,23 +186,23 @@ Training an MNIST-ready classification neural network in pure NumPy might look l batchsize = 100 for i in xrange(1000): - x_i = x[i * batchsize: (i + 1) * batchsize] - y_i = y[i * batchsize: (i + 1) * batchsize] + x_i = x[i*batchsize:(i+1)*batchsize] + y_i = y[i*batchsize:(i+1)*batchsize] hidin = np.dot(x_i, w) + b hidout = np.tanh(hidin) outin = np.dot(hidout, v) + c - outout = (np.tanh(outin) + 1) / 2.0 + outout = (np.tanh(outin)+1)/2.0 g_outout = outout - y_i - err = 0.5 * np.sum(g_outout ** 2) + err = 0.5 * np.sum(g_outout**2) g_outin = g_outout * outout * (1.0 - outout) g_hidout = np.dot(g_outin, v.T) - g_hidin = g_hidout * (1 - hidout ** 2) + g_hidin = g_hidout * (1 - hidout**2) b -= lr * np.sum(g_hidin, axis=0) c -= lr * np.sum(g_outin, axis=0) @@ -215,9 +215,9 @@ What's missing? * Non-lazy evaluation (required by Python) hurts performance -* NumPy is bound to the CPU +* Numpy is bound to the CPU -* NumPy lacks symbolic or automatic differentiation +* Numpy lacks symbolic or automatic differentiation Now let's have a look at the same algorithm in Theano, which runs 15 times faster if you have GPU (I'm skipping some dtype-details which we'll come back to). @@ -229,42 +229,40 @@ you have GPU (I'm skipping some dtype-details which we'll come back to). # Neural Network on MNIST ######################### - import numpy as np - - import theano - import theano.tensor as tensor + import theano as T + import theano.tensor as TT x = np.load('data_x.npy') y = np.load('data_y.npy') # symbol declarations - sx = tensor.matrix() - sy = tensor.matrix() - w = theano.shared(np.random.normal(avg=0, std=.1, - size=(784, 500))) - b = theano.shared(np.zeros(500)) - v = theano.shared(np.zeros((500, 10))) - c = theano.shared(np.zeros(10)) + sx = TT.matrix() + sy = TT.matrix() + w = T.shared(np.random.normal(avg=0, std=.1, + size=(784, 500))) + b = T.shared(np.zeros(500)) + v = T.shared(np.zeros((500, 10))) + c = T.shared(np.zeros(10)) # symbolic expression-building - hid = tensor.tanh(tensor.dot(sx, w) + b) - out = tensor.tanh(tensor.dot(hid, v) + c) - err = 0.5 * tensor.sum(out - sy) ** 2 - gw, gb, gv, gc = tensor.grad(err, [w, b, v, c]) + hid = TT.tanh(TT.dot(sx, w) + b) + out = TT.tanh(TT.dot(hid, v) + c) + err = 0.5 * TT.sum(out - sy)**2 + gw, gb, gv, gc = TT.grad(err, [w,b,v,c]) # compile a fast training function - train = theano.function([sx, sy], err, + train = T.function([sx, sy], err, updates={ - w: w - lr * gw, - b: b - lr * gb, - v: v - lr * gv, - c: c - lr * gc}) + w:w - lr * gw, + b:b - lr * gb, + v:v - lr * gv, + c:c - lr * gc}) # now do the computations batchsize = 100 for i in xrange(1000): - x_i = x[i * batchsize: (i + 1) * batchsize] - y_i = y[i * batchsize: (i + 1) * batchsize] + x_i = x[i*batchsize:(i+1)*batchsize] + y_i = y[i*batchsize:(i+1)*batchsize] err_i = train(x_i, y_i) @@ -288,7 +286,7 @@ Theano in one slide * Expression substitution optimizations automatically draw on many backend technologies for best performance. - * FFTW, MKL, ATLAS, SciPy, Cython, CUDA + * FFTW, MKL, ATLAS, Scipy, Cython, CUDA * Slower fallbacks always available diff --git a/doc/cifarSC2011/pyCUDA.txt b/doc/cifarSC2011/pyCUDA.txt index 6713fe189c0..6aac7f5c3da 100644 --- a/doc/cifarSC2011/pyCUDA.txt +++ b/doc/cifarSC2011/pyCUDA.txt @@ -75,12 +75,12 @@ Exercise 6 - Modify and execute it to work for a matrix of 20 x 10 -.. _cifar2011_pyCUDA_theano: +.. _pyCUDA_theano: Theano + PyCUDA --------------- -.. testcode:: +.. code-block:: python import numpy, theano import theano.misc.pycuda_init @@ -118,20 +118,15 @@ Theano + PyCUDA pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size), block=(512,1,1), grid=grid) return thunk - -.. testoutput:: - :hide: - :options: +SKIP - - This contains GPU code so skip it + Test it! ->>> x = theano.tensor.fmatrix() # doctest: +SKIP ->>> f = theano.function([x], PyCUDADoubleOp()(x)) # doctest: +SKIP ->>> xv=numpy.ones((4,5), dtype="float32") # doctest: +SKIP ->>> assert numpy.allclose(f(xv), xv*2) # doctest: +SKIP ->>> print numpy.asarray(f(xv)) # doctest: +SKIP +>>> x = theano.tensor.fmatrix() +>>> f = theano.function([x], PyCUDADoubleOp()(x)) +>>> xv=numpy.ones((4,5), dtype="float32") +>>> assert numpy.allclose(f(xv), xv*2) +>>> print numpy.asarray(f(xv)) Exercises 7 ----------- diff --git a/doc/cifarSC2011/theano.txt b/doc/cifarSC2011/theano.txt index ce726dc8afb..659526d6095 100644 --- a/doc/cifarSC2011/theano.txt +++ b/doc/cifarSC2011/theano.txt @@ -345,11 +345,20 @@ Differentiation details * We are working on the missing optimizations to be able to compute efficently the full Jacobian and Hessian and Jacobian x vector -.. _cifar2011_benchmark: Benchmarks ---------- +Example: + +* Multi-layer perceptron +* Convolutional Neural Networks +* Misc Elemwise operations + +Competitors: NumPy + SciPy, MATLAB, EBLearn, Torch5, numexpr + +* EBLearn, Torch5: specialized libraries written by practitioners specifically for these tasks +* numexpr: similar to Theano, 'virtual machine' for elemwise expressions **Multi-Layer Perceptron**: diff --git a/doc/citation.txt b/doc/citation.txt deleted file mode 100644 index 804d115d7a4..00000000000 --- a/doc/citation.txt +++ /dev/null @@ -1,41 +0,0 @@ - -.. _citation: - - -Theano Citation Policy -====================== - -If you use Theano for academic research, you are highly encouraged (though not -required) to cite the following two papers: - -* F. Bastien, P. Lamblin, R. Pascanu, J. Bergstra, I. Goodfellow, - A. Bergeron, N. Bouchard, D. Warde-Farley and Y. Bengio. - `"Theano: new features and speed improvements" - `_. - NIPS 2012 deep learning workshop. (`BibTex - `_) - -* J. Bergstra, O. Breuleux, F. Bastien, P. Lamblin, R. - Pascanu, G. Desjardins, J. Turian, D. Warde-Farley and Y. - Bengio. `"Theano: A CPU and GPU Math Expression Compiler" - `_. - *Proceedings of the Python for Scientific Computing Conference (SciPy) - 2010. June 30 - July 3, Austin, TX* (`BibTeX - `_) - -Theano is primarily developed by academics, and so citations matter a lot to -us. As an added benefit, you increase Theano's exposure and potential user -(and developer) base, which is to the benefit of all users of Theano. Thanks -in advance! - -Previously, we only asked users of Theano to cite the original 2010 paper. However, -this policy did not give appropriate credit to the many members of our community -who have contributed to Theano in the meantime. - -In the future, we intend to introduce new papers periodically (hopefully approximately -once per year) with a comprehensive author list. As soon as one of these papers is -prepared, we will only ask for users to cite the single most recent paper with the -most comprehensive author list. - - - diff --git a/doc/conf.py b/doc/conf.py index d0c72ac8075..2dc96752819 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -23,7 +23,7 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo', 'sphinx.ext.doctest'] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo'] todo_include_todos = True @@ -53,7 +53,7 @@ # The short X.Y version. version = '0.6' # The full version, including alpha/beta/rc tags. -release = '0.6' +release = '0.6rc3' # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: diff --git a/doc/crei2013/advanced_theano.txt b/doc/crei2013/advanced_theano.txt deleted file mode 100644 index 2f91f5cf7d1..00000000000 --- a/doc/crei2013/advanced_theano.txt +++ /dev/null @@ -1,216 +0,0 @@ - -.. _crei2013_advanced_theano: - -*************** -Advanced Theano -*************** - - -Profiling ---------- - -- To replace the default mode with this mode, use the Theano flags ``profile=True`` - -- To enable the memory profiling use the flags ``profile_memory=True`` - -Theano output: - -.. literalinclude:: logreg_profile.prof - -Compilation pipeline --------------------- - -.. image:: ../hpcs2011_tutorial/pics/pipeline.png - :width: 400 px - - -Inplace optimization --------------------- - -- 2 type of inplace operations: - - - An op that return a view on its inputs (e.g. reshape, inplace transpose) - - An op that write the output on the inputs memory space - -- This allows some memory optimization -- The Op must tell Theano if they work inplace -- Inplace Op add constraints to the order of execution - - -Conditions ----------- -**IfElse** - -- Build condition over symbolic variables. -- IfElse Op takes a boolean condition and two variables to compute as input. -- While Switch Op evaluates both 'output' variables, IfElse Op is lazy and only - evaluates one variable respect to the condition. - -**IfElse Example: Comparison with Switch** - -.. literalinclude:: ifelse_switch.py - -IfElse Op spend less time (about an half) than Switch since it computes only -one variable instead of both. - ->>> python ifelse_switch.py -time spent evaluating both values 0.230000 sec -time spent evaluating one value 0.120000 sec - -Note that IfElse condition is a boolean while Switch condition is a tensor, so -Switch is more general. - -It is actually important to use ``linker='vm'`` or ``linker='cvm'``, -otherwise IfElse will compute both variables and take the same computation -time as the Switch Op. The linker is not currently set by default to 'cvm' but -it will be in a near future. - -Loops ------ - -**Scan** - -- General form of **recurrence**, which can be used for looping. -- **Reduction** and **map** (loop over the leading dimensions) are special cases of Scan -- You 'scan' a function along some input sequence, producing an output at each time-step -- The function can see the **previous K time-steps** of your function -- ``sum()`` could be computed by scanning the z + x(i) function over a list, given an initial state of ``z=0``. -- Often a for-loop can be expressed as a ``scan()`` operation, and ``scan`` is the closest that Theano comes to looping. -- The advantage of using ``scan`` over for loops - - - The number of iterations to be part of the symbolic graph - - Minimizes GPU transfers if GPU is involved - - Compute gradients through sequential steps - - Slightly faster then using a for loop in Python with a compiled Theano function - - Can lower the overall memory usage by detecting the actual amount of memory needed - -**Scan Example: Computing pow(A,k)** - -.. literalinclude:: scan_pow.py - - -**Scan Example: Calculating a Polynomial** - -.. literalinclude:: scan_poly.py - -Exercise 4 ------------ - -- Run both examples -- Modify and execute the polynomial example to have the reduction done by scan - - -Exercise 5 ------------ - -- In the last exercises, do you see a speed up with the GPU? -- Where does it come from? (Use ProfileMode) -- Is there something we can do to speed up the GPU version? - - -Printing/Drawing Theano graphs ------------------------------- - -- Pretty Printing - -``theano.printing.pprint(variable)`` - ->>> theano.printing.pprint(prediction) -gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),TensorConstant{0.5}) - - -- Debug Print - -``theano.printing.debugprint({fct, variable, list of variables})`` - ->>> theano.printing.debugprint(prediction) -Elemwise{gt,no_inplace} [@181772236] '' - |Elemwise{true_div,no_inplace} [@181746668] '' - | |InplaceDimShuffle{x} [@181746412] '' - | | |TensorConstant{1} [@181745836] - | |Elemwise{add,no_inplace} [@181745644] '' - | | |InplaceDimShuffle{x} [@181745420] '' - | | | |TensorConstant{1} [@181744844] - | | |Elemwise{exp,no_inplace} [@181744652] '' - | | | |Elemwise{sub,no_inplace} [@181744012] '' - | | | | |Elemwise{neg,no_inplace} [@181730764] '' - | | | | | |dot [@181729676] '' - | | | | | | |x [@181563948] - | | | | | | |w [@181729964] - | | | | |InplaceDimShuffle{x} [@181743788] '' - | | | | | |b [@181730156] - |InplaceDimShuffle{x} [@181771788] '' - | |TensorConstant{0.5} [@181771148] ->>> theano.printing.debugprint(predict) -Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] '' 2 - |dot [@183018796] '' 1 - | |x [@183000780] - | |w [@183000812] - |InplaceDimShuffle{x} [@183133580] '' 0 - | |b [@183000876] - |TensorConstant{[ 0.5]} [@183084108] - -- Picture Printing of Graphs - ->>> theano.printing.pydotprint_variables(prediction) - -.. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png - :width: 800 px - -All pydotprint* requires graphviz and pydot - ->>> theano.printing.pydotprint(predict) - -.. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_predic.png - :width: 800 px - ->>> theano.printing.pydotprint(train) # This is a small train example! - -.. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_train.png - :width: 1500 px - - -Debugging ---------- - -- Run with the Theano flag ``compute_test_value = {``off'',``ignore'', ``warn'', ``raise''}`` - - - Run the code as we create the graph - - Allows you to find the bug earlier (ex: shape mismatch) - - Makes it easier to identify where the problem is in *your* code - - Use the value of constants and shared variables directly - - For pure symbolic variables uses ``x.tag.test_value = numpy.random.rand(5,10)`` - -- Run with the flag ``mode=FAST_COMPILE`` - - - Few optimizations - - Run Python code (better error messages and can be debugged interactively in the Python debugger) - -- Run with the flag ``mode=DebugMode`` - - - 100-1000x slower - - Test all optimization steps from the original graph to the final graph - - Checks many things that Op should/shouldn't do - - Executes both the Python and C code versions - -Known limitations ------------------ - -- Compilation phase distinct from execution phase - - - Use ``a_tensor_variable.eval()`` to make this less visible - -- Compilation time can be significant - - - Amortize it with functions over big input or reuse functions - -- Execution overhead - - - We have worked on this, but more work needed - - So needs a certain number of operations to be useful - -- Compilation time superlinear in the size of the graph. - - - Hundreds of nodes is fine - - Disabling a few optimizations can speed up compilation - - Usually too many nodes indicates a problem with the graph diff --git a/doc/crei2013/gpundarray.txt b/doc/crei2013/gpundarray.txt deleted file mode 100644 index f0462975a17..00000000000 --- a/doc/crei2013/gpundarray.txt +++ /dev/null @@ -1,42 +0,0 @@ - -.. _crei2013_gpundarray: - -********** -GpuNdArray -********** - -Why a common GPU ndarray? -------------------------- - -- Currently there are at least 4 different GPU array data structures in use by Python packages - - - CudaNdarray (Theano), GPUArray (PyCUDA), CUDAMatrix (cudamat), GPUArray (PyOpenCL), ... - - There are even more if we include other languages - -- All of them are a subset of the functionality of ``numpy.ndarray`` on the GPU -- Lots of duplicated effort - - - GPU code is harder/slower to do {\bf correctly} and {\bf fast} than on the CPU/Python - -- Lack of a common array API makes it harder to port/reuse code -- Also harder to find/distribute code -- Divides development work - - -Design Goals ------------- - -- Make it VERY similar to ``numpy.ndarray`` -- Be compatible with both CUDA and OpenCL -- Have the base object accessible from C to allow collaboration with more projects, across high-level languages - - - We want people from C, C++, Ruby, R, ... all use the same base GPU N-dimensional array - - -Final Note ----------- - -- Under development -- Will be the next GPU array container for Theano (*this summer!*) -- Probably also for PyCUDA, PyOpenCL -- Mailing list: http://lists.tiker.net/listinfo/gpundarray diff --git a/doc/crei2013/ifelse_switch.py b/doc/crei2013/ifelse_switch.py deleted file mode 100644 index 31cd3223ca2..00000000000 --- a/doc/crei2013/ifelse_switch.py +++ /dev/null @@ -1,33 +0,0 @@ -import time - -import numpy - -import theano -from theano import tensor as tt -from theano.ifelse import ifelse - -a, b = tt.scalars('a', 'b') -x, y = tt.matrices('x', 'y') - -z_switch = tt.switch(tt.lt(a, b), tt.mean(x), tt.mean(y)) -z_lazy = ifelse(tt.lt(a, b), tt.mean(x), tt.mean(y)) - -f_switch = theano.function([a, b, x, y], z_switch) -f_lazyifelse = theano.function([a, b, x, y], z_lazy) - -val1 = 0. -val2 = 1. -big_mat1 = numpy.ones((10000, 1000)) -big_mat2 = numpy.ones((10000, 1000)) - -n_times = 10 - -tic = time.clock() -for i in xrange(n_times): - f_switch(val1, val2, big_mat1, big_mat2) -print 'time spent evaluating both values %f sec' % (time.clock() - tic) - -tic = time.clock() -for i in xrange(n_times): - f_lazyifelse(val1, val2, big_mat1, big_mat2) -print 'time spent evaluating one value %f sec' % (time.clock() - tic) \ No newline at end of file diff --git a/doc/crei2013/index.txt b/doc/crei2013/index.txt deleted file mode 100644 index 101ce7f2970..00000000000 --- a/doc/crei2013/index.txt +++ /dev/null @@ -1,71 +0,0 @@ - -.. _crei2013_index: - -=========================== -Theano Tutorial @ CREI 2013 -=========================== - -July 19, 2013, Sherbrook, Québec, Canada. - - -Theano is python software for evaluating complicated array expressions. - -What does it do? - - * aggressive expression optimizations, - - * automatic GPU use, - - * symbolic differentiation and R op. - -It complements the Python numeric/scientific software stack (e.g. NumPy, SciPy, -scikits, matplotlib, PIL.) - -Design and feature set has been driven by machine learning research -at the University of -Montreal (groups of Yoshua Bengio, Pascal Vincent, Aaron Courville and Roland Memisevic) -The result is a very good library for doing research in deep -learning and neural network training, and a flexible framework for -many other models and algorithms in machine learning more generally. - -It has proven to be useful for implementing: - - - linear and nonlinear neural network classifiers - - - convolutional models - - - Energy models: RBM, DBN, GRBM, ssRBM, AIS - - - Auto-encoders: DAE, CAE - - - GP regression - - - sparse coding - - - recurrent neural networks, echo state, (HMM?) - - - online and batch learning and optimization - - - Even SVM! - -As people's needs change this list will grow, but Theano is built -around vector, matrix, and tensor expressions; there is little reason -to use it for calculations on other data structures except. There is -also some sparse matrix support. - - -Contents --------- - -The structured part of these lab sessions will be a walk-through of the following -material. Interleaved with this structured part will be blocks of time for -individual or group work. The idea is that you can try out Theano and get help -from gurus on hand if you get stuck. - -.. toctree:: - - introduction - theano - advanced_theano - gpundarray - /tutorial/extending_theano diff --git a/doc/crei2013/introduction.txt b/doc/crei2013/introduction.txt deleted file mode 100644 index 8e0269b47e5..00000000000 --- a/doc/crei2013/introduction.txt +++ /dev/null @@ -1,397 +0,0 @@ - -.. _crei2013_Introduction: - - -************ -Introduction -************ - -Background Questionaire ------------------------ - -* Who has used Theano before? - - * What did you do with it? - -* Who has used Python? NumPy? SciPy? matplotlib? - -* Who has used iPython? - - * Who has used it as a distributed computing engine? - -* Who has done C/C++ programming? - -* Who has organized computation around a particular physical memory layout? - -* Who has used a multidimensional array of >2 dimensions? - -* Who has written a Python module in C before? - - * Who has written a program to *generate* Python modules in C? - -* Who has used a templating engine? - -* Who has programmed a GPU before? - - * Using OpenGL / shaders ? - - * Using CUDA (runtime? / driver?) - - * Using PyCUDA ? - - * Using OpenCL / PyOpenCL ? - - * Using cudamat / gnumpy ? - - * Other? - -* Who has used Cython? - - -Python in one slide -------------------- - -* General-purpose high-level OO interpreted language - -* Emphasizes code readability - -* Comprehensive standard library - -* Dynamic type and memory management - -* Built-in types: int, float, str, list, dict, tuple, object - -* Slow execution - -* Popular in web-dev and scientific communities - - -.. code-block:: python - - ####################### - # PYTHON SYNTAX EXAMPLE - ####################### - a = 1 # no type declaration required! - b = (1, 2, 3) # tuple of three int literals - c = [1, 2, 3] # list of three int literals - d = {'a': 5, b: None} # dictionary of two elements - # N.B. string literal, None - - print d['a'] # square brackets index - # -> 5 - print d[(1, 2, 3)] # new tuple == b, retrieves None - # -> None - print d[6] - # raises KeyError Exception - - x, y, z = 10, 100, 100 # multiple assignment from tuple - x, y, z = b # unpacking a sequence - - b_squared = [b_i**2 for b_i in b] # list comprehension - - def foo(b, c=3): # function w default param c - return a + b + c # note scoping, indentation - - foo(5) # calling a function - # -> 1 + 5 + 3 == 9 # N.B. scoping - foo(b=6, c=2) # calling with named args - # -> 1 + 6 + 2 == 9 - - print b[1:3] # slicing syntax - - class Foo(object): # Defining a class - def __init__(self): - self.a = 5 - def hello(self): - return self.a - - f = Foo() # Creating a class instance - print f.hello() # Calling methods of objects - # -> 5 - - class Bar(Foo): # Defining a subclass - def __init__(self, a): - self.a = a - - print Bar(99).hello() # Creating an instance of Bar - # -> 99 - -NumPy in one slide ------------------- - -* Python floats are full-fledged objects on the heap - - * Not suitable for high-performance computing! - -* NumPy provides a N-dimensional numeric array in Python - - * Perfect for high-performance computing. - * Slice are return view (no copy) - -* NumPy provides - - * elementwise computations - - * linear algebra, Fourier transforms - - * pseudorandom numbers from many distributions - -* SciPy provides lots more, including - - * more linear algebra - - * solvers and optimization algorithms - - * matlab-compatible I/O - - * I/O and signal processing for images and audio - -.. code-block:: python - - ############################## - # Properties of NumPy arrays - # that you really need to know - ############################## - - import numpy as np # import can rename - a = np.random.rand(3, 4, 5) # random generators - a32 = a.astype('float32') # arrays are strongly typed - - a.ndim # int: 3 - a.shape # tuple: (3, 4, 5) - a.size # int: 60 - a.dtype # np.dtype object: 'float64' - a32.dtype # np.dtype object: 'float32' - - assert a[1, 1, 1] != 10 # a[1, 1, 1] is a view - a[1, 1, 1] = 10 # So affectation to it change the - assert a[1, 1, 1] == 10 # original array - - -Arrays can be combined with numeric operators, standard mathematical -functions. NumPy has great `documentation `_. - -Training an MNIST-ready classification neural network in pure NumPy might look like this: - -.. code-block:: python - - ######################### - # NumPy for Training a - # Neural Network on MNIST - ######################### - - x = np.load('data_x.npy') - y = np.load('data_y.npy') - w = np.random.normal( - avg=0, - std=.1, - size=(784, 500)) - b = np.zeros((500,)) - v = np.zeros((500, 10)) - c = np.zeros((10,)) - - batchsize = 100 - for i in xrange(1000): - x_i = x[i * batchsize: (i + 1) * batchsize] - y_i = y[i * batchsize: (i + 1) * batchsize] - - hidin = np.dot(x_i, w) + b - - hidout = np.tanh(hidin) - - outin = np.dot(hidout, v) + c - outout = (np.tanh(outin) + 1) / 2.0 - - g_outout = outout - y_i - err = 0.5 * np.sum(g_outout) ** 2 - - g_outin = g_outout * outout * (1.0 - outout) - - g_hidout = np.dot(g_outin, v.T) - g_hidin = g_hidout * (1 - hidout ** 2) - - b -= lr * np.sum(g_hidin, axis=0) - c -= lr * np.sum(g_outin, axis=0) - w -= lr * np.dot(x_i.T, g_hidin) - v -= lr * np.dot(hidout.T, g_outin) - - -What's missing? ---------------- - -* Non-lazy evaluation (required by Python) hurts performance - -* NumPy is bound to the CPU - -* NumPy lacks symbolic or automatic differentiation - -Now let's have a look at the same algorithm in Theano, which runs 15 times faster if -you have GPU (I'm skipping some dtype-details which we'll come back to). - -.. code-block:: python - - ######################### - # Theano for Training a - # Neural Network on MNIST - ######################### - - import numpy as np - - import theano - import theano.tensor as tensor - - x = np.load('data_x.npy') - y = np.load('data_y.npy') - - # symbol declarations - sx = tensor.matrix() - sy = tensor.matrix() - w = theano.shared(np.random.normal(avg=0, std=.1, - size=(784, 500))) - b = theano.shared(np.zeros(500)) - v = theano.shared(np.zeros((500, 10))) - c = theano.shared(np.zeros(10)) - - # symbolic expression-building - hid = tensor.tanh(tensor.dot(sx, w) + b) - out = tensor.tanh(tensor.dot(hid, v) + c) - err = 0.5 * tensor.sum(out - sy) ** 2 - gw, gb, gv, gc = tensor.grad(err, [w, b, v, c]) - - # compile a fast training function - train = theano.function([sx, sy], err, - updates={ - w: w - lr * gw, - b: b - lr * gb, - v: v - lr * gv, - c: c - lr * gc}) - - # now do the computations - batchsize = 100 - for i in xrange(1000): - x_i = x[i * batchsize: (i + 1) * batchsize] - y_i = y[i * batchsize: (i + 1) * batchsize] - err_i = train(x_i, y_i) - - -Theano in one slide -------------------- - -* High-level domain-specific language tailored to numeric computation - -* Compiles most common expressions to C for CPU and GPU. - -* Limited expressivity means lots of opportunities for expression-level optimizations - - * No function call -> global optimization - - * Strongly typed -> compiles to machine instructions - - * Array oriented -> parallelizable across cores - - * Support for looping and branching in expressions - -* Expression substitution optimizations automatically draw - on many backend technologies for best performance. - - * FFTW, MKL, ATLAS, SciPy, Cython, CUDA - - * Slower fallbacks always available - -* Automatic differentiation and R op - -* Sparse matrices - - -Project status --------------- - -* Mature: theano has been developed and used since January 2008 (5.5 yrs old) - -* Driven over 87 research papers - -* Good user documentation - -* Active mailing list with participants from outside our lab - -* Core technology for a funded Silicon-Valley startup - -* Many contributors (some from outside our lab) - -* Used to teach IFT6266 for many years - -* Used for research at Google and Yahoo. - -* Downloads (January 2011 - June 8 2011): - - * Pypi (16 July 2013): 60k total, 159 last day, 823 last week - - * Github (`bleeding edge` repository): unknown - - - - -Why scripting for GPUs? ------------------------ - -They *Complement each other*: - -* GPUs are everything that scripting/high level languages are not - - * Highly parallel - - * Very architecture-sensitive - - * Built for maximum FP/memory throughput - - * So hard to program that meta-programming is easier. - -* CPU: largely restricted to control - - * Optimized for sequential code and low latency (rather than high throughput) - - * Tasks (1000/sec) - - * Scripting fast enough - -Best of both: scripted CPU invokes JIT-compiled kernels on GPU. - - -How Fast are GPUs? ------------------- - -* Theory - - * Intel Core i7 980 XE (107Gf/s float64) 6 cores - - * NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32) 480 cores - - * NVIDIA GTX580 (1.5Tf/s float32) 512 cores - - * GPUs are faster, cheaper, more power-efficient - -* Practice (our experience) - - * Depends on algorithm and implementation! - - * Reported speed improvements over CPU in lit. vary *widely* (.01x to 1000x) - - * Matrix-matrix multiply speedup: usually about 10-20x. - - * Convolution speedup: usually about 15x. - - * Elemwise speedup: slower or up to 100x (depending on operation and layout) - - * Sum: can be faster or slower depending on layout. - -* Benchmarking is delicate work... - - * How to control quality of implementation? - - * How much time was spent optimizing CPU vs GPU code? - - * Theano goes up to 100x faster on GPU because it uses only one CPU core - - * Theano can be linked with multi-core capable BLAS (GEMM and GEMV) - -* If you see speedup > 100x, the benchmark is probably not fair. diff --git a/doc/crei2013/logreg.py b/doc/crei2013/logreg.py deleted file mode 100644 index bd1bbbe6ff8..00000000000 --- a/doc/crei2013/logreg.py +++ /dev/null @@ -1,44 +0,0 @@ -import numpy -import theano -import theano.tensor as tt -rng = numpy.random - -N = 400 -feats = 784 -D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2)) -training_steps = 10000 - -# Declare Theano symbolic variables -x = tt.matrix("x") -y = tt.vector("y") -w = theano.shared(rng.randn(feats), name="w") -b = theano.shared(0., name="b") -print "Initial model:" -print w.get_value(), b.get_value() - -# Construct Theano expression graph -p_1 = 1 / (1 + tt.exp(-tt.dot(x, w) - b)) # Probability that target = 1 -prediction = p_1 > 0.5 # The prediction thresholded -xent = -y * tt.log(p_1) - (1 - y) * tt.log(1 - p_1) # Cross-entropy loss -cost = xent.mean() + 0.01 * (w ** 2).sum() # The cost to minimize -gw, gb = tt.grad(cost, [w, b]) - -# Compile -train = theano.function( - inputs=[x, y], - outputs=[prediction, xent], - updates=[(w, w - 0.1 * gw), - (b, b - 0.1 * gb)], - name='train') - -predict = theano.function(inputs=[x], outputs=prediction, - name='predict') - -# Train -for i in range(training_steps): - pred, err = train(D[0], D[1]) - -print "Final model:" -print w.get_value(), b.get_value() -print "target values for D:", D[1] -print "prediction on D:", predict(D[0]) diff --git a/doc/crei2013/logreg_profile.prof b/doc/crei2013/logreg_profile.prof deleted file mode 100644 index bd61054a32e..00000000000 --- a/doc/crei2013/logreg_profile.prof +++ /dev/null @@ -1,121 +0,0 @@ -Function profiling -================== - Message: train - Time in 10000 calls to Function.__call__: 7.171231e+00s - Time in Function.fn.__call__: 6.686692e+00s (93.243%) - Time in thunks: 6.511275e+00s (90.797%) - Total compile time: 6.550491e-01s - Theano Optimizer time: 5.976810e-01s - Theano validate time: 1.260662e-02s - Theano Linker time (includes C, CUDA code generation/compiling): 2.649593e-02s - -Class ---- -<% time>