From 997b1788a4635c66721f052bc0dfde1bb5c9759b Mon Sep 17 00:00:00 2001
From: Greg <gregliest@gmail.com>
Date: Mon, 19 Jul 2021 09:14:14 -0700
Subject: [PATCH 1/2] Edits and tutorial fixes.

---
 docs/src/examples/augmented_neural_ode.md     |  2 +-
 docs/src/examples/mnist_conv_neural_ode.md    |  2 +-
 docs/src/examples/mnist_neural_ode.md         | 18 +++++++++---------
 docs/src/examples/neural_ode_flux.md          |  4 ++--
 docs/src/examples/neural_ode_galacticoptim.md |  7 +++++--
 docs/src/examples/neural_ode_sciml.md         | 14 +++++++-------
 6 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/docs/src/examples/augmented_neural_ode.md b/docs/src/examples/augmented_neural_ode.md
index d530444213..beda2c1d45 100644
--- a/docs/src/examples/augmented_neural_ode.md
+++ b/docs/src/examples/augmented_neural_ode.md
@@ -26,7 +26,7 @@ function concentric_sphere(dim, inner_radius_range, outer_radius_range,
     end
     data = cat(data..., dims=2)
     labels = cat(labels..., dims=2)
-    return DataLoader(data |> gpu, labels |> gpu; batchsize=batch_size, shuffle=true,
+    return DataLoader((data |> gpu, labels |> gpu); batchsize=batch_size, shuffle=true,
                       partial=false)
 end
 
diff --git a/docs/src/examples/mnist_conv_neural_ode.md b/docs/src/examples/mnist_conv_neural_ode.md
index f7cf9bc1fa..1710c537ed 100644
--- a/docs/src/examples/mnist_conv_neural_ode.md
+++ b/docs/src/examples/mnist_conv_neural_ode.md
@@ -344,7 +344,7 @@ This callback function is used to print both the training and testing accuracy a
 ```julia
 cb() = begin
     global iter += 1
-    # Monitor that the weights do infact update
+    # Monitor that the weights update
     # Every 10 training iterations show accuracy
     if iter % 10 == 1
         train_accuracy = accuracy(model, train_dataloader) * 100
diff --git a/docs/src/examples/mnist_neural_ode.md b/docs/src/examples/mnist_neural_ode.md
index e47c0683d9..df98ef912f 100644
--- a/docs/src/examples/mnist_neural_ode.md
+++ b/docs/src/examples/mnist_neural_ode.md
@@ -39,7 +39,7 @@ const bs = 128
 const train_split = 0.9
 train_dataloader, test_dataloader = loadmnist(bs, train_split)
 
-down = Chain(flatten, Dense(784, 20, tanh)) |> gpu
+down = Chain(Flux.flatten, Dense(784, 20, tanh)) |> gpu
 
 nn = Chain(Dense(20, 10, tanh),
            Dense(10, 10, tanh),
@@ -58,7 +58,7 @@ function DiffEqArray_to_Array(x)
     return reshape(xarr, size(xarr)[1:2])
 end
 
-# Build our over-all model topology
+# Build our overall model topology
 model = Chain(down,
               nn_ode,
               DiffEqArray_to_Array,
@@ -196,7 +196,7 @@ to the next. Four different sets of layers are used here:
 
 
 ```julia
-down = Chain(flatten, Dense(784, 20, tanh)) |> gpu
+down = Chain(Flux.flatten, Dense(784, 20, tanh)) |> gpu
 
 nn = Chain(Dense(20, 10, tanh),
            Dense(10, 10, tanh),
@@ -227,8 +227,8 @@ fc  = Chain(Dense(20, 10)) |> gpu
 
 ### Array Conversion
 
-When using `NeuralODE`, we can use the following function as a cheap conversion of `DiffEqArray`
-from the ODE solver into a Matrix that can be used in the following layer:
+When using `NeuralODE`, this function converts the ODESolution's `DiffEqArray` to
+a Matrix (CuArray), and reduces the matrix from 3 to 2 dimensions for use in the next layer.
 
 ```julia
 function DiffEqArray_to_Array(x)
@@ -238,7 +238,7 @@ end
 ```
 
 For CPU: If this function does not automatically fallback to CPU when no GPU is present, we can
-change `gpu(x)` with `Array(x)`.
+change `gpu(x)` to `Array(x)`.
 
 
 ### Build Topology
@@ -246,7 +246,7 @@ change `gpu(x)` with `Array(x)`.
 Next we connect all layers together in a single chain:
 
 ```julia
-# Build our over-all model topology
+# Build our overall model topology
 model = Chain(down,
               nn_ode,
               DiffEqArray_to_Array,
@@ -343,7 +343,7 @@ This callback function is used to print both the training and testing accuracy a
 ```julia
 cb() = begin
     global iter += 1
-    # Monitor that the weights do infact update
+    # Monitor that the weights update
     # Every 10 training iterations show accuracy
     if iter % 10 == 1
         train_accuracy = accuracy(model, train_dataloader) * 100
@@ -363,7 +363,7 @@ for Neural ODE is given by `nn_ode.p`:
 
 ```julia
 # Train the NN-ODE and monitor the loss and weights.
-Flux.train!(loss, params( down, nn_ode.p, fc), zip( x_train, y_train ), opt, cb = cb)
+Flux.train!(loss, Flux.params( down, nn_ode.p, fc), zip( x_train, y_train ), opt, cb = cb)
 ```
 
 ### Expected Output
diff --git a/docs/src/examples/neural_ode_flux.md b/docs/src/examples/neural_ode_flux.md
index 4fdb3ad504..57592d94b7 100644
--- a/docs/src/examples/neural_ode_flux.md
+++ b/docs/src/examples/neural_ode_flux.md
@@ -24,7 +24,7 @@ dudt2 = Chain(x -> x.^3,
              Dense(2,50,tanh),
              Dense(50,2))
 p,re = Flux.destructure(dudt2) # use this p as the initial condition!
-dudt(u,p,t) = re(p)(u) # need to restrcture for backprop!
+dudt(u,p,t) = re(p)(u) # need to restructure for backprop!
 prob = ODEProblem(dudt,u0,tspan)
 
 function predict_n_ode()
@@ -39,7 +39,7 @@ end
 
 loss_n_ode() # n_ode.p stores the initial parameters of the neural ODE
 
-cb = function (;doplot=false) #callback function to observe training
+cb = function (;doplot=false) # callback function to observe training
   pred = predict_n_ode()
   display(sum(abs2,ode_data .- pred))
   # plot current prediction against data
diff --git a/docs/src/examples/neural_ode_galacticoptim.md b/docs/src/examples/neural_ode_galacticoptim.md
index a0b9345e8f..89e119bda9 100644
--- a/docs/src/examples/neural_ode_galacticoptim.md
+++ b/docs/src/examples/neural_ode_galacticoptim.md
@@ -5,7 +5,7 @@ a lot of the choices, using heuristics to determine a potentially efficient meth
 However, in some cases you may want more control over the optimization process.
 The underlying optimization package behind `sciml_train` is
 [GalacticOptim.jl](https://github.com/SciML/GalacticOptim.jl).
-In this tutorial we will show how to more deeply interact with the optimzation
+In this tutorial we will show how to more deeply interact with the optimization
 library to tweak its processes.
 
 We can use a neural ODE as our example. A neural ODE is an ODE where a neural
@@ -169,11 +169,14 @@ set up custom optimization problems. For more information on the usage of
 [GalacticOptim.jl](https://github.com/SciML/GalacticOptim.jl), please consult
 [this](https://galacticoptim.sciml.ai/stable/) documentation.
 
+The `x` and `p` variables in the optimization function are different than
+`x` and `p` above. The optimization function runs over the space of parameters of
+the original problem, so `x_optimization` == `p_original`.
 ```julia
 # Train using the ADAM optimizer
 adtype = GalacticOptim.AutoZygote()
 
-optf = GalacticOptim.OptimizationFunction((x, p) -> loss_neuralode(x), adtype)
+optf = GalacticOptim.OptimizationFunction((x_optimization, p_optimization) -> loss_neuralode(x_optimization), adtype)
 optfunc = GalacticOptim.instantiate_function(optf, prob_neuralode.p, adtype, nothing)
 optprob = GalacticOptim.OptimizationProblem(optfunc, prob_neuralode.p)
 
diff --git a/docs/src/examples/neural_ode_sciml.md b/docs/src/examples/neural_ode_sciml.md
index 33d10bfaec..dbffce5280 100644
--- a/docs/src/examples/neural_ode_sciml.md
+++ b/docs/src/examples/neural_ode_sciml.md
@@ -64,7 +64,7 @@ result_neuralode = DiffEqFlux.sciml_train(loss_neuralode, prob_neuralode.p,
 
 ## Explanation
 
-Let's get a time series array from the Lotka-Volterra equation as data:
+Let's generate a time series array from a cubic equation as data:
 
 ```julia
 using DiffEqFlux, DifferentialEquations, Plots
@@ -104,9 +104,9 @@ dudt2 = Chain(x -> x.^3,
 
 In our model we used the `x -> x.^3` assumption in the model. By incorporating
 structure into our equations, we can reduce the required size and training time
-for the neural network, but a good guess needs to be known!
+for the neural network, but we need a good guess!
 
-From here we build a loss function around it. The `NeuralODE` has an optional
+From here, we build a loss function around our `NeuralODE`. `NeuralODE` has an optional
 second argument for new parameters which we will use to iteratively change the
 neural network in our training loop. We will use the L2 loss of the network's
 output against the time series data:
@@ -139,18 +139,18 @@ callback = function (p, l, pred; doplot = false)
 end
 ```
 
-We then train the neural network to learn the ODE. Using `sciml_train`, heuristics
-are chosen that does this fast and simply:
+We then train the neural network to learn the ODE. `sciml_train` chooses heuristics
+that train quickly and simply:
 
 ```julia
 result_neuralode = DiffEqFlux.sciml_train(loss_neuralode, prob_neuralode.p,
                                           cb = callback)
 ```
 
-## Usage without the layer
+## Usage Without the Layer Function
 
 Note that you can equivalently define the NeuralODE by hand instead of using
-the layer function. With `FastChain` this would look like:
+the `NeuralODE`. With `FastChain` this would look like:
 
 ```julia
 dudt!(u, p, t) = dudt2(u, p)

From a5d077cbf1ec0867a4bc2d849148e8c8f869c201 Mon Sep 17 00:00:00 2001
From: Greg <gregliest@gmail.com>
Date: Mon, 19 Jul 2021 09:24:42 -0700
Subject: [PATCH 2/2] Add detailed comments to sciml_train tutorial.

---
 docs/src/examples/neural_ode_sciml.md | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/docs/src/examples/neural_ode_sciml.md b/docs/src/examples/neural_ode_sciml.md
index dbffce5280..882d77d2bf 100644
--- a/docs/src/examples/neural_ode_sciml.md
+++ b/docs/src/examples/neural_ode_sciml.md
@@ -17,34 +17,40 @@ follow a full explanation of the definition and training process:
 ```julia
 using DiffEqFlux, DifferentialEquations, Plots, GalacticOptim
 
-u0 = Float32[2.0; 0.0]
-datasize = 30
-tspan = (0.0f0, 1.5f0)
-tsteps = range(tspan[1], tspan[2], length = datasize)
+u0 = Float32[2.0; 0.0] # Initial condition
+datasize = 30 # Number of data points
+tspan = (0.0f0, 1.5f0) # Time range
+tsteps = range(tspan[1], tspan[2], length = datasize) # Split time range into equal steps for each data point
 
+# Function that will generate the data we are trying to fit
 function trueODEfunc(du, u, p, t)
     true_A = [-0.1 2.0; -2.0 -0.1]
-    du .= ((u.^3)'true_A)'
+    du .= ((u.^3)'true_A)' # Need transposes to make the matrix multiplication work
 end
 
+# Define the problem with the function above
 prob_trueode = ODEProblem(trueODEfunc, u0, tspan)
+# Solve and take just the solution array
 ode_data = Array(solve(prob_trueode, Tsit5(), saveat = tsteps))
 
-dudt2 = FastChain((x, p) -> x.^3,
-                  FastDense(2, 50, tanh),
+# Make a neural net with a NeuralODE layer
+dudt2 = FastChain((x, p) -> x.^3, # Guess a cubic function
+                  FastDense(2, 50, tanh), # Multilayer perceptron for the part we don't know
                   FastDense(50, 2))
 prob_neuralode = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps)
 
+# Array of predictions from NeuralODE with parameters p starting at initial condition u0
 function predict_neuralode(p)
   Array(prob_neuralode(u0, p))
 end
 
 function loss_neuralode(p)
     pred = predict_neuralode(p)
-    loss = sum(abs2, ode_data .- pred)
+    loss = sum(abs2, ode_data .- pred) # Just sum of squared error
     return loss, pred
 end
 
+# Callback function to observe training
 callback = function (p, l, pred; doplot = true)
   display(l)
   # plot current prediction against data
@@ -56,6 +62,7 @@ callback = function (p, l, pred; doplot = true)
   return false
 end
 
+# Parameters are prob_neuralode.p
 result_neuralode = DiffEqFlux.sciml_train(loss_neuralode, prob_neuralode.p,
                                           cb = callback)
 ```