SciML · ChrisRackauckas · Jan 20, 2023 · Jan 17, 2023 · Jan 17, 2023 · Jan 17, 2023
diff --git a/Project.toml b/Project.toml
@@ -1,16 +1,14 @@
 name = "DiffEqFlux"
 uuid = "aae7a2af-3d4f-5e19-a356-7da93b79d9d0"
 authors = ["Chris Rackauckas <accounts@chrisrackauckas.com>"]
-version = "1.54.0"
+version = "2.0.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-Cassette = "7057c7e9-c182-5462-911a-8362d720325c"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ConsoleProgressMonitor = "88cd18e8-d9cc-4ea6-8889-5259c0d15c8b"
 DataInterpolations = "82cc6244-b520-54b8-b5a6-8a565e85f1d0"
 DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
-DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DistributionsAD = "ced4e74d-a319-5a8a-b0ac-84af2272839c"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
@@ -20,12 +18,6 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
-NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-Optim = "429524aa-4258-5aef-a3af-852621145aeb"
-Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
-OptimizationFlux = "253f991c-a7b2-45f8-8852-8b9a9df78a86"
-OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
-OptimizationPolyalgorithms = "500b13db-7e66-49ce-bda4-eed966be6282"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressLogging = "33c8b6b6-d38a-422a-b730-caa89a2f386c"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -34,39 +26,28 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 SciMLSensitivity = "1ed8b502-d754-442c-8d5d-10ac956f44a1"
-StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 TerminalLoggers = "5d786b92-1e48-4d6f-9151-6b4477ca9bed"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 ZygoteRules = "700de1a5-db45-46bc-99cf-38207098b444"
 
 [compat]
 Adapt = "3"
-Cassette = "0.3.7"
 ChainRulesCore = "1"
 ConsoleProgressMonitor = "0.1"
 DataInterpolations = "3.3"
 DiffEqBase = "6.41"
-DiffResults = "1.0"
 Distributions = "0.23, 0.24, 0.25"
 DistributionsAD = "0.6"
 Flux = "0.12, 0.13"
 ForwardDiff = "0.10"
 Functors = "0.4"
 LoggingExtras = "0.4, 1"
 Lux = "0.4"
-NNlib = "0.7, 0.8"
-Optim = "1"
-Optimization = "3"
-OptimizationFlux = "0.1"
-OptimizationOptimJL = "0.1"
-OptimizationPolyalgorithms = "0.1"
 ProgressLogging = "0.1"
 RecursiveArrayTools = "2"
 Reexport = "0.2, 1"
-Requires = "0.5, 1.0"
 SciMLBase = "1"
 SciMLSensitivity = "7"
-StaticArrays = "0.11, 0.12, 1"
 TerminalLoggers = "0.1"
 Zygote = "0.5, 0.6"
 ZygoteRules = "0.2"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -22,7 +22,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StochasticDiffEq = "789caeaf-c7a9-5a7d-9973-96adeb23e2a0"
 
 [compat]
-DiffEqFlux = "1.52.0"
+DiffEqFlux = "2"
 DifferentialEquations = "7.6.0"
 Distances = "0.10.7"
 Distributions = "0.25.78"

diff --git a/docs/src/examples/augmented_neural_ode.md b/docs/src/examples/augmented_neural_ode.md
@@ -28,24 +28,24 @@ function concentric_sphere(dim, inner_radius_range, outer_radius_range,
     end
     data = cat(data..., dims=2)
     labels = cat(labels..., dims=2)
-    DataLoader((data |> gpu, labels |> gpu); batchsize=batch_size, shuffle=true,
+    DataLoader((data |> Flux.gpu, labels |> Flux.gpu); batchsize=batch_size, shuffle=true,
                       partial=false)
 end
 
-diffeqarray_to_array(x) = reshape(gpu(x), size(x)[1:2])
+diffeqarray_to_array(x) = reshape(Flux.gpu(x), size(x)[1:2])
 
 function construct_model(out_dim, input_dim, hidden_dim, augment_dim)
     input_dim = input_dim + augment_dim
-    node = NeuralODE(Chain(Dense(input_dim, hidden_dim, relu),
-                           Dense(hidden_dim, hidden_dim, relu),
-                           Dense(hidden_dim, input_dim)) |> gpu,
+    node = NeuralODE(Flux.Chain(Flux.Dense(input_dim, hidden_dim, relu),
+                           Flux.Dense(hidden_dim, hidden_dim, relu),
+                           Flux.Dense(hidden_dim, input_dim)) |> Flux.gpu,
                      (0.f0, 1.f0), Tsit5(), save_everystep = false,
-                     reltol = 1e-3, abstol = 1e-3, save_start = false) |> gpu
+                     reltol = 1e-3, abstol = 1e-3, save_start = false) |> Flux.gpu
     node = augment_dim == 0 ? node : AugmentedNDELayer(node, augment_dim)
-    return Chain((x, p=node.p) -> node(x, p),
+    return Flux.Chain((x, p=node.p) -> node(x, p),
                  Array,
                  diffeqarray_to_array,
-                 Dense(input_dim, out_dim) |> gpu), node.p |> gpu
+                 Flux.Dense(input_dim, out_dim) |> Flux.gpu), node.p |> Flux.gpu
 end
 
 function plot_contour(model, npoints = 300)
@@ -57,7 +57,7 @@ function plot_contour(model, npoints = 300)
         grid_points[:, idx] .= [x1, x2]
         idx += 1
     end
-    sol = reshape(model(grid_points |> gpu), npoints, npoints) |> cpu
+    sol = reshape(model(grid_points |> Flux.gpu), npoints, npoints) |> Flux.cpu
 
     return contour(x, y, sol, fill = true, linewidth=0.0)
 end
@@ -146,7 +146,7 @@ function concentric_sphere(dim, inner_radius_range, outer_radius_range,
     end
     data = cat(data..., dims=2)
     labels = cat(labels..., dims=2)
-    return DataLoader((data |> gpu, labels |> gpu); batchsize=batch_size, shuffle=true,
+    return DataLoader((data |> Flux.gpu, labels |> Flux.gpu); batchsize=batch_size, shuffle=true,
                       partial=false)
 end
 ```
@@ -160,24 +160,24 @@ DE Layer by appending zeros. So in order to use any arbitrary DE Layer in combin
 simply assume that the input to the DE Layer is of size `size(x, 1) + augment_dim` instead of `size(x, 1)`
 and construct that layer accordingly.
 
-In order to run the models on GPU, we need to manually transfer the models to GPU. First one is the network
+In order to run the models on Flux.gpu, we need to manually transfer the models to Flux.gpu. First one is the network
 predicting the derivatives inside the Neural ODE and the other one is the last layer in the Chain.
 
 ```@example augneuralode
-diffeqarray_to_array(x) = reshape(gpu(x), size(x)[1:2])
+diffeqarray_to_array(x) = reshape(Flux.gpu(x), size(x)[1:2])
 
 function construct_model(out_dim, input_dim, hidden_dim, augment_dim)
     input_dim = input_dim + augment_dim
-    node = NeuralODE(Chain(Dense(input_dim, hidden_dim, relu),
-                           Dense(hidden_dim, hidden_dim, relu),
-                           Dense(hidden_dim, input_dim)) |> gpu,
+    node = NeuralODE(Flux.Chain(Flux.Dense(input_dim, hidden_dim, relu),
+                           Flux.Dense(hidden_dim, hidden_dim, relu),
+                           Flux.Dense(hidden_dim, input_dim)) |> Flux.gpu,
                      (0.f0, 1.f0), Tsit5(), save_everystep = false,
-                     reltol = 1e-3, abstol = 1e-3, save_start = false) |> gpu
-    node = augment_dim == 0 ? node : (AugmentedNDELayer(node, augment_dim) |> gpu)
-    return Chain((x, p=node.p) -> node(x, p),
+                     reltol = 1e-3, abstol = 1e-3, save_start = false) |> Flux.gpu
+    node = augment_dim == 0 ? node : (AugmentedNDELayer(node, augment_dim) |> Flux.gpu)
+    return Flux.Chain((x, p=node.p) -> node(x, p),
                  Array,
                  diffeqarray_to_array,
-                 Dense(input_dim, out_dim) |> gpu), node.p |> gpu
+                 Flux.Dense(input_dim, out_dim) |> Flux.gpu), node.p |> Flux.gpu
 end
 ```
 
@@ -195,7 +195,7 @@ function plot_contour(model, npoints = 300)
         grid_points[:, idx] .= [x1, x2]
         idx += 1
     end
-    sol = reshape(model(grid_points |> gpu), npoints, npoints) |> cpu
+    sol = reshape(model(grid_points |> Flux.gpu), npoints, npoints) |> Flux.cpu
 
     return contour(x, y, sol, fill = true, linewidth=0.0)
 end

diff --git a/docs/src/examples/hamiltonian_nn.md b/docs/src/examples/hamiltonian_nn.md
@@ -23,7 +23,7 @@ target = cat(dqdt, dpdt, dims = 1)
 dataloader = Flux.Data.DataLoader((data, target); batchsize=256, shuffle=true)
 
 hnn = HamiltonianNN(
-    Chain(Dense(2, 64, relu), Dense(64, 1))
+    Flux.Chain(Flux.Dense(2, 64, relu), Flux.Dense(64, 1))
 )
 
 p = hnn.p
@@ -86,7 +86,7 @@ We parameterize the HamiltonianNN with a small MultiLayered Perceptron (HNN also
 
 ```@example hamiltonian
 hnn = HamiltonianNN(
-    Chain(Dense(2, 64, relu), Dense(64, 1))
+    Flux.Chain(Flux.Dense(2, 64, relu), Flux.Dense(64, 1))
 )
 
 p = hnn.p

diff --git a/docs/src/examples/mnist_conv_neural_ode.md b/docs/src/examples/mnist_conv_neural_ode.md
@@ -28,10 +28,10 @@ function loadmnist(batchsize = bs, train_split = 0.9)
                                                          p = train_split)
     return (
         # Use Flux's DataLoader to automatically minibatch and shuffle the data
-        DataLoader(gpu.(collect.((x_train, y_train))); batchsize = batchsize,
+        DataLoader(Flux.gpu.(collect.((x_train, y_train))); batchsize = batchsize,
                    shuffle = true),
         # Don't shuffle the test data
-        DataLoader(gpu.(collect.((x_test, y_test))); batchsize = batchsize,
+        DataLoader(Flux.gpu.(collect.((x_test, y_test))); batchsize = batchsize,
                    shuffle = false)
     )
 end
@@ -43,21 +43,21 @@ train_dataloader, test_dataloader = loadmnist(bs, train_split)
 
 down = Flux.Chain(Flux.Conv((3, 3), 1=>64, relu, stride = 1), Flux.GroupNorm(64, 64),
              Flux.Conv((4, 4), 64=>64, relu, stride = 2, pad=1), Flux.GroupNorm(64, 64),
-             Flux.Conv((4, 4), 64=>64, stride = 2, pad = 1)) |>gpu
+             Flux.Conv((4, 4), 64=>64, stride = 2, pad = 1)) |> Flux.gpu
 
 dudt = Flux.Chain(Flux.Conv((3, 3), 64=>64, tanh, stride=1, pad=1),
-             Flux.Conv((3, 3), 64=>64, tanh, stride=1, pad=1)) |>gpu
+             Flux.Conv((3, 3), 64=>64, tanh, stride=1, pad=1)) |> Flux.gpu
 
 fc = Flux.Chain(Flux.GroupNorm(64, 64), x -> relu.(x), Flux.MeanPool((6, 6)),
-           x -> reshape(x, (64, :)), Flux.Dense(64,10)) |> gpu
+           x -> reshape(x, (64, :)), Flux.Dense(64,10)) |> Flux.gpu
 
 nn_ode = NeuralODE(dudt, (0.f0, 1.f0), Tsit5(),
                    save_everystep = false,
                    reltol = 1e-3, abstol = 1e-3,
-                   save_start = false) |> gpu
+                   save_start = false) |> Flux.gpu
 
 function DiffEqArray_to_Array(x)
-    xarr = gpu(x)
+    xarr = Flux.gpu(x)
     return xarr[:,:,:,:,1]
 end
 
@@ -84,8 +84,8 @@ function accuracy(model, data; n_batches = 100)
     for (i, (x, y)) in enumerate(data)
         # Only evaluate accuracy for n_batches
         i > n_batches && break
-        target_class = classify(cpu(y))
-        predicted_class = classify(cpu(model(x)))
+        target_class = classify(Flux.cpu(y))
+        predicted_class = classify(Flux.cpu(model(x)))
         total_correct += sum(target_class .== predicted_class)
         total += length(target_class)
     end
@@ -174,10 +174,10 @@ function loadmnist(batchsize = bs, train_split = 0.9)
                                                          p = train_split)
     return (
         # Use Flux's DataLoader to automatically minibatch and shuffle the data
-        DataLoader(gpu.(collect.((x_train, y_train))); batchsize = batchsize,
+        DataLoader(Flux.gpu.(collect.((x_train, y_train))); batchsize = batchsize,
                    shuffle = true),
         # Don't shuffle the test data
-        DataLoader(gpu.(collect.((x_test, y_test))); batchsize = batchsize,
+        DataLoader(Flux.gpu.(collect.((x_test, y_test))); batchsize = batchsize,
                    shuffle = false)
     )
 end
@@ -202,18 +202,18 @@ to the next. Four different sets of layers are used here:
 ```julia
 down = Flux.Chain(Flux.Conv((3, 3), 1=>64, relu, stride = 1), Flux.GroupNorm(64, 64),
              Flux.Conv((4, 4), 64=>64, relu, stride = 2, pad=1), Flux.GroupNorm(64, 64),
-             Flux.Conv((4, 4), 64=>64, stride = 2, pad = 1)) |>gpu
+             Flux.Conv((4, 4), 64=>64, stride = 2, pad = 1)) |> Flux.gpu
 
 dudt = Flux.Chain(Flux.Conv((3, 3), 64=>64, tanh, stride=1, pad=1),
-             Flux.Conv((3, 3), 64=>64, tanh, stride=1, pad=1)) |>gpu
+             Flux.Conv((3, 3), 64=>64, tanh, stride=1, pad=1)) |> Flux.gpu
 
 fc = Flux.Chain(Flux.GroupNorm(64, 64), x -> relu.(x), Flux.MeanPool((6, 6)),
-           x -> reshape(x, (64, :)), Flux.Dense(64,10)) |> gpu
+           x -> reshape(x, (64, :)), Flux.Dense(64,10)) |> Flux.gpu
 
 nn_ode = NeuralODE(dudt, (0.f0, 1.f0), Tsit5(),
                    save_everystep = false,
                    reltol = 1e-3, abstol = 1e-3,
-                   save_start = false) |> gpu
+                   save_start = false) |> Flux.gpu
 ```
 
 `down`: This layer downsamples our images into `6 x 6 x 64` dimensional features.
@@ -237,7 +237,7 @@ from the ODE solver into a Matrix that can be used in the following layer:
 
 ```julia
 function DiffEqArray_to_Array(x)
-    xarr = gpu(x)
+    xarr = Flux.gpu(x)
     return xarr[:,:,:,:,1]
 end
 ```
@@ -275,7 +275,7 @@ This can also be built without the NN-ODE by replacing `nn-ode` with a simple `n
 
 ```julia
 # We can also build the model topology without a NN-ODE
-m_no_ode = Flux.Chain(down, nn, fc) |> gpu
+m_no_ode = Flux.Chain(down, nn, fc) |> Flux.gpu
 
 x_m = m_no_ode(img)
 ```
@@ -300,8 +300,8 @@ function accuracy(model, data; n_batches = 100)
     for (i, (x, y)) in enumerate(data)
         # Only evaluate accuracy for n_batches
         i > n_batches && break
-        target_class = classify(cpu(y))
-        predicted_class = classify(cpu(model(x)))
+        target_class = classify(Flux.cpu(y))
+        predicted_class = classify(Flux.cpu(model(x)))
         total_correct += sum(target_class .== predicted_class)
         total += length(target_class)
     end