In [1]:
import Flux, Statistics, BSON, Zygote
import Parameters

## Gradient

In [2]:
g(x) = 5x^3 + 3x^2 + 2x

g (generic function with 1 method)

In [3]:
g(1.0)

10.0

In [10]:
Zygote.gradient(g, 1)

(23.0,)

In [11]:
g'(1.0)

23.0

In [12]:
g''(1.0)

36.0

In [14]:
ainp = rand(3)
binp = rand(3)
cinp = rand(3)

function func(a, b)
    temp = a.^2 + b.^2
    out = (cinp - temp).^2
    return sum(out)
end

Zygote.gradient(func, ainp, binp)

([0.2870794464187259, -0.15072324287419367, 0.2545307787495331], [2.4980934937311376, -0.0852769368471369, 1.825895586034233])

## Jacobian

In [15]:
ainp = rand(3,4)
binp = rand(4,3)
cinp = ones(3,3)

function func(a, b)
    temp = a * b
    out = (cinp - temp).^2
    return out
end

Zygote.jacobian(func, ainp, binp)

([-1.0573986293327713 0.0 … 0.0 0.0; 0.0 -0.15967164587968913 … -0.07338191150069401 0.0; … ; 0.0 -0.22889974206570568 … -0.1966533484846408 0.0; 0.0 0.0 … 0.0 -0.00013050951781236987], [-0.27720208189726325 -0.045438751583509136 … 0.0 0.0; -0.10620532917681685 -0.052254138639468796 … 0.0 0.0; … ; 0.0 0.0 … -0.10219294012069868 -0.08934439827810202; 0.0 0.0 … -5.3945807226079016e-5 -0.00010746416573991927])

## Machine Learning

Consider solving `y = w * x + b`, which is standard gradient descent problem.

The loss function is discrepency between estimated and true `y` (`yhat`)

`L = RMS(yhat - y)`

To solve it, we need to compute dL/dw.

In [16]:
import Random
Random.seed!(2022)

# Inputs
Wo = rand(4,3)
bo = rand(4,1)
x = rand(3)

# Training data
yhat = rand(4)

# Loss function
function loss(W, b)
    y = W * x .+ b
    l = 0.5 * (yhat - y) .^2
    return sum(l)
end


Zygote.gradient(loss, Wo, bo)

([0.1495067559506688 1.434866431317314 1.042899862082106; 0.013571459568988413 0.13024984480264554 0.09466912195874969; 0.14021211045434054 1.3456626041800532 0.9780640996806981; 0.0020561482589988318 0.01973354378461961 0.014342875157011598], [1.5022101214306587; 0.13636296097447465; 1.408819622446778; 0.020659713376760314;;])

In [31]:
Wo*x .+ bo

4×1 Matrix{Float64}:
 1.9145409176542763
 0.5376768790526782
 1.6628666613100092
 1.008819269756692

## Flux layers

In [18]:
linearlayer = Flux.Dense(3,4)

Dense(3 => 4)       [90m# 16 parameters[39m

In [19]:
linearlayer(rand(3))

4-element Vector{Float64}:
 -0.172069408331454
  0.4817180636593049
  0.6488648605809555
 -0.08067461133091389

In [20]:
weights = Flux.params(linearlayer)

Params([Float32[-0.044647608 -0.31147394 0.3310886; -0.46983743 0.32885465 0.90849006; 0.16546322 0.552223 0.17763619; 0.17444198 0.09530168 -0.59155774], Float32[0.0, 0.0, 0.0, 0.0]])

In [35]:
weights[1]

4×3 Matrix{Float32}:
 0.426271  -0.41051     0.165463
 0.375355  -0.467059    0.174442
 0.82343   -0.0446476  -0.311474
 0.225686  -0.469837    0.328855

In [21]:
weights[2]

4-element Vector{Float32}:
 0.0
 0.0
 0.0
 0.0

In [28]:
function loss(x)
  y = linearlayer(x)
  return sum((yhat .- y).^2)
end

l = loss(x)
println("Loss is: ", l)

gs = Zygote.gradient(()->loss(x), Flux.params(linearlayer))

Loss is: 2.317448061468025


Grads(...)

In [29]:
Flux.params(linearlayer)

Params([Float32[-0.044647608 -0.31147394 0.3310886; -0.46983743 0.32885465 0.90849006; 0.16546322 0.552223 0.17763619; 0.17444198 0.09530168 -0.59155774], Float32[0.0, 0.0, 0.0, 0.0]])

In [33]:
opt = Flux.ADAM(0.1)

for p in Flux.params(linearlayer)
    println("before:")
    display(p)
    println("loss before: ", loss(x))
    Flux.Optimise.update!(opt, p, gs[p])
    println("after:")
    display(p)
    println("loss after: ", loss(x))
end

before:


4×3 Matrix{Float32}:
  0.156352   -0.110474   0.532089
 -0.670837    0.127855   0.70749
 -0.0355368   0.351223  -0.0233638
  0.375442    0.296302  -0.390558

loss before: 0.5716848188579376
after:


4×3 Matrix{Float32}:
  0.256352  -0.010474    0.632089
 -0.770837   0.0278547   0.60749
 -0.135537   0.251223   -0.123364
  0.475442   0.396302   -0.290558

loss after: 0.5279925325805124
before:


4-element Vector{Float32}:
  0.20099999
 -0.20099999
 -0.20099999
  0.20099999

loss before: 0.5279925325805124
after:


4-element Vector{Float32}:
  0.301
 -0.301
 -0.301
  0.301

loss after: 0.6129679051933241


: 

In [65]:
opt = Flux.Descent(0.1) # Gradient descent with learning rate 0.1

for p in Flux.params(linearlayer)
    println("before: ", p) 
    println("loss before: ", loss(x))
    Flux.Optimise.update!(opt, p, gs[p])
    println("after: ", p)
    println("loss after: ", loss(x))
end

before: Float32[0.4391527 -0.28688365 0.25531796; 0.38906926 -0.33544204 0.27010497; 0.83200884 0.037684325 -0.25163284; 0.24929641 -0.243239 0.49355254]
loss before: 0.6700865544827526
after: Float32[0.44584006 -0.22270271 0.3019664; 0.3961889 -0.2671124 0.31976882; 0.83646244 0.08042728 -0.2205661; 0.26155394 -0.1255995 0.57905614]
loss after: 0.3465570819064103
before: Float32[0.12942824, 0.13779455, 0.08619608, 0.23723355]
loss before: 0.3465570819064103
after: Float32[0.19662143, 0.20933115, 0.13094512, 0.3603943]
loss after: 0.180602370202887


## Trying backprop thru arbitrary layers
## 1. FFT

In [None]:
using Random
using FFTW
Random.seed!(1234)
# define problem
x = rand(3);
# Training data for Loss function definition
yhat = rand(4);
linearlayer = Dense(3,4,σ);
model = Chain(linearlayer,x -> real(fft(x)));
#model = Chain(linearlayer,x -> imag(fft(x)));
out = model(x)
println(out)

# backprop thru this layer
function loss(x)
  y = model(x)
  sum((y).^2)
end

l = loss(x)
println("loss is: ", l)
gs = gradient(() -> loss(x), params(model))

## 2. SVD

In [None]:
using Random
using LinearAlgebra
Random.seed!(1234)
# define problem
x = rand(3);
xmat = Float32.(rand(10,10,1,1)); #conv input (w,l,nch,batch)
yhatmat = Float32.(rand(10,10,1,1));
# Training data for Loss function definition
yhat = rand(4);
linearlayer = Dense(3,4,σ);
convlayer = Conv((3,3), 1 => 1, relu);
#model = Chain(linearlayer,x -> svd(x));
#model  = Chain(convlayer);
model = Chain(convlayer,
            x -> dropdims(x, dims=4),
            x -> dropdims(x, dims=3),
            x -> svd(x).S,
            x -> x.^3 + 2*x);
out = model(xmat);
@show size(out)

# backprop thru this layer
function loss(x)
  y = model(x)
  sum((y).^2)
end

l = loss(xmat)
#println("loss is: ", l)
gs = gradient(() -> loss(xmat), params(model))

So adding SVD works.. although we have to drop dimensions and then add dimensions
every iteration for it to work (since SVD only accepts a m x n matrix that can't have channel and nbatches)

## 3. NNs with Arbitrary equations embedded
Lets consider input X matrix.
Define output Y which satisfies Y = X' + 2X.^2 
Problem: We have Y, but need to learn X such that the equation is satisfied.

This works, as amply demonstrated in documentation and my own experiments. Just find a good application problem to use it with.

In [None]:
using Random
using LinearAlgebra
Random.seed!(1234)
# define problem
x = rand(3);
dx = 0.1;

xmat = Float32.(rand(10,10,1,1)); #conv input (w,l,nch,batch)
yhatmat = Float32.(rand(10,10,1,1));
# Training data for Loss function definition
yhat = rand(4);

linearlayer = Dense(3, 4, σ);
convlayer = Conv((3, 3), 1=>1, pad=1, relu);

function fdmlayer(x)
    fdmgradx = (x[2:end,:] - x[1:end-1,:])/(2*dx);
    fdmgrady = (x[:,2:end] - x[:,1:end-1])/(2*dx);
    gradfull = fdmgradx' * fdmgrady'
    #res = gradfull + 2 * x.^2
    return gradfull
end

model = Chain(convlayer,
            x -> dropdims(x, dims=4),
            x -> dropdims(x, dims=3),
            #x -> fdmlayer(x),
            x -> x.^3 + 2*x);
out = model(xmat);
@show size(out)

# backprop thru this layer
function loss(x)
  y = model(x)
  sum((y).^2)
end

l = loss(xmat)
#println("loss is: ", l)
gs = gradient(() -> loss(xmat), params(model))

In [None]:
fdmgradx = (xmat[2:end,:] - xmat[1:end-1,:])/(2*dx);
fdmgrady = (xmat[:,2:end] - xmat[:,1:end-1])/(2*dx);

In [None]:
out = fdmgradx' * fdmgrady'
size(out)

In [None]:
out2 = map((x) -> x.^3 + 2*x, out);

In [None]:
sum((out2).^2)