# First and second derivative of FNN with respect to input (tensorflow)

Import necessary packages.

In [36]:
import tensorflow as tf
import numpy as np

Define activation function and its derivatives.

In [37]:
# Custom activation function
# from keras.layers import Activation
# from keras import backend as K
# from keras.utils.generic_utils import get_custom_objects

#def mσ(x):
    #return np.abs(x) + np.log(1. + np.exp(-2. * np.abs(x)))
    
def mσ(x):
    return tf.math.divide(1, 1 + tf.math.exp(tf.math.negative(x)))

# get_custom_objects().update({'custom_activation': Activation(mσ)})


In [38]:
#def mdσ(x):
    #return np.tanh(x)
    
    
#def md2σ(x):
    #return np.divide(1., np.square(np.cosh(x)))

def mdσ(x):
    return mσ(x) * (1 - mσ(x))
    
    
def md2σ(x):
    return mσ(x) * (1 - mσ(x)) * (1 - 2*mσ(x))

In [39]:
x = [[10.], [20.], [30.]]

print(mσ(x))
print(tf.keras.activations.sigmoid(x))
print(mdσ(x))
print(md2σ(x))

tf.Tensor(
[[0.9999546]
 [1.       ]
 [1.       ]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[0.9999546]
 [1.       ]
 [1.       ]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[4.5416677e-05]
 [0.0000000e+00]
 [0.0000000e+00]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[-4.541255e-05]
 [-0.000000e+00]
 [-0.000000e+00]], shape=(3, 1), dtype=float32)


Does not exactly match the results/values in Julia.

Define Neural Network.

In [40]:
# Define model architecture
class PINN(tf.keras.Model):
    """ Set basic architecture of the PINN model."""

    def __init__(self,
                 output_dim=1,
                 num_hidden_layers=3,
                 num_neurons_per_layer=20,
                 activationfunction = 'sigmoid',
                 kernel_initializer='glorot_normal',
                 **kwargs):
        
        super().__init__(**kwargs)

        self.num_hidden_layers = num_hidden_layers
        self.input_dim = 2
        self.output_dim = output_dim

        # Define NN architecture
        
        # Inititialize num_hidden_layers many fully connected dense layers
        self.hidden = [tf.keras.layers.Dense(num_neurons_per_layer,
                                             activation = activationfunction,
                                             kernel_initializer=kernel_initializer) for _ in range(self.num_hidden_layers)]
        
        # Output layer
        #self.out = tf.keras.layers.Dense(output_dim, activation=None)
        self.out = tf.keras.layers.Dense(output_dim, activation = activationfunction)
        
    def call(self, X):
        """Forward-pass through neural network."""
        self.tmp_layer_output = [X]
        #Z = self.scale(X)
        Z = X
        
        for i in range(self.num_hidden_layers):
            Z = self.hidden[i](Z)
            self.tmp_layer_output.append(Z)
            
        return self.out(Z)

Compute gradient.

Compute gradient for layer l.

In [41]:
def get_gradient_layer(W,b,a,δ):
#     z1 = tf.transpose(a @ W + b)  
#     b = tf.reshape(b, z1.shape)
#     z2 = z1 + b
#     z3 = mdσ(z1) * δ
    return W @ (mdσ(tf.transpose(a @ W + b)) * δ)

Compute gradient of neural network.

In [115]:
def get_gradient(N, x):
    output = N(x)
    δ = get_gradient_layer(N.out.weights[0], N.out.weights[1], N.tmp_layer_output[-1], 1.)

    for k in range(N.num_hidden_layers-1, -1, -1):
        δ = get_gradient_layer(N.hidden[k].weights[0], N.hidden[k].weights[1], N.tmp_layer_output[k], δ)
            
    return output, δ

Compute gradient and Hessian of last layer.

In [116]:
def get_gradient_hessian_last_layer(W,b,a,δ):
#     z1 = tf.transpose(a @ W)  
#     b = tf.reshape(b, z1.shape)
#     z2 = z1 + b
#     z3 = mdσ(z2) * δ
    
#     ϑ = tf.linalg.diag(tf.reshape(md2σ(z2), [-1]))
    z = tf.transpose(a @ W + b)
    return W @ (mdσ(z) * δ), W @ (md2σ(z) * tf.transpose(W))

Compute gradient and Hessian of hidden layer.

In [117]:
def get_gradient_hessian_hidden_layer(W,b,a,δ,ϑ):
#     z1 = tf.transpose(a @ W)  
#     b = tf.reshape(b, np.shape(z1))
#     z2 = z1 + b
#     z3 = mdσ(z2) * δ
    
#     t2 = δ * md2σ(z2)
#     H1 = W @ tf.linalg.diag(tf.reshape(t2, [-1])) @ tf.transpose(W)

#     dσt = tf.linalg.diag(tf.reshape(mdσ(z2), [-1]))
#     H2 = W @ dσt @ ϑ @ dσt @ tf.transpose(W)
    z = tf.transpose(a @ W + b)
    dσt = mdσ(z) * tf.transpose(W)
    return W @ (mdσ(z) * δ), W @ ((δ * md2σ(z)) * tf.transpose(W)) + tf.transpose(dσt) @ ϑ @ dσt 

Compute Hessian and gradient of neural network.

In [118]:
def get_hessian(N, x):
    output = N(x)
    δ,ϑ = get_gradient_hessian_last_layer(N.out.weights[0], N.out.weights[1], N.tmp_layer_output[-1], 1.)

    for k in range(N.num_hidden_layers-1, -1, -1):
        δ,ϑ = get_gradient_hessian_hidden_layer(N.hidden[k].weights[0], N.hidden[k].weights[1], N.tmp_layer_output[k], δ,  ϑ)
      
    
    return output, δ, ϑ

In [119]:
NeuralN = PINN()

x = tf.random.normal((3,2))

out2, δ2, ϑ = get_hessian(NeuralN,x)

InvalidArgumentError: Incompatible shapes: [1,3] vs. [1,20] [Op:Mul]

In [120]:
x = tf.random.normal((9,2))
NeuralN(x)
A = tf.random.normal((1,20))
len(NeuralN.tmp_layer_output)
#print(NeuralN.tmp_layer_output[3])
z = NeuralN.tmp_layer_output[3] @ NeuralN.out.weights[0] + NeuralN.out.weights[1]
print(z)
print(md2σ(z))

tf.Tensor(
[[0.9218569 ]
 [0.92649245]
 [0.9226983 ]
 [0.9151223 ]
 [0.9298943 ]
 [0.9309273 ]
 [0.9274326 ]
 [0.9275983 ]
 [0.9266088 ]], shape=(9, 1), dtype=float32)
tf.Tensor(
[[-0.0877166 ]
 [-0.08792435]
 [-0.0877545 ]
 [-0.08740994]
 [-0.08807508]
 [-0.08812055]
 [-0.08796614]
 [-0.08797351]
 [-0.08792952]], shape=(9, 1), dtype=float32)


Why do we get a 2D vector when we insert a 2D vector?

In [121]:
NeuralN = PINN()

x = tf.random.normal((1,2))

#out = NeuralN(x)
#print(out)

out1, δ1 = get_gradient(NeuralN, x)
out2, δ2, ϑ = get_hessian(NeuralN,x)

print(δ1- δ2)
print(δ1)
#print(tf.reshape(ϑ, [-1]))

tf.Tensor(
[[0.]
 [0.]], shape=(2, 1), dtype=float32)
tf.Tensor(
[[-0.00025731]
 [ 0.00012754]], shape=(2, 1), dtype=float32)


In [122]:
x = tf.random.normal((20,2))
out1, δ1 = get_gradient(NeuralN, x)
print(δ1)
print(out1)
# δ_ad = _fvals1(NeuralN, x)[1]
# print(δ_ad[0,:])

tf.Tensor(
[[-2.5211912e-04 -2.3443325e-04 -3.1350570e-04 -3.4411135e-04
  -2.0756546e-04 -3.9110344e-04 -3.0514458e-04 -3.5695612e-04
  -3.5029522e-04 -2.7277105e-04 -3.2770471e-04 -3.4966198e-04
  -2.7396163e-04 -3.2730683e-04 -3.9289700e-04 -3.7934099e-04
  -3.6476974e-04 -2.8553198e-04 -3.2274041e-04 -3.1096558e-04]
 [ 2.3444928e-04  2.6710203e-04  2.2945076e-04  2.1776307e-04
   4.7089939e-05  8.4369618e-05  2.2498221e-04  2.1452940e-04
   9.2276547e-05  2.1715229e-04  2.2350322e-04  1.9113964e-04
   1.9841455e-04  2.2690831e-04  1.7907599e-04  1.7606636e-04
   2.1125347e-04  2.6097416e-04  2.3963756e-04  1.8088223e-04]], shape=(2, 20), dtype=float32)
tf.Tensor(
[[0.27780408]
 [0.2791403 ]
 [0.2786059 ]
 [0.27809304]
 [0.27918383]
 [0.2785737 ]
 [0.27868718]
 [0.2787072 ]
 [0.2787192 ]
 [0.2788821 ]
 [0.27839917]
 [0.27839768]
 [0.27792192]
 [0.27834934]
 [0.27798387]
 [0.2785477 ]
 [0.27759594]
 [0.27900136]
 [0.27868026]
 [0.27871016]], shape=(20, 1), dtype=float32)


-> We need to choose appropriate dtypes so that no operation overflows.

In [123]:
def _fvals1(N, x):

    with tf.GradientTape() as g:
        g.watch(x)
        y = N(x)

    dy_dx = g.gradient(y, x)
    dy_dx = np.transpose(dy_dx.numpy())

    return y, dy_dx

In [124]:
for i in range(10):
    x = tf.random.normal((1,2))
    NeuralN = PINN()
    out = NeuralN(x)
    out1, δ1 = get_gradient(NeuralN, x)
    out2, δ2,ϑ = get_hessian(NeuralN, x)
    δ_ad = _fvals1(NeuralN, x)[1]
    print(np.linalg.norm(δ1-δ2))
    print(np.linalg.norm(δ1-δ_ad))
    print(np.linalg.norm(δ2-δ_ad))

0.0
2.6031258e-10
2.6031258e-10
0.0
2.3999633e-10
2.3999633e-10
0.0
1.5672892e-10
1.5672892e-10
0.0
4.4520299e-10
4.4520299e-10
0.0
3.2927225e-10
3.2927225e-10
0.0
3.4924597e-10
3.4924597e-10
0.0
0.0
0.0
0.0
1.4551915e-10
1.4551915e-10
0.0
2.098707e-10
2.098707e-10
0.0
4.115903e-11
4.115903e-11


In [52]:
def _fvals2(N, x):

    with tf.GradientTape(persistent=True) as h:
        h.watch(x)
        with tf.GradientTape() as g:
            g.watch(x)
            y = N(x)

        dy_dx = g.gradient(y, x)
    
    d2y_d2x = h.jacobian(dy_dx, x)

    return y, dy_dx, d2y_d2x

In [53]:
for i in range(10):
    x = tf.random.normal((1,2))
    NeuralN = PINN()
    out,δ,ϑ = get_hessian(NeuralN,x)
    ϑ_ad = _fvals2(NeuralN, x)[2]
    print(np.linalg.norm(ϑ-ϑ_ad))

7.821082e-05
0.0008649627
0.00021177101
0.0008003311
0.0002645101
0.00016619818
5.018935e-05
0.00019621728
0.00013533684
0.00026718286


Maybe gradient tape thinks that the neural network is not differentiable?

# Explicit derivatives of ResNet

Here we only approximate the "half" gradient so far. 

In [164]:
class PINN_ResNet(tf.keras.Model):
    """ Set basic architecture of the PINN model."""

    def __init__(self,
                 ResNetLayers=2,
                 ResNetNeurons=16,
                 ResNetStepsize=1.0,
                 ResNetActivation='sigmoid',
                 **kwargs):
        
        super(PINN_ResNet, self).__init__(**kwargs)
        
        #RNact = tf.keras.activations.get(ResNetActivation)
        #RNact = my_act
        RNact = ResNetActivation
        

        
        self.ResNetLayers = ResNetLayers
        self.ResNetStepsize = ResNetStepsize

        self.ResNet = [tf.keras.layers.Dense(ResNetNeurons,
                                        activation = RNact) for _ in range(self.ResNetLayers)]
        self.wb = tf.keras.layers.Dense(1)
        self.A = tf.keras.layers.Dense(2, use_bias=False)
        self.c = tf.keras.layers.Dense(1, use_bias=False)
        
        #self.num_hidden_layers = num_hidden_layers
        self.input_dim = 2
        self.output_dim = 1


        # Define NN architecture
        
        # Output layer
        #self.out = tf.keras.layers.Dense(1, activation='sigmoid')

        
    def call(self, input_tensor, training=False):
        """Forward-pass through neural network."""
        
        self.tmp_layer_output = [input_tensor]
        
        N = self.ResNet[0](input_tensor, training=training)
        
        for i in range(1, self.ResNetLayers):
            self.tmp_layer_output.append(N)
            N = N + self.ResNetStepsize * self.ResNet[i](N, training=training)
        
        Phi = self.wb(N, training=training)

        As = self.A(input_tensor, training=training)
        sAs = tf.keras.layers.Dot(axes=(1))([input_tensor, As])
        Phi += .5 * sAs
        Phi += self.c(input_tensor, training=training)
            
        return Phi

Gradient of model, which approximates solution of pde

In [165]:
def get_gradient_ResNet(R,x):
    output = R(x)
    δ = get_gradient_layer(R.ResNet[-1].weights[0], R.ResNet[-1].weights[1], R.tmp_layer_output[-1], R.wb.weights[0])

    δ = R.wb.weights[0] + R.ResNetStepsize * δ
 
    for k in range(R.ResNetLayers-2, 0, -1):
        δ = δ + R.ResNetStepsize * get_gradient_layer(R.ResNet[k].weights[0], R.ResNet[k].weights[1], R.tmp_layer_output[k], δ)
          
    
    δ = get_gradient_layer(R.ResNet[0].weights[0], R.ResNet[0].weights[1], R.tmp_layer_output[0], δ)
    
    M = R.A.weights[0]
    
    return output, δ + 0.5*tf.transpose(x @ (M + tf.transpose(M))) + R.c.weights[0]

Something is wrong with the 'whole' gradient?

In [166]:
Resnet = PINN_ResNet()

x = tf.constant([[1., 10.]])

out, δ = get_gradient_ResNet(Resnet,x)

print(δ)

δ_ad = _fvals1(Resnet, x)

print(δ_ad[1])

print(tf.linalg.norm(δ - δ_ad[1]))

tf.Tensor(
[[ 9.111064  ]
 [-0.36449236]], shape=(2, 1), dtype=float32)
[[ 9.111063  ]
 [-0.36449233]]
tf.Tensor(9.541399e-07, shape=(), dtype=float32)


In [167]:
for i in range(10):
    x = tf.random.normal((1,2))
    Resnet = PINN_ResNet()
    out, δ1 = get_gradient_ResNet(Resnet, x)
    δ_ad = _fvals1(Resnet, x)[1]
    print(np.linalg.norm(δ1-δ_ad))

0.0
1.1920929e-07
1.1920929e-07
1.1920929e-07
1.1920929e-07
0.0
1.1920929e-07
8.4293696e-08
7.598131e-08
0.0


In [186]:
x = tf.random.normal((10,2))
Resnet = PINN_ResNet()
out, δ1 = get_gradient_ResNet(Resnet, x)
δ_ad = _fvals1(Resnet, x)[1]
print(δ1)
print(δ_ad)
print(np.linalg.norm(δ1-δ_ad))

tf.Tensor(
[[-1.0859847  -1.2132406  -1.1828234  -1.2834083  -1.1984874  -1.5392333
  -1.3509641  -1.1210082  -1.108957   -0.7772795 ]
 [-0.29275456 -0.21046205 -0.24362211 -0.1349855  -0.27058196  0.14021671
  -0.09764048 -0.37375104 -0.38146296 -0.77618027]], shape=(2, 10), dtype=float32)
[[-1.0859847  -1.2132405  -1.1828234  -1.2834083  -1.1984873  -1.5392333
  -1.3509642  -1.1210083  -1.1089572  -0.7772795 ]
 [-0.29275456 -0.21046206 -0.2436221  -0.13498548 -0.27058196  0.14021675
  -0.09764045 -0.37375104 -0.381463   -0.7761802 ]]
2.832201e-07


In [168]:
def get_gradient_hessian_layer_ResNet(W,b,a,δ):
#     z1 = np.transpose(a @ W)  
#     b = np.reshape(b, np.shape(z1))
#     z2 = z1 + b
#     z3 = np.diag(tf.reshape(mdσ(z2), [-1])) @ δ
    
#     z4 = md2σ(z2) * δ
#     ϑ = np.diag(tf.reshape(z4, [-1]))
    
#     return W @ z3, W @ ϑ @ np.transpose(W)

    z = tf.transpose(a @ W + b)
    return W @ (mdσ(z) * δ), W @ ((md2σ(z) * δ) * tf.transpose(W)), z

In [169]:
def get_hessian_ResNet(R,x):
    output = R(x)
    δ,ϑ,z = get_gradient_hessian_layer_ResNet(R.ResNet[-1].weights[0], R.ResNet[-1].weights[1], R.tmp_layer_output[-1], R.wb.weights[0])

    δ = R.wb.weights[0] + R.ResNetStepsize * δ
 
    for k in range(R.ResNetLayers-2, 0, -1):
        δ_new, ϑ_new_1, z = get_gradient_hessian_layer_ResNet(R.ResNet[k].weights[0], R.ResNet[k].weights[1], R.tmp_layer_output[k], δ)
        t = ϑ + R.ResNetStepsize * R.ResNet[k].weights[0] @ ( mdσ(z) * ϑ)
        ϑ_new_2 = tf.transpose(t) + R.ResNetStepsize * R.ResNet[k].weights[0] @ ( mdσ(z) * tf.transpose(t))
        ϑ = ϑ_new_1 + ϑ_new_2
        δ = δ + R.ResNetStepsize * δ_new
    
      
    δ, ϑ = get_gradient_hessian_hidden_layer(R.ResNet[0].weights[0], R.ResNet[0].weights[1], R.tmp_layer_output[0], δ, ϑ)
    
    M = R.A.weights[0]
    
    return output, δ + 0.5*tf.transpose(x @ (M + tf.transpose(M))) + R.c.weights[0], ϑ + 0.5*(M + tf.transpose(M))

In [180]:
tf.random.set_seed(0)
for i in range(10):
    x = tf.random.normal((1,2))
    Resnet = PINN_ResNet()
    #out = Resnet(x)
    out, δ,ϑ = get_hessian_ResNet(Resnet,x)
    ϑ_ad = _fvals2(Resnet, x)[2]
    ϑ = ϑ.numpy().flatten('F')
    ϑ_ad = ϑ_ad.numpy().flatten('F')
    print(np.linalg.norm(ϑ-ϑ_ad))

5.9604645e-08
5.9604645e-08
5.9604645e-08
0.0
1.1920929e-07
0.0
5.9604645e-08
1.4901161e-08
1.4901161e-08
1.1920929e-07
