# First and second derivative of FNN with respect to input

Import necessary packages.

In [23]:
import tensorflow as tf
import numpy as np

Define activation function and its derivatives.

In [24]:
# Custom activation function
# from keras.layers import Activation
# from keras import backend as K
# from keras.utils.generic_utils import get_custom_objects

#def mσ(x):
    #return np.abs(x) + np.log(1. + np.exp(-2. * np.abs(x)))
    
def mσ(x):
    return np.divide(1, 1 + np.exp(np.negative(x)))

# get_custom_objects().update({'custom_activation': Activation(mσ)})


In [25]:
#def mdσ(x):
    #return np.tanh(x)
    
    
#def md2σ(x):
    #return np.divide(1., np.square(np.cosh(x)))

def mdσ(x):
    return mσ(x) * (1 - mσ(x))
    
    
def md2σ(x):
    return mσ(x) * (1 - mσ(x)) * (1 - 2*mσ(x))

In [26]:
x = [[10.], [20.], [30.]]

print(mσ(x))
print(tf.keras.activations.sigmoid(x))
print(mdσ(x))
print(md2σ(x))

[[0.9999546]
 [1.       ]
 [1.       ]]
tf.Tensor(
[[0.9999546]
 [1.       ]
 [1.       ]], shape=(3, 1), dtype=float32)
[[4.53958077e-05]
 [2.06115369e-09]
 [9.34807787e-14]]
[[-4.53916860e-05]
 [-2.06115368e-09]
 [-9.34807787e-14]]


Does not exactly match the results/values in Julia.

Define Neural Network.

In [27]:
# Define model architecture
class PINN(tf.keras.Model):
    """ Set basic architecture of the PINN model."""

    def __init__(self,
                 output_dim=1,
                 num_hidden_layers=3,
                 num_neurons_per_layer=20,
                 activationfunction = 'sigmoid',
                 kernel_initializer='glorot_normal',
                 **kwargs):
        
        super().__init__(**kwargs)

        self.num_hidden_layers = num_hidden_layers
        self.input_dim = 2
        self.output_dim = output_dim

        # Define NN architecture
        
        # Inititialize num_hidden_layers many fully connected dense layers
        self.hidden = [tf.keras.layers.Dense(num_neurons_per_layer,
                                             activation = activationfunction,
                                             kernel_initializer=kernel_initializer) for _ in range(self.num_hidden_layers)]
        
        # Output layer
        #self.out = tf.keras.layers.Dense(output_dim, activation=None)
        self.out = tf.keras.layers.Dense(output_dim, activation = activationfunction)
        
    def call(self, X):
        """Forward-pass through neural network."""
        self.tmp_layer_output = [X]
        #Z = self.scale(X)
        Z = X
        
        for i in range(self.num_hidden_layers):
            Z = self.hidden[i](Z)
            self.tmp_layer_output.append(Z)
            
        return self.out(Z)

Compute gradient.

Compute gradient for layer l.

In [28]:
def get_gradient_layer(W,b,a,δ):
#     z1 = np.transpose(a @ W)  
#     b = np.reshape(b, np.shape(z1))
#     z2 = z1 + b
#     z3 = np.diag(mdσ(z2).flatten('F')) @ δ
    
#     return W @ z3
    return W @ (mdσ(np.transpose(a @ W + b)) * δ)

Compute gradient of neural network.

In [29]:
def get_gradient(N):
    δ = get_gradient_layer(N.out.get_weights()[0], N.out.get_weights()[1], N.tmp_layer_output[-1], np.identity(N.output_dim))

    for k in range(N.num_hidden_layers-1, -1, -1):
        δ = get_gradient_layer(N.hidden[k].get_weights()[0], N.hidden[k].get_weights()[1], N.tmp_layer_output[k], δ)
            
    return δ

Compute gradient and Hessian of last layer.

In [30]:
def get_gradient_hessian_last_layer(W,b,a,δ):
#     z1 = np.transpose(a @ W)  
#     b = np.reshape(b, np.shape(z1))
#     z2 = z1 + b
#     z3 = np.diag(mdσ(z2).flatten('F')) @ δ
    
#     ϑ = np.diag(md2σ(z2).flatten('F'))
    
#     return W @ z3, W @ ϑ @ np.transpose(W)
    z = np.transpose(a @ W + b)
    return W @ (mdσ(z) * δ), W @ (md2σ(z) * np.transpose(W))

Compute gradient and Hessian of hidden layer.

In [31]:
def get_gradient_hessian_hidden_layer(W,b,a,δ,ϑ):
#     z1 = np.transpose(a @ W)  
#     b = np.reshape(b, np.shape(z1))
#     z2 = z1 + b
#     z3 = np.diag(mdσ(z2).flatten('F')) @ δ
    
#     t2 = δ * md2σ(z2)
#     H1 = W @ np.diag(t2.flatten('F')) @ np.transpose(W)

#     dσt = np.diag(mdσ(z2).flatten('F'))
#     H2 = W @ dσt @ ϑ @ dσt @ np.transpose(W)
    
#     return W @ z3, H1+H2
    z = np.transpose(a @ W + b)
    dσt = mdσ(z) * np.transpose(W)
    return W @ (mdσ(z) * δ), W @ ((md2σ(z) * δ) * np.transpose(W)) + np.transpose(dσt) @ ϑ @ dσt 

Compute Hessian and gradient of neural network.

In [32]:
def get_hessian(N):
    δ,ϑ = get_gradient_hessian_last_layer(N.out.get_weights()[0], N.out.get_weights()[1], N.tmp_layer_output[-1], np.identity(N.output_dim))

    for k in range(N.num_hidden_layers-1, -1, -1):
        δ,ϑ = get_gradient_hessian_hidden_layer(N.hidden[k].get_weights()[0], N.hidden[k].get_weights()[1], N.tmp_layer_output[k], δ,  ϑ)
            
    return δ,ϑ

In [33]:
z = np.transpose(NeuralN.tmp_layer_output[-2] @ NeuralN.hidden[-1].weights[0] + NeuralN.hidden[-1].weights[1])
print(NeuralN.tmp_layer_output[-2].shape)
print(NeuralN.hidden[-1].weights[0].shape)
print(NeuralN.hidden[-1].weights[1].shape)
print(z.shape)
print(md2σ(z))

(1, 20)
(20, 20)
(20,)
(20, 1)
[[-0.00173629]
 [-0.0461097 ]
 [ 0.02222086]
 [-0.00479989]
 [-0.00386268]
 [-0.01797646]
 [ 0.06091084]
 [-0.07671752]
 [-0.00100137]
 [ 0.07066401]
 [-0.03450035]
 [-0.04417004]
 [-0.08326985]
 [ 0.02460143]
 [ 0.03797385]
 [-0.08481718]
 [ 0.05611779]
 [-0.06934197]
 [ 0.05944929]
 [-0.05830243]]


Why do we get a 2D vector when we insert a 2D vector?

In [34]:
NeuralN = PINN()

x = tf.random.normal((1,2))

out = NeuralN(x)
#print(out)

δ1 = get_gradient(NeuralN)
δ2,ϑ = get_hessian(NeuralN)

print(δ1- δ2)
#print(δ1[0])
#print(ϑ.flatten())

[[0.]
 [0.]]


-> We need to choose appropriate dtypes so that no operation overflows.

In [35]:
def _fvals1(N, x):

    with tf.GradientTape() as g:
        g.watch(x)
        y = N(x)

    dy_dx = g.gradient(y, x)
    dy_dx = np.transpose(dy_dx.numpy())

    return y, dy_dx

In [36]:
for i in range(5):
    x = tf.random.normal((1,2))
    NeuralN = PINN()
    out = NeuralN(x)
    δ1 = get_gradient(NeuralN)
    δ2,ϑ = get_hessian(NeuralN)
    δ_ad = _fvals1(NeuralN, x)[1]
    print(np.linalg.norm(δ1-δ2))
    print(np.linalg.norm(δ1-δ_ad))
    print(np.linalg.norm(δ2-δ_ad))

0.0
4.5843378546270136e-10
4.5843378546270136e-10
0.0
8.251608966725052e-11
8.251608966725052e-11
0.0
1.6434315828037948e-10
1.6434315828037948e-10
0.0
1.751187946328149e-10
1.751187946328149e-10
0.0
1.7745272322123738e-10
1.7745272322123738e-10


In [37]:
def _fvals2(N, x):

    with tf.GradientTape(persistent=True) as h:
        h.watch(x)
        with tf.GradientTape() as g:
            g.watch(x)
            y = N(x)

        dy_dx = g.gradient(y, x)
    
    d2y_d2x = h.jacobian(dy_dx, x)

    return y, dy_dx, d2y_d2x

In [38]:
for i in range(5):
    x = tf.random.normal((1,2))
    NeuralN = PINN()
    out = NeuralN(x)
    δ,ϑ = get_hessian(NeuralN)
    ϑ = ϑ.flatten()
    ϑ_ad = _fvals2(NeuralN, x)[2].numpy().flatten()
    print(np.linalg.norm(ϑ-ϑ_ad))

5.306208975426972e-11
1.242952224669795e-11
2.4844468069760115e-11
1.0945939278330516e-10
7.201135406061884e-11


Maybe gradient tape thinks that the neural network is not differentiable?

# Explicit derivatives of ResNet

Here we only approximate the "half" gradient so far. 

In [198]:
class PINN_ResNet(tf.keras.Model):
    """ Set basic architecture of the PINN model."""

    def __init__(self,
                 ResNetLayers=3,
                 ResNetNeurons=16,
                 ResNetStepsize=1.0,
                 ResNetActivation='sigmoid',
                 **kwargs):
        
        super(PINN_ResNet, self).__init__(**kwargs)
        
        #RNact = tf.keras.activations.get(ResNetActivation)
        #RNact = my_act
        RNact = ResNetActivation
        

        
        self.ResNetLayers = ResNetLayers
        self.ResNetStepsize = ResNetStepsize

        self.ResNet = [tf.keras.layers.Dense(ResNetNeurons,
                                        activation = RNact) for _ in range(self.ResNetLayers)]
        self.wb = tf.keras.layers.Dense(1)
        self.A = tf.keras.layers.Dense(2, use_bias=False)
        self.c = tf.keras.layers.Dense(1, use_bias=False)
        
        #self.num_hidden_layers = num_hidden_layers
        self.input_dim = 2
        self.output_dim = 1


        # Define NN architecture
        
        # Output layer
        #self.out = tf.keras.layers.Dense(1, activation='sigmoid')

        
    def call(self, input_tensor, training=False):
        """Forward-pass through neural network."""
        
        self.tmp_layer_output = [input_tensor]
        
        N = self.ResNet[0](input_tensor, training=training)
        
        for i in range(1, self.ResNetLayers):
            self.tmp_layer_output.append(N)
            N = N + self.ResNetStepsize * self.ResNet[i](N, training=training)
        
        Phi = self.wb(N, training=training)

        As = self.A(input_tensor, training=training)
        sAs = tf.keras.layers.Dot(axes=(1))([input_tensor, As])
        Phi += .5 * sAs
        Phi += self.c(input_tensor, training=training)
            
        return Phi

Gradient of model, which approximates solution of pde

In [199]:
def get_gradient_ResNet(R, x):
    output = R(x)
    δ = get_gradient_layer(R.ResNet[-1].get_weights()[0], R.ResNet[-1].get_weights()[1], R.tmp_layer_output[-1], R.wb.get_weights()[0])

    δ = R.wb.get_weights()[0] + R.ResNetStepsize * δ
 
    for k in range(R.ResNetLayers-2, 0, -1):
        δ = δ + R.ResNetStepsize * get_gradient_layer(R.ResNet[k].get_weights()[0], R.ResNet[k].get_weights()[1], R.tmp_layer_output[k], δ)
          
    
    δ = get_gradient_layer(R.ResNet[0].get_weights()[0], R.ResNet[0].get_weights()[1], R.tmp_layer_output[0], δ)
    
    M = R.A.get_weights()[0]
    
    return output, δ + 0.5*np.transpose(x @ (M + np.transpose(M))) + R.c.get_weights()[0]
    #return δ 

Something is wrong with the 'whole' gradient?

In [200]:
Resnet = PINN_ResNet()

x = tf.constant([[1., 10.]])


out, δ = get_gradient_ResNet(Resnet,x)

print(δ)

δ_ad = _fvals1(Resnet, x)

print(δ_ad[1])

print(np.linalg.norm(δ - δ_ad[1]))

[[4.1017737]
 [3.9596167]]
[[4.1017733]
 [3.959617 ]]
5.3312016e-07


In [211]:
for i in range(10):
    x = tf.random.normal((1,2))
    Resnet = PINN_ResNet()
    out, δ1 = get_gradient_ResNet(Resnet,x)
    δ_ad = _fvals1(Resnet, x)[1]
    print(np.linalg.norm(δ1-δ_ad))

1.2287812e-07
1.1920929e-07
1.3328004e-07
1.1920929e-07
6.143906e-08
1.2287812e-07
0.0
2.9802322e-08
8.940697e-08
1.4901161e-08


In [205]:
def get_gradient_hessian_layer_ResNet(W,b,a,δ):
#     z1 = np.transpose(a @ W)  
#     b = np.reshape(b, np.shape(z1))
#     z2 = z1 + b
#     z3 = np.diag(mdσ(z2).flatten('F')) @ δ
    
#     z4 = md2σ(z2) * δ
#     ϑ = np.diag(z4.flatten('F'))
    
#     return W @ z3, W @ ϑ @ np.transpose(W)

    z = np.transpose(a @ W + b)
    return W @ (mdσ(z) * δ), W @ ((md2σ(z) * δ) * np.transpose(W)), z

In [215]:
def get_hessian_ResNet(R,x):
    output = R(x)
    δ,ϑ,z = get_gradient_hessian_layer_ResNet(R.ResNet[-1].get_weights()[0], R.ResNet[-1].get_weights()[1], R.tmp_layer_output[-1], R.wb.get_weights()[0])

    δ = R.wb.get_weights()[0] + R.ResNetStepsize * δ
 
    for k in range(R.ResNetLayers-2, 0, -1):
        δ_new, ϑ_new_1, z = get_gradient_hessian_layer_ResNet(R.ResNet[k].get_weights()[0], R.ResNet[k].get_weights()[1], R.tmp_layer_output[k], δ)
        t = ϑ + R.ResNetStepsize * R.ResNet[k].get_weights()[0] @ ( mdσ(z) * ϑ)
        ϑ_new_2 = np.transpose(t) + R.ResNetStepsize * R.ResNet[k].get_weights()[0] @ ( mdσ(z) * np.transpose(t))
        ϑ = ϑ_new_1 + ϑ_new_2
        δ = δ + R.ResNetStepsize * δ_new
    
      
    δ, ϑ = get_gradient_hessian_hidden_layer(R.ResNet[0].get_weights()[0], R.ResNet[0].get_weights()[1], R.tmp_layer_output[0], δ, ϑ)
    
    M = R.A.get_weights()[0]
    
    return output, δ + 0.5*np.transpose(x @ (M + np.transpose(M))) + R.c.get_weights()[0], ϑ + 0.5*(M + np.transpose(M))

In [217]:
for i in range(20):
    x = tf.random.normal((1,2), dtype=tf.float64)
    Resnet = PINN_ResNet()
    out,δ,ϑ = get_hessian_ResNet(Resnet,x)
    ϑ = ϑ.flatten()
    ϑ_ad = _fvals2(Resnet, x)[2].numpy().flatten()
    print(np.linalg.norm(ϑ-ϑ_ad))

8.429369702178807e-08
4.2146848510894035e-08
0.0
3.725290298461914e-09
0.0
1.1920928955078125e-07
7.450580596923828e-09
6.143906154658885e-08
0.0
1.4901161193847656e-08
1.3170890159654385e-08
0.0
5.960464477539063e-08
8.429369702178807e-08
1.0536712127723509e-08
0.0
0.0
0.0
1.580506819158526e-08
1.1920928955078125e-07
