# First and second derivative of FNN with respect to input (tensorflow)

Import necessary packages.

In [19]:
import tensorflow as tf
import numpy as np

Define activation function and its derivatives.

In [20]:
# Custom activation function
from keras.layers import Activation
from keras import backend as K
from keras.utils.generic_utils import get_custom_objects

#def mσ(x):
    #return np.abs(x) + np.log(1. + np.exp(-2. * np.abs(x)))
    
def mσ(x):
    return tf.math.divide(1, 1 + tf.math.exp(tf.math.negative(x)))

get_custom_objects().update({'custom_activation': Activation(mσ)})


In [21]:
#def mdσ(x):
    #return np.tanh(x)
    
    
#def md2σ(x):
    #return np.divide(1., np.square(np.cosh(x)))

def mdσ(x):
    return mσ(x) * (1 - mσ(x))
    
    
def md2σ(x):
    return mσ(x) * (1 - mσ(x)) * (1 - 2*mσ(x))

In [22]:
x = [[10.], [20.], [30.]]

print(mσ(x))
print(tf.keras.activations.sigmoid(x))
print(mdσ(x))
print(md2σ(x))

tf.Tensor(
[[0.9999546]
 [1.       ]
 [1.       ]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[0.9999546]
 [1.       ]
 [1.       ]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[4.5416677e-05]
 [0.0000000e+00]
 [0.0000000e+00]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[-4.541255e-05]
 [-0.000000e+00]
 [-0.000000e+00]], shape=(3, 1), dtype=float32)


Does not exactly match the results/values in Julia.

Define Neural Network.

In [71]:
# Define model architecture
class PINN(tf.keras.Model):
    """ Set basic architecture of the PINN model."""

    def __init__(self,
                 output_dim=1,
                 num_hidden_layers=3,
                 num_neurons_per_layer=20,
                 activationfunction = 'sigmoid',
                 kernel_initializer='glorot_normal',
                 **kwargs):
        
        super().__init__(**kwargs)

        self.num_hidden_layers = num_hidden_layers
        self.input_dim = 2
        self.output_dim = output_dim

        # Define NN architecture
        
        # Inititialize num_hidden_layers many fully connected dense layers
        self.hidden = [tf.keras.layers.Dense(num_neurons_per_layer,
                                             activation = activationfunction,
                                             kernel_initializer=kernel_initializer) for _ in range(self.num_hidden_layers)]
        
        # Output layer
        #self.out = tf.keras.layers.Dense(output_dim, activation=None)
        self.out = tf.keras.layers.Dense(output_dim, activation = activationfunction)
        
    def call(self, X):
        """Forward-pass through neural network."""
        self.tmp_layer_output = []
        #Z = self.scale(X)
        Z = X
        self.tmp_layer_output.append(Z)
        
        for i in range(self.num_hidden_layers):
            Z = self.hidden[i](Z)
            self.tmp_layer_output.append(Z)
            
        return self.out(Z)

Compute gradient.

Compute gradient for layer l.

In [72]:
def get_gradient_layer(W,b,a,δ):
    z1 = tf.transpose(a @ W)  
    b = tf.reshape(b, z1.shape)
    z2 = z1 + b
    z3 = mdσ(z2) * δ
    
    return W @ z3

Compute gradient of neural network.

In [73]:
def get_gradient(N):
    δ = get_gradient_layer(N.out.weights[0], N.out.weights[1], N.tmp_layer_output[-1], np.identity(N.output_dim))

    for k in range(N.num_hidden_layers-1, -1, -1):
        δ = get_gradient_layer(N.hidden[k].weights[0], N.hidden[k].weights[1], N.tmp_layer_output[k], δ)
            
    return δ

Compute gradient and Hessian of last layer.

In [74]:
def get_gradient_hessian_last_layer(W,b,a,δ):
    z1 = tf.transpose(a @ W)  
    b = tf.reshape(b, z1.shape)
    z2 = z1 + b
    z3 = mdσ(z2) * δ
    
    ϑ = tf.linalg.diag(tf.reshape(md2σ(z2), [-1]))
    
    return W @ z3, W @ ϑ @ tf.transpose(W)

Compute gradient and Hessian of hidden layer.

In [75]:
def get_gradient_hessian_hidden_layer(W,b,a,δ,ϑ):
    z1 = tf.transpose(a @ W)  
    b = np.reshape(b, np.shape(z1))
    z2 = z1 + b
    z3 = mdσ(z2) * δ
    
    t2 = δ * md2σ(z2)
    H1 = W @ tf.linalg.diag(tf.reshape(t2, [-1])) @ tf.transpose(W)

    dσt = tf.linalg.diag(tf.reshape(mdσ(z2), [-1]))
    H2 = W @ dσt @ ϑ @ dσt @ tf.transpose(W)
    
    return W @ z3, H1+H2

Compute Hessian and gradient of neural network.

In [76]:
def get_hessian(N):
    δ,ϑ = get_gradient_hessian_last_layer(N.out.weights[0], N.out.weights[1], N.tmp_layer_output[-1], np.identity(N.output_dim))

    for k in range(N.num_hidden_layers-1, -1, -1):
        δ,ϑ = get_gradient_hessian_hidden_layer(N.hidden[k].weights[0], N.hidden[k].weights[1], N.tmp_layer_output[k], δ,  ϑ)
            
    return δ,ϑ

Why do we get a 2D vector when we insert a 2D vector?

In [77]:
NeuralN = PINN()

x = tf.random.normal((1,2))

out = NeuralN(x)
#print(out)

δ1 = get_gradient(NeuralN)
δ2,ϑ = get_hessian(NeuralN)

print(δ1- δ2)
print(δ1)
print(tf.reshape(ϑ, [-1]))



tf.Tensor(
[[0.]
 [0.]], shape=(2, 1), dtype=float32)
tf.Tensor(
[[0.00219784]
 [0.00040738]], shape=(2, 1), dtype=float32)
tf.Tensor([2.1990007e-04 2.0248743e-05 2.0248741e-05 2.2907179e-05], shape=(4,), dtype=float32)


In [78]:
NeuralN.hidden[0].weights[0]

<tf.Variable 'pinn_77/dense_308/kernel:0' shape=(2, 20) dtype=float32, numpy=
array([[-0.050066  , -0.14161286,  0.1861498 , -0.14060965, -0.28997618,
        -0.06024836, -0.01600968,  0.1230906 , -0.16865018,  0.16376644,
         0.12977847,  0.53065395,  0.52102387,  0.4689893 ,  0.3328992 ,
        -0.05310843, -0.64260006, -0.14978583,  0.2796152 ,  0.08261903],
       [ 0.43730757,  0.05173635,  0.04774111, -0.13227905, -0.16938747,
        -0.3721871 ,  0.08980095, -0.09904161,  0.39719364, -0.05187916,
        -0.41392252, -0.17660563, -0.5699045 , -0.37745818,  0.4979724 ,
        -0.1005419 , -0.23676614, -0.3856979 , -0.11393765,  0.50251454]],
      dtype=float32)>

-> We need to choose appropriate dtypes so that no operation overflows.

In [67]:
def _fvals1(N, x):

    with tf.GradientTape() as g:
        g.watch(x)
        y = N(x)

    dy_dx = g.gradient(y, x)
    dy_dx = np.transpose(dy_dx.numpy())

    return y, dy_dx

In [68]:
for i in range(10):
    x = tf.random.normal((1,2))
    NeuralN = PINN()
    out = NeuralN(x)
    δ1 = get_gradient(NeuralN)
    δ2,ϑ = get_hessian(NeuralN)
    δ_ad = _fvals1(NeuralN, x)[1]
    print(np.linalg.norm(δ1-δ2))
    print(np.linalg.norm(δ1-δ_ad))
    print(np.linalg.norm(δ2-δ_ad))

0.0
2.6031258e-10
2.6031258e-10
0.0
3.6813758e-10
3.6813758e-10
0.0
1.9792745e-10
1.9792745e-10
0.0
1.3015629e-10
1.3015629e-10
0.0
1.1641532e-10
1.1641532e-10
0.0
2.3283064e-10
2.3283064e-10
0.0
2.3283064e-10
2.3283064e-10
0.0
2.7456498e-10
2.7456498e-10
0.0
2.3999633e-10
2.3999633e-10
0.0
3.727109e-10
3.727109e-10


In [69]:
def _fvals2(N, x):

    with tf.GradientTape(persistent=True) as h:
        h.watch(x)
        with tf.GradientTape() as g:
            g.watch(x)
            y = N(x)

        dy_dx = g.gradient(y, x)
    
    d2y_d2x = h.jacobian(dy_dx, x)

    return y, dy_dx, d2y_d2x

In [70]:
for i in range(10):
    x = tf.random.normal((1,2))
    NeuralN = PINN()
    out = NeuralN(x)
    δ,ϑ = get_hessian(NeuralN)
    ϑ_ad = _fvals2(NeuralN, x)[2]
    print(np.linalg.norm(ϑ-ϑ_ad))

0.00046507386
8.289944e-05
0.00030583996
0.00010691532
0.00010735289
0.00035789586
0.00035949846
0.00032030875
0.00036833013
0.00024477145


In [18]:
x = tf.random.normal((10,))
t = tf.random.normal((10,))
tx = tf.stack([t, x], axis=1)
print(NeuralN(tx))



tf.Tensor(
[[0.14998177]
 [0.15036088]
 [0.15024763]
 [0.15099967]
 [0.1509214 ]
 [0.14976248]
 [0.14969146]
 [0.15064535]
 [0.14984101]
 [0.15021867]], shape=(10, 1), dtype=float32)


Maybe gradient tape thinks that the neural network is not differentiable?

# Explicit derivatives of ResNet

Here we only approximate the "half" gradient so far. 

In [23]:
class PINN_ResNet(tf.keras.Model):
    """ Set basic architecture of the PINN model."""

    def __init__(self,
                 ResNetLayers=2,
                 ResNetNeurons=16,
                 ResNetStepsize=1.0,
                 ResNetActivation='sigmoid',
                 **kwargs):
        
        super(PINN_ResNet, self).__init__(**kwargs)
        
        #RNact = tf.keras.activations.get(ResNetActivation)
        #RNact = my_act
        RNact = ResNetActivation
        

        
        self.ResNetLayers = ResNetLayers
        self.ResNetStepsize = ResNetStepsize

        self.ResNet = [tf.keras.layers.Dense(ResNetNeurons,
                                        activation = RNact) for _ in range(self.ResNetLayers)]
        self.wb = tf.keras.layers.Dense(1)
        self.A = tf.keras.layers.Dense(2, use_bias=False)
        self.c = tf.keras.layers.Dense(1, use_bias=False)
        
        #self.num_hidden_layers = num_hidden_layers
        self.input_dim = 2
        self.output_dim = 1


        # Define NN architecture
        
        # Output layer
        #self.out = tf.keras.layers.Dense(1, activation='sigmoid')

        
    def call(self, input_tensor, training=False):
        """Forward-pass through neural network."""
        
        self.tmp_layer_output = [input_tensor]
        
        N = self.ResNet[0](input_tensor, training=training)
        
        for i in range(1, self.ResNetLayers):
            self.tmp_layer_output.append(N)
            N = N + self.ResNetStepsize * self.ResNet[i](N, training=training)
        
        Phi = self.wb(N, training=training)

#         As = self.A(input_tensor, training=training)
#         sAs = tf.keras.layers.Dot(axes=(1))([input_tensor, As])
#         Phi += .5 * sAs
#         Phi += self.c(input_tensor, training=training)
            
        return Phi

Gradient of model, which approximates solution of pde

In [24]:
def get_gradient_ResNet(R):
    δ = get_gradient_layer(R.ResNet[-1].weights[0], R.ResNet[-1].weights[1], R.tmp_layer_output[-1], R.wb.weights[0])

    δ = R.wb.weights[0] + R.ResNetStepsize * δ
 
    for k in range(R.ResNetLayers-2, 0, -1):
        δ = δ + R.ResNetStepsize * get_gradient_layer(R.ResNet[k].weights[0], R.ResNet[k].weights[1], R.tmp_layer_output[k], δ)
          
    
    δ = get_gradient_layer(R.ResNet[0].weights[0], R.ResNet[0].weights[1], R.tmp_layer_output[0], δ)
    
    #return δ + np.transpose(R.A(R.tmp_layer_output[0]).numpy()) + R.c.get_weights()[0]
    return δ 

Something is wrong with the 'whole' gradient?

In [25]:
Resnet = PINN_ResNet()

x = tf.constant([[1., 10.]])

out = Resnet(x)

δ = get_gradient_ResNet(Resnet)

print(δ)

δ_ad = _fvals1(Resnet, x)

print(δ_ad[1])

print(np.linalg.norm(δ - δ_ad[1]))

tf.Tensor(
[[ 0.0401888 ]
 [-0.01666789]], shape=(2, 1), dtype=float32)
[[ 0.0401888 ]
 [-0.01666789]]
1.8626451e-09


In [26]:
for i in range(5):
    x = tf.random.normal((1,2))
    Resnet = PINN_ResNet()
    out = Resnet(x)
    δ1 = get_gradient_ResNet(Resnet)
    δ_ad = _fvals1(Resnet, x)[1]
    print(np.linalg.norm(δ1-δ_ad))

0.0
3.1828893e-08
1.4901161e-08
1.3038516e-08
0.0


In [27]:
def get_gradient_hessian_layer_ResNet(W,b,a,δ):
    z1 = np.transpose(a @ W)  
    b = np.reshape(b, np.shape(z1))
    z2 = z1 + b
    z3 = np.diag(tf.reshape(mdσ(z2), [-1])) @ δ
    
    z4 = md2σ(z2) * δ
    ϑ = np.diag(tf.reshape(z4, [-1]))
    
    return W @ z3, W @ ϑ @ np.transpose(W)

In [31]:
def get_hessian_ResNet(R):
    δ,ϑ = get_gradient_hessian_layer_ResNet(R.ResNet[-1].weights[0], R.ResNet[-1].weights[1], R.tmp_layer_output[-1], R.wb.weights[0])

    δ = R.wb.weights[0] + R.ResNetStepsize * δ
 
    for k in range(R.ResNetLayers-2, 0, -1):
        δ_new, ϑ_new_1 = get_gradient_hessian_layer_ResNet(R.ResNet[k].weights[0], R.ResNet[k].weights[1], R.tmp_layer_output[k], δ)
        z1 = np.transpose(R.tmp_layer_output[k] @ R.ResNet[k].weights[0])  
        b = np.reshape(R.ResNet[k].weights()[1], np.shape(z1))
        z2 = z1 + b
        t1 = ϑ + R.ResNetStepsize * R.ResNet[k].weights[0] @ np.diag(tf.reshape(mdσ(z2), [-1])) @ ϑ
        ϑ_new_2 = np.transpose(t1) + R.ResNetStepsize * R.ResNet[k].weights[0] @ np.diag(tf.reshape(mdσ(z2), [-1])) @ np.transpose(t1)
        ϑ = ϑ_new_1 + ϑ_new_2
        δ = δ + R.ResNetStepsize * δ_new
    
      
    δ, ϑ = get_gradient_hessian_hidden_layer(R.ResNet[0].weights[0], R.ResNet[0].weights[1], R.tmp_layer_output[0], δ, ϑ)
    
    #return δ + np.transpose(R.A(R.tmp_layer_output[0]).numpy()) + R.c.get_weights()[0], ϑ + np.transpose(R.A.get_weights()[0])
    return δ, ϑ

In [32]:
for i in range(5):
    x = tf.random.normal((1,2))
    Resnet = PINN_ResNet()
    out = Resnet(x)
    δ,ϑ = get_hessian_ResNet(Resnet)
    ϑ_ad = _fvals2(Resnet, x)[2]
    print(np.linalg.norm(ϑ-ϑ_ad))

0.004866444
0.017996006
0.021355437
0.007731237
0.00909908


In [27]:
Resnet.ResNet[0].weights[0]

<tf.Variable 'pinn__res_net_10/dense_94/kernel:0' shape=(2, 16) dtype=float32, numpy=
array([[ 0.21207619,  0.38117278, -0.11182824,  0.27755016, -0.11570671,
         0.46281052,  0.17287827,  0.14012116,  0.47641778,  0.16146809,
        -0.3751303 , -0.16747999, -0.10676545,  0.32442945,  0.09818757,
        -0.03592718],
       [-0.5770765 ,  0.09112537,  0.43927562,  0.5082332 , -0.53277296,
         0.06791842, -0.42723218,  0.5460204 ,  0.3342405 ,  0.5088353 ,
        -0.27283287, -0.5655137 , -0.41135675,  0.34313715,  0.35957617,
         0.1782304 ]], dtype=float32)>