# First and second derivative of FNN with respect to input (tensorflow)

Import necessary packages.

In [1]:
import tensorflow as tf
import numpy as np

Define activation function and its derivatives.

In [2]:
# Custom activation function
from keras.layers import Activation
from keras import backend as K
from keras.utils.generic_utils import get_custom_objects

#def mσ(x):
    #return np.abs(x) + np.log(1. + np.exp(-2. * np.abs(x)))
    
def mσ(x):
    return tf.math.divide(1, 1 + tf.math.exp(tf.math.negative(x)))

get_custom_objects().update({'custom_activation': Activation(mσ)})


In [3]:
#def mdσ(x):
    #return np.tanh(x)
    
    
#def md2σ(x):
    #return np.divide(1., np.square(np.cosh(x)))

def mdσ(x):
    return mσ(x) * (1 - mσ(x))
    
    
def md2σ(x):
    return mσ(x) * (1 - mσ(x)) * (1 - 2*mσ(x))

In [4]:
x = [[10.], [20.], [30.]]

print(mσ(x))
print(tf.keras.activations.sigmoid(x))
print(mdσ(x))
print(md2σ(x))

tf.Tensor(
[[0.9999546]
 [1.       ]
 [1.       ]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[0.9999546]
 [1.       ]
 [1.       ]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[4.5416677e-05]
 [0.0000000e+00]
 [0.0000000e+00]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[-4.541255e-05]
 [-0.000000e+00]
 [-0.000000e+00]], shape=(3, 1), dtype=float32)


Does not exactly match the results/values in Julia.

Define Neural Network.

In [5]:
# Define model architecture
class PINN(tf.keras.Model):
    """ Set basic architecture of the PINN model."""

    def __init__(self,
                 output_dim=1,
                 num_hidden_layers=3,
                 num_neurons_per_layer=20,
                 activationfunction = 'sigmoid',
                 kernel_initializer='glorot_normal',
                 **kwargs):
        
        super().__init__(**kwargs)

        self.num_hidden_layers = num_hidden_layers
        self.input_dim = 2
        self.output_dim = output_dim

        # Define NN architecture
        
        # Inititialize num_hidden_layers many fully connected dense layers
        self.hidden = [tf.keras.layers.Dense(num_neurons_per_layer,
                                             activation = activationfunction,
                                             kernel_initializer=kernel_initializer) for _ in range(self.num_hidden_layers)]
        
        # Output layer
        #self.out = tf.keras.layers.Dense(output_dim, activation=None)
        self.out = tf.keras.layers.Dense(output_dim, activation = activationfunction)
        
    def call(self, X):
        """Forward-pass through neural network."""
        self.tmp_layer_output = []
        #Z = self.scale(X)
        Z = X
        self.tmp_layer_output.append(Z)
        
        for i in range(self.num_hidden_layers):
            Z = self.hidden[i](Z)
            self.tmp_layer_output.append(Z)
            
        return self.out(Z)

Compute gradient.

Compute gradient for layer l.

In [6]:
def get_gradient_layer(W,b,a,δ):
    z1 = tf.transpose(a @ W)  
    b = tf.reshape(b, z1.shape)
    z2 = z1 + b
    z3 = mdσ(z2) * δ
    
    return W @ z3

Compute gradient of neural network.

In [7]:
def get_gradient(N):
    δ = get_gradient_layer(N.out.weights[0], N.out.weights[1], N.tmp_layer_output[-1], np.identity(N.output_dim))

    for k in range(N.num_hidden_layers-1, -1, -1):
        δ = get_gradient_layer(N.hidden[k].weights[0], N.hidden[k].weights[1], N.tmp_layer_output[k], δ)
            
    return δ

Compute gradient and Hessian of last layer.

In [8]:
def get_gradient_hessian_last_layer(W,b,a,δ):
    z1 = tf.transpose(a @ W)  
    b = tf.reshape(b, z1.shape)
    z2 = z1 + b
    z3 = mdσ(z2) * δ
    
    ϑ = tf.linalg.diag(tf.reshape(md2σ(z2), [-1]))
    
    return W @ z3, W @ ϑ @ tf.transpose(W)

Compute gradient and Hessian of hidden layer.

In [9]:
def get_gradient_hessian_hidden_layer(W,b,a,δ,ϑ):
    z1 = tf.transpose(a @ W)  
    b = np.reshape(b, np.shape(z1))
    z2 = z1 + b
    z3 = mdσ(z2) * δ
    
    t2 = δ * md2σ(z2)
    H1 = W @ tf.linalg.diag(tf.reshape(t2, [-1])) @ tf.transpose(W)

    dσt = tf.linalg.diag(tf.reshape(mdσ(z2), [-1]))
    H2 = W @ dσt @ ϑ @ dσt @ tf.transpose(W)
    
    return W @ z3, H1+H2

Compute Hessian and gradient of neural network.

In [10]:
def get_hessian(N):
    δ,ϑ = get_gradient_hessian_last_layer(N.out.weights[0], N.out.weights[1], N.tmp_layer_output[-1], np.identity(N.output_dim))

    for k in range(N.num_hidden_layers-1, -1, -1):
        δ,ϑ = get_gradient_hessian_hidden_layer(N.hidden[k].weights[0], N.hidden[k].weights[1], N.tmp_layer_output[k], δ,  ϑ)
            
    return δ,ϑ

Why do we get a 2D vector when we insert a 2D vector?

In [11]:
NeuralN = PINN()

x = tf.random.normal((1,2))

out = NeuralN(x)
#print(out)

δ1 = get_gradient(NeuralN)
δ2,ϑ = get_hessian(NeuralN)

print(δ1- δ2)
print(δ1)
print(tf.reshape(ϑ, [-1]))



tf.Tensor(
[[0.]
 [0.]], shape=(2, 1), dtype=float32)
tf.Tensor(
[[ 0.00029884]
 [-0.00043246]], shape=(2, 1), dtype=float32)
tf.Tensor([-1.0324914e-04 -8.4117437e-06 -8.4117382e-06 -4.8673053e-05], shape=(4,), dtype=float32)


In [12]:
NeuralN.hidden[0].weights[0]

<tf.Variable 'pinn/dense/kernel:0' shape=(2, 20) dtype=float32, numpy=
array([[-1.60525716e-03,  1.27788708e-01, -2.17058763e-01,
         8.61400515e-02, -5.37691295e-01, -1.04075164e-01,
         1.28059030e-01, -1.12600196e-02, -1.01063266e-01,
        -1.04028247e-01,  2.15378374e-01,  2.71746516e-01,
        -3.65494378e-03,  2.25552708e-01, -1.99095413e-01,
        -1.79860204e-01, -9.37863961e-02,  2.13393018e-01,
        -4.90239352e-01,  2.69888490e-01],
       [ 2.49091238e-02,  3.07551622e-02, -3.38211268e-01,
        -1.56116113e-01, -3.91041517e-01, -5.08562744e-01,
         5.18440545e-01,  7.20664933e-02,  1.89640656e-01,
         2.00130933e-04, -3.16599131e-01,  3.75685692e-02,
         1.38230994e-01, -1.06607094e-01,  5.82093537e-01,
         2.74899572e-01,  1.09360896e-01,  2.80288726e-01,
        -8.55155215e-02, -3.24645042e-01]], dtype=float32)>

-> We need to choose appropriate dtypes so that no operation overflows.

In [13]:
def _fvals1(N, x):

    with tf.GradientTape() as g:
        g.watch(x)
        y = N(x)

    dy_dx = g.gradient(y, x)
    dy_dx = np.transpose(dy_dx.numpy())

    return y, dy_dx

In [14]:
for i in range(10):
    x = tf.random.normal((1,2))
    NeuralN = PINN()
    out = NeuralN(x)
    δ1 = get_gradient(NeuralN)
    δ2,ϑ = get_hessian(NeuralN)
    δ_ad = _fvals1(NeuralN, x)[1]
    print(np.linalg.norm(δ1-δ2))
    print(np.linalg.norm(δ1-δ_ad))
    print(np.linalg.norm(δ2-δ_ad))

0.0
5.620883e-10
5.620883e-10
0.0
2.910383e-11
2.910383e-11
0.0
4.656613e-10
4.656613e-10
0.0
2.6031258e-10
2.6031258e-10
0.0
5.820766e-11
5.820766e-11
0.0
3.540634e-10
3.540634e-10
0.0
1.8635544e-10
1.8635544e-10
0.0
4.665699e-10
4.665699e-10
0.0
1.8635544e-10
1.8635544e-10
0.0
1.6463612e-10
1.6463612e-10


In [15]:
def _fvals2(N, x):

    with tf.GradientTape(persistent=True) as h:
        h.watch(x)
        with tf.GradientTape() as g:
            g.watch(x)
            y = N(x)

        dy_dx = g.gradient(y, x)
    
    d2y_d2x = h.jacobian(dy_dx, x)

    return y, dy_dx, d2y_d2x

In [16]:
for i in range(10):
    x = tf.random.normal((1,2))
    NeuralN = PINN()
    out = NeuralN(x)
    δ,ϑ = get_hessian(NeuralN)
    ϑ_ad = _fvals2(NeuralN, x)[2]
    print(np.linalg.norm(ϑ-ϑ_ad))

0.00023168208
0.00021419277
0.00013859835
0.0003060957
2.5743322e-05
0.0005692281
2.4410165e-05
0.0008893156
0.00012566645
0.00074173196


In [17]:
x = tf.random.normal((10,))
t = tf.random.normal((10,))
tx = tf.stack([t, x], axis=1)
print(NeuralN(tx))



tf.Tensor(
[[0.59380233]
 [0.59357953]
 [0.593705  ]
 [0.5941088 ]
 [0.5947718 ]
 [0.5949982 ]
 [0.59471434]
 [0.5954039 ]
 [0.5949071 ]
 [0.5955576 ]], shape=(10, 1), dtype=float32)


Maybe gradient tape thinks that the neural network is not differentiable?

# Explicit derivatives of ResNet

Here we only approximate the "half" gradient so far. 

In [18]:
class PINN_ResNet(tf.keras.Model):
    """ Set basic architecture of the PINN model."""

    def __init__(self,
                 ResNetLayers=2,
                 ResNetNeurons=16,
                 ResNetStepsize=1.0,
                 ResNetActivation='sigmoid',
                 **kwargs):
        
        super(PINN_ResNet, self).__init__(**kwargs)
        
        #RNact = tf.keras.activations.get(ResNetActivation)
        #RNact = my_act
        RNact = ResNetActivation
        

        
        self.ResNetLayers = ResNetLayers
        self.ResNetStepsize = ResNetStepsize

        self.ResNet = [tf.keras.layers.Dense(ResNetNeurons,
                                        activation = RNact) for _ in range(self.ResNetLayers)]
        self.wb = tf.keras.layers.Dense(1)
        self.A = tf.keras.layers.Dense(2, use_bias=False)
        self.c = tf.keras.layers.Dense(1, use_bias=False)
        
        #self.num_hidden_layers = num_hidden_layers
        self.input_dim = 2
        self.output_dim = 1


        # Define NN architecture
        
        # Output layer
        #self.out = tf.keras.layers.Dense(1, activation='sigmoid')

        
    def call(self, input_tensor, training=False):
        """Forward-pass through neural network."""
        
        self.tmp_layer_output = [input_tensor]
        
        N = self.ResNet[0](input_tensor, training=training)
        
        for i in range(1, self.ResNetLayers):
            self.tmp_layer_output.append(N)
            N = N + self.ResNetStepsize * self.ResNet[i](N, training=training)
        
        Phi = self.wb(N, training=training)

#         As = self.A(input_tensor, training=training)
#         sAs = tf.keras.layers.Dot(axes=(1))([input_tensor, As])
#         Phi += .5 * sAs
#         Phi += self.c(input_tensor, training=training)
            
        return Phi

Gradient of model, which approximates solution of pde

In [19]:
def get_gradient_ResNet(R):
    δ = get_gradient_layer(R.ResNet[-1].weights[0], R.ResNet[-1].weights[1], R.tmp_layer_output[-1], R.wb.weights[0])

    δ = R.wb.weights[0] + R.ResNetStepsize * δ
 
    for k in range(R.ResNetLayers-2, 0, -1):
        δ = δ + R.ResNetStepsize * get_gradient_layer(R.ResNet[k].weights[0], R.ResNet[k].weights[1], R.tmp_layer_output[k], δ)
          
    
    δ = get_gradient_layer(R.ResNet[0].weights[0], R.ResNet[0].weights[1], R.tmp_layer_output[0], δ)
    
    #return δ + np.transpose(R.A(R.tmp_layer_output[0]).numpy()) + R.c.get_weights()[0]
    return δ 

Something is wrong with the 'whole' gradient?

In [20]:
Resnet = PINN_ResNet()

x = tf.constant([[1., 10.]])

out = Resnet(x)

δ = get_gradient_ResNet(Resnet)

print(δ)

δ_ad = _fvals1(Resnet, x)

print(δ_ad[1])

print(np.linalg.norm(δ - δ_ad[1]))

tf.Tensor(
[[ 0.02221752]
 [-0.00531843]], shape=(2, 1), dtype=float32)
[[ 0.0222175 ]
 [-0.00531846]]
3.3967748e-08


In [21]:
for i in range(5):
    x = tf.random.normal((1,2))
    Resnet = PINN_ResNet()
    out = Resnet(x)
    δ1 = get_gradient_ResNet(Resnet)
    δ_ad = _fvals1(Resnet, x)[1]
    print(np.linalg.norm(δ1-δ_ad))

2.0061302e-08
0.0
7.450581e-09
1.4901161e-08
1.8626451e-08


In [22]:
def get_gradient_hessian_layer_ResNet(W,b,a,δ):
    z1 = np.transpose(a @ W)  
    b = np.reshape(b, np.shape(z1))
    z2 = z1 + b
    z3 = np.diag(tf.reshape(mdσ(z2), [-1])) @ δ
    
    z4 = md2σ(z2) * δ
    ϑ = np.diag(tf.reshape(z4, [-1]))
    
    return W @ z3, W @ ϑ @ np.transpose(W)

In [23]:
def get_hessian_ResNet(R):
    δ,ϑ = get_gradient_hessian_layer_ResNet(R.ResNet[-1].weights[0], R.ResNet[-1].weights[1], R.tmp_layer_output[-1], R.wb.weights[0])

    δ = R.wb.weights[0] + R.ResNetStepsize * δ
 
    for k in range(R.ResNetLayers-2, 0, -1):
        δ_new, ϑ_new_1 = get_gradient_hessian_layer_ResNet(R.ResNet[k].weights[0], R.ResNet[k].weights[1], R.tmp_layer_output[k], δ)
        z1 = np.transpose(R.tmp_layer_output[k] @ R.ResNet[k].weights[0])  
        b = np.reshape(R.ResNet[k].weights()[1], np.shape(z1))
        z2 = z1 + b
        t1 = ϑ + R.ResNetStepsize * R.ResNet[k].weights[0] @ np.diag(tf.reshape(mdσ(z2), [-1])) @ ϑ
        ϑ_new_2 = np.transpose(t1) + R.ResNetStepsize * R.ResNet[k].weights[0] @ np.diag(tf.reshape(mdσ(z2), [-1])) @ np.transpose(t1)
        ϑ = ϑ_new_1 + ϑ_new_2
        δ = δ + R.ResNetStepsize * δ_new
    
      
    δ, ϑ = get_gradient_hessian_hidden_layer(R.ResNet[0].weights[0], R.ResNet[0].weights[1], R.tmp_layer_output[0], δ, ϑ)
    
    #return δ + np.transpose(R.A(R.tmp_layer_output[0]).numpy()) + R.c.get_weights()[0], ϑ + np.transpose(R.A.get_weights()[0])
    return δ, ϑ

In [24]:
for i in range(5):
    x = tf.random.normal((1,2))
    Resnet = PINN_ResNet()
    out = Resnet(x)
    δ,ϑ = get_hessian_ResNet(Resnet)
    ϑ_ad = _fvals2(Resnet, x)[2]
    print(np.linalg.norm(ϑ-ϑ_ad))

0.02004727
0.040664587
0.0139674535
0.007035735
0.002148037


In [25]:
Resnet.ResNet[0].weights[0]

<tf.Variable 'pinn__res_net_10/dense_134/kernel:0' shape=(2, 16) dtype=float32, numpy=
array([[-0.10830575,  0.3440271 , -0.47204584,  0.10250157,  0.2765507 ,
         0.3026178 , -0.4115299 ,  0.47118652, -0.2142457 , -0.4787906 ,
        -0.11594582, -0.12754828, -0.12051362, -0.05279261,  0.36103922,
         0.36242455],
       [ 0.53960454,  0.08977658,  0.30206776, -0.01997924, -0.40110546,
         0.10174572, -0.47245246,  0.49537158, -0.09396005, -0.10590637,
         0.27241457, -0.05103976, -0.2864606 , -0.0780122 , -0.2055957 ,
        -0.30557978]], dtype=float32)>