In [1]:
import numpy as np
from numpy import ndarray
from typing import Dict,Tuple

In [2]:
def tanh(W:ndarray)->ndarray:
    return np.tanh(W)

In [3]:
def tanh_der(S:ndarray)->ndarray:
    return (1-np.power(tanh(S),2))

In [4]:
def e(H:ndarray)->ndarray:
    return np.exp(H)

In [5]:
def sigmoid(n:ndarray)->ndarray:
    return (1/(1+e(-n)))

In [6]:
def sigmoid_der(h:ndarray)->ndarray:
    return sigmoid(h)*(1-sigmoid(h))

In [7]:
def Quadratic_Cost_regression_2d(y_pred:ndarray,y_obs:ndarray)->float:
    return np.mean(np.power(y_pred-y_obs,2))

In [9]:
def Quadratic_Cost_naive_2d(y_pred:ndarray,y_obs:ndarray)->float:
    z = 0
    for s in range(y_pred.shape[0]):
        for w in range(y_obs.shape[1]): # The naiver version designed for creating representations by pre multiplying with 
            z+= (y_pred[s,w]-y_obs[s,w])**2  # a weight matrix. no of columns in the representations corresponds to the no
                                                 # of datapoints
    return z/(y_obs.shape[1])    
                                

In [20]:
def L(S:ndarray,w1:ndarray,w2:ndarray,w3:ndarray,b1:ndarray,b2:ndarray,b3:ndarray,y:ndarray)->Tuple[Dict[str,ndarray],float]:
    g1 = np.dot(w1,S)
    g2 = g1 + b1
    g3 = tanh(g2)
    g4 = np.dot(w2,g3)
    g5 = g4 + b2
    g6 = sigmoid(g5)
    g7 = np.dot(w3,g6)
    g8 = g7 + b3
    g9 = Quadratic_Cost_naive_2d(g8,y)
    
    Data:Dict[str,ndarray] = {'S':S,'w1':w1,'w2':w2,'w3':w3,'b1':b1,'b2':b2,'b3':b3,'y':y,'g1':g1,'g2':g2,'g3':g3,'g4':g4,
                             'g5':g5,'g6':g6,'g7':g7,'g8':g8}
        
    
    return Data,g9
        
                             

In [18]:
S = np.random.randn(2,2)
w1 = np.random.randn(3,2)
b1 = np.random.randn(3,1)
w2 = np.random.randn(4,3)
b2 = np.random.randn(4,1)
w3 = np.random.randn(1,4)
b3 = np.random.randn(1,1)
y = np.random.randn(1,2)

In [21]:
L(S,w1,w2,w3,b1,b2,b3,y)

({'S': array([[-0.68424688,  0.1226127 ],
         [ 0.85043664,  0.46881203]]), 'w1': array([[ 1.14205375, -0.17839548],
         [ 0.9174844 , -0.14132818],
         [-1.10637701,  0.0749737 ]]), 'w2': array([[ 1.35164888, -0.52904082, -1.10840109],
         [-2.02972089,  0.94565733,  1.16814785],
         [ 0.89546794,  2.15418749,  2.41812263],
         [ 0.45734434, -0.4099463 ,  0.63345622]]), 'w3': array([[-1.1047429 , -0.39768368, -0.17999112, -1.16220417]]), 'b1': array([[ 0.78941267],
         [ 0.76236111],
         [-0.05389374]]), 'b2': array([[ 0.11596932],
         [-0.62191183],
         [-0.18579458],
         [ 0.13295161]]), 'b3': array([[-0.50117255]]), 'y': array([[-0.56611036,  0.36550861]]), 'g1': array([[-0.93316077,  0.05639635],
         [-0.7479765 ,  0.04623889],
         [ 0.8207954 , -0.10050731]]), 'g2': array([[-0.1437481 ,  0.84580902],
         [ 0.01438461,  0.80860001],
         [ 0.76690166, -0.15440105]]), 'g3': array([[-0.1427661 ,  0.68887366],


In [57]:
def grads(data:Dict[str,ndarray])->Dict[str,ndarray]:
    g8 = data['g8']
    y = data['y']
    g6 = data['g6']
    g5 = data['g5']
    g3 = data['g3']
    g2 = data['g2']
    S = data['S']
    b1 = data['b1']
    b2 = data['b2']
    b3 = data['b3']
    w1 = data['w1']
    w2 = data['w2']
    w3 = data['w3']
    
    grad_g8 = 2*(g8-y)/(y.shape[1])
    grad_w3 = np.dot(grad_g8,g6.transpose())
    grad_b3 = (np.dot(grad_g8,np.repeat([1],S.shape[1]))).reshape(b3.shape[0],b3.shape[1])
    grad_g5 = (np.dot(w3.transpose(),grad_g8))*sigmoid_der(g5)
    grad_w2 = np.dot(grad_g5,g3.transpose())
    grad_b2 = (np.dot(grad_g5,np.repeat([1],S.shape[1]))).reshape(b2.shape[0],b2.shape[1])
    grad_g2 = (np.dot(w2.transpose(),grad_g5))*tanh_der(g2)
    grad_w1 = np.dot(grad_g2,S.transpose())
    grad_b1 = (np.dot(grad_g2,np.repeat([1],S.shape[1]))).reshape(b1.shape[0],b1.shape[1])
    
    grad: Dict[str,ndarray] = {'w3':grad_w3,'w2':grad_w2,'w1':grad_w1,'b1':grad_b1,'b2':grad_b2,'b3':grad_b3,
                              'grad_g8':grad_g8,'grad_g5':grad_g5,'grad_g2':grad_g2}
        
    return grad

In [58]:
grads(L(S,w1,w2,w3,b1,b2,b3,y)[0])

{'w3': array([[-2.15858721, -1.2601697 , -3.08948689, -2.12509017]]),
 'w2': array([[0.34472206, 0.38440933, 0.12269544],
        [0.07777838, 0.09547129, 0.06271722],
        [0.03948163, 0.04478774, 0.01710213],
        [0.4382357 , 0.48334361, 0.13454856]]),
 'w1': array([[-0.20225301,  0.53513869],
        [ 0.05553329, -0.17842761],
        [-0.03589336,  0.1334987 ]]),
 'b1': array([[ 0.83423313],
        [-0.28883895],
        [ 0.22118634]]),
 'b2': array([[0.89277703],
        [0.27038755],
        [0.10825356],
        [1.09278301]]),
 'b3': array([[-3.83805491]]),
 'grad_g8': array([[-1.37513586, -2.46291905]]),
 'grad_g5': array([[0.32500674, 0.5677703 ],
        [0.13044648, 0.13994107],
        [0.04219543, 0.06605813],
        [0.37823316, 0.71454985]]),
 'grad_g2': array([[ 0.3774394 ,  0.45679373],
        [-0.11271925, -0.17611969],
        [ 0.07809738,  0.14308896]])}

In [60]:
grad = grads(L(S,w1,w2,w3,b1,b2,b3,y)[0]) #Storing the values of the gradient values of the objective function
                                                      # wrt various tensor variables.

In [29]:
w1_11 = w1.copy()

In [30]:
 w1_11[0,0] = w1_11[0,0] + .00001

In [31]:
(L(S,w1_11,w2,w3,b1,b2,b3,y)[1] - L(S,w1,w2,w3,b1,b2,b3,y)[1])/(.00001) # approximating the (1,1) th component of the gradient
                              # of the objective function wrt w1.

-0.2022524668632286

In [44]:
grad['w1'][0,0]   # the computed partial derivative value. compare the values.

-0.20225301327891795

In [37]:
w1_12 = w1.copy()

In [38]:
w1_12[0,1] = w1_12[0,1] + .00001

In [39]:
(L(S,w1_12,w2,w3,b1,b2,b3,y)[1]-L(S,w1,w2,w3,b1,b2,b3,y)[1])/(.00001) # approximating the partial derivative of the cost 
                                         # function wrt (1,2) the component of w1.

0.5351389250396466

In [45]:
grad['w1'][0,1]  # the corresponding value, computed by the grads function.

0.5351386886662324

In [40]:
b1_11 = b1.copy()

In [41]:
b1_11[0,0] = b1_11[0,0] + .00001

In [42]:
(L(S,w1,w2,w3,b1_11,b2,b3,y)[1]-L(S,w1,w2,w3,b1,b2,b3,y)[1])/(.00001) # the approximate value of the first component of the
                                  # grad of the objective function wrt b1.

0.8342313023490532

In [46]:
grad['b1'][0,0]

0.8342331267437736

In [47]:
w2_12 = w2.copy()

In [48]:
w2_12[0,1] = w2_12[0,1] + .00001

In [49]:
(L(S,w1,w2_12,w3,b1,b2,b3,y)[1] - L(S,w1,w2,w3,b1,b2,b3,y)[1])/(.00001)

0.38440893344038324

In [50]:
grad['w2'][0,1]

0.3844093307700902

In [52]:
b2_21 = b2.copy()

In [53]:
b2_21[1,0] = b2_21[1,0] + .00001

In [55]:
(L(S,w1,w2,w3,b1,b2_21,b3,y)[1]-L(S,w1,w2,w3,b1,b2,b3,y)[1])/(.00001)

0.2703879304899459

In [61]:
grad['b2'][1,0]

0.270387551073182

In [62]:
w3_11 = w3.copy()

In [63]:
w3_11[0,0] = w3_11[0,0] - .00001

In [64]:
(L(S,w1,w2,w3_11,b1,b2,b3,y)[1] - L(S,w1,w2,w3,b1,b2,b3,y)[1])/(-.00001)

-2.158590163592322

In [65]:
grad['w3'][0,0]

-2.158587209689129

In [66]:
b3_11 = b3.copy()

In [68]:
b3_11[0,0] = b3_11[0,0] + .00001

In [70]:
(L(S,w1,w2,w3,b1,b2,b3_11,y)[1] - L(S,w1,w2,w3,b1,b2,b3,y)[1])/(.00001)

-3.8380449126407252

In [73]:
grad['b3']

array([[-3.83805491]])