In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
for gpu_instance in physical_devices: 
    tf.config.experimental.set_memory_growth(gpu_instance, True)
from tensorflow.experimental import numpy as np
import numpy as np

XOR problem 
===


In [2]:
X_xor = np.array([[0,0], [0,1], [1,0], [1,1]])
y_xor = np.array([0, 1, 1, 0])

In [11]:
# For others activations functions, check : https://github.com/RomainGrx/deep-learning-from-scratch/blob/master/from_scratch/activations/activation_functions.py
# For others loss functions, check : https://github.com/RomainGrx/deep-learning-from-scratch/blob/master/from_scratch/losses.py
class ReLU:
    @staticmethod
    def forward(x):
        return np.clip(x, 0, x)
    @staticmethod
    def backward(x):
        return (x > 0).astype(np.uint8)

class Sigmoid:
    @staticmethod
    def forward(x):
        return 1 / (1 + np.exp(-x))
    @staticmethod
    def backward(x):
        return np.exp(-x) / (1-np.exp(-x))**2

class Crossentropy:
    @staticmethod
    def forward(y_hat, y):
        return - np.sum(y*np.log(y_hat)+(1-y)*np.log(1-y_hat))
    @staticmethod
    def backward(y_hat, y):
        return - y/y_hat + (1-y)/(1-y_hat)
    
class DenseLayer:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
    def forward(self, x):
        self.x = x
        return x, self.W + self.b
    
    def backward(self, x, hidden=False):
        grads_input = x @ self.W.T
        if self.x is not None and hidden == True:
            grads_W = self.x.T @ x
            grads_b = np.sum(x, axis=0, keepdims=True)
            return dict(grads_input=grads_input, grads_W=grads_W, grads_b=grads_b)
        return grads_input
            
class DenseModel:
    def __init__(self, Wxh, bh, Why, by):
        self.Wxh, self.bh, self.Why, self.by = Wxh, bh, Why, by
        self.net = (
            DenseLayer(self.Wxh, self.bh),
            ReLU,
            DenseLayer(self.Why, self.by),
            Sigmoid
        )
    
    def forward(self, x):
        # In case argument is a single example and not a batch
        if x.ndim == 1:
            x = x[None]
        y = x
        for layer in self.net:
            y = layer.forward(y)
        return y
    
    def backward(self, loss):
        raise NotImplementedError()
    

Question 1 : Forward propagation
---

In [12]:
Wxh = np.array([[.79, 1.34], [.87, 1.08]])
bh = np.array([.10, -1.12])
Why = np.array([.68, -2.01])
by = np.array([-.3])

clf = DenseModel(Wxh, bh, Why, by)
y_hat = clf.forward(X_xor)
print("Outputed y_hat :: ", ["%.3f"%v for v in y_hat])

  return array(a, dtype, copy=False, order=order)
  return ufunc(*args, out=out, **kwargs)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

Question 2 : Classification
---

How many examples are correctly classified with the current network?

In [60]:
def xor_correct(y_hat):
    y_hat_rounded = np.round(y_hat)
    return np.sum(y_xor == y_hat_rounded)

n_corrects = xor_correct(y_hat)
print("Number of correctly classified :: %d"%n_corrects)

Number of correctly classified :: 2


Question 3 : Loss
---

What is the value of the cross-entropy loss $L(\hat{y}, y)$ computed from the 4 examples using this network? This cross-entropy loss is the cost function $J$ of the network, which depends on the network parameters.

In [61]:
loss = Crossentropy.forward(y_hat, y_xor)
print("Binary crossentropy loss :: %.3f"%loss)

[3.57030526]
Binary crossentropy loss :: 3.570


Question 4 : Back-propagation
---

In order to update the weights of our model, we will back-propagate one example, namely $x_3 = \begin{bmatrix} 1 \\ 0 \end{bmatrix}$, $y_3=1$.

First, we need to compute the output layer gradient $\nabla{\hat{y}}J$. What is its value when considering this example ?

In [56]:
x3, y3 = np.array([[1, 0]]), np.array([[1]])
print(x3.shape)
y_hat3 = clf.forward(x3)
print(y_hat3.shape)
gradient = Crossentropy.backward(y_hat3, y3)

print(gradient)

(1, 2)
(1, 1)
[[-2.14682815]]


Question 5 : Back-propagation (continued) 
---

Consequently, what are the values of $\nabla_{W_{h \rightarrow y}} J$ and $\nabla_{b_{y}} J$, i.e. the gradients of the weight vectors and the bias of the last layer?

Give your answer using the format *grad_w_1*, *grad_w_2*, *grad_b*.

In [57]:
grad_sig = clf.net[-1].backward(y_hat3)
print(grad_sig.shape)
grad_dict = clf.net[-2].backward(grad_sig, hidden=True)
print(grad_dict)

(1, 1)


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 1)