# Deep Learning

Deep learning originally referred to the application of **"deep"** neural networks (that is, networks with more than one hidden layer)



In neural network libraries, n-dimensional arrays are referred to as tensors.

In this chapter, we will say **Tensor** is just a ***list***

`Tensor = list`

Note:
- A tensor is either a float, or List of Tensors

`Tensor = Union[float, List[Tensor]]`

In [2]:
# Helper function to find a tensor's shape

Tensor = list

from typing import List
def shape(tensor:Tensor)-> List[int]:
    sizes: List[int] = []
    while isinstance(tensor, list):
        sizes.append(len(tensor))
        tensor = tensor[0]
    return sizes


assert shape([1,2,3]) == [3]
assert shape([[1,2],[3,5],[5,6]]) == [3,2]

In [3]:
def is_1d(tensor:Tensor)->bool:
    """
    If tensor[0] is a list, it's a higher-order tensor. Otherwise, tensor is 1-dimensional(that is, a vector)
    """
    return not isinstance(tensor[0],list)

assert is_1d([1,2,3])
assert not is_1d([[1,2],[3,4]])

In [4]:
# Recursive tensor_sum function

def tensor_sum(tensor:Tensor) -> float:
    """ Sums up all the values in the tensor """
    if is_1d(tensor):
        return sum(tensor)     # just a list of floats, use Python sum
    else:
        return sum(tensor_sum(tensor_i) for tensor_i in tensor)
    

assert tensor_sum([1,2,3]) == 6
assert tensor_sum([[1,2],[3,4]]) == 10


In [6]:
# This function applies a function elementwise to a single tensor

from typing import Callable

def tensor_apply(f:Callable[[float],float],tensor:Tensor)-> Tensor:
    """ Applies f elementwise """
    if is_1d(tensor):
        return [f(x) for x in tensor]
    else:
        return [tensor_apply(f, tensor_i) for tensor_i in tensor]
    

assert tensor_apply(lambda x:x+1,[1,2,3]) == [2,3,4]
assert tensor_apply(lambda x : 2 * x, [[1,2],[3,4]]) == [[2,4],[6,8]]

In [7]:
# Function that creates a zero tensor with the same shape as given tensor

def zeros_like(tensor:Tensor) -> Tensor:
    return tensor_apply(lambda _ : 0.0,tensor)

assert zeros_like([1,2,3]) == [0,0,0]
assert zeros_like([[1,2],[3,4]]) == [[0,0],[0,0]]

In [None]:
# function to corresponding elements from two tensors

def tensor_combine(f:Callable[[float,float],float],
                   t1: Tensor,
                   t2:Tensor) -> Tensor:
    """ Applies f to corresponding elements of t1 and t2 """
    if is_1d(t1):
        return [f(x,y) for x,y in zip(t1,t2)]
    else:
        return [tensor_combine(f,t1_i,t2_i) for t1_i,t2_i in zip(t1,t2)]
    
import operator

assert tensor_combine(operator.add,[1,2,3],[4,5,6]) == [5,7,9]
assert tensor_combine(operator.mul,[1,2,3],[4,5,6]) == [4,10,18]

## The Layer Abstraction

Layer, something that knows how to apply some function to its inputs and that knows how to backpropagate gradients


A Layer is one step in a neural network that:
-  takes input → processes it (forward)
- learns from mistakes by sending gradients back (backward)

What each method means:

-  forward(input) → tells the layer how to compute its output from the input.

- backward(gradient) → tells the layer how to adjust itself during learning.

- params() → returns the layer’s learnable values (like weights).

- grads() → returns the gradients (how much each weight should change).

In [None]:
from typing import Iterable, Tuple 
 
class Layer: 
    """
    Our neural networks will be composed of Layers, each of which
    knows how to do some computation on its inputs in the "forward"
    direction and propagate gradients in the "backward" direction.
    """ 
    def forward(self, input): 
        """
        Note the lack of types. We're not going to be prescriptive
        about what kinds of inputs layers can take and what kinds
        of outputs they can return.
        """ 
        raise NotImplementedError 
 
    def backward(self, gradient): 
        """
        Similarly, we're not going to be prescriptive about what the
        gradient looks like. It's up to you the user to make sure
        that you're doing things sensibly.
        """ 
        raise NotImplementedError 
 
    def params(self) -> Iterable[Tensor]: 
        """
        Returns the parameters of this layer. The default implementation
        returns nothing, so that if you have a layer with no parameters
        you don't have to implement this.
        """ 
        return () 
 
    def grads(self) -> Iterable[Tensor]: 
        """
        Returns the gradients, in the same order as params().
        """ 
        return ()

In [None]:
import math

def sigmoid(t:float)-> float:
    return 1/(1+math.exp(-t))

class Sigmoid(Layer):
    def forward(self, input:Tensor) -> Tensor:
        """
        Apply sigmoid to each element of the input tensor,
        and save the results to use in backpropagation
        """
        self.sigmoids = tensor_apply(sigmoid,input)
        return self.sigmoids
    
    def backward(self, gradient: Tensor) -> Tensor:
        return tensor_combine(lambda sig, grad: sig * (1 - sig) * grad, self.sigmoids, gradient)

## The Linear Layer

This layer will have parameters, which we had like to initialize with random values

There are three different schemes for randomly generating our weight tensors:

1️⃣ Uniform Initialization
→ Weights are random numbers between 0 and 1.
 - Simple but not ideal for deep networks.

2️⃣ Normal Initialization
→ Weights are drawn from a distribution centered around 0.
 -  Helps keep learning stable.

3️⃣ Xavier Initialization ⭐
→ Weights are scaled based on the number of inputs and outputs.
 -  Prevents gradients from becoming too large or too small.
Each weight is randomly chosen from a normal distribution with:

 - Mean = 0

 - Variance = 2 / (number of inputs + number of outputs)


In [None]:
from typing import List

Vector = List[float]

def dot(v:Vector,w:Vector)-> Vector:
    return sum(v_i*w_i for v_i,w_i in zip(v,w))

class Linear(Layer):
    def __init__(self,
                 input_dim:int,
                 output_dim:int,
                 init:str= 'xavier') -> None:
        """
        A layer of output_dim neurons, each with input_dim weights (and a bias)
        """
        self.input_dim = input_dim
        self.output_dim = output_dim

        """ random_tensor is a helper function that generates tensors with randomly initialized values using methods like normal, uniform, or Xavier initialization. """

        # self.w[0] is the weights for the oth neuron
        self.w = random_tensor(output_dim, input_dim, init=init)        

        # self.b[0] is the bias term for the oth neuron
        self.b = random_tensor(output_dim, init=init)

        def forward(self, input: Tensor) -> Tensor:
            # save the input to use the backward pass
            self.input = input

            # return the vector of neuron outputs
            return[dot(input,self.w[0])+ self.b[0]
                   for o in range(self.output_dim)]
        
        def backward(self, gradient: Tensor) -> Tensor:
             # Each b[o] gets added to output[o], which means 
        # the gradient of b is the same as the output gradient. 
            self.b_grad = gradient

            self.w_grad = [[self.input[i] * gradient[o]
                        for i in range(self.input_dim)]
                        for o in range(self.output_dim)]
            
            
             # Each input[i] multiplies every w[o][i] and gets added to every 
        # output[o]. So its gradient is the sum of w[o][i] * gradient[o] 
        # across all the outputs. 

            return [sum(self.w[o][i] * gradient[o] for o in  
                   range(self.output_dim)) 
                   for i in range(self.input_dim)]
        
        def params(self) -> Iterable[Tensor]: 
           return [self.w, self.b] 
 
        def grads(self) -> Iterable[Tensor]: 
            return [self.w_grad, self.b_grad]

## Neural Networks as a Sequence of Layers

In [None]:
from typing import List 
 
class Sequential(Layer): 
    """
    A layer consisting of a sequence of other layers.
    It's up to you to make sure that the output of each layer
    makes sense as the input to the next layer.
    """ 
    def __init__(self, layers: List[Layer]) -> None: 
        self.layers = layers 
 
    def forward(self, input): 
        """Just forward the input through the layers in order.""" 
        for layer in self.layers: 
            input = layer.forward(input) 
        return input 
 
    def backward(self, gradient): 
        """Just backpropagate the gradient through the layers in reverse.""" 
        for layer in reversed(self.layers): 
            gradient = layer.backward(gradient) 
        return gradient 
 
    def params(self) -> Iterable[Tensor]: 
        """Just return the params from each layer.""" 
        return (param for layer in self.layers for param in layer.params()) 
 
    def grads(self) -> Iterable[Tensor]: 
        """Just return the grads from each layer.""" 
        return (grad for layer in self.layers for grad in layer.grads())

## Loss and Optimization

In [None]:
class Loss: 
    def loss(self, predicted: Tensor, actual: Tensor) -> float: 
        """How good are our predictions? (Larger numbers are worse.)""" 
        raise NotImplementedError 
 
    def gradient(self, predicted: Tensor, actual: Tensor) -> Tensor: 
        """How does the loss change as the predictions change?""" 
        raise NotImplementedError
    
class SSE(Loss): 
    """Loss function that computes the sum of the squared errors.""" 
    def loss(self, predicted: Tensor, actual: Tensor) -> float: 
        # Compute the tensor of squared differences 
        squared_errors = tensor_combine( 
            lambda predicted, actual: (predicted - actual) ** 2, 
            predicted, 
            actual) 
 
        # And just add them up 
        return tensor_sum(squared_errors) 
 
    def gradient(self, predicted: Tensor, actual: Tensor) -> Tensor: 
        return tensor_combine( 
            lambda predicted, actual: 2 * (predicted - actual), 
            predicted, 
            actual)

Now last piece to figure out is gradient descent. Throughout we have done all our gradient descent manually by having a training loop that involves something like:

`theta = gradient_step(theta, grad, -learning_rate)`

But this will not work here for couple of reasons

- Neural nets will have many parameters and we have to update all of them.
- Use more clever variants of gradient descent and we don't want to have to rewrite them each time.
  
So we introduce `Optimizer abstraction` of which gradinet descent will be sepecific instance

In [None]:
class Optimizer:
    """
    An optimizer updates the weights of a layer(in place) using information know by either the layer or the optimizer (or by both)
    """
    def step(self, layer:Layer) -> None:
        raise NotImplementedError
    
    # No implement gradient descent using tensor_combine

class GradientDescent(Optimizer):
    def __init__(self, learning_rate: float = 0.1) -> None:
        self.lr = learning_rate

    def step(self, layer: Layer) -> None:
        for param, grad in zip(layer.params(),layer.grads()):
            # Update param using a gradient step
            param[:] = tensor_combine(lambda param, grad: param - grad * self.lr, param, grad)

Now,optimizer that uses momentum. The idea is that we don't want to overract to each new gradient and so we maintain a running average of the gradient we have seen, updating it with each new gradient and taking a step in the direction of the average

In [None]:
class Momentum(Optimizer): 
    def __init__(self, 
                 learning_rate: float, 
                 momentum: float = 0.9) -> None: 
        self.lr = learning_rate 
        self.mo = momentum 
        self.updates: List[Tensor] = []  # running average 
 
    def step(self, layer: Layer) -> None: 
        # If we have no previous updates, start with all zeros 
        if not self.updates: 
            self.updates = [zeros_like(grad) for grad in layer.grads()] 
 
        for update, param, grad in zip(self.updates, 
                                       layer.params(), 
                                       layer.grads()): 
            # Apply momentum 
            update[:] = tensor_combine( 
                lambda u, g: self.mo * u + (1 - self.mo) * g, 
                update, 
                grad) 
 
            # Then take a gradient step 
            param[:] = tensor_combine( 
                lambda p, u: p - self.lr * u, 
                param, 
                update)

## Other Activation Functions

The **Sigmoid** function has fallen out of favour for a couple of reasons.
- The **Sigmoid(0)** equal 1/2, which means that a neuron whose inputs sum to 0 has a positive output.
- Another is that its gradient its very close to 0 for very large and small inputs, which means that its gradients can get "saturated" and its weights can get stuck


One popular replacement is **tanh**("hyperbolic tangent"), which is a different sigmoid-shaped function that ranges from -1 to 1 and outputs 0 if its input is 0.


The derivative of **tanh(x)** is just `1 - tanh(x) ** 2`

In [None]:
import math

def tanh(x:float) -> float:
    # If x is very large or very small, tanh is (essentially) 1 or -1.
    # We check for this because, e.g., math.exp(1000) raises an error.
    if x < -100: return -1 
    elif x > 100: return 1

    em2x = math.exp(-2 * x)
    return (1-em2x) / (1+em2x)

class tanh(Layer):
    def forward(self, input:Tensor)-> Tensor:
        # save tanh output to use in backward pass
        self.tanh = tensor_apply(tanh, input)
        return self.tanh
    
    def backward(self, gradient:Tensor) -> Tensor:
        return tensor_combine(
            lambda tanh, grad: (1 - tanh ** 2) * grad,self.tanh, gradient)

In larger networks another popular replacement is **Relu**, which is **0** for negative inputs and identity for postive inputs

In [None]:
class Relu(Layer):
    def forward(self, input:Tensor)-> Tensor:
        self.input = input
        return tensor_apply(lambda x : max(x,0), input)
    
    def backward(self, gradient: Tensor) -> Tensor:
        return tensor_combine(lambda x, grad: grad if x > 0 else 0,
                              self.input,gradient)

## Softmaxes and Cross Entropy

Previously we used the Sigmoid layer, which means that its output was a vector of numbers between 0 and 1. Then particularly, it could output a vector that has entirely 0s, or it could output a vector that was entirely 1s.

Now for classification problems:
-  **1** is for correct class
-  **0** is for incorrect class


Now take a example, if we have two classes and our model outputs [0,0] then its hard to make much sense. This shows that output is not belonged to any class.

But if our model outputs [0.4,0.6], we can interpret it as a prediction that there's probability of **0.4** that our input belongs to first class and **0.6** that our input belongs to second class.

In order to accomplish this forget the `Sigmoid` layer and instead use the `Softmax` function which converts a vector of real numbers to a vector of probabilities

- For Numerical Stability

In practice, we compute:

$$[
\text{softmax}(z_i) =
\frac{e^{z_i - \max(z)}}
{\sum_{j} e^{z_j - \max(z)}}
]$$

This prevents overflow when numbers are large.


In [None]:
def softmax(tensor:Tensor) -> Tensor:
    """ Softmax along the last dimension """
    if is_1d(tensor):
        # Subtract largest value for numerical stability
        largest = max(tensor)
        exps = [math.exp(x-largest) for x in tensor]

        sum_of_exps = sum(exps)                           
        return [exp_i / sum_of_exps for exp_i in exps]
    
    else: 
        return [softmax(tensor_i) for tensor_i in tensor]

In [None]:
class SoftmaxCrossEntropy(Loss):
    """
    This is the negative log-likelihood of the observed values, given the neural net model. So if we choose weights to minimize it, our model will be maximizing the likelihood of the observed data
    """
    def loss(self, predicted:Tensor, actual:Tensor)-> float:
        # Apply softmax to get probabilities
        probabilities = softmax(predicted)

        # This will be log p_i for the actual class i and 0 for the other classes. We add a tiny amount to p to avoid taking log(0)

        likelihoods = tensor_combine(lambda p,act: math.log(p + 1e-30) * act, probabilities, actual)

        # and then we just sum up the negatives
        return -tensor_sum(likelihoods)
    
    def gradient(self, predicted:Tensor, actual:Tensor)-> Tensor:
        probabilities = softmax(predicted)

        return tensor_combine(lambda p,actual: p - actual, probabilities, actual)

## Dropout

A common way of regularization neural networks is using dropout.

- At training time, we randomly turn off each neuron (that is, replace its output with 0) with some fixed probability. This means that the network can't learn to depend on any individual neuron
- At evaluation time, we don't want to dropout any neurons, so a **Dropout** layer will need to know whether it's training or not.


In [None]:
import random

class Dropout(Layer):
    def __init__(self,p:float)-> None:
        self.p = p
        self.train = True

    def forward(self, input: Tensor)-> Tensor:
        if self.train:
            # Create a mask of 0s and 1s shaped like the input using the specified probability
            self.mask = tensor_apply(
                lambda _: 0 if random.random() < self.p else 1, input
            )
            # Multiply by the mask to dropout the inputs
            return tensor_combine(operator.mul, input, self.mask)
        else:
            # During evaluation just scale down the outputs uniformly
            return tensor_apply(lambda x:x * (1 - self.p),input)
        
    def backward(self, gradient:Tensor) -> Tensor:
        if self.train:
            # Only propagate the gradients where mask == 1.
            return tensor_combine(operator.mul, gradient, self.mask)
        else:
            raise RuntimeError("don't call backward when not in train model")

## Example: MNIST (Modified National Institute of Standards and Technology)


To install MNIST:

`python -m pip install mnist`

In [None]:
from tensorflow.keras.datasets import mnist

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

assert train_images.shape == (60000, 28, 28)
assert train_labels.shape == (60000,)



Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step


In [10]:
import tensorflow as tf 

train_images = train_images / 255.0
test_images = test_images / 255.0

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),  # 28x28 → 784
    tf.keras.layers.Dense(128, activation='relu'),  # Hidden layer
    tf.keras.layers.Dense(10, activation='softmax') # 10 output classes
])


model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.fit(train_images, train_labels, epochs=5)


  super().__init__(**kwargs)


Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.1116 - loss: 2.3014
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.1124 - loss: 2.3007
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.1124 - loss: 2.2990
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.1185 - loss: 2.2940
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.1521 - loss: 2.2824


<keras.src.callbacks.history.History at 0x25228b8f620>

In [11]:
test_loss, test_acc = model.evaluate(test_images, test_labels)
print("Test accuracy:", test_acc)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.1603 - loss: 2.2723
Test accuracy: 0.16030000150203705


## Saving and Loading Models

In [None]:
import json

def save_weights(model: Layer, filename: str) -> None:
    weights = list(model.params())
    with open(filename,'w') as f:
        json.dump(weights,f)


def load_weights(model: Layer, filename: str) -> None: 
    with open(filename) as f: 
        weights = json.load(f) 
 
    # Check for consistency 
    assert all(shape(param) == shape(weight) 
               for param, weight in zip(model.params(), weights)) 
 
    # Then load using slice assignment 
    for param, weight in zip(model.params(), weights): 
        param[:] = weight