# **Task 1**: Implementing and analyzing Custom Loss Functions in PyTorch

This task sequence introduces the development of custom loss functions in PyTorch, with a focus on applying theoretical knowledge to practical implementation. As an initial example, the L1 Loss (Mean Absolute Error) function is fully implemented, demonstrating how to extend PyTorch's nn.Module to create custom loss computations. Following this example **implement** additional loss functions, including L2 Loss (Mean Squared Error), Binary Cross-Entropy Loss, and Cross-Entropy Loss for multi-class classification.

In [1]:
import torch
import torch.nn as nn

class L1Loss(nn.Module):
    """
    L1 Loss, also known as Mean Absolute Error (MAE).
    """
    def forward(self, y_pred, y_true):
        """
        Forward pass for L1 loss using PyTorch operations.

        :param y_pred: Predicted values (Tensor).
        :param y_true: Ground truth values (Tensor).
        :return: Scalar tensor representing the L1 loss.
        """
        ########################################################################
        # TODO:                                                                #
        # Implement the forward pass to calculate the L1 loss.                 #
        # Use PyTorch tensor operations to compute the mean absolute difference#
        # between y_pred and y_true.                                           #
        ########################################################################
        return torch.mean(torch.abs(y_pred - y_true))
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################


# Example usage
if __name__ == "__main__":
    # TODO: Define sample predicted values and ground truth values for testing your implementation.
    y_pred = torch.tensor([1.0, 2.0, 3.0, 4.0], requires_grad=True)
    y_true = torch.tensor([1.5, 2.5, 3.0, 4.5])

    # Initialize your custom L2Loss
    criterion = L1Loss()

    # TODO: Compute the loss using your L2Loss class and print it.
    loss = criterion(y_pred, y_true)
    print(f"Loss: {loss}")

    # TODO: Perform a backward pass to compute gradients (optional demonstration of PyTorch's autograd).
    loss.backward()
    print(f"Gradients on y_pred: {y_pred.grad}")

Loss: 0.375
Gradients on y_pred: tensor([-0.2500, -0.2500,  0.0000, -0.2500])


In [2]:
class L2Loss(nn.Module):
    """
    L2 Loss, also known as Mean Squared Error (MSE).
    """
    def forward(self, y_pred, y_true):
        """
        Forward pass for L2 loss using PyTorch operations.
        :param y_pred: Predicted values (Tensor).
        :param y_true: Ground truth values (Tensor).
        :return: Scalar tensor representing the L2 loss.
        """
        ########################################################################
        # TODO:                                                                #
        # Implement the forward pass to calculate the L2 loss.                 #
        ########################################################################
        return torch.mean((y_pred - y_true) ** 2)
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Define sample predicted values and ground truth values for testing your implementation.
    # Ensure y_pred and y_true are PyTorch tensors.
    y_pred = torch.tensor([1.0, 2.0, 3.0, 4.0], requires_grad=True)
    y_true = torch.tensor([1.5, 2.5, 3.0, 4.5])
    # Initialize your custom L1Loss
    criterion = L2Loss()
    # TODO: Compute the loss using your L2Loss class and print it.
    loss = criterion(y_pred, y_true)
    print(f"Loss: {loss}")
    # TODO: Perform a backward pass to compute gradients (optional demonstration of PyTorch's autograd).
    loss.backward()
    print(f"Gradients on y_pred: {y_pred.grad}")

Loss: 0.1875
Gradients on y_pred: tensor([-0.2500, -0.2500,  0.0000, -0.2500])


In [3]:
class BCELoss(nn.Module):
    """
    Binary Cross-Entropy (BCE) Loss implemented for PyTorch.
    Note: PyTorch already provides nn.BCELoss, but implementing it manually can be educational.
    """
    def forward(self, y_pred, y_true):
        """
        Forward pass for BCE loss using PyTorch operations.

        :param y_pred: Predicted probabilities (Tensor) with values in range [0, 1].
        :param y_true: Ground truth values (Tensor) with binary values 0 or 1.
        :return: Scalar tensor representing the BCE loss.
        """
        ########################################################################
        # TODO:                                                                #
        # Implement the BCE loss calculation here.                             #
        # Hint: Use PyTorch's torch.clamp to avoid log(0) which is undefined.  #
        # Use torch.log for natural logarithm.                                 #
        ########################################################################
        y_pred = torch.clamp(y_pred, min=1e-7, max=1-1e-7)
        loss = -torch.mean(y_true * torch.log(y_pred) + (1 - y_true) * torch.log(1 - y_pred))
        return loss
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Define sample predicted values and ground truth values for testing your implementation.
    # Ensure y_pred and y_true are PyTorch tensors.
    y_pred = torch.tensor([0.9, 0.2, 0.1, 0.8, 0.7], requires_grad=True)
    y_true = torch.tensor([1.0, 0.0, 0.0, 1.0, 1.0])
    # Initialize your custom BCELoss
    criterion = BCELoss()

    # TODO: Compute the loss using your BCELoss class and print it.
    loss = criterion(y_pred, y_true)
    print(f"Loss: {loss.item()}")
    # TODO: Perform a backward pass to compute gradients (optional demonstration of PyTorch's autograd).
    loss.backward()
    print(f"Gradients on y_pred: {y_pred.grad}")

Loss: 0.20273664593696594
Gradients on y_pred: tensor([-0.2222,  0.2500,  0.2222, -0.2500, -0.2857])


In [4]:
class CELoss(nn.Module):
    """
    Implement the Cross-Entropy Loss for multi-class classification in PyTorch.
    """
    def __init__(self):
        super(CELoss, self).__init__()

    def forward(self, logits, targets):
        """
        Forward pass for Cross-Entropy loss.

        :param logits: Logits from the model (Tensor). Shape: [batch_size, num_classes].
        :param targets: Ground truth class indices (Tensor). Shape: [batch_size].
        :return: Scalar tensor representing the CE loss.
        """
        ########################################################################
        # TODO:                                                                #
        # Implement the forward pass to calculate the Cross-Entropy loss.      #
        # Hint: Don't use PyTorch's log_softmax and nll_loss functions.   #
        ########################################################################
        logits = logits - torch.max(logits, dim=1, keepdim=True).values
        
        # Compute the softmax
        exp_logits = torch.exp(logits)
        softmax_probs = exp_logits / torch.sum(exp_logits, dim=1, keepdim=True)
        
        # Convert targets to one-hot encoding
        targets_one_hot = nn.functional.one_hot(targets, num_classes=logits.size(1)).float()
        
        # Compute the cross-entropy loss
        loss = -torch.sum(targets_one_hot * torch.log(softmax_probs + 1e-7)) / logits.size(0)
        return loss
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Define sample predicted values and ground truth values for testing your implementation.
    # Ensure y_pred and y_true are PyTorch tensors.
    logits = torch.tensor([[1.0, 2.0, 0.5], [2.0, 1.0, 0.5]], requires_grad=True)
    targets = torch.tensor([1, 0])
    # Initialize your custom CELoss
    criterion = CELoss()

    # TODO: Compute the loss using your CELoss class and print it.
    loss = criterion(logits, targets)
    print(f"Loss: {loss.item()}")
    # TODO: Perform a backward pass to compute gradients (optional demonstration of PyTorch's autograd).
    loss.backward()
    print(f"Gradients on logits: {logits.grad}")

Loss: 0.46436864137649536
Gradients on logits: tensor([[ 0.1156, -0.1857,  0.0701],
        [-0.1857,  0.1156,  0.0701]])


#**Task 2:** Implementing Custom Activation Functions in PyTorch


This task involves developing a set of custom activation functions in PyTorch, understanding their roles in neural networks, and how they can be implemented from scratch. Activation functions are crucial for introducing non-linearity into the network, allowing for the learning of complex patterns in the data. You'll start with an example of the ReLU (Rectified Linear Unit) activation function and then proceed to **implement** additional activation functions such as Sigmoid, Tanh, and Softmax, followed by a **comparison** with PyTorch's built-in implementations.

**The backward calculation for the Softmax function is not straightforward; hence, you may rely solely on PyTorch's built-in functionality for the backward pass.**

In [5]:
class ReLU(nn.Module):
    """
    Implement the ReLU activation function.
    """
    def __init__(self):
        super(ReLU, self).__init__()

    def forward(self, x):
        """
        Forward pass for ReLU.
        :param x: Input tensor.
        :return: Output tensor where ReLU(x) = max(0, x).
        """
        ########################################################################
        # TODO: Implement the ReLU activation function.                        #
        ########################################################################
        return torch.maximum(torch.zeros_like(x), x)
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def backward(grad_output):
      """
      Backward pass for custom ReLU.
      :param grad_output: Gradient tensor of the output.
      :return: Gradient tensor for the input.
      """
      ########################################################################
      # TODO: Implement the backward computation for ReLU.                   #
      ########################################################################
      # Gradient of ReLU is 1 for input > 0; otherwise, it's 0
      grad_input = grad_output.clone()
      grad_input[grad_input >= 1] = 1
      return grad_input
      ########################################################################
      #                           END OF YOUR CODE                           #
      ########################################################################

# Example usage
if __name__ == "__main__":
    # Define a sample input tensor.
    x = torch.tensor([-1.0, 0.0, 1.0, 2.0], requires_grad=True)

    # Initialize the custom ReLU activation function.
    custom_relu = ReLU()

    # Compute the activation using the custom ReLU class.
    activated_x_custom = custom_relu(x)

    # Perform a backward pass to compute gradients for the custom implementation.
    gradients_custom = ReLU.backward(activated_x_custom)

    # Print the outputs and gradients from the custom implementation.
    print("Custom ReLU output:", activated_x_custom)
    print("Custom ReLU gradients:", gradients_custom)

    # Reset gradients to zero before another backward pass
    x.grad = None

    # Compute the activation using PyTorch's built-in relu function.
    activated_x_torch = torch.relu(x)

    # Perform a backward pass to compute gradients for PyTorch's implementation.
    activated_x_torch.backward(torch.ones_like(x))
    gradients_torch = x.grad

    # Print the outputs and gradients from PyTorch's implementation.
    print("PyTorch ReLU output:", activated_x_torch)
    print("PyTorch ReLU gradients:", gradients_torch)

Custom ReLU output: tensor([0., 0., 1., 2.], grad_fn=<MaximumBackward0>)
Custom ReLU gradients: tensor([0., 0., 1., 1.], grad_fn=<IndexPutBackward0>)
PyTorch ReLU output: tensor([0., 0., 1., 2.], grad_fn=<ReluBackward0>)
PyTorch ReLU gradients: tensor([0., 0., 1., 1.])


In [6]:
class Sigmoid(nn.Module):
    """
    Implement the Sigmoid activation function.
    """
    def __init__(self):
        super(Sigmoid, self).__init__()

    def forward(self, x):
        """
        Forward pass for Sigmoid.
        :param x: Input tensor.
        :return: Output tensor where Sigmoid(x) = 1 / (1 + exp(-x)).
        """
        ########################################################################
        # TODO: Implement the Sigmoid activation function.                     #
        ########################################################################
        output = 1 / (1 + torch.exp(-x))
        self.input = output
        return output
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def backward(self, grad_output):
        """
        Backward pass for custom Sigmoid.
       :param grad_output: Gradient tensor of the output.
       :return: Gradient tensor for the input.
        """
        ########################################################################
        # TODO: Implement the backward computation for Sigmoid.                #
        ########################################################################
        grad_input = grad_output * self.input * (1 - self.input)  
        return grad_input
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Test your Sigmoid implementation.
    # Define a sample input tensor.
    x = torch.tensor([-1.0, 0.0, 1.0, 2.0], requires_grad=True)

    # Initialize the custom Sigmoid activation function.
    custom_sigmoid = Sigmoid()

    # Compute the activation using the custom ReLU class.
    activated_x_custom = custom_sigmoid(x)

    # Perform a backward pass to compute gradients for the custom implementation.
    activated_x_custom.sum().backward()  
    gradients_custom = x.grad

    # Print the outputs and gradients from the custom implementation.
    print("Custom Sigmoid output:", activated_x_custom)
    print("Custom Sigmmoid gradients:", gradients_custom)

    # Reset gradients to zero before another backward pass
    x.grad = None

    # Compute the activation using PyTorch's built-in relu function.
    activated_x_torch = torch.sigmoid(x)

    # Perform a backward pass to compute gradients for PyTorch's implementation.
    activated_x_torch.backward(torch.ones_like(x))
    gradients_torch = x.grad

    # Print the outputs and gradients from PyTorch's implementation.
    print("PyTorch Sigmoid output:", activated_x_torch)
    print("PyTorch Sigmoid gradients:", gradients_torch)

Custom Sigmoid output: tensor([0.2689, 0.5000, 0.7311, 0.8808], grad_fn=<MulBackward0>)
Custom Sigmmoid gradients: tensor([0.1966, 0.2500, 0.1966, 0.1050])
PyTorch Sigmoid output: tensor([0.2689, 0.5000, 0.7311, 0.8808], grad_fn=<SigmoidBackward0>)
PyTorch Sigmoid gradients: tensor([0.1966, 0.2500, 0.1966, 0.1050])


In [7]:
class Tanh(nn.Module):
    """
    Implement the Tanh activation function.
    """
    def __init__(self):
        super(Tanh, self).__init__()

    def forward(self, x):
        """
        Forward pass for Tanh.
        :param x: Input tensor.
        :return: Output tensor where Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)).
        """
        ########################################################################
        # TODO: Implement the Tanh activation function.                        #
        ########################################################################
        output = (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
        self.input = output
        return output
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def backward(self, grad_output):
        """
        Backward pass for custom Tanh.
       :param grad_output: Gradient tensor of the output.
       :return: Gradient tensor for the input.
        """
        ########################################################################
        # TODO: Implement the backward computation for Tanh                    #
        ########################################################################
        grad_input = grad_output * (1 - self.input.pow(2))
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Test your Tanh implementation.
    # Define a sample input tensor.
    x = torch.tensor([-1.0, 0.0, 1.0, 2.0], requires_grad=True)

    # Initialize the custom Tanh activation function.
    custom_tanh = Tanh()

    # Compute the activation using the custom Tanh class.
    activated_x_custom = custom_tanh(x)

    # Perform a backward pass to compute gradients for the custom implementation.
    activated_x_custom.sum().backward()  
    gradients_custom = x.grad

    # Print the outputs and gradients from the custom implementation.
    print("Custom Tanh output:", activated_x_custom)
    print("Custom Tanh gradients:", gradients_custom)

    # Reset gradients to zero before another backward pass
    x.grad = None

    # Compute the activation using PyTorch's built-in relu function.
    activated_x_torch = torch.tanh(x)

    # Perform a backward pass to compute gradients for PyTorch's implementation.
    activated_x_torch.backward(torch.ones_like(x))
    gradients_torch = x.grad

    # Print the outputs and gradients from PyTorch's implementation.
    print("PyTorch Tanh output:", activated_x_torch)
    print("PyTorch Tanh gradients:", gradients_torch)

Custom Tanh output: tensor([-0.7616,  0.0000,  0.7616,  0.9640], grad_fn=<DivBackward0>)
Custom Tanh gradients: tensor([0.4200, 1.0000, 0.4200, 0.0707])
PyTorch Tanh output: tensor([-0.7616,  0.0000,  0.7616,  0.9640], grad_fn=<TanhBackward0>)
PyTorch Tanh gradients: tensor([0.4200, 1.0000, 0.4200, 0.0707])


In [8]:
class Softmax(nn.Module):
    """
    Implement the Softmax activation function.
    """
    def __init__(self):
        super(Softmax, self).__init__()

    def forward(self, x, dim=1):
        """
        Forward pass for Softmax.
        :param x: Input tensor.
        :param dim: The dimension Softmax would be applied to.
        :return: Output tensor after applying Softmax.
        """
        ########################################################################
        # TODO: Implement the Softmax activation function.                     #
        # Hint: Subtract the maximum value in each row for numerical stability #
        ########################################################################
        exp_x = torch.exp(x - torch.max(x, dim=dim, keepdim=True).values)
        return exp_x / torch.sum(exp_x, dim=dim, keepdim=True)
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################


# Example usage
if __name__ == "__main__":
    # TODO: Test your Softmax implementation.
    # Define a sample input tensor.
    x = torch.tensor([-1.0, 0.0, 1.0, 2.0], requires_grad=True)

    # Initialize the custom Softmax activation function.
    custom_softmax = Softmax()

    # Compute the activation using the custom Softmax class.
    activated_x_custom = custom_softmax(x, dim=0)

    dummy_grad = torch.tensor([[0.1, 0.2, 0.3, 0.4]])
    
    # Perform a backward pass to compute gradients for the custom implementation.
    # activated_x_custom.backward(dummy_grad)
    activated_x_custom.sum().backward()  
    gradients_custom = x.grad

    # Print the outputs and gradients from the custom implementation.
    print("Custom Softmax output:", activated_x_custom)
    print("Custom Softmax gradients:", gradients_custom)

    # Reset gradients to zero before another backward pass
    x.grad = None

    # Compute the activation using PyTorch's built-in relu function.
    activated_x_torch = torch.softmax(x, dim=0)

    # Perform a backward pass to compute gradients for PyTorch's implementation.
    activated_x_torch.backward(torch.ones_like(x))
    gradients_torch = x.grad

    # Print the outputs and gradients from PyTorch's implementation.
    print("PyTorch Softmax output:", activated_x_torch)
    print("PyTorch Softmax gradients:", gradients_torch)

Custom Softmax output: tensor([0.0321, 0.0871, 0.2369, 0.6439], grad_fn=<DivBackward0>)
Custom Softmax gradients: tensor([0., 0., 0., 0.])
PyTorch Softmax output: tensor([0.0321, 0.0871, 0.2369, 0.6439], grad_fn=<SoftmaxBackward0>)
PyTorch Softmax gradients: tensor([0., 0., 0., 0.])


# **Task 3**: Deriving and Understanding the Sigmoid Function

The sigmoid function is a widely used activation function in the field of machine learning, especially in logistic regression and neural networks. It maps any real-valued number into the range between 0 and 1.

1. Given the sigmoid function defined as $\sigma(x) = \frac{1}{1 + e^{-x}}$, **compute the derivative** $\frac{d\sigma(x)}{dx}$ **with respect to $x$**.

2. A special property of the sigmoid function is that its derivative can be expressed in terms of the sigmoid function itself. If we denote $y = \sigma(x)$, **show how the derivative you've computed can be re-written in terms of $y$**, where $y$ is the output of the sigmoid function.

   *Hint: Your answer should only depend on $y$.*


-- Your solution here

# **Task 4**: Connecting Sigmoid and Softmax Functions

The sigmoid and softmax functions are foundational to machine learning, particularly in classification tasks. While the sigmoid function is traditionally used for binary classification, the softmax function generalizes this concept to multi-class problems. The sigmoid function can be seen as a special case of the softmax function when the output space consists of two classes.

Consider a binary classification problem and the general form of the softmax function for an arbitrary vector $\mathbf{z} $ with components $\mathbf{z_i} $ for $\mathbf( i = 1, \ldots, K) $ classes. The softmax function is defined as:

$$
\text{softmax}(\mathbf{z})_i = \frac{e^{z_i}}{\sum_{j=1}^K e^{z_j}}
$$

Your task is to demonstrate that the softmax function simplifies to the sigmoid function in the context of binary classification.

1. **Express the Softmax Function for Two Classes:**
   Show the softmax function for a two-class system and define the components of the vector $\mathbf{z} $ as arbitrary logits without specifying any particular values.

2. **Derive the Sigmoid Function from Softmax:**
   Simplify the expression for the probability of the first class and show how it is equivalent to the sigmoid function for an arbitrary logit.

_Hint: Consider the nature of binary classification and how the probabilities must sum to one._


-- Your solution here

# **Task 5:** Understanding Logits and Log Odds

In logistic regression and neural networks, the concept of logits and log odds play a central role in modeling probabilities.

- **Logits:** The logit function is the inverse of the sigmoid function. It takes a probability value and maps it to the entire real number line, which can be interpreted as the log odds.

- **Log Odds:** This is the logarithm of the odds ratio. For a probability $p$, the odds are $\frac{p}{1-p}$, and the log odds, or logits, is the natural logarithm of this odds: $\text{logit}(p) = \log\left(\frac{p}{1-p}\right)$. In logistic regression, we predict log odds with the linear combination of features, and then convert these predictions into probabilities using the sigmoid function.



**The Sigmoid Inverse:** The inverse of the sigmoid function, denoted as $(\sigma^{-1})$, is the logit function. Given the sigmoid function defined as:
$$
\sigma(x) = \frac{1}{1 + e^{-x}}
$$
**Derive its inverse, $(\sigma^{-1}(y))$,** which takes a probability and gives the corresponding log odds.

Hint: To find the inverse, set $y = \sigma(x)$, and solve for $x$ in terms of $y$. The result will give you the logit function.



-- Your solution here

# Task 6: Understanding Backpropagation and the Chain Rule

Background

Backpropagation is an algorithm commonly used for training neural networks. It leverages the chain rule to calculate the gradient of the loss function with respect to each weight in the network. This gradient tells us how much the loss will change for a small change in the weights, and it's used to update the weights to minimize the loss.

The chain rule is a fundamental principle in calculus that is used to find the derivative of composite functions. If we have functions nested within each other, the chain rule allows us to take the derivative of the entire expression by multiplying the derivatives of the constituent functions.

1. **Chain Rule for Simple Composition**

   Given a function composed as $f(u(x))$, where $u$ is a function of $x$, use the chain rule to find the derivative of $f$ with respect to $x$.

   **Example Function:**

   Let $f(u) = e^u$ and $u(x) = 2x + 3$. Compute $\frac{df}{dx}$.

   **Solution:**

   First, find $\frac{du}{dx}$ where $u(x) = 2x + 3$. The derivative is $\frac{du}{dx} = 2$.

   Then, compute $\frac{df}{du}$ for $f(u) = e^u$. The derivative is $\frac{df}{du} = e^u$.

   Multiply $\frac{du}{dx}$ by $\frac{df}{du}$ to get $\frac{df}{dx} = 2e^{(2x+3)}$.

2. **Chain Rule for Nested Composition**

   For a nested function $f(g(u(x)))$, where $g$ is a function of $u(x)$, and $u$ is a function of $x$, apply the chain rule to compute the derivative of $f$ with respect to $x$.

   **Example Function:**

   Let $f(g) = \sin(g)$, $g(u) = u^2$, and $u(x) = 3x - 5$. Find $\frac{df}{dx}$.

   **Solution:**

   Start by finding $\frac{du}{dx}$ for $u(x) = 3x - 5$. The derivative is $\frac{du}{dx} = 3$.

   Next, find $\frac{dg}{du}$ for $g(u) = u^2$. The derivative is $\frac{dg}{du} = 2u$.

   Then, compute $\frac{df}{dg}$ for $f(g) = \sin(g)$. The derivative is $\frac{df}{dg} = \cos(g)$.

   By the chain rule, $\frac{df}{dx} = \frac{df}{dg} \cdot \frac{dg}{du} \cdot \frac{du}{dx} = \cos(u^2) \cdot 2u \cdot 3$.

   Substituting $u(x)$ into the derivative, we get $\frac{df}{dx} = \cos((3x - 5)^2) \cdot 2(3x - 5) \cdot 3$.


**Task:** Consider a neural network with a single neuron that takes two inputs $x_1$ and $x_2$, with weights $w_1$ and $w_2$ respectively, and a bias $b$. The output of the neuron is passed through a hyperbolic tangent activation function:

$$
f(x) = \tanh(w_1x_1 + w_2x_2 + b)
$$

The hyperbolic tangent function, $\tanh(x)$, is defined as:

$$
\tanh(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
$$

**Compute the partial derivatives of the function $f(x)$ with respect to $w_1$.**

-- Your solution here

# **Task 7 (Optional):** Implementing Custom Optimizers in PyTorch

In this task, you will delve into the mechanics of optimization algorithms in deep learning by creating custom optimizer classes in PyTorch. Optimizers are the engines that power the learning process, updating model weights based on gradients to minimize loss functions. You will start by understanding the foundational principles of the Gradient Descent optimizer. Following this, you will **implement** custom versions of more advanced optimizers such as Stochastic Gradient Descent (SGD), Momentum, and Adam, and **compare** their performance with PyTorch's built-in optimizers.

In [10]:
class GradientDescentOptimizer:
    """
    Custom implementation of the gradient descent optimization algorithm.
    """
    def __init__(self, parameters, learning_rate):
        """
        Initializes the GradientDescentOptimizer.

        Args:
            parameters (iterable): Iterable of parameters to optimize or dicts defining parameter groups.
            learning_rate (float): Learning rate for the optimizer.
        """
        self.parameters = list(parameters)
        self.learning_rate = learning_rate

    def step(self):
        """
        Performs a single optimization step using gradient descent.

        :return: None
        """
        ########################################################################
        # TODO: Implement the gradient descent update step.                    #
        ########################################################################
        with torch.no_grad():
            for param in self.parameters:
                if param.grad is not None:
                    param.data -= self.learning_rate * param.grad
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def zero_grad(self):
        """
        Clears gradients of all optimized parameters.
        """
        ########################################################################
        # TODO: Clear gradients of all parameters.                             #
        ########################################################################
        for param in self.parameters:
            if param.grad is not None:
                param.grad.zero_()
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################


# Example usage
if __name__ == "__main__":
    # Define a simple model and a sample loss function
    model = torch.nn.Linear(1, 1)
    loss_fn = torch.nn.MSELoss()

    # Define sample input and target data
    input_data = torch.tensor([[1.0], [2.0]], requires_grad=True)
    target_data = torch.tensor([[2.0], [4.0]])

    # Initialize the custom optimizer
    optimizer = GradientDescentOptimizer(model.parameters(), learning_rate=0.01)

    # Forward pass: Compute predicted y by passing input_data to the model
    predicted_y = model(input_data)

    # Compute loss
    loss = loss_fn(predicted_y, target_data)

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Updated model weights: {model.weight.data}")

Updated model weights: tensor([[-0.1797]])


In [11]:
class SGDOptimizer:
    """
    Custom implementation of the stochastic gradient descent optimization algorithm.

    Attributes:
        parameters (iterable): Iterable of parameters to optimize or dicts defining parameter groups.
        learning_rate (float): Learning rate for the optimizer.
    """
    def __init__(self, parameters, learning_rate):
        """
        Initializes the SGDOptimizer.

        Args:
            parameters (iterable): Iterable of parameters to optimize or dicts defining parameter groups.
            learning_rate (float): Learning rate for the optimizer.
        """
        ########################################################################
        # TODO: Initialize parameters and learning rate.                       #
        ########################################################################
        self.parameters = list(parameters)
        self.learning_rate = learning_rate
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def step(self):
        """
        Perform a single optimization step using SGD.

        :return: None
        """
        ########################################################################
        # TODO: Implement the SGD update step.                                 #
        ########################################################################
        with torch.no_grad():
            for param in self.parameters:
                if param.grad is not None:
                    # Update the parameter based on the current gradient
                    param.data -= self.learning_rate * param.grad
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def zero_grad(self):
        """
        Clear gradients of all optimized parameters.
        """
        ########################################################################
        # TODO: Clear gradients of all parameters.                             #
        ########################################################################
        for param in self.parameters:
            if param.grad is not None:
                param.grad.zero_()
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Repeat the process above for the SGD optimizer
    # Define a simple model and a sample loss function
    model1 = nn.Linear(1, 1)
    model2 = nn.Linear(1, 1)
    loss_fn = nn.MSELoss()
    model2.weight = nn.Parameter(model1.weight.clone())
    model2.bias = nn.Parameter(model1.bias.clone())

    # Define sample input and target data
    input_data = torch.tensor([[1.0], [2.0]])
    target_data = torch.tensor([[2.0], [4.0]])

    # Initialize the custom optimizer
    optimizer1 = GradientDescentOptimizer(model1.parameters(), learning_rate=0.01)
    optimizer2 = torch.optim.SGD(model2.parameters(), lr=0.01)

    # Forward pass: Compute predicted y by passing input_data to the model
    # Compute loss
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer1.zero_grad()
    predicted_y1 = model1(input_data)
    loss1 = loss_fn(predicted_y1, target_data)
    loss1.backward()
    optimizer1.step()

    # Forward pass: Compute predicted y by passing input_data to the model
    # Compute loss
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer2.zero_grad()
    predicted_y2 = model2(input_data)
    loss2 = loss_fn(predicted_y2, target_data)
    loss2.backward()
    optimizer2.step()

    print(f"Updated model1 weights (Custom SGD): {model1.weight.data}, bias: {model1.bias.data}")
    print(f"Updated model2 weights (PyTorch SGD): {model2.weight.data}, bias: {model2.bias.data}")

Updated model1 weights (Custom SGD): tensor([[-0.0349]]), bias: tensor([-0.3047])
Updated model2 weights (PyTorch SGD): tensor([[-0.0349]]), bias: tensor([-0.3047])


  from .autonotebook import tqdm as notebook_tqdm


In [12]:
class MomentumOptimizer:
    """
    Custom implementation of the stochastic gradient descent optimization algorithm with momentum.
    """
    def __init__(self, parameters, learning_rate, momentum=0.9):
        """
        Initializes the MomentumOptimizer.

        Args:
            parameters (iterable): Iterable of parameters to optimize or dicts defining parameter groups.
            learning_rate (float): Learning rate for the optimizer.
            momentum (float): Momentum factor (default: 0.9).
        """
        ########################################################################
        # TODO: Initialize parameters, learning rate, and momentum.            #
        ########################################################################
        self.parameters = list(parameters)
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.velocities = [torch.zeros_like(p.data) for p in self.parameters]
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def step(self):
        """
        Perform a single optimization step using SGD with momentum.

        :return: None
        """
        ########################################################################
        # TODO: Implement the momentum update step.                            #
        ########################################################################
        with torch.no_grad():
            for param, velocity in zip(self.parameters, self.velocities):
                if param.grad is not None:
                    # Update the velocities
                    velocity.mul_(self.momentum).add_(param.grad, alpha=self.learning_rate)
                    # Update the parameters
                    param.sub_(velocity)
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def zero_grad(self):
        """
        Clear gradients of all optimized parameters.
        """
        ########################################################################
        # TODO: Clear gradients of all parameters.                             #
        ########################################################################
        for param in self.parameters:
            if param.grad is not None:
                param.grad.zero_()
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################


# Example usage
if __name__ == "__main__":
    # TODO: Repeat the process above for the SGD with momentum optimizer
    # Define a simple model and a sample loss function
    model1 = nn.Linear(1, 1)
    model2 = nn.Linear(1, 1)
    loss_fn = nn.MSELoss()
    model2.weight = nn.Parameter(model1.weight.clone())
    model2.bias = nn.Parameter(model1.bias.clone())

    # Define sample input and target data
    input_data = torch.tensor([[1.0], [2.0]])
    target_data = torch.tensor([[2.0], [4.0]])

    # Initialize the custom optimizer
    optimizer1 = MomentumOptimizer(model1.parameters(), learning_rate=0.01, momentum=0.9)
    optimizer2 = torch.optim.SGD(model2.parameters(), lr=0.01, momentum=0.9)

    # Forward pass: Compute predicted y by passing input_data to the model
    # Compute loss
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer1.zero_grad()
    predicted_y1 = model1(input_data)
    loss1 = loss_fn(predicted_y1, target_data)
    loss1.backward()
    optimizer1.step()

    # Forward pass: Compute predicted y by passing input_data to the model
    # Compute loss
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer2.zero_grad()
    predicted_y2 = model2(input_data)
    loss2 = loss_fn(predicted_y2, target_data)
    loss2.backward()
    optimizer2.step()

    print(f"Updated model1 weights (Custom SGD): {model1.weight.data}, bias: {model1.bias.data}")
    print(f"Updated model2 weights (PyTorch SGD): {model2.weight.data}, bias: {model2.bias.data}")

Updated model1 weights (Custom SGD): tensor([[-0.8152]]), bias: tensor([0.8463])
Updated model2 weights (PyTorch SGD): tensor([[-0.8152]]), bias: tensor([0.8463])


In [13]:
class AdamOptimizer:
    """
    Custom implementation of the Adam optimization algorithm.
    """
    def __init__(self, parameters, learning_rate=0.001, betas=(0.9, 0.999), eps=1e-8):
        """
        Initializes the AdamOptimizer.

        Args:
            parameters (iterable): Iterable of parameters to optimize or dicts defining parameter groups.
            learning_rate (float): Learning rate for the optimizer.
            betas (Tuple[float, float]): Coefficients used for computing running averages of gradient and its square.
            eps (float): Term added to the denominator to improve numerical stability.
        """
        ########################################################################
        # TODO: Initialize parameters, learning rate, betas, and eps.          #
        ########################################################################
        self.parameters = list(parameters)
        self.learning_rate = learning_rate
        self.betas = betas
        self.eps = eps
        self.t = 0
        self.m = [torch.zeros_like(p.data) for p in self.parameters]
        self.v = [torch.zeros_like(p.data) for p in self.parameters]
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def step(self):
        """
        Perform a single optimization step using Adam.

        :return: None
        """
        ########################################################################
        # TODO: Implement the Adam update step.                                #
        ########################################################################
        self.t += 1
        with torch.no_grad():
            for param, m, v in zip(self.parameters, self.m, self.v):
                if param.grad is not None:
                    # Update biased first moment estimate
                    m.mul_(self.betas[0]).add_(param.grad, alpha=1 - self.betas[0])
                    # Update biased second raw moment estimate
                    v.mul_(self.betas[1]).addcmul_(param.grad, param.grad, value=1 - self.betas[1])

                    # Compute bias-corrected first moment estimate
                    m_hat = m / (1 - self.betas[0] ** self.t)
                    # Compute bias-corrected second raw moment estimate
                    v_hat = v / (1 - self.betas[1] ** self.t)

                    # Update parameters
                    param.addcdiv_(m_hat, v_hat.sqrt().add_(self.eps), value=-self.learning_rate)
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def zero_grad(self):
        """
        Clear gradients of all optimized parameters.
        """
        ########################################################################
        # TODO: Clear gradients of all parameters.                             #
        ########################################################################
        for param in self.parameters:
            if param.grad is not None:
                param.grad.zero_()
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Repeat the process above for the Adam optimizer
    # Define a simple model and a sample loss function
    model1 = nn.Linear(1, 1)
    model2 = nn.Linear(1, 1)
    loss_fn = nn.MSELoss()
    model2.weight = nn.Parameter(model1.weight.clone())
    model2.bias = nn.Parameter(model1.bias.clone())

    # Define sample input and target data
    input_data = torch.tensor([[1.0], [2.0]])
    target_data = torch.tensor([[2.0], [4.0]])

    # Initialize the custom optimizer
    optimizer1 = AdamOptimizer(model1.parameters(), learning_rate=0.01)
    optimizer2 = torch.optim.Adam(model2.parameters(), lr=0.01)

    # Forward pass: Compute predicted y by passing input_data to the model
    # Compute loss
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer1.zero_grad()
    predicted_y1 = model1(input_data)
    loss1 = loss_fn(predicted_y1, target_data)
    loss1.backward()
    optimizer1.step()

    # Forward pass: Compute predicted y by passing input_data to the model
    # Compute loss
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer2.zero_grad()
    predicted_y2 = model2(input_data)
    loss2 = loss_fn(predicted_y2, target_data)
    loss2.backward()
    optimizer2.step()

    print(f"Updated model1 weights (Custom SGD): {model1.weight.data}, bias: {model1.bias.data}")
    print(f"Updated model2 weights (PyTorch SGD): {model2.weight.data}, bias: {model2.bias.data}")

Updated model1 weights (Custom SGD): tensor([[-0.1830]]), bias: tensor([0.9239])
Updated model2 weights (PyTorch SGD): tensor([[-0.1830]]), bias: tensor([0.9239])
