In [None]:
import numpy as np

**Module** is an abstract class which defines fundamental methods necessary for a training a neural network. You do not need to change anything here, just read the comments.

In [None]:
class Module(object):
    """
    Basically, you can think of a module as of a something (black box)
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`:

        output = module.forward(input)

    The module should be able to perform a backward pass: to differentiate the `forward` function.
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule.

        gradInput = module.backward(input, gradOutput)
    """
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True

    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self,input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.

        This includes
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput


    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.

        Make sure to both store the data in `output` field and return it.
        """

        # The easiest case:

        # self.output = input
        # return self.output

        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input.
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.

        The shape of `gradInput` is always the same as the shape of `input`.

        Make sure to both store the gradients in `gradInput` field and return it.
        """

        # The easiest case:

        # self.gradInput = gradOutput
        # return self.gradInput

        pass

    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass

    def zeroGradParameters(self):
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass

    def getParameters(self):
        """
        Returns a list with its parameters.
        If the module does not have parameters return empty list.
        """
        return []

    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters.
        If the module does not have parameters return empty list.
        """
        return []

    def train(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True

    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want
        to have readable description.
        """
        return "Module"

# Sequential container

**Define** a forward and backward pass procedures.

In [None]:
class Sequential(Module):
    """
         This class implements a container, which processes `input` data sequentially.

         `input` is processed by each module (layer) in self.modules consecutively.
         The resulting array is called `output`.
    """

    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []

    def add(self, module):
        """
        Adds a module to the container.
        """
        self.modules.append(module)

    def updateOutput(self, input):
        """
        Basic workflow of FORWARD PASS:

            y_0    = module[0].forward(input)
            y_1    = module[1].forward(y_0)
            ...
            output = module[n-1].forward(y_{n-2})


        Just write a little loop.
        """

        if len(self.modules) == 0:
            self.output = input
            return self.output

        current_input = input
        for module in self.modules:
            current_input = module.forward(current_input)

        self.output = current_input
        return self.output

    def backward(self, input, gradOutput):
        """
        Workflow of BACKWARD PASS:

            g_{n-1} = module[n-1].backward(y_{n-2}, gradOutput)
            g_{n-2} = module[n-2].backward(y_{n-3}, g_{n-1})
            ...
            g_1 = module[1].backward(y_0, g_2)
            gradInput = module[0].backward(input, g_1)


        !!!

        To ech module you need to provide the input, module saw while forward pass,
        it is used while computing gradients.
        Make sure that the input for `i-th` layer the output of `module[i]` (just the same input as in forward pass)
        and NOT `input` to this Sequential module.

        !!!

        """
        if len(self.modules) == 0:
            self.gradInput = gradOutput
            return self.gradInput

        current_grad = gradOutput

        for i in range(len(self.modules) - 1, 0, -1):
            prev_output = self.modules[i-1].output
            current_grad = self.modules[i].backward(prev_output, current_grad)

        self.gradInput = self.modules[0].backward(input, current_grad)

        return self.gradInput

    def zeroGradParameters(self):
        for module in self.modules:
            module.zeroGradParameters()

    def getParameters(self):
        """
        Should gather all parameters in a list.
        """
        return [x.getParameters() for x in self.modules]

    def getGradParameters(self):
        """
        Should gather all gradients w.r.t parameters in a list.
        """
        return [x.getGradParameters() for x in self.modules]

    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string

    def __getitem__(self,x):
        return self.modules.__getitem__(x)

    def train(self):
        """
        Propagates training parameter through all modules
        """
        self.training = True
        for module in self.modules:
            module.train()

    def evaluate(self):
        """
        Propagates training parameter through all modules
        """
        self.training = False
        for module in self.modules:
            module.evaluate()


# Layers

## 1 (0.2). Linear transform layer
Also known as dense layer, fully-connected layer, FC-layer, InnerProductLayer (in caffe), affine transform
- input:   **`batch_size x n_feats1`**
- output: **`batch_size x n_feats2`**

In [None]:
class Linear(Module):
    """
    A module which applies a linear transformation
    A common name is fully-connected layer, InnerProductLayer in caffe.

    The module should work with 2D input of shape (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()

        # This is a nice initialization
        stdv = 1./np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size = (n_out, n_in))
        self.b = np.random.uniform(-stdv, stdv, size = n_out)

        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)

    def updateOutput(self, input):
        # Your code goes here. ################################################
        self.output = np.dot(input, self.W.T) + self.b
        return self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        self.gradInput = np.dot(gradOutput, self.W)
        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        # Your code goes here. ################################################
        self.gradW += np.dot(gradOutput.T, input)
        self.gradb += np.sum(gradOutput, axis=0)
        pass

    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)

    def getParameters(self):
        return [self.W, self.b]

    def getGradParameters(self):
        return [self.gradW, self.gradb]

    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' %(s[1],s[0])
        return q


## 2. (0.2) SoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{softmax}(x)_i = \frac{\exp x_i} {\sum_j \exp x_j}$

Recall that $\text{softmax}(x) == \text{softmax}(x - \text{const})$. It makes possible to avoid computing exp() from large argument.

In [None]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()

    def updateOutput(self, input):
        # start with normalization for numerical stability
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))

        # Your code goes here. ################################################
        exp_x = np.exp(self.output)
        self.output = exp_x / np.sum(exp_x, axis=1, keepdims=True)
        return self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        self.gradInput = np.zeros_like(gradOutput)

        for i in range(input.shape[0]):
            softmax_output = self.output[i]
            grad_output = gradOutput[i]

            jacobian = np.diag(softmax_output) - np.outer(softmax_output, softmax_output)

            self.gradInput[i] = np.dot(grad_output, jacobian)

        return self.gradInput

    def __repr__(self):
        return "SoftMax"


## 3. (0.2) LogSoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{logsoftmax}(x)_i = \log\text{softmax}(x)_i = x_i - \log {\sum_j \exp x_j}$

The main goal of this layer is to be used in computation of log-likelihood loss.

In [None]:
class LogSoftMax(Module):
    def __init__(self):
         super(LogSoftMax, self).__init__()

    def updateOutput(self, input):
        # start with normalization for numerical stability
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))

        # Your code goes here. ################################################
        exp_x = np.exp(self.output)
        sum_exp = np.sum(exp_x, axis=1, keepdims=True)
        self.output = self.output - np.log(sum_exp)
        return self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        softmax = np.exp(self.output)
        self.gradInput = gradOutput - softmax * np.sum(gradOutput, axis=1, keepdims=True)
        return self.gradInput

    def __repr__(self):
        return "LogSoftMax"


## 4. (0.3) Batch normalization
One of the most significant recent ideas that impacted NNs a lot is [**Batch normalization**](http://arxiv.org/abs/1502.03167). The idea is simple, yet effective: the features should be whitened ($mean = 0$, $std = 1$) all the way through NN. This improves the convergence for deep models letting it train them for days but not weeks. **You are** to implement the first part of the layer: features normalization. The second part (`ChannelwiseScaling` layer) is implemented below.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

The layer should work as follows. While training (`self.training == True`) it transforms input as $$y = \frac{x - \mu}  {\sqrt{\sigma + \epsilon}}$$
where $\mu$ and $\sigma$ - mean and variance of feature values in **batch** and $\epsilon$ is just a small number for numericall stability. Also during training, layer should maintain exponential moving average values for mean and variance:
```
    self.moving_mean = self.moving_mean * alpha + batch_mean * (1 - alpha)
    self.moving_variance = self.moving_variance * alpha + batch_variance * (1 - alpha)
```
During testing (`self.training == False`) the layer normalizes input using moving_mean and moving_variance.

Note that decomposition of batch normalization on normalization itself and channelwise scaling here is just a common **implementation** choice. In general "batch normalization" always assumes normalization + scaling.

In [None]:
class BatchNormalization(Module):
    EPS = 1e-3
    def __init__(self, alpha = 0.):
        super(BatchNormalization, self).__init__()
        self.alpha = alpha
        self.moving_mean = None
        self.moving_variance = None
        self.training = True

    def updateOutput(self, input):
        # Your code goes here. ################################################
        # use self.EPS please
        if self.training:
            batch_mean = np.mean(input, axis=0)
            batch_variance = np.var(input, axis=0)

            if self.moving_mean is None:
                self.moving_mean = batch_mean
                self.moving_variance = batch_variance
            else:
                self.moving_mean = self.moving_mean * self.alpha + batch_mean * (1 - self.alpha)
                self.moving_variance = self.moving_variance * self.alpha + batch_variance * (1 - self.alpha)

            self.output = (input - batch_mean) / np.sqrt(batch_variance + self.EPS)
        else:
            self.output = (input - self.moving_mean) / np.sqrt(self.moving_variance + self.EPS)

        return self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        if self.training:
            batch_size = input.shape[0]
            batch_mean = np.mean(input, axis=0)
            batch_variance = np.var(input, axis=0)

            x_centered = input - batch_mean
            std_inv = 1.0 / np.sqrt(batch_variance + self.EPS)

            dx_norm = gradOutput
            dvar = np.sum(dx_norm * x_centered, axis=0) * -0.5 * std_inv**3
            dmean = np.sum(dx_norm * -std_inv, axis=0) + dvar * np.mean(-2.0 * x_centered, axis=0)

            self.gradInput = dx_norm * std_inv + dvar * 2.0 * x_centered / batch_size + dmean / batch_size
        else:
            std_inv = 1.0 / np.sqrt(self.moving_variance + self.EPS)
            self.gradInput = gradOutput * std_inv

        return self.gradInput

    def __repr__(self):
        return "BatchNormalization"


In [None]:
class ChannelwiseScaling(Module):
    """
       Implements linear transform of input y = \gamma * x + \beta
       where \gamma, \beta - learnable vectors of length x.shape[-1]
    """
    def __init__(self, n_out):
        super(ChannelwiseScaling, self).__init__()

        stdv = 1./np.sqrt(n_out)
        self.gamma = np.random.uniform(-stdv, stdv, size=n_out)
        self.beta = np.random.uniform(-stdv, stdv, size=n_out)

        self.gradGamma = np.zeros_like(self.gamma)
        self.gradBeta = np.zeros_like(self.beta)

    def updateOutput(self, input):
        self.output = input * self.gamma + self.beta
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput * self.gamma
        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        self.gradBeta = np.sum(gradOutput, axis=0)
        self.gradGamma = np.sum(gradOutput*input, axis=0)

    def zeroGradParameters(self):
        self.gradGamma.fill(0)
        self.gradBeta.fill(0)

    def getParameters(self):
        return [self.gamma, self.beta]

    def getGradParameters(self):
        return [self.gradGamma, self.gradBeta]

    def __repr__(self):
        return "ChannelwiseScaling"

Practical notes. If BatchNormalization is placed after a linear transformation layer (including dense layer, convolutions, channelwise scaling) that implements function like `y = weight * x + bias`, than bias adding become useless and could be omitted since its effect will be discarded while batch mean subtraction. If BatchNormalization (followed by `ChannelwiseScaling`) is placed before a layer that propagates scale (including ReLU, LeakyReLU) followed by any linear transformation layer than parameter `gamma` in `ChannelwiseScaling` could be freezed since it could be absorbed into the linear transformation layer.

## 5. (0.3) Dropout
Implement [**dropout**](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf). The idea and implementation is really simple: just multimply the input by $Bernoulli(p)$ mask. Here $p$ is probability of an element to be zeroed.

This has proven to be an effective technique for regularization and preventing the co-adaptation of neurons.

While training (`self.training == True`) it should sample a mask on each iteration (for every batch), zero out elements and multiply elements by $1 / (1 - p)$. The latter is needed for keeping mean values of features close to mean values which will be in test mode. When testing this module should implement identity transform i.e. `self.output = input`.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [None]:
def updateOutput(self, input):
    # Your code goes here. ################################################
    if self.training:
        self.mask = np.random.binomial(1, 1-self.p, input.shape)
        scale = 1.0 / (1 - self.p)
        self.output = input * self.mask * scale
    else:
        self.output = input
    return self.output

def updateGradInput(self, input, gradOutput):
    # Your code goes here. ################################################
    if self.training:
        scale = 1.0 / (1 - self.p)
        self.gradInput = gradOutput * self.mask * scale
    else:
        self.gradInput = gradOutput
    return self.gradInput


#6. (2.0) Conv2d
Implement [**Conv2d**](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html). Use only this list of parameters: (in_channels, out_channels, kernel_size, stride, padding, bias, padding_mode) and fix dilation=1 and groups=1.

In [None]:
def updateOutput(self, input):
    # Your code goes here. ################################################
    import torch.nn.functional as F

    kernel_size = self.kernel_size if isinstance(self.kernel_size, tuple) else (self.kernel_size, self.kernel_size)

    if not hasattr(self, 'weight'):
        self.weight = torch.Tensor(self.out_channels, self.in_channels, *kernel_size)
        self.weight.uniform_(-0.1, 0.1)

    if self.bias and not hasattr(self, 'bias_term'):
        self.bias_term = torch.Tensor(self.out_channels)
        self.bias_term.uniform_(-0.1, 0.1)

    self.output = F.conv2d(input, self.weight,
                          bias=self.bias_term if self.bias else None,
                          stride=self.stride,
                          padding=self.padding,
                          dilation=1,
                          groups=1,
                          padding_mode=self.padding_mode)

    return self.output

def updateGradInput(self, input, gradOutput):
    # Your code goes here. ################################################
    import torch.nn.functional as F

    kernel_size = self.kernel_size if isinstance(self.kernel_size, tuple) else (self.kernel_size, self.kernel_size)

    weight_transposed = self.weight.transpose(0, 1).flip(2, 3)

    self.gradInput = F.conv_transpose2d(gradOutput, weight_transposed,
                                       stride=self.stride,
                                       padding=self.padding,
                                       output_padding=0,
                                       groups=1,
                                       dilation=1)

    return self.gradInput


#7. (0.5) Implement [**MaxPool2d**](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html) and [**AvgPool2d**](https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html). Use only parameters like kernel_size, stride, padding (negative infinity for maxpool and zero for avgpool) and other parameters fixed as in framework.

In [None]:
def updateOutput(self, input):
    # Your code goes here. ################################################
    import torch.nn.functional as F

    kernel_size = self.kernel_size if isinstance(self.kernel_size, tuple) else (self.kernel_size, self.kernel_size)
    stride = self.stride if isinstance(self.stride, tuple) else (self.stride, self.stride)
    padding = self.padding if isinstance(self.padding, tuple) else (self.padding, self.padding)

    self.output = F.max_pool2d(input, kernel_size, stride, padding)

    return self.output

def updateGradInput(self, input, gradOutput):
    # Your code goes here. ################################################
    import torch
    import torch.nn.functional as F

    kernel_size = self.kernel_size if isinstance(self.kernel_size, tuple) else (self.kernel_size, self.kernel_size)
    stride = self.stride if isinstance(self.stride, tuple) else (self.stride, self.stride)
    padding = self.padding if isinstance(self.padding, tuple) else (self.padding, self.padding)

    with torch.no_grad():
        _, indices = F.max_pool2d(input, kernel_size, stride, padding, return_indices=True)

    self.gradInput = F.max_unpool2d(gradOutput, indices, kernel_size, stride, padding, input.shape)

    return self.gradInput
def updateOutput(self, input):
    # Your code goes here. ################################################
    import torch.nn.functional as F

    kernel_size = self.kernel_size if isinstance(self.kernel_size, tuple) else (self.kernel_size, self.kernel_size)
    stride = self.stride if isinstance(self.stride, tuple) else (self.stride, self.stride)
    padding = self.padding if isinstance(self.padding, tuple) else (self.padding, self.padding)

    self.output = F.avg_pool2d(input, kernel_size, stride, padding)

    return self.output

def updateGradInput(self, input, gradOutput):
    # Your code goes here. ################################################
    import torch
    import torch.nn.functional as F

    kernel_size = self.kernel_size if isinstance(self.kernel_size, tuple) else (self.kernel_size, self.kernel_size)
    stride = self.stride if isinstance(self.stride, tuple) else (self.stride, self.stride)
    padding = self.padding if isinstance(self.padding, tuple) else (self.padding, self.padding)


    batch_size, channels, height, width = input.shape
    output_height = (height + 2 * padding[0] - kernel_size[0]) // stride[0] + 1
    output_width = (width + 2 * padding[1] - kernel_size[1]) // stride[1] + 1


    self.gradInput = torch.zeros_like(input)


    ones = torch.ones(batch_size, channels, output_height, output_width, device=gradOutput.device)
    kernel_elements = kernel_size[0] * kernel_size[1]

    self.gradInput = F.fold(
        F.unfold(gradOutput, (1, 1), stride=(1, 1), padding=(0, 0)) / kernel_elements,
        (height, width),
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=1
    )

    return self.gradInput


#8. (0.3) Implement **GlobalMaxPool2d** and **GlobalAvgPool2d**. They do not have testing and parameters are up to you but they must aggregate information within channels. Write test functions for these layers on your own.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GlobalMaxPool2d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool2d, self).__init__()

    def forward(self, input):
        batch_size, channels, height, width = input.shape
        return F.max_pool2d(input, kernel_size=(height, width))

    def __repr__(self):
        return f"{self.__class__.__name__}()"


class GlobalAvgPool2d(nn.Module):
    def __init__(self):
        super(GlobalAvgPool2d, self).__init__()

    def forward(self, input):
        batch_size, channels, height, width = input.shape
        return F.avg_pool2d(input, kernel_size=(height, width))

    def __repr__(self):
        return f"{self.__class__.__name__}()"


def test_global_max_pool2d():
    batch_size, channels, height, width = 2, 3, 4, 5
    x = torch.randn(batch_size, channels, height, width)

    global_max_pool = GlobalMaxPool2d()

    output = global_max_pool(x)

    assert output.shape == (batch_size, channels, 1, 1), f"Неверный размер выхода: {output.shape}"

    expected_output = torch.zeros(batch_size, channels, 1, 1)
    for b in range(batch_size):
        for c in range(channels):
            expected_output[b, c, 0, 0] = torch.max(x[b, c])

    assert torch.allclose(output, expected_output), "Результат GlobalMaxPool2d не соответствует ожидаемому"

    print("Тест GlobalMaxPool2d пройден успешно!")


def test_global_avg_pool2d():
    batch_size, channels, height, width = 2, 3, 4, 5
    x = torch.randn(batch_size, channels, height, width)

    global_avg_pool = GlobalAvgPool2d()

    output = global_avg_pool(x)

    assert output.shape == (batch_size, channels, 1, 1), f"Неверный размер выхода: {output.shape}"

    expected_output = torch.zeros(batch_size, channels, 1, 1)
    for b in range(batch_size):
        for c in range(channels):
            expected_output[b, c, 0, 0] = torch.mean(x[b, c])

    assert torch.allclose(output, expected_output), "Результат GlobalAvgPool2d не соответствует ожидаемому"

    print("Тест GlobalAvgPool2d пройден успешно!")


test_global_max_pool2d()
test_global_avg_pool2d()


Тест GlobalMaxPool2d пройден успешно!
Тест GlobalAvgPool2d пройден успешно!


#9. (0.2) Implement [**Flatten**](https://pytorch.org/docs/stable/generated/torch.flatten.html)

In [None]:
def updateOutput(self, input):
    self.input_shape = input.shape
    end_dim = self.end_dim if self.end_dim >= 0 else len(input.shape) + self.end_dim
    new_shape = list(input.shape[:self.start_dim])
    flattened_dim = 1
    for i in range(self.start_dim, end_dim + 1):
        flattened_dim *= input.shape[i]
    new_shape.append(flattened_dim)
    if end_dim + 1 < len(input.shape):
        new_shape.extend(input.shape[end_dim + 1:])
    self.output = input.reshape(new_shape)
    return self.output

def updateGradInput(self, input, gradOutput):
    self.gradInput = gradOutput.reshape(self.input_shape)
    return self.gradInput


# Activation functions

Here's the complete example for the **Rectified Linear Unit** non-linearity (aka **ReLU**):

In [None]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()

    def updateOutput(self, input):
        self.output = np.maximum(input, 0)
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput , input > 0)
        return self.gradInput

    def __repr__(self):
        return "ReLU"

## 10. (0.1) Leaky ReLU
Implement [**Leaky Rectified Linear Unit**](http://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29%23Leaky_ReLUs). Expriment with slope.

In [None]:
class LeakyReLU(Module):
    def __init__(self, slope = 0.03):
        super(LeakyReLU, self).__init__()
        self.slope = slope

    def updateOutput(self, input):
        self.output = np.maximum(input, self.slope * input)
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.where(input > 0, 1, self.slope) * gradOutput
        return self.gradInput

    def __repr__(self):
        return "LeakyReLU"


## 11. (0.1) ELU
Implement [**Exponential Linear Units**](http://arxiv.org/abs/1511.07289) activations.

In [None]:
class ELU(Module):
    def __init__(self, alpha = 1.0):
        super(ELU, self).__init__()
        self.alpha = alpha

    def updateOutput(self, input):
        self.output = np.where(input > 0, input, self.alpha * (np.exp(input) - 1))
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.where(input > 0, 1, self.alpha * np.exp(input)) * gradOutput
        return self.gradInput

    def __repr__(self):
        return "ELU"


## 12. (0.1) SoftPlus
Implement [**SoftPlus**](https://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29) activations. Look, how they look a lot like ReLU.

In [None]:
class SoftPlus(Module):
    def __init__(self):
        super(SoftPlus, self).__init__()

    def updateOutput(self, input):
        self.output = np.log1p(np.exp(np.clip(input, -88, 88)))
        return self.output

    def updateGradInput(self, input, gradOutput):
        sigmoid_x = 1 / (1 + np.exp(-np.clip(input, -88, 88)))
        self.gradInput = sigmoid_x * gradOutput
        return self.gradInput

    def __repr__(self):
        return "SoftPlus"


#13. (0.2) Gelu
Implement [**Gelu**](https://pytorch.org/docs/stable/generated/torch.nn.GELU.html) activations.

In [None]:
class Gelu(Module):
    def __init__(self):
        super(SoftPlus, self).__init__()

    def updateOutput(self, input):
        # Your code goes here. ################################################
        sqrt_2_over_pi = np.sqrt(2 / np.pi)
        tanh_arg = sqrt_2_over_pi * (input + 0.044715 * np.power(input, 3))
        self.output = input * 0.5 * (1 + np.tanh(tanh_arg))
        return self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        sqrt_2_over_pi = np.sqrt(2 / np.pi)
        x_cubed = np.power(input, 3)
        tanh_arg = sqrt_2_over_pi * (input + 0.044715 * x_cubed)
        tanh_val = np.tanh(tanh_arg)
        dtanh = 1 - np.power(tanh_val, 2)
        darg = sqrt_2_over_pi * (1 + 0.044715 * 3 * np.power(input, 2))

        dgelu = 0.5 * (1 + tanh_val) + 0.5 * input * dtanh * darg

        self.gradInput = dgelu * gradOutput
        return self.gradInput

    def __repr__(self):
        return "Gelu"


# Criterions

Criterions are used to score the models answers.

In [None]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None

    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function
            associated to the criterion and return the result.

            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result.

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)

    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want
        to have readable description.
        """
        return "Criterion"

The **MSECriterion**, which is basic L2 norm usually used for regression, is implemented here for you.
- input:   **`batch_size x n_feats`**
- target: **`batch_size x n_feats`**
- output: **scalar**

In [None]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()

    def updateOutput(self, input, target):
        self.output = np.sum(np.power(input - target,2)) / input.shape[0]
        return self.output

    def updateGradInput(self, input, target):
        self.gradInput  = (input - target) * 2 / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

## 14. (0.2) Negative LogLikelihood criterion (numerically unstable)
You task is to implement the **ClassNLLCriterion**. It should implement [multiclass log loss](http://scikit-learn.org/stable/modules/model_evaluation.html#log-loss). Nevertheless there is a sum over `y` (target) in that formula,
remember that targets are one-hot encoded. This fact simplifies the computations a lot. Note, that criterions are the only places, where you divide by batch size. Also there is a small hack with adding small number to probabilities to avoid computing log(0).
- input:   **`batch_size x n_feats`** - probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**



In [None]:
def updateOutput(self, input, target):
    # Use this trick to avoid numerical errors
    input_clamp = np.clip(input, self.EPS, 1 - self.EPS)

    # Your code goes here. ################################################
    # Multiclass log loss: -sum(target * log(input)) / batch_size
    log_probs = np.log(input_clamp)
    batch_loss = -np.sum(target * log_probs)
    self.output = batch_loss / input.shape[0]  # деление на размер батча
    return self.output

def updateGradInput(self, input, target):
    # Use this trick to avoid numerical errors
    input_clamp = np.clip(input, self.EPS, 1 - self.EPS)

    # Your code goes here. ################################################
    self.gradInput = -target / input_clamp / input.shape[0]
    return self.gradInput


## 15. (0.3) Negative LogLikelihood criterion (numerically stable)
- input:   **`batch_size x n_feats`** - log probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**

Task is similar to the previous one, but now the criterion input is the output of log-softmax layer. This decomposition allows us to avoid problems with computation of forward and backward of log().

In [None]:
def updateOutput(self, input, target):
    # Your code goes here. ################################################
    # Multiclass log loss: -sum(target * log(p)) / batch_size
    batch_loss = -np.sum(target * input)
    self.output = batch_loss / input.shape[0]
    return self.output

def updateGradInput(self, input, target):
    # Your code goes here. ################################################
    self.gradInput = -target / input.shape[0]
    return self.gradInput


1-я часть задания: реализация слоев, лосей и функций активации - 5 баллов. \\
2-я часть задания: реализация моделей на своих классах. Что должно быть:
  1. Выберите оптимизатор и реализуйте его, чтоб он работал с вами классами. - 1 балл.
  2. Модель для задачи мультирегрессии на выбраных вами данных. Использовать FCNN, dropout, batchnorm, MSE. Пробуйте различные фукнции активации. Для первой модели попробуйте большую, среднюю и маленькую модель. - 1 балл.
  3. Модель для задачи мультиклассификации на MNIST. Использовать свёртки, макспулы, флэттэны, софтмаксы - 1 балла.
  4. Автоэнкодер для выбранных вами данных. Должен быть на свёртках и полносвязных слоях, дропаутах, батчнормах и тд. - 2 балла. \\

Дополнительно в оценке каждой модели будет учитываться:
1. Наличие правильно выбранной метрики и лосс функции.
2. Отрисовка графиков лосей и метрик на трейне-валидации. Проверка качества модели на тесте.
3. Наличие шедулера для lr.
4. Наличие вормапа.
5. Наличие механизма ранней остановки и сохранение лучшей модели.
6. Свитч лося (метрики) и оптимайзера.

1

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import os

class Module:
    def __init__(self):
        self.training = True

    def forward(self, input):
        return self.updateOutput(input)

    def backward(self, input, gradOutput):
        return self.updateGradInput(input, gradOutput)

    def parameters(self):
        return []

    def gradParameters(self):
        return []

    def train(self):
        self.training = True

    def eval(self):
        self.training = False

    def zero_grad(self):
        for grad in self.gradParameters():
            grad.fill(0)


class ReLU(Module):
    def __init__(self):
        super(ReLU, self).__init__()
        self.output = None
        self.gradInput = None

    def updateOutput(self, input):
        self.output = np.maximum(0, input)
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput * (input > 0)
        return self.gradInput

class LeakyReLU(Module):
    def __init__(self, alpha=0.01):
        super(LeakyReLU, self).__init__()
        self.alpha = alpha
        self.output = None
        self.gradInput = None

    def updateOutput(self, input):
        self.output = np.where(input > 0, input, input * self.alpha)
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.where(input > 0, gradOutput, gradOutput * self.alpha)
        return self.gradInput

class Sigmoid(Module):
    def __init__(self):
        super(Sigmoid, self).__init__()
        self.output = None
        self.gradInput = None

    def updateOutput(self, input):
        self.output = 1 / (1 + np.exp(-input))
        return self.output

    def updateGradInput(self, input, gradOutput):
        sigmoid_output = self.output
        self.gradInput = gradOutput * sigmoid_output * (1 - sigmoid_output)
        return self.gradInput

class Tanh(Module):
    def __init__(self):
        super(Tanh, self).__init__()
        self.output = None
        self.gradInput = None

    def updateOutput(self, input):
        self.output = np.tanh(input)
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput * (1 - np.square(self.output))
        return self.gradInput

class Softmax(Module):
    def __init__(self, axis=-1):
        super(Softmax, self).__init__()
        self.axis = axis
        self.output = None
        self.gradInput = None

    def updateOutput(self, input):

        shifted_input = input - np.max(input, axis=self.axis, keepdims=True)
        exp_values = np.exp(shifted_input)
        self.output = exp_values / np.sum(exp_values, axis=self.axis, keepdims=True)
        return self.output

    def updateGradInput(self, input, gradOutput):

        self.gradInput = gradOutput
        return self.gradInput

class LogSoftmax(Module):
    def __init__(self, axis=-1):
        super(LogSoftmax, self).__init__()
        self.axis = axis
        self.output = None
        self.gradInput = None

    def updateOutput(self, input):
        shifted_input = input - np.max(input, axis=self.axis, keepdims=True)
        exp_values = np.exp(shifted_input)
        sum_exp = np.sum(exp_values, axis=self.axis, keepdims=True)
        self.output = shifted_input - np.log(sum_exp)
        return self.output

    def updateGradInput(self, input, gradOutput):
        softmax_output = np.exp(self.output)
        self.gradInput = gradOutput - np.sum(gradOutput * softmax_output, axis=self.axis, keepdims=True) * softmax_output
        return self.gradInput


class Linear(Module):
    def __init__(self, in_features, out_features):
        super(Linear, self).__init__()

        stdv = 1. / np.sqrt(in_features)
        self.weight = np.random.uniform(-stdv, stdv, (out_features, in_features))
        self.bias = np.random.uniform(-stdv, stdv, (out_features,))

        self.gradWeight = np.zeros_like(self.weight)
        self.gradBias = np.zeros_like(self.bias)

        self.output = None
        self.gradInput = None
        self.input_cache = None

    def updateOutput(self, input):
        self.input_cache = input
        self.output = np.dot(input, self.weight.T) + self.bias
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.dot(gradOutput, self.weight)
        self.gradWeight = np.dot(gradOutput.T, input)
        self.gradBias = np.sum(gradOutput, axis=0)
        return self.gradInput

    def parameters(self):
        return [self.weight, self.bias]

    def gradParameters(self):
        return [self.gradWeight, self.gradBias]

class BatchNorm1d(Module):
    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        super(BatchNorm1d, self).__init__()
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum

        self.weight = np.ones((num_features,))
        self.bias = np.zeros((num_features,))


        self.gradWeight = np.zeros_like(self.weight)
        self.gradBias = np.zeros_like(self.bias)


        self.running_mean = np.zeros((num_features,))
        self.running_var = np.ones((num_features,))


        self.input_cache = None
        self.norm_cache = None
        self.std_cache = None
        self.output = None
        self.gradInput = None

    def updateOutput(self, input):
        batch_size = input.shape[0]

        if self.training:

            batch_mean = np.mean(input, axis=0)
            batch_var = np.var(input, axis=0)


            self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean
            self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var


            std = np.sqrt(batch_var + self.eps)
            norm_input = (input - batch_mean) / std

            self.input_cache = input
            self.norm_cache = norm_input
            self.std_cache = std
        else:
            norm_input = (input - self.running_mean) / np.sqrt(self.running_var + self.eps)

        self.output = self.weight * norm_input + self.bias
        return self.output

    def updateGradInput(self, input, gradOutput):
        if not self.training:
            self.gradInput = gradOutput * self.weight / np.sqrt(self.running_var + self.eps)
            return self.gradInput

        batch_size = input.shape[0]
        norm_input = self.norm_cache
        std = self.std_cache

        self.gradWeight = np.sum(gradOutput * norm_input, axis=0)
        self.gradBias = np.sum(gradOutput, axis=0)

        dNorm = gradOutput * self.weight
        dVar = np.sum(dNorm * (input - np.mean(input, axis=0)) * -0.5 * np.power(std, -3), axis=0)
        dMean = np.sum(dNorm * -1 / std, axis=0) + dVar * np.mean(-2 * (input - np.mean(input, axis=0)), axis=0)

        self.gradInput = dNorm / std + dVar * 2 * (input - np.mean(input, axis=0)) / batch_size + dMean / batch_size
        return self.gradInput

    def parameters(self):
        return [self.weight, self.bias]

    def gradParameters(self):
        return [self.gradWeight, self.gradBias]

class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()
        self.p = p
        self.mask = None
        self.output = None
        self.gradInput = None

    def updateOutput(self, input):
        if self.training:
            self.mask = np.random.binomial(1, 1-self.p, input.shape) / (1-self.p)
            self.output = input * self.mask
        else:
            self.output = input
        return self.output

    def updateGradInput(self, input, gradOutput):
        if self.training:
            self.gradInput = gradOutput * self.mask
        else:
            self.gradInput = gradOutput
        return self.gradInput

class Flatten(Module):
    def __init__(self):
        super(Flatten, self).__init__()
        self.output = None
        self.gradInput = None
        self.input_shape = None

    def updateOutput(self, input):
        self.input_shape = input.shape
        self.output = input.reshape(input.shape[0], -1)
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput.reshape(self.input_shape)
        return self.gradInput

class Conv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        super(Conv2d, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
        self.padding = padding if isinstance(padding, tuple) else (padding, padding)

        stdv = 1. / np.sqrt(in_channels * self.kernel_size[0] * self.kernel_size[1])
        self.weight = np.random.uniform(-stdv, stdv, (out_channels, in_channels, self.kernel_size[0], self.kernel_size[1]))
        self.bias = np.random.uniform(-stdv, stdv, (out_channels,))

        self.gradWeight = np.zeros_like(self.weight)
        self.gradBias = np.zeros_like(self.bias)

        self.output = None
        self.gradInput = None
        self.input_cache = None
        self.input_padded = None

    def _pad_input(self, input):
        if self.padding[0] == 0 and self.padding[1] == 0:
            return input

        batch_size, channels, height, width = input.shape
        padded_height = height + 2 * self.padding[0]
        padded_width = width + 2 * self.padding[1]

        padded = np.zeros((batch_size, channels, padded_height, padded_width))
        padded[:, :, self.padding[0]:self.padding[0]+height, self.padding[1]:self.padding[1]+width] = input

        return padded

    def updateOutput(self, input):
        self.input_cache = input
        batch_size, channels, height, width = input.shape

        self.input_padded = self._pad_input(input)
        padded_height, padded_width = self.input_padded.shape[2], self.input_padded.shape[3]

        output_height = (padded_height - self.kernel_size[0]) // self.stride[0] + 1
        output_width = (padded_width - self.kernel_size[1]) // self.stride[1] + 1

        self.output = np.zeros((batch_size, self.out_channels, output_height, output_width))

        for b in range(batch_size):
            for c_out in range(self.out_channels):
                for h_out in range(output_height):
                    for w_out in range(output_width):
                        h_start = h_out * self.stride[0]
                        h_end = h_start + self.kernel_size[0]
                        w_start = w_out * self.stride[1]
                        w_end = w_start + self.kernel_size[1]

                        patch = self.input_padded[b, :, h_start:h_end, w_start:w_end]
                        self.output[b, c_out, h_out, w_out] = np.sum(patch * self.weight[c_out]) + self.bias[c_out]

        return self.output

    def updateGradInput(self, input, gradOutput):
        batch_size, channels, height, width = input.shape
        padded_height, padded_width = self.input_padded.shape[2], self.input_padded.shape[3]

        self.gradInput = np.zeros_like(self.input_padded)

        output_height = gradOutput.shape[2]
        output_width = gradOutput.shape[3]

        for b in range(batch_size):
            for c_out in range(self.out_channels):
                for h_out in range(output_height):
                    for w_out in range(output_width):
                        h_start = h_out * self.stride[0]
                        h_end = h_start + self.kernel_size[0]
                        w_start = w_out * self.stride[1]
                        w_end = w_start + self.kernel_size[1]

                        patch = self.input_padded[b, :, h_start:h_end, w_start:w_end]
                        self.gradWeight[c_out] += patch * gradOutput[b, c_out, h_out, w_out]

                        self.gradInput[b, :, h_start:h_end, w_start:w_end] += self.weight[c_out] * gradOutput[b, c_out, h_out, w_out]

        self.gradBias = np.sum(gradOutput, axis=(0, 2, 3))

        if self.padding[0] > 0 or self.padding[1] > 0:
            self.gradInput = self.gradInput[:, :, self.padding[0]:self.padding[0]+height, self.padding[1]:self.padding[1]+width]

        return self.gradInput

    def parameters(self):
        return [self.weight, self.bias]

    def gradParameters(self):
        return [self.gradWeight, self.gradBias]

class MaxPool2d(Module):
    def __init__(self, kernel_size, stride=None, padding=0):
        super(MaxPool2d, self).__init__()
        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.stride = stride if stride is not None else self.kernel_size
        self.stride = self.stride if isinstance(self.stride, tuple) else (self.stride, self.stride)
        self.padding = padding if isinstance(padding, tuple) else (padding, padding)

        self.output = None
        self.gradInput = None
        self.indices = None

    def _pad_input(self, input):
        if self.padding[0] == 0 and self.padding[1] == 0:
            return input

        batch_size, channels, height, width = input.shape
        padded_height = height + 2 * self.padding[0]
        padded_width = width + 2 * self.padding[1]

        padded = np.zeros((batch_size, channels, padded_height, padded_width)) - np.inf
        padded[:, :, self.padding[0]:self.padding[0]+height, self.padding[1]:self.padding[1]+width] = input

        return padded

    def updateOutput(self, input):
        batch_size, channels, height, width = input.shape

        input_padded = self._pad_input(input)
        padded_height, padded_width = input_padded.shape[2], input_padded.shape[3]

        output_height = (padded_height - self.kernel_size[0]) // self.stride[0] + 1
        output_width = (padded_width - self.kernel_size[1]) // self.stride[1] + 1

        self.output = np.zeros((batch_size, channels, output_height, output_width))
        self.indices = np.zeros((batch_size, channels, output_height, output_width, 2), dtype=np.int32)

        for b in range(batch_size):
            for c in range(channels):
                for h_out in range(output_height):
                    for w_out in range(output_width):
                        h_start = h_out * self.stride[0]
                        h_end = h_start + self.kernel_size[0]
                        w_start = w_out * self.stride[1]
                        w_end = w_start + self.kernel_size[1]

                        patch = input_padded[b, c, h_start:h_end, w_start:w_end]
                        max_idx = np.unravel_index(np.argmax(patch), patch.shape)
                        self.output[b, c, h_out, w_out] = patch[max_idx]
                        self.indices[b, c, h_out, w_out] = [h_start + max_idx[0], w_start + max_idx[1]]

        return self.output

    def updateGradInput(self, input, gradOutput):
        batch_size, channels, height, width = input.shape

        self.gradInput = np.zeros((batch_size, channels, height + 2 * self.padding[0], width + 2 * self.padding[1]))

        for b in range(batch_size):
            for c in range(channels):
                for h_out in range(gradOutput.shape[2]):
                    for w_out in range(gradOutput.shape[3]):
                        h_idx, w_idx = self.indices[b, c, h_out, w_out]
                        self.gradInput[b, c, h_idx, w_idx] += gradOutput[b, c, h_out, w_out]

        if self.padding[0] > 0 or self.padding[1] > 0:
            self.gradInput = self.gradInput[:, :, self.padding[0]:self.padding[0]+height, self.padding[1]:self.padding[1]+width]

        return self.gradInput

class MSELoss(Module):
    def __init__(self):
        super(MSELoss, self).__init__()
        self.output = None
        self.gradInput = None

    def updateOutput(self, input, target):
        self.output = np.mean(np.square(input - target))
        return self.output

    def updateGradInput(self, input, target):
        self.gradInput = 2 * (input - target) / input.size
        return self.gradInput

class CrossEntropyLoss(Module):
    def __init__(self):
        super(CrossEntropyLoss, self).__init__()
        self.output = None
        self.gradInput = None
        self.logsoftmax = LogSoftmax()

    def updateOutput(self, input, target):
        log_probs = self.logsoftmax.updateOutput(input)
        self.output = -np.sum(target * log_probs) / input.shape[0]
        return self.output

    def updateGradInput(self, input, target):
        log_probs = self.logsoftmax.updateOutput(input)
        self.gradInput = -target / input.shape[0]
        return self.gradInput

class NLLLoss(Module):
    def __init__(self):
        super(NLLLoss, self).__init__()
        self.output = None
        self.gradInput = None

    def updateOutput(self, input, target):
        batch_loss = -np.sum(target * input)
        self.output = batch_loss / input.shape[0]
        return self.output

    def updateGradInput(self, input, target):
        self.gradInput = -target / input.shape[0]
        return self.gradInput
class Sequential(Module):
    def __init__(self, *args):
        super(Sequential, self).__init__()
        self.modules = list(args)
        self.output = None
        self.gradInput = None

    def add(self, module):
        self.modules.append(module)

    def updateOutput(self, input):
        current_input = input
        for module in self.modules:
            current_input = module.forward(current_input)
        self.output = current_input
        return self.output

    def updateGradInput(self, input, gradOutput):
        current_gradOutput = gradOutput
        current_input = input

        intermediate_outputs = [input]
        for i, module in enumerate(self.modules[:-1]):
            intermediate_outputs.append(module.forward(intermediate_outputs[-1]))
        for i in range(len(self.modules) - 1, -1, -1):
            module = self.modules[i]
            current_input = intermediate_outputs[i]
            current_gradOutput = module.backward(current_input, current_gradOutput)

        self.gradInput = current_gradOutput
        return self.gradInput

    def parameters(self):
        params = []
        for module in self.modules:
            params.extend(module.parameters())
        return params

    def gradParameters(self):
        grad_params = []
        for module in self.modules:
            grad_params.extend(module.gradParameters())
        return grad_params

    def train(self):
        self.training = True
        for module in self.modules:
            module.train()

    def eval(self):
        self.training = False
        for module in self.modules:
            module.eval()

    def zero_grad(self):
        for module in self.modules:
            module.zero_grad()


2

In [None]:

class Optimizer:
    def __init__(self, params, lr=0.01):
        self.params = params
        self.lr = lr

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.fill(0)

    def step(self):
        pass

class SGD(Optimizer):
    def __init__(self, parameters, lr=0.01, momentum=0, weight_decay=0):
        self.parameters = parameters
        self.gradParameters = []
        for param in parameters:
            if isinstance(param, list):
                self.gradParameters.extend(param)
            else:
                self.gradParameters.append(param)

        self.lr = lr
        self.momentum = momentum
        self.weight_decay = weight_decay

        self.velocity = [np.zeros_like(param) for param in self.gradParameters]

    def step(self):
        for i, (param, grad, vel) in enumerate(zip(self.parameters, self.gradParameters, self.velocity)):
            if self.weight_decay != 0:
                grad = grad + self.weight_decay * param

            if self.momentum > 0:
                vel *= self.momentum
                vel += grad
                param -= self.lr * vel
            else:
                param -= self.lr * grad

class Adam(Optimizer):
    def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
        self.parameters = parameters
        self.gradParameters = []
        for param in parameters:
            if isinstance(param, list):
                self.gradParameters.extend(param)
            else:
                self.gradParameters.append(param)

        self.lr = lr
        self.betas = betas
        self.eps = eps
        self.weight_decay = weight_decay
        self.m = [np.zeros_like(param) for param in self.gradParameters]
        self.v = [np.zeros_like(param) for param in self.gradParameters]
        self.t = 0

    def step(self):
        self.t += 1
        beta1, beta2 = self.betas

        for i, (param, grad, m, v) in enumerate(zip(self.parameters, self.gradParameters, self.m, self.v)):
            if self.weight_decay != 0:
                grad = grad + self.weight_decay * param
            m *= beta1
            m += (1 - beta1) * grad

            v *= beta2
            v += (1 - beta2) * (grad ** 2)

            m_hat = m / (1 - beta1 ** self.t)
            v_hat = v / (1 - beta2 ** self.t)

            param -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)

class LRScheduler:
    def __init__(self, optimizer):
        self.optimizer = optimizer
        self.base_lr = optimizer.lr

    def step(self):
        pass

class StepLR(LRScheduler):
    def __init__(self, optimizer, step_size, gamma=0.1):
        super(StepLR, self).__init__(optimizer)
        self.step_size = step_size
        self.gamma = gamma
        self.epoch = 0

    def step(self):
        self.epoch += 1
        if self.epoch % self.step_size == 0:
            self.optimizer.lr *= self.gamma

class ReduceLROnPlateau(LRScheduler):
    def __init__(self, optimizer, mode='min', factor=0.1, patience=10, threshold=1e-4, min_lr=0):
        super(ReduceLROnPlateau, self).__init__(optimizer)
        self.mode = mode
        self.factor = factor
        self.patience = patience
        self.threshold = threshold
        self.min_lr = min_lr

        self.best = float('inf') if mode == 'min' else float('-inf')
        self.num_bad_epochs = 0

    def step(self, metrics):
        current = metrics

        if self.mode == 'min':
            is_better = current < self.best - self.threshold
        else:
            is_better = current > self.best + self.threshold

        if is_better:
            self.best = current
            self.num_bad_epochs = 0
        else:
            self.num_bad_epochs += 1

        if self.num_bad_epochs > self.patience:
            self.optimizer.lr = max(self.optimizer.lr * self.factor, self.min_lr)
            self.num_bad_epochs = 0
class WarmupScheduler(LRScheduler):
    def __init__(self, optimizer, warmup_steps, initial_lr=0):
        super(WarmupScheduler, self).__init__(optimizer)
        self.warmup_steps = warmup_steps
        self.initial_lr = initial_lr
        self.step_count = 0

    def step(self):
        self.step_count += 1
        if self.step_count <= self.warmup_steps:
            progress = self.step_count / self.warmup_steps
            self.optimizer.lr = self.initial_lr + progress * (self.base_lr - self.initial_lr)

class EarlyStopping:
    def __init__(self, patience=7, min_delta=0, mode='min'):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = float('inf') if mode == 'min' else float('-inf')
        self.early_stop = False
        self.best_model = None

    def __call__(self, score, model):
        if self.mode == 'min':
            is_better = score < self.best_score - self.min_delta
        else:
            is_better = score > self.best_score + self.min_delta

        if is_better:
            self.best_score = score
            self.counter = 0
            self.best_model = self._save_model(model)
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

        return self.early_stop

    def _save_model(self, model):
        model_copy = pickle.dumps(model)
        return pickle.loads(model_copy)

    def get_best_model(self):
        return self.best_model

class Model:
    def __init__(self):
        self.network = None
        self.criterion = None
        self.optimizer = None
        self.scheduler = None
        self.early_stopping = None
        self.metrics = {}
        self.history = {'train_loss': [], 'val_loss': [], 'train_metrics': {}, 'val_metrics': {}}

    def compile(self, optimizer='sgd', loss='mse', metrics=None, lr=0.01, momentum=0, weight_decay=0):
        if loss == 'mse':
            self.criterion = MSELoss()
        elif loss == 'cross_entropy':
            self.criterion = CrossEntropyLoss()
        elif loss == 'nll':
            self.criterion = NLLLoss()
        else:
            raise ValueError(f"Unsupported loss function: {loss}")

        if optimizer == 'sgd':
            self.optimizer = SGD(self.network.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
        elif optimizer == 'adam':
            self.optimizer = Adam(self.network.parameters(), lr=lr, weight_decay=weight_decay)
        else:
            raise ValueError(f"Unsupported optimizer: {optimizer}")

        if metrics:
            self.metrics = {metric: [] for metric in metrics}

    def set_scheduler(self, scheduler_type='step', **kwargs):
        if scheduler_type == 'step':
            self.scheduler = StepLR(self.optimizer, **kwargs)
        elif scheduler_type == 'reduce_on_plateau':
            self.scheduler = ReduceLROnPlateau(self.optimizer, **kwargs)
        elif scheduler_type == 'warmup':
            self.scheduler = WarmupScheduler(self.optimizer, **kwargs)
        else:
            raise ValueError(f"Unsupported scheduler: {scheduler_type}")

    def set_early_stopping(self, patience=7, min_delta=0, mode='min'):
        self.early_stopping = EarlyStopping(patience=patience, min_delta=min_delta, mode=mode)

    def train_step(self, inputs, targets):
        outputs = self.network.forward(inputs)
        loss = self.criterion.updateOutput(outputs, targets)
        self.network.zero_grad()
        grad_output = self.criterion.updateGradInput(outputs, targets)
        self.network.backward(inputs, grad_output)

        self.optimizer.step()

        return outputs, loss

    def evaluate(self, inputs, targets):
        self.network.eval()
        outputs = self.network.forward(inputs)
        loss = self.criterion.updateOutput(outputs, targets)
        self.network.train()

        return outputs, loss

    def calculate_metrics(self, outputs, targets, prefix=''):
        results = {}

        for metric_name in self.metrics:
            if metric_name == 'accuracy':
                if outputs.shape[1] > 1:
                    predictions = np.argmax(outputs, axis=1)
                    true_classes = np.argmax(targets, axis=1)
                    accuracy = np.mean(predictions == true_classes)
                else:
                    predictions = (outputs > 0.5).astype(int)
                    accuracy = np.mean(predictions == targets)

                results[f'{prefix}accuracy'] = accuracy

            elif metric_name == 'mse':
                mse = np.mean(np.square(outputs - targets))
                results[f'{prefix}mse'] = mse

            elif metric_name == 'mae':
                mae = np.mean(np.abs(outputs - targets))
                results[f'{prefix}mae'] = mae

        return results

    def fit(self, train_inputs, train_targets, val_inputs=None, val_targets=None,
            epochs=10, batch_size=32, verbose=1, callbacks=None):
        n_samples = train_inputs.shape[0]
        n_batches = (n_samples + batch_size - 1) // batch_size

        for epoch in range(epochs):

            indices = np.random.permutation(n_samples)
            train_inputs_shuffled = train_inputs[indices]
            train_targets_shuffled = train_targets[indices]

            train_losses = []
            train_metric_values = {metric: [] for metric in self.metrics}

            self.network.train()

            for batch_idx in tqdm(range(n_batches), disable=not verbose):
                start_idx = batch_idx * batch_size
                end_idx = min(start_idx + batch_size, n_samples)

                batch_inputs = train_inputs_shuffled[start_idx:end_idx]
                batch_targets = train_targets_shuffled[start_idx:end_idx]

                outputs, loss = self.train_step(batch_inputs, batch_targets)
                train_losses.append(loss)

                if self.metrics:
                    batch_metrics = self.calculate_metrics(outputs, batch_targets, prefix='train_')
                    for metric_name, value in batch_metrics.items():
                        train_metric_values[metric_name].append(value)

            train_loss = np.mean(train_losses)
            self.history['train_loss'].append(train_loss)

            for metric_name, values in train_metric_values.items():
                if metric_name not in self.history['train_metrics']:
                    self.history['train_metrics'][metric_name] = []
                self.history['train_metrics'][metric_name].append(np.mean(values))

            val_loss = None
            if val_inputs is not None and val_targets is not None:
                val_losses = []
                val_metric_values = {metric: [] for metric in self.metrics}

                self.network.eval()

                n_val_samples = val_inputs.shape[0]
                n_val_batches = (n_val_samples + batch_size - 1) // batch_size

                for batch_idx in range(n_val_batches):
                    start_idx = batch_idx * batch_size
                    end_idx = min(start_idx + batch_size, n_val_samples)

                    batch_inputs = val_inputs[start_idx:end_idx]
                    batch_targets = val_targets[start_idx:end_idx]

                    outputs, loss = self.evaluate(batch_inputs, batch_targets)
                    val_losses.append(loss)

                    if self.metrics:
                        batch_metrics = self.calculate_metrics(outputs, batch_targets, prefix='val_')
                        for metric_name, value in batch_metrics.items():
                            val_metric_values[metric_name].append(value)

                val_loss = np.mean(val_losses)
                self.history['val_loss'].append(val_loss)

                for metric_name, values in val_metric_values.items():
                    if metric_name not in self.history['val_metrics']:
                        self.history['val_metrics'][metric_name] = []
                    self.history['val_metrics'][metric_name].append(np.mean(values))

            if verbose:
                log_str = f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}"

                for metric_name, values in train_metric_values.items():
                    log_str += f", Train {metric_name}: {np.mean(values):.4f}"

                if val_loss is not None:
                    log_str += f", Val Loss: {val_loss:.4f}"

                    for metric_name, values in val_metric_values.items():
                        log_str += f", Val {metric_name}: {np.mean(values):.4f}"

                print(log_str)


            if self.scheduler:
                if isinstance(self.scheduler, ReduceLROnPlateau) and val_loss is not None:
                    self.scheduler.step(val_loss)
                else:
                    self.scheduler.step()

            if self.early_stopping and val_loss is not None:
                if self.early_stopping(val_loss, self.network):
                    print(f"Early stopping triggered after epoch {epoch+1}")
                    self.network = self.early_stopping.get_best_model()
                    break

        return self.history

    def predict(self, inputs, batch_size=32):
        n_samples = inputs.shape[0]
        n_batches = (n_samples + batch_size - 1) // batch_size

        self.network.eval()

        predictions = []
        for batch_idx in range(n_batches):
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, n_samples)

            batch_inputs = inputs[start_idx:end_idx]
            batch_outputs = self.network.forward(batch_inputs)
            predictions.append(batch_outputs)

        return np.vstack(predictions)

    def save(self, path):
        with open(path, 'wb') as f:
            pickle.dump(self, f)

    @staticmethod
    def load(path):
        with open(path, 'rb') as f:
            return pickle.load(f)

    def plot_history(self, figsize=(12, 4)):
        fig, axes = plt.subplots(1, 2, figsize=figsize)

        axes[0].plot(self.history['train_loss'], label='Train Loss')
        if 'val_loss' in self.history and self.history['val_loss']:
            axes[0].plot(self.history['val_loss'], label='Validation Loss')
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Loss')
        axes[0].set_title('Loss over epochs')
        axes[0].legend()


        if self.metrics:
            for metric_name in self.history['train_metrics']:
                axes[1].plot(self.history['train_metrics'][metric_name], label=f'Train {metric_name}')

            for metric_name in self.history['val_metrics']:
                axes[1].plot(self.history['val_metrics'][metric_name], label=f'Val {metric_name}')

            axes[1].set_xlabel('Epoch')
            axes[1].set_ylabel('Metric Value')
            axes[1].set_title('Metrics over epochs')
            axes[1].legend()

        plt.tight_layout()
        plt.show()

class FCNNRegressionModel(Model):
    def __init__(self, input_size, output_size, hidden_sizes=[64, 32], dropout_rate=0.2, use_batchnorm=True):
        super(FCNNRegressionModel, self).__init__()

        layers = []

        layers.append(Linear(input_size, hidden_sizes[0]))
        if use_batchnorm:
            layers.append(BatchNorm1d(hidden_sizes[0]))
        layers.append(ReLU())
        layers.append(Dropout(dropout_rate))

        for i in range(len(hidden_sizes) - 1):
            layers.append(Linear(hidden_sizes[i], hidden_sizes[i+1]))
            if use_batchnorm:
                layers.append(BatchNorm1d(hidden_sizes[i+1]))
            layers.append(ReLU())
            layers.append(Dropout(dropout_rate))

        layers.append(Linear(hidden_sizes[-1], output_size))

        self.network = Sequential(*layers)

    def set_activation(self, activation_type):
        for i, module in enumerate(self.network.modules):
            if isinstance(module, ReLU) or isinstance(module, LeakyReLU) or isinstance(module, Sigmoid) or isinstance(module, Tanh):
                if activation_type == 'relu':
                    self.network.modules[i] = ReLU()
                elif activation_type == 'leaky_relu':
                    self.network.modules[i] = LeakyReLU()
                elif activation_type == 'sigmoid':
                    self.network.modules[i] = Sigmoid()
                elif activation_type == 'tanh':
                    self.network.modules[i] = Tanh()
                else:
                    raise ValueError(f"Unsupported activation function: {activation_type}")

class CNNClassificationModel(Model):
    def __init__(self, input_channels=1, num_classes=10):
        super(CNNClassificationModel, self).__init__()

        self.network = Sequential(
            Conv2d(input_channels, 32, kernel_size=3, padding=1),
            ReLU(),
            MaxPool2d(kernel_size=2, stride=2),

            Conv2d(32, 64, kernel_size=3, padding=1),
            ReLU(),
            MaxPool2d(kernel_size=2, stride=2),

            Conv2d(64, 128, kernel_size=3, padding=1),
            ReLU(),
            MaxPool2d(kernel_size=2, stride=2),

            Flatten(),
            Linear(128 * 3 * 3, 128),
            ReLU(),
            Dropout(0.5),
            Linear(128, num_classes),
            Softmax()
        )

class ConvAutoencoder(Model):
    def __init__(self, input_channels=1, latent_dim=8):
        super(ConvAutoencoder, self).__init__()


        self.encoder = Sequential(
            Conv2d(input_channels, 32, kernel_size=3, stride=2, padding=1),
            BatchNorm1d(32),
            ReLU(),

            Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            BatchNorm1d(64),
            ReLU(),

            Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
            BatchNorm1d(128),
            ReLU(),

            Flatten(),
            Linear(128 * 4 * 4, latent_dim),
            BatchNorm1d(latent_dim),
            Tanh()
        )

        self.decoder = Sequential(
            Linear(latent_dim, 128 * 4 * 4),
            BatchNorm1d(128 * 4 * 4),
            ReLU(),

        )

        self.network = Sequential(
        )

