<a href="https://colab.research.google.com/github/Undasnr/DL-ML/blob/main/Ronny_CNN1_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Creating a one-dimensional convolutional layer class that limits the number of channels to one**

In [2]:
import numpy as np

# A simple AdaGrad optimizer class to update weights and biases
class AdaGrad:
    """
    AdaGrad optimizer.

    Parameters
    ----------
    lr : float
        Learning rate.
    """
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h_w = 1e-4  # Epsilon for numerical stability
        self.h_b = 1e-4

    def update(self, layer):
        """
        Update weights and biases of a layer.

        Parameters
        ----------
        layer : object
            The layer object to be updated.
        """
        self.h_w += layer.dW ** 2
        self.h_b += layer.db ** 2
        layer.W -= self.lr * layer.dW / np.sqrt(self.h_w)
        layer.b -= self.lr * layer.db / np.sqrt(self.h_b)


# A simple Xavier Initializer class
class XavierInitializer:
    """
    Xavier initializer for weights and biases.
    """
    def __init__(self, in_features, out_features):
        self.in_features = in_features
        self.out_features = out_features

    def W(self):
        """
        Initialize weights with Xavier method.
        """
        return np.random.randn(self.out_features, self.in_features) / np.sqrt(self.in_features)

    def b(self):
        """
        Initialize biases to zeros.
        """
        return np.zeros(self.out_features)


class SimpleConv1d:
    """
    A 1D convolutional layer with a single channel.

    Parameters
    ----------
    W_initializer : object
        Instance of a weight initializer class.
    b_initializer : object
        Instance of a bias initializer class.
    optimizer : object
        Instance of an optimizer class.
    filter_size : int
        The size of the convolutional filter.
    """
    def __init__(self, W_initializer, b_initializer, optimizer, filter_size):
        # Initializing filter size, weights, and bias
        self.filter_size = filter_size
        self.W = W_initializer.W(filter_size, 1).flatten()  # Flatten to a 1D array for single channel
        self.b = b_initializer.b(1)
        self.optimizer = optimizer
        self.x = None  # To store the input for backpropagation
        self.dW = None
        self.db = None
        self.out_size = None

    def forward(self, x):
        """
        Forward propagation.

        Parameters
        ----------
        x : numpy.ndarray
            Input array of shape (N_in,).

        Returns
        -------
        numpy.ndarray
            Output array of shape (N_out,).
        """
        # Storing input for backpropagation
        self.x = x

        # Calculating output size
        N_in = x.shape[0]
        self.out_size = N_in - self.filter_size + 1

        # Initializing output array
        a = np.zeros(self.out_size)

        # Performing convolution
        for i in range(self.out_size):
            # The formula is a_i = sum(x_(i+s) * w_s) + b
            a[i] = np.dot(x[i : i + self.filter_size], self.W) + self.b

        return a

    def backward(self, da):
        """
        Backward propagation.

        Parameters
        ----------
        da : numpy.ndarray
            Gradient array passed from the next layer, shape (N_out,).

        Returns
        -------
        numpy.ndarray
            Gradient to pass to the previous layer, shape (N_in,).
        """
        N_in = self.x.shape[0]

        # Calculating gradients for weights and bias
        self.dW = np.zeros(self.filter_size)
        for s in range(self.filter_size):
            # dL/dw_s = sum(dL/da_i * x_(i+s))
            self.dW[s] = np.sum(da * self.x[s : s + self.out_size])

        self.db = np.sum(da)

        # Updating weights and biases
        self.optimizer.update(self)

        # Calculating the gradient to pass to the previous layer
        dx = np.zeros(N_in)
        # Padding the da array with zeros for easier calculation
        da_padded = np.pad(da, (self.filter_size - 1, self.filter_size - 1), 'constant', constant_values=0)

        # The formula is dL/dx_j = sum(dL/da_(j-s) * w_s)
        for j in range(N_in):
            # Note: We need to reverse the weights for the convolution operation
            dx[j] = np.dot(da_padded[j : j + self.filter_size], self.W[::-1])

        return dx

**2. Output size calculation after one-dimensional convolution**

In [4]:
import numpy as np
import math

# Function to calculate the output size after 1D convolution
def calculate_output_size(N_in, P, F, S):
    """
    Calculates the output size of a 1D convolutional layer.

    Parameters
    ----------
    N_in : int
        Input size (number of features).
    P : int
        Number of paddings in one direction.
    F : int
        Filter size.
    S : int
        Stride size.

    Returns
    -------
    int
        Output size (number of features).
    """
    return math.floor((N_in + 2 * P - F) / S) + 1


# A simple AdaGrad optimizer class to update weights and biases
class AdaGrad:
    """
    AdaGrad optimizer.

    Parameters
    ----------
    lr : float
        Learning rate.
    """
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h_w = 1e-4  # Epsilon for numerical stability
        self.h_b = 1e-4

    def update(self, layer):
        """
        Update weights and biases of a layer.

        Parameters
        ----------
        layer : object
            The layer object to be updated.
        """
        self.h_w += layer.dW ** 2
        self.h_b += layer.db ** 2
        layer.W -= self.lr * layer.dW / np.sqrt(self.h_w)
        layer.b -= self.lr * layer.db / np.sqrt(self.h_b)


# A simple Xavier Initializer class
class XavierInitializer:
    """
    Xavier initializer for weights and biases.
    """
    def __init__(self, in_features, out_features):
        self.in_features = in_features
        self.out_features = out_features

    def W(self):
        """
        Initialize weights with Xavier method.
        """
        return np.random.randn(self.out_features, self.in_features) / np.sqrt(self.in_features)

    def b(self):
        """
        Initialize biases to zeros.
        """
        return np.zeros(self.out_features)


class SimpleConv1d:
    """
    A 1D convolutional layer with a single channel.

    Parameters
    ----------
    W_initializer : object
        Instance of a weight initializer class.
    b_initializer : object
        Instance of a bias initializer class.
    optimizer : object
        Instance of an optimizer class.
    filter_size : int
        The size of the convolutional filter.
    """
    def __init__(self, W_initializer, b_initializer, optimizer, filter_size):
        # Initializing filter size, weights, and bias
        self.filter_size = filter_size
        # Corrected initializer calls
        self.W = W_initializer.W(filter_size, 1).flatten()
        self.b = b_initializer.b(1)
        self.optimizer = optimizer
        self.x = None  # To store the input for backpropagation
        self.dW = None
        self.db = None
        self.out_size = None

    def forward(self, x):
        """
        Forward propagation.

        Parameters
        ----------
        x : numpy.ndarray
            Input array of shape (N_in,).

        Returns
        -------
        numpy.ndarray
            Output array of shape (N_out,).
        """
        # Storing input for backpropagation
        self.x = x

        # Calculating output size (assuming P=0, S=1 as per Problem 1 instructions)
        N_in = x.shape[0]
        self.out_size = N_in - self.filter_size + 1

        # Initializing output array
        a = np.zeros(self.out_size)

        # Performing convolution
        for i in range(self.out_size):
            # The formula is a_i = sum(x_(i+s) * w_s) + b
            a[i] = np.dot(x[i : i + self.filter_size], self.W) + self.b

        return a

    def backward(self, da):
        """
        Backward propagation.

        Parameters
        ----------
        da : numpy.ndarray
            Gradient array passed from the next layer, shape (N_out,).

        Returns
        -------
        numpy.ndarray
            Gradient to pass to the previous layer, shape (N_in,).
        """
        N_in = self.x.shape[0]

        # Calculating gradients for weights and bias
        self.dW = np.zeros(self.filter_size)
        for s in range(self.filter_size):
            # dL/dw_s = sum(dL/da_i * x_(i+s))
            self.dW[s] = np.sum(da * self.x[s : s + self.out_size])

        self.db = np.sum(da)

        # Updating weights and biases
        self.optimizer.update(self)

        # Calculating the gradient to pass to the previous layer
        dx = np.zeros(N_in)
        # Padding the da array with zeros for easier calculation
        da_padded = np.pad(da, (self.filter_size - 1, self.filter_size - 1), 'constant', constant_values=0)

        # The formula is dL/dx_j = sum(dL/da_(j-s) * w_s)
        for j in range(N_in):
            # Note: We need to reverse the weights for the convolution operation
            dx[j] = np.dot(da_padded[j : j + self.filter_size], self.W[::-1])

        return dx

3. Experiment of one-dimensional convolutional layer with small array

In [10]:
import numpy as np
import math

# Function to calculate the output size after 1D convolution
def calculate_output_size(N_in, P, F, S):
    """
    Calculates the output size of a 1D convolutional layer.

    Parameters
    ----------
    N_in : int
        Input size (number of features).
    P : int
        Number of paddings in one direction.
    F : int
        Filter size.
    S : int
        Stride size.

    Returns
    -------
    int
        Output size (number of features).
    """
    return math.floor((N_in + 2 * P - F) / S) + 1


# A simple AdaGrad optimizer class to update weights and biases
class AdaGrad:
    """
    AdaGrad optimizer.

    Parameters
    ----------
    lr : float
        Learning rate.
    """
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h_w = 1e-4  # Epsilon for numerical stability
        self.h_b = 1e-4

    def update(self, layer):
        """
        Update weights and biases of a layer.

        Parameters
        ----------
        layer : object
            The layer object to be updated.
        """
        self.h_w += layer.dW ** 2
        self.h_b += layer.db ** 2
        layer.W -= self.lr * layer.dW / np.sqrt(self.h_w)
        layer.b -= self.lr * layer.db / np.sqrt(self.h_b)


# A simple Xavier Initializer class
class XavierInitializer:
    """
    Xavier initializer for weights and biases.
    """
    def __init__(self, in_features, out_features):
        self.in_features = in_features
        self.out_features = out_features

    def W(self):
        """
        Initialize weights with Xavier method.
        """
        return np.random.randn(self.out_features, self.in_features) / np.sqrt(self.in_features)

    def b(self):
        """
        Initialize biases to zeros.
        """
        return np.zeros(self.out_features)


class SimpleConv1d:
    """
    A 1D convolutional layer with a single channel.

    Parameters
    ----------
    W_initializer : object
        Instance of a weight initializer class.
    b_initializer : object
        Instance of a bias initializer class.
    optimizer : object
        Instance of an optimizer class.
    filter_size : int
        The size of the convolutional filter.
    """
    def __init__(self, W_initializer, b_initializer, optimizer, filter_size):
        # Initializing filter size, weights, and bias
        self.filter_size = filter_size
        self.W = W_initializer.W().flatten().astype(np.float64)
        self.b = b_initializer.b().astype(np.float64)
        self.optimizer = optimizer
        self.x = None  # To store the input for backpropagation
        self.dW = None
        self.db = None
        self.out_size = None

    def forward(self, x):
        """
        Forward propagation.

        Parameters
        ----------
        x : numpy.ndarray
            Input array of shape (N_in,).

        Returns
        -------
        numpy.ndarray
            Output array of shape (N_out,).
        """
        # Storing input for backpropagation
        self.x = x.astype(np.float64)

        # Calculating output size (assuming P=0, S=1 as per Problem 1 instructions)
        N_in = x.shape[0]
        self.out_size = N_in - self.filter_size + 1

        # Initializing output array
        a = np.zeros(self.out_size)

        # Performing convolution
        for i in range(self.out_size):
            # The formula is a_i = sum(x_(i+s) * w_s) + b
            a[i] = np.dot(self.x[i : i + self.filter_size], self.W) + self.b

        return a

    def backward(self, da):
        """
        Backward propagation.

        Parameters
        ----------
        da : numpy.ndarray
            Gradient array passed from the next layer, shape (N_out,).

        Returns
        -------
        numpy.ndarray
            Gradient to pass to the previous layer, shape (N_in,).
        """
        N_in = self.x.shape[0]

        # Calculating gradients for weights and bias
        self.dW = np.zeros(self.filter_size)
        for s in range(self.filter_size):
            # dL/dw_s = sum(dL/da_i * x_(i+s))
            self.dW[s] = np.sum(da * self.x[s : s + self.out_size])

        self.db = np.sum(da)

        # Calculating the gradient to pass to the previous layer
        dx = np.zeros(N_in)
        # Padding the da array with zeros for easier calculation
        da_padded = np.pad(da, (self.filter_size - 1, self.filter_size - 1), 'constant', constant_values=0)

        # The formula is dL/dx_j = sum(dL/da_(j-s) * w_s)
        for j in range(N_in):
            # Note: We need to reverse the weights for the convolution operation
            dx[j] = np.dot(da_padded[j : j + self.filter_size], self.W[::-1])

        # Updating weights and biases AFTER calculating dx
        self.optimizer.update(self)

        return dx


if __name__ == '__main__':
    # Test case from Problem 3
    x = np.array([1, 2, 3, 4])
    w = np.array([3, 5, 7])
    b = np.array([1])
    delta_a = np.array([10, 20])

    # Expected values
    expected_a = np.array([35, 50])
    expected_delta_b = np.array([30])
    expected_delta_w = np.array([50, 80, 110])
    expected_delta_x = np.array([30, 110, 170, 140])

    # Creating layer instance and manually set weights/bias for testing
    optimizer = AdaGrad(lr=0.01)
    conv_layer = SimpleConv1d(W_initializer=XavierInitializer(3, 1),
                              b_initializer=XavierInitializer(1, 1),
                              optimizer=optimizer,
                              filter_size=3)

    conv_layer.W = w.astype(np.float64)
    conv_layer.b = b.astype(np.float64)

    # Forward propagation test
    output_a = conv_layer.forward(x)
    assert np.allclose(output_a, expected_a), f"Forward prop failed: Expected {expected_a}, but got {output_a}"
    print("Forward propagation test passed!")

    # Backward propagation test
    output_dx = conv_layer.backward(delta_a)
    assert np.allclose(conv_layer.dW, expected_delta_w), f"Backward prop (dW) failed: Expected {expected_delta_w}, but got {conv_layer.dW}"
    assert np.allclose(conv_layer.db, expected_delta_b), f"Backward prop (db) failed: Expected {expected_delta_b}, but got {conv_layer.db}"
    assert np.allclose(output_dx, expected_delta_x), f"Backward prop (dx) failed: Expected {expected_delta_x}, but got {output_dx}"
    print("Backward propagation test passed!")
    print("All tests for SimpleConv1d passed!")

Forward propagation test passed!
Backward propagation test passed!
All tests for SimpleConv1d passed!


  a[i] = np.dot(self.x[i : i + self.filter_size], self.W) + self.b


**4. Creating a one-dimensional convolutional layer class that does not limit the number of channels**

In [14]:
import numpy as np
import math

# Function to calculate the output size after 1D convolution
def calculate_output_size(N_in, P, F, S):
    """
    Calculates the output size of a 1D convolutional layer.

    Parameters
    ----------
    N_in : int
        Input size (number of features).
    P : int
        Number of paddings in one direction.
    F : int
        Filter size.
    S : int
        Stride size.

    Returns
    -------
    int
        Output size (number of features).
    """
    return math.floor((N_in + 2 * P - F) / S) + 1


# A simple AdaGrad optimizer class to update weights and biases
class AdaGrad:
    """
    AdaGrad optimizer.

    Parameters
    ----------
    lr : float
        Learning rate.
    """
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h_w = 1e-4  # Epsilon for numerical stability
        self.h_b = 1e-4

    def update(self, layer):
        """
        Update weights and biases of a layer.

        Parameters
        ----------
        layer : object
            The layer object to be updated.
        """
        self.h_w += layer.dW ** 2
        self.h_b += layer.db ** 2
        layer.W -= self.lr * layer.dW / np.sqrt(self.h_w)
        layer.b -= self.lr * layer.db / np.sqrt(self.h_b)


# A simple Xavier Initializer class
class XavierInitializer:
    """
    Xavier initializer for weights and biases.
    """
    def __init__(self, in_features, out_features):
        self.in_features = in_features
        self.out_features = out_features

    def W(self):
        """
        Initialize weights with Xavier method.
        """
        return np.random.randn(self.out_features, self.in_features) / np.sqrt(self.in_features)

    def b(self):
        """
        Initialize biases to zeros.
        """
        return np.zeros(self.out_features)


class Conv1d:
    """
    A 1D convolutional layer that supports multiple channels.

    Parameters
    ----------
    W_initializer : object
        Instance of a weight initializer class.
    b_initializer : object
        Instance of a bias initializer class.
    optimizer : object
        Instance of an optimizer class.
    filter_size : int
        The size of the convolutional filter.
    in_channels : int
        The number of input channels.
    out_channels : int
        The number of output channels.
    """
    def __init__(self, W_initializer, b_initializer, optimizer, filter_size, in_channels, out_channels):
        self.filter_size = filter_size
        self.in_channels = in_channels
        self.out_channels = out_channels

        # Initializing weights with shape (out_channels, in_channels, filter_size)
        # Using a new initializer instance for the correct dimensions
        self.W_initializer = XavierInitializer(in_features=in_channels * filter_size, out_features=out_channels)
        self.W = self.W_initializer.W().reshape(out_channels, in_channels, filter_size).astype(np.float64)

        # Initializing biases with shape (out_channels,)
        self.b_initializer = XavierInitializer(in_features=in_channels * filter_size, out_features=out_channels)
        self.b = self.b_initializer.b().astype(np.float64)

        self.optimizer = optimizer
        self.x = None  # To store the input for backpropagation
        self.dW = None
        self.db = None
        self.out_size = None

    def forward(self, x):
        """
        Forward propagation.

        Parameters
        ----------
        x : numpy.ndarray
            Input array of shape (in_channels, N_in).

        Returns
        -------
        numpy.ndarray
            Output array of shape (out_channels, N_out).
        """
        self.x = x.astype(np.float64)
        N_in = self.x.shape[1]

        # Calculating output size (P=0, S=1)
        self.out_size = N_in - self.filter_size + 1

        # Initializing output array
        a = np.zeros((self.out_channels, self.out_size))

        # Performing convolution for each output channel
        for oc in range(self.out_channels):
            # Iterating through each input channel and sum the results
            for ic in range(self.in_channels):
                for i in range(self.out_size):
                    a[oc, i] += np.dot(self.x[ic, i : i + self.filter_size], self.W[oc, ic, :])
            a[oc, :] += self.b[oc]

        return a

    def backward(self, da):
        """
        Backward propagation.

        Parameters
        ----------
        da : numpy.ndarray
            Gradient array passed from the next layer, shape (out_channels, N_out).

        Returns
        -------
        numpy.ndarray
            Gradient to pass to the previous layer, shape (in_channels, N_in).
        """
        N_in = self.x.shape[1]

        # Calculating gradients for weights and bias
        self.dW = np.zeros(self.W.shape)
        self.db = np.sum(da, axis=1) # Sum gradients across features for each output channel

        # Calculating dW
        for oc in range(self.out_channels):
            for ic in range(self.in_channels):
                for s in range(self.filter_size):
                    self.dW[oc, ic, s] = np.sum(da[oc, :] * self.x[ic, s:s + self.out_size])

        # Calculating dx
        dx = np.zeros(self.x.shape)
        da_padded = np.pad(da, ((0, 0), (self.filter_size - 1, self.filter_size - 1)), 'constant', constant_values=0)

        for ic in range(self.in_channels):
            for oc in range(self.out_channels):
                for j in range(N_in):
                    dx[ic, j] += np.dot(da_padded[oc, j : j + self.filter_size], self.W[oc, ic, ::-1])

        # Updating weights and biases AFTER calculating dx
        self.optimizer.update(self)

        return dx


if __name__ == '__main__':
    # Test case from Problem 4
    x = np.array(
        [[1, 2, 3, 4], [2, 3, 4, 5]]
    )  # shape (2, 4), (number of input channels, number of features).
    w = np.ones((3, 2, 3))  # (out_channels, in_channels, filter_size).
    b = np.array([1, 2, 3])  # (out_channels,)
    delta_a = np.ones((3, 2))  # (out_channels, N_out)

    # Expected forward propagation output
    expected_a = np.array([[16, 22], [17, 23], [18, 24]])

    # Backpropagation gradients
    expected_dx = np.array([[3., 6., 6., 3.],
                            [3., 6., 6., 3.]])
    expected_dW = np.array([[[3., 5., 7.],
                             [5., 7., 9.]],
                            [[3., 5., 7.],
                             [5., 7., 9.]],
                            [[3., 5., 7.],
                             [5., 7., 9.]]])
    expected_db = np.array([2., 2., 2.])

    # Creating layer instance and manually set weights/bias for testing
    optimizer = AdaGrad(lr=0.01)
    conv_layer = Conv1d(W_initializer=XavierInitializer(in_features=2 * 3, out_features=3),
                        b_initializer=XavierInitializer(in_features=2 * 3, out_features=3),
                        optimizer=optimizer,
                        filter_size=3,
                        in_channels=2,
                        out_channels=3)

    conv_layer.W = w.astype(np.float64)
    conv_layer.b = b.astype(np.float64)

    # Forward propagation test
    output_a = conv_layer.forward(x)
    assert np.allclose(output_a, expected_a), f"Forward prop failed: Expected\n{expected_a}, but got\n{output_a}"
    print("Forward propagation test passed!")

    # Backward propagation test
    output_dx = conv_layer.backward(delta_a)
    assert np.allclose(conv_layer.dW, expected_dW), f"Backward prop (dW) failed: Expected\n{expected_dW}, but got\n{conv_layer.dW}"
    assert np.allclose(conv_layer.db, expected_db), f"Backward prop (db) failed: Expected\n{expected_db}, but got\n{conv_layer.db}"
    assert np.allclose(output_dx, expected_dx), f"Backward prop (dx) failed: Expected\n{expected_dx}, but got\n{output_dx}"
    print("Backward propagation test passed!")
    print("All tests for Conv1d passed!")

Forward propagation test passed!
Backward propagation test passed!
All tests for Conv1d passed!


**5. Implementing padding**

In [17]:
import numpy as np
import math

# Function to calculate the output size after 1D convolution
def calculate_output_size(N_in, P, F, S):
    """
    Calculates the output size of a 1D convolutional layer.

    Parameters
    ----------
    N_in : int
        Input size (number of features).
    P : int
        Number of paddings in one direction.
    F : int
        Filter size.
    S : int
        Stride size.

    Returns
    -------
    int
        Output size (number of features).
    """
    return math.floor((N_in + 2 * P - F) / S) + 1


# A simple AdaGrad optimizer class to update weights and biases
class AdaGrad:
    """
    AdaGrad optimizer.

    Parameters
    ----------
    lr : float
        Learning rate.
    """
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h_w = 1e-4  # Epsilon for numerical stability
        self.h_b = 1e-4

    def update(self, layer):
        """
        Update weights and biases of a layer.

        Parameters
        ----------
        layer : object
            The layer object to be updated.
        """
        self.h_w += layer.dW ** 2
        self.h_b += layer.db ** 2
        layer.W -= self.lr * layer.dW / np.sqrt(self.h_w)
        layer.b -= self.lr * layer.db / np.sqrt(self.h_b)


# A simple Xavier Initializer class
class XavierInitializer:
    """
    Xavier initializer for weights and biases.
    """
    def __init__(self, in_features, out_features):
        self.in_features = in_features
        self.out_features = out_features

    def W(self):
        """
        Initialize weights with Xavier method.
        """
        return np.random.randn(self.out_features, self.in_features) / np.sqrt(self.in_features)

    def b(self):
        """
        Initialize biases to zeros.
        """
        return np.zeros(self.out_features)


class Conv1d:
    """
    A 1D convolutional layer that supports multiple channels and padding.

    Parameters
    ----------
    W_initializer : object
        Instance of a weight initializer class.
    b_initializer : object
        Instance of a bias initializer class.
    optimizer : object
        Instance of an optimizer class.
    filter_size : int
        The size of the convolutional filter.
    in_channels : int
        The number of input channels.
    out_channels : int
        The number of output channels.
    padding : int
        The amount of zero-padding to add to the input. Default is 0.
    """
    def __init__(self, W_initializer, b_initializer, optimizer, filter_size, in_channels, out_channels, padding=0):
        self.filter_size = filter_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.padding = padding

        # Initializing weights with shape (out_channels, in_channels, filter_size)
        self.W_initializer = XavierInitializer(in_features=in_channels * filter_size, out_features=out_channels)
        self.W = self.W_initializer.W().reshape(out_channels, in_channels, filter_size).astype(np.float64)

        # Initializing biases with shape (out_channels,)
        self.b_initializer = XavierInitializer(in_features=in_channels * filter_size, out_features=out_channels)
        self.b = self.b_initializer.b().astype(np.float64)

        self.optimizer = optimizer
        self.x = None  # To store the input for backpropagation
        self.dW = None
        self.db = None
        self.out_size = None

    def forward(self, x):
        """
        Forward propagation.

        Parameters
        ----------
        x : numpy.ndarray
            Input array of shape (in_channels, N_in).

        Returns
        -------
        numpy.ndarray
            Output array of shape (out_channels, N_out).
        """
        self.x = x.astype(np.float64)
        N_in = self.x.shape[1]

        # Padding the input array
        x_padded = np.pad(self.x, ((0, 0), (self.padding, self.padding)), 'constant', constant_values=0)

        # Calculating output size
        self.out_size = calculate_output_size(N_in=N_in, P=self.padding, F=self.filter_size, S=1)

        # Initializing output array
        a = np.zeros((self.out_channels, self.out_size))

        # Performing convolution for each output channel
        for oc in range(self.out_channels):
            for ic in range(self.in_channels):
                for i in range(self.out_size):
                    a[oc, i] += np.dot(x_padded[ic, i : i + self.filter_size], self.W[oc, ic, :])
            a[oc, :] += self.b[oc]

        return a

    def backward(self, da):
        """
        Backward propagation.

        Parameters
        ----------
        da : numpy.ndarray
            Gradient array passed from the next layer, shape (out_channels, N_out).

        Returns
        -------
        numpy.ndarray
            Gradient to pass to the previous layer, shape (in_channels, N_in).
        """
        N_in = self.x.shape[1]

        # Calculating gradients for weights and bias
        self.dW = np.zeros(self.W.shape)
        self.db = np.sum(da, axis=1) # Sum gradients across features for each output channel

        # Calculating dW
        x_padded = np.pad(self.x, ((0, 0), (self.padding, self.padding)), 'constant', constant_values=0)
        for oc in range(self.out_channels):
            for ic in range(self.in_channels):
                for s in range(self.filter_size):
                    self.dW[oc, ic, s] = np.sum(da[oc, :] * x_padded[ic, s:s + self.out_size])

        # Calculating dx
        dx_padded = np.zeros(x_padded.shape)
        da_padded = np.pad(da, ((0, 0), (self.filter_size - 1, self.filter_size - 1)), 'constant', constant_values=0)

        for ic in range(self.in_channels):
            for oc in range(self.out_channels):
                for j in range(x_padded.shape[1]):
                    dx_padded[ic, j] += np.dot(da_padded[oc, j : j + self.filter_size], self.W[oc, ic, ::-1])

        # Removing padding from dx
        dx = dx_padded[:, self.padding : N_in + self.padding]

        # Updating weights and biases AFTER calculating dx
        self.optimizer.update(self)

        return dx


if __name__ == '__main__':
    # Test case with padding
    x_test = np.array([[1, 2, 3], [4, 5, 6]]) # 2 input channels, 3 features
    w_test = np.array([[[1, 1], [1, 1]], [[2, 2], [2, 2]]]) # 2 output channels, 2 input channels, 2 filter size
    b_test = np.array([1, 2])
    padding_val = 1

    # Corrected expected values for padding test case
    expected_a_padded = np.array([[6, 13, 17, 10], [12, 26, 34, 20]])
    expected_db_padded = np.array([4., 4.])
    expected_dW_padded = np.array([[[6., 6.], [15., 15.]], [[6., 6.], [15., 15.]]])
    expected_dx = np.array([[6., 6., 6.], [6., 6., 6.]])

    # Creating layer instance and manually set weights/bias for testing
    optimizer = AdaGrad(lr=0.01)
    conv_layer = Conv1d(W_initializer=XavierInitializer(in_features=2 * 2, out_features=2),
                        b_initializer=XavierInitializer(in_features=2 * 2, out_features=2),
                        optimizer=optimizer,
                        filter_size=2,
                        in_channels=2,
                        out_channels=2,
                        padding=padding_val)

    conv_layer.W = w_test.astype(np.float64)
    conv_layer.b = b_test.astype(np.float64)

    # Forward propagation test with padding
    output_a = conv_layer.forward(x_test)
    assert np.allclose(output_a, expected_a_padded), f"Forward prop with padding failed: Expected\n{expected_a_padded}, but got\n{output_a}"
    print("Forward propagation with padding test passed!")

    # Backward propagation test with padding
    da_padded = np.ones((2, 4))
    output_dx = conv_layer.backward(da_padded)

    assert np.allclose(conv_layer.dW, expected_dW_padded), f"Backward prop (dW) with padding failed: Expected\n{expected_dW_padded}, but got\n{conv_layer.dW}"
    assert np.allclose(conv_layer.db, expected_db_padded), f"Backward prop (db) with padding failed: Expected\n{expected_db_padded}, but got\n{conv_layer.db}"
    assert np.allclose(output_dx, expected_dx), f"Backward prop (dx) with padding failed: Expected\n{expected_dx}, but got\n{output_dx}"
    print("Backward propagation with padding test passed!")
    print("All tests for Conv1d with padding passed!")

    # Test case from Problem 4 (no padding)
    x = np.array(
        [[1, 2, 3, 4], [2, 3, 4, 5]]
    )
    w = np.ones((3, 2, 3))
    b = np.array([1, 2, 3])
    delta_a = np.ones((3, 2))

    # Expected values
    expected_a = np.array([[16, 22], [17, 23], [18, 24]])
    expected_dx = np.array([[3., 6., 6., 3.],
                            [3., 6., 6., 3.]])
    expected_dW = np.array([[[3., 5., 7.],
                             [5., 7., 9.]],
                            [[3., 5., 7.],
                             [5., 7., 9.]],
                            [[3., 5., 7.],
                             [5., 7., 9.]]])
    expected_db = np.array([2., 2., 2.])

    conv_layer_nopad = Conv1d(W_initializer=XavierInitializer(in_features=2 * 3, out_features=3),
                        b_initializer=XavierInitializer(in_features=2 * 3, out_features=3),
                        optimizer=AdaGrad(lr=0.01),
                        filter_size=3,
                        in_channels=2,
                        out_channels=3,
                        padding=0)

    conv_layer_nopad.W = w.astype(np.float64)
    conv_layer_nopad.b = b.astype(np.float64)

    output_a = conv_layer_nopad.forward(x)
    assert np.allclose(output_a, expected_a), f"Forward prop failed: Expected\n{expected_a}, but got\n{output_a}"
    print("Forward propagation test (no padding) passed!")

    output_dx = conv_layer_nopad.backward(delta_a)
    assert np.allclose(conv_layer_nopad.dW, expected_dW), f"Backward prop (dW) failed: Expected\n{expected_dW}, but got\n{conv_layer_nopad.dW}"
    assert np.allclose(conv_layer_nopad.db, expected_db), f"Backward prop (db) failed: Expected\n{expected_db}, but got\n{conv_layer_nopad.db}"
    assert np.allclose(output_dx, expected_dx), f"Backward prop (dx) failed: Expected\n{expected_dx}, but got\n{output_dx}"
    print("Backward propagation test (no padding) passed!")
    print("All tests passed!")

Forward propagation with padding test passed!
Backward propagation with padding test passed!
All tests for Conv1d with padding passed!
Forward propagation test (no padding) passed!
Backward propagation test (no padding) passed!
All tests passed!


**6. Response to mini batch**

In [21]:
import numpy as np
import math

# Function to calculate the output size after 1D convolution
def calculate_output_size(N_in, P, F, S):
    """
    Calculates the output size of a 1D convolutional layer.

    Parameters
    ----------
    N_in : int
        Input size (number of features).
    P : int
        Number of paddings in one direction.
    F : int
        Filter size.
    S : int
        Stride size.

    Returns
    -------
    int
        Output size (number of features).
    """
    return math.floor((N_in + 2 * P - F) / S) + 1


# A simple AdaGrad optimizer class to update weights and biases
class AdaGrad:
    """
    AdaGrad optimizer.

    Parameters
    ----------
    lr : float
        Learning rate.
    """
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h_w = 1e-4  # Epsilon for numerical stability
        self.h_b = 1e-4

    def update(self, layer):
        """
        Update weights and biases of a layer.

        Parameters
        ----------
        layer : object
            The layer object to be updated.
        """
        self.h_w += layer.dW ** 2
        self.h_b += layer.db ** 2
        layer.W -= self.lr * layer.dW / np.sqrt(self.h_w)
        layer.b -= self.lr * layer.db / np.sqrt(self.h_b)


# A simple Xavier Initializer class
class XavierInitializer:
    """
    Xavier initializer for weights and biases.
    """
    def __init__(self, in_features, out_features):
        self.in_features = in_features
        self.out_features = out_features

    def W(self):
        """
        Initialize weights with Xavier method.
        """
        return np.random.randn(self.out_features, self.in_features) / np.sqrt(self.in_features)

    def b(self):
        """
        Initialize biases to zeros.
        """
        return np.zeros(self.out_features)


class Conv1d:
    """
    A 1D convolutional layer that supports multiple channels, padding, and mini-batches.

    Parameters
    ----------
    W_initializer : object
        Instance of a weight initializer class.
    b_initializer : object
        Instance of a bias initializer class.
    optimizer : object
        Instance of an optimizer class.
    filter_size : int
        The size of the convolutional filter.
    in_channels : int
        The number of input channels.
    out_channels : int
        The number of output channels.
    padding : int
        The amount of zero-padding to add to the input. Default is 0.
    """
    def __init__(self, W_initializer, b_initializer, optimizer, filter_size, in_channels, out_channels, padding=0):
        self.filter_size = filter_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.padding = padding

        # Initializing weights with shape (out_channels, in_channels, filter_size)
        self.W_initializer = XavierInitializer(in_features=in_channels * filter_size, out_features=out_channels)
        self.W = self.W_initializer.W().reshape(out_channels, in_channels, filter_size).astype(np.float64)

        # Initializing biases with shape (out_channels,)
        self.b_initializer = XavierInitializer(in_features=in_channels * filter_size, out_features=out_channels)
        self.b = self.b_initializer.b().astype(np.float64)

        self.optimizer = optimizer
        self.x = None  # To store the input for backpropagation
        self.dW = None
        self.db = None
        self.out_size = None

    def forward(self, x):
        """
        Forward propagation.

        Parameters
        ----------
        x : numpy.ndarray
            Input array of shape (batch_size, in_channels, N_in).

        Returns
        -------
        numpy.ndarray
            Output array of shape (batch_size, out_channels, N_out).
        """
        self.x = x.astype(np.float64)
        batch_size, in_channels, N_in = self.x.shape

        # Padding the input array
        x_padded = np.pad(self.x, ((0, 0), (0, 0), (self.padding, self.padding)), 'constant', constant_values=0)

        # Calculating output size
        self.out_size = calculate_output_size(N_in=N_in, P=self.padding, F=self.filter_size, S=1)

        # Initializing output array
        a = np.zeros((batch_size, self.out_channels, self.out_size))

        # Performing convolution for each output channel and each sample in the batch
        for b in range(batch_size):
            for oc in range(self.out_channels):
                for ic in range(self.in_channels):
                    for i in range(self.out_size):
                        a[b, oc, i] += np.dot(x_padded[b, ic, i : i + self.filter_size], self.W[oc, ic, :])
                a[b, oc, :] += self.b[oc]

        return a

    def backward(self, da):
        """
        Backward propagation.

        Parameters
        ----------
        da : numpy.ndarray
            Gradient array passed from the next layer, shape (batch_size, out_channels, N_out).

        Returns
        -------
        numpy.ndarray
            Gradient to pass to the previous layer, shape (batch_size, in_channels, N_in).
        """
        batch_size, in_channels, N_in = self.x.shape

        # Initializing gradients for weights and bias, summing across the batch
        self.dW = np.zeros(self.W.shape)
        self.db = np.sum(da, axis=(0, 2)) # Sum gradients across batch and features

        # Calculating dW
        x_padded = np.pad(self.x, ((0, 0), (0, 0), (self.padding, self.padding)), 'constant', constant_values=0)
        for b in range(batch_size):
            for oc in range(self.out_channels):
                for ic in range(self.in_channels):
                    for s in range(self.filter_size):
                        self.dW[oc, ic, s] += np.sum(da[b, oc, :] * x_padded[b, ic, s:s + self.out_size])

        # Calculating dx
        dx_padded = np.zeros(x_padded.shape)
        da_padded = np.pad(da, ((0, 0), (0, 0), (self.filter_size - 1, self.filter_size - 1)), 'constant', constant_values=0)

        for b in range(batch_size):
            for ic in range(self.in_channels):
                for oc in range(self.out_channels):
                    for j in range(x_padded.shape[2]):
                        dx_padded[b, ic, j] += np.dot(da_padded[b, oc, j : j + self.filter_size], self.W[oc, ic, ::-1])

        # Removing padding from dx
        dx = dx_padded[:, :, self.padding : N_in + self.padding]

        # Updating weights and biases AFTER calculating dx
        self.optimizer.update(self)

        return dx


if __name__ == '__main__':
    # Test case with mini-batch and padding
    x_test_batch = np.array([[[1, 2, 3], [4, 5, 6]], [[-1, -2, -3], [-4, -5, -6]]]) # (batch_size, in_channels, N_in)
    w_test = np.array([[[1, 1], [1, 1]], [[2, 2], [2, 2]]]) # 2 out_channels, 2 in_channels, 2 filter_size
    b_test = np.array([1, 2])
    padding_val = 1

    expected_a_padded_batch = np.array([[[ 6., 13., 17., 10.],
                                         [12., 26., 34., 20.]],
                                        [[ -4., -11., -15., -8.],
                                         [-8., -22., -30., -16.]]])
    expected_db_padded_batch = np.array([8., 8.])
    expected_dW_padded_batch = np.array([[[0., 0.], [0., 0.]], [[0., 0.], [0., 0.]]])
    # backward dx is independent of input x values' sign.
    expected_dx_batch = np.array([[[6., 6., 6.], [6., 6., 6.]], [[6., 6., 6.], [6., 6., 6.]]])

    # Create layer instance and manually set weights/bias for testing
    optimizer = AdaGrad(lr=0.01)
    conv_layer = Conv1d(W_initializer=XavierInitializer(in_features=2 * 2, out_features=2),
                        b_initializer=XavierInitializer(in_features=2 * 2, out_features=2),
                        optimizer=optimizer,
                        filter_size=2,
                        in_channels=2,
                        out_channels=2,
                        padding=padding_val)

    conv_layer.W = w_test.astype(np.float64)
    conv_layer.b = b_test.astype(np.float64)

    # Forward propagation test with mini-batch
    output_a = conv_layer.forward(x_test_batch)
    assert np.allclose(output_a, expected_a_padded_batch), f"Forward prop with mini-batch failed: Expected\n{expected_a_padded_batch}, but got\n{output_a}"
    print("Forward propagation with mini-batch test passed!")

    # Backward propagation test with mini-batch
    da_padded_batch = np.ones((2, 2, 4))
    output_dx_batch = conv_layer.backward(da_padded_batch)

    assert np.allclose(conv_layer.dW, expected_dW_padded_batch), f"Backward prop (dW) with mini-batch failed: Expected\n{expected_dW_padded_batch}, but got\n{conv_layer.dW}"
    assert np.allclose(conv_layer.db, expected_db_padded_batch), f"Backward prop (db) with mini-batch failed: Expected\n{expected_db_padded_batch}, but got\n{conv_layer.db}"
    assert np.allclose(output_dx_batch, expected_dx_batch), f"Backward prop (dx) with mini-batch failed: Expected\n{expected_dx_batch}, but got\n{output_dx_batch}"
    print("Backward propagation with mini-batch test passed!")
    print("All tests for Conv1d with mini-batch and padding passed!")

    # Test case from Problem 4 (no padding and batch size of 1) to ensure backward compatibility
    x = np.array([[[1, 2, 3, 4], [2, 3, 4, 5]]]) # added batch dimension
    w = np.ones((3, 2, 3))
    b = np.array([1, 2, 3])
    delta_a = np.ones((1, 3, 2)) # added batch dimension

    # Expected values
    expected_a = np.array([[[16, 22], [17, 23], [18, 24]]])
    expected_dx = np.array([[[3., 6., 6., 3.],
                            [3., 6., 6., 3.]]])
    expected_dW = np.array([[[3., 5., 7.],
                             [5., 7., 9.]],
                            [[3., 5., 7.],
                             [5., 7., 9.]],
                            [[3., 5., 7.],
                             [5., 7., 9.]]])
    expected_db = np.array([2., 2., 2.])

    conv_layer_nopad = Conv1d(W_initializer=XavierInitializer(in_features=2 * 3, out_features=3),
                        b_initializer=XavierInitializer(in_features=2 * 3, out_features=3),
                        optimizer=AdaGrad(lr=0.01),
                        filter_size=3,
                        in_channels=2,
                        out_channels=3,
                        padding=0)

    conv_layer_nopad.W = w.astype(np.float64)
    conv_layer_nopad.b = b.astype(np.float64)

    output_a = conv_layer_nopad.forward(x)
    assert np.allclose(output_a, expected_a), f"Forward prop failed: Expected\n{expected_a}, but got\n{output_a}"
    print("Forward propagation test (batch size 1) passed!")

    output_dx = conv_layer_nopad.backward(delta_a)
    assert np.allclose(conv_layer_nopad.dW, expected_dW), f"Backward prop (dW) failed: Expected\n{expected_dW}, but got\n{conv_layer_nopad.dW}"
    assert np.allclose(conv_layer_nopad.db, expected_db), f"Backward prop (db) failed: Expected\n{expected_db}, but got\n{conv_layer_nopad.db}"
    assert np.allclose(output_dx, expected_dx), f"Backward prop (dx) failed: Expected\n{expected_dx}, but got\n{output_dx}"
    print("Backward propagation test (batch size 1) passed!")
    print("All tests passed!")

Forward propagation with mini-batch test passed!
Backward propagation with mini-batch test passed!
All tests for Conv1d with mini-batch and padding passed!
Forward propagation test (batch size 1) passed!
Backward propagation test (batch size 1) passed!
All tests passed!


**7. Arbitrary number of strides**

In [24]:
import numpy as np
import math

# Function to calculate the output size after 1D convolution
def calculate_output_size(N_in, P, F, S):
    """
    Calculates the output size of a 1D convolutional layer.

    Parameters
    ----------
    N_in : int
        Input size (number of features).
    P : int
        Number of paddings in one direction.
    F : int
        Filter size.
    S : int
        Stride size.

    Returns
    -------
    int
        Output size (number of features).
    """
    return math.floor((N_in + 2 * P - F) / S) + 1


# A simple AdaGrad optimizer class to update weights and biases
class AdaGrad:
    """
    AdaGrad optimizer.

    Parameters
    ----------
    lr : float
        Learning rate.
    """
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h_w = 1e-4  # Epsilon for numerical stability
        self.h_b = 1e-4

    def update(self, layer):
        """
        Update weights and biases of a layer.

        Parameters
        ----------
        layer : object
            The layer object to be updated.
        """
        self.h_w += layer.dW ** 2
        self.h_b += layer.db ** 2
        layer.W -= self.lr * layer.dW / np.sqrt(self.h_w)
        layer.b -= self.lr * layer.db / np.sqrt(self.h_b)


# A simple Xavier Initializer class
class XavierInitializer:
    """
    Xavier initializer for weights and biases.
    """
    def __init__(self, in_features, out_features):
        self.in_features = in_features
        self.out_features = out_features

    def W(self):
        """
        Initialize weights with Xavier method.
        """
        return np.random.randn(self.out_features, self.in_features) / np.sqrt(self.in_features)

    def b(self):
        """
        Initialize biases to zeros.
        """
        return np.zeros(self.out_features)


class Conv1d:
    """
    A 1D convolutional layer that supports multiple channels, padding, and mini-batches.

    Parameters
    ----------
    W_initializer : object
        Instance of a weight initializer class.
    b_initializer : object
        Instance of a bias initializer class.
    optimizer : object
        Instance of an optimizer class.
    filter_size : int
        The size of the convolutional filter.
    in_channels : int
        The number of input channels.
    out_channels : int
        The number of output channels.
    padding : int
        The amount of zero-padding to add to the input. Default is 0.
    stride : int
        The step size of the convolution. Default is 1.
    """
    def __init__(self, W_initializer, b_initializer, optimizer, filter_size, in_channels, out_channels, padding=0, stride=1):
        self.filter_size = filter_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.padding = padding
        self.stride = stride

        # Initializing weights with shape (out_channels, in_channels, filter_size)
        self.W_initializer = XavierInitializer(in_features=in_channels * filter_size, out_features=out_channels)
        self.W = self.W_initializer.W().reshape(out_channels, in_channels, filter_size).astype(np.float64)

        # Initializing biases with shape (out_channels,)
        self.b_initializer = XavierInitializer(in_features=in_channels * filter_size, out_features=out_channels)
        self.b = self.b_initializer.b().astype(np.float64)

        self.optimizer = optimizer
        self.x = None  # To store the input for backpropagation
        self.dW = None
        self.db = None
        self.out_size = None

    def forward(self, x):
        """
        Forward propagation.

        Parameters
        ----------
        x : numpy.ndarray
            Input array of shape (batch_size, in_channels, N_in).

        Returns
        -------
        numpy.ndarray
            Output array of shape (batch_size, out_channels, N_out).
        """
        self.x = x.astype(np.float64)
        batch_size, in_channels, N_in = self.x.shape

        # Padding the input array
        x_padded = np.pad(self.x, ((0, 0), (0, 0), (self.padding, self.padding)), 'constant', constant_values=0)

        # Calculating output size
        self.out_size = calculate_output_size(N_in=N_in, P=self.padding, F=self.filter_size, S=self.stride)

        # Initializing output array
        a = np.zeros((batch_size, self.out_channels, self.out_size))

        # Performing convolution for each output channel and each sample in the batch
        for b in range(batch_size):
            for oc in range(self.out_channels):
                for ic in range(self.in_channels):
                    for i in range(self.out_size):
                        # Using the stride to slice the padded input
                        start = i * self.stride
                        end = start + self.filter_size
                        a[b, oc, i] += np.dot(x_padded[b, ic, start : end], self.W[oc, ic, :])
                a[b, oc, :] += self.b[oc]

        return a

    def backward(self, da):
        """
        Backward propagation.

        Parameters
        ----------
        da : numpy.ndarray
            Gradient array passed from the next layer, shape (batch_size, out_channels, N_out).

        Returns
        -------
        numpy.ndarray
            Gradient to pass to the previous layer, shape (batch_size, in_channels, N_in).
        """
        batch_size, in_channels, N_in = self.x.shape

        # Initializing gradients for weights and bias, summing across the batch
        self.dW = np.zeros(self.W.shape)
        self.db = np.sum(da, axis=(0, 2)) # Sum gradients across batch and features

        # Calculating dW
        x_padded = np.pad(self.x, ((0, 0), (0, 0), (self.padding, self.padding)), 'constant', constant_values=0)
        for b in range(batch_size):
            for oc in range(self.out_channels):
                for ic in range(self.in_channels):
                    for i in range(self.out_size):
                        start = i * self.stride
                        end = start + self.filter_size
                        self.dW[oc, ic, :] += da[b, oc, i] * x_padded[b, ic, start : end]

        dx_padded = np.zeros(x_padded.shape)
        for b in range(batch_size):
            for oc in range(self.out_channels):
                for ic in range(self.in_channels):
                    for i in range(self.out_size):
                        start = i * self.stride
                        end = start + self.filter_size
                        # The gradient is scattered back to the input space at strided intervals
                        # and multiplied by the flipped weights.
                        dx_padded[b, ic, start:end] += da[b, oc, i] * self.W[oc, ic, ::-1]

        # Removing padding from dx
        dx = dx_padded[:, :, self.padding : N_in + self.padding]

        # Updating weights and biases AFTER calculating dx
        self.optimizer.update(self)

        return dx


if __name__ == '__main__':
    # Test case with mini-batch and padding
    x_test_batch = np.array([[[1, 2, 3], [4, 5, 6]], [[-1, -2, -3], [-4, -5, -6]]]) # (batch_size, in_channels, N_in)
    w_test = np.array([[[1, 1], [1, 1]], [[2, 2], [2, 2]]]) # 2 out_channels, 2 in_channels, 2 filter_size
    b_test = np.array([1, 2])
    padding_val = 1
    stride_val = 1

    expected_a_padded_batch = np.array([[[ 6., 13., 17., 10.],
                                         [12., 26., 34., 20.]],
                                        [[ -4., -11., -15., -8.],
                                         [-8., -22., -30., -16.]]])
    expected_db_padded_batch = np.array([8., 8.])
    expected_dW_padded_batch = np.array([[[0., 0.], [0., 0.]], [[0., 0.], [0., 0.]]])
    expected_dx_batch = np.array([[[6., 6., 6.], [6., 6., 6.]], [[6., 6., 6.], [6., 6., 6.]]])

    # Creating layer instance and manually set weights/bias for testing
    optimizer = AdaGrad(lr=0.01)
    conv_layer = Conv1d(W_initializer=XavierInitializer(in_features=2 * 2, out_features=2),
                        b_initializer=XavierInitializer(in_features=2 * 2, out_features=2),
                        optimizer=optimizer,
                        filter_size=2,
                        in_channels=2,
                        out_channels=2,
                        padding=padding_val,
                        stride=stride_val)

    conv_layer.W = w_test.astype(np.float64)
    conv_layer.b = b_test.astype(np.float64)

    # Forward propagation test with mini-batch
    output_a = conv_layer.forward(x_test_batch)
    assert np.allclose(output_a, expected_a_padded_batch), f"Forward prop with mini-batch failed: Expected\n{expected_a_padded_batch}, but got\n{output_a}"
    print("Forward propagation with mini-batch test passed!")

    # Backward propagation test with mini-batch
    da_padded_batch = np.ones((2, 2, 4))
    output_dx_batch = conv_layer.backward(da_padded_batch)

    assert np.allclose(conv_layer.dW, expected_dW_padded_batch), f"Backward prop (dW) with mini-batch failed: Expected\n{expected_dW_padded_batch}, but got\n{conv_layer.dW}"
    assert np.allclose(conv_layer.db, expected_db_padded_batch), f"Backward prop (db) with mini-batch failed: Expected\n{expected_db_padded_batch}, but got\n{conv_layer.db}"
    assert np.allclose(output_dx_batch, expected_dx_batch), f"Backward prop (dx) with mini-batch failed: Expected\n{expected_dx_batch}, but got\n{output_dx_batch}"
    print("Backward propagation with mini-batch test passed!")
    print("All tests for Conv1d with mini-batch and padding passed!")

    # New Test case for arbitrary stride (S=2)
    x_stride_test = np.array([[[1, 2, 3, 4], [2, 3, 4, 5]]]) # added batch dimension
    w_stride_test = np.ones((1, 2, 2))
    b_stride_test = np.array([1])
    da_stride_test = np.ones((1, 1, 2))

    # Expected values for stride=2
    expected_a_stride = np.array([[[9., 17.]]])
    expected_dW_stride = np.array([[[4., 6.], [6., 8.]]])
    expected_db_stride = np.array([2.])
    expected_dx_stride = np.array([[[1., 1., 1., 1.], [1., 1., 1., 1.]]])

    conv_layer_stride = Conv1d(W_initializer=XavierInitializer(in_features=2*2, out_features=1),
                            b_initializer=XavierInitializer(in_features=2*2, out_features=1),
                            optimizer=AdaGrad(lr=0.01),
                            filter_size=2,
                            in_channels=2,
                            out_channels=1,
                            padding=0,
                            stride=2)

    conv_layer_stride.W = w_stride_test.astype(np.float64)
    conv_layer_stride.b = b_stride_test.astype(np.float64)

    output_a_stride = conv_layer_stride.forward(x_stride_test)
    assert np.allclose(output_a_stride, expected_a_stride), f"Forward prop with stride failed: Expected\n{expected_a_stride}, but got\n{output_a_stride}"
    print("Forward propagation with stride test passed!")

    output_dx_stride = conv_layer_stride.backward(da_stride_test)
    assert np.allclose(conv_layer_stride.dW, expected_dW_stride), f"Backward prop (dW) with stride failed: Expected\n{expected_dW_stride}, but got\n{conv_layer_stride.dW}"
    assert np.allclose(conv_layer_stride.db, expected_db_stride), f"Backward prop (db) with stride failed: Expected\n{expected_db_stride}, but got\n{conv_layer_stride.db}"
    assert np.allclose(output_dx_stride, expected_dx_stride), f"Backward prop (dx) with stride failed: Expected\n{expected_dx_stride}, but got\n{output_dx_stride}"
    print("Backward propagation with stride test passed!")
    print("All tests for Conv1d with stride, mini-batch, and padding passed!")

Forward propagation with mini-batch test passed!
Backward propagation with mini-batch test passed!
All tests for Conv1d with mini-batch and padding passed!
Forward propagation with stride test passed!
Backward propagation with stride test passed!
All tests for Conv1d with stride, mini-batch, and padding passed!


**8. Learning and estimation**

In [29]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

class Conv1d:
    def __init__(self, in_c, out_c, kernel_size, stride=1, pad=0):
        self.in_c, self.out_c, self.k_size, self.stride, self.pad = in_c, out_c, kernel_size, stride, pad
        scale = np.sqrt(2.0 / (in_c * kernel_size))
        self.W = np.random.randn(out_c, in_c, kernel_size) * scale
        self.b = np.zeros(out_c)
        self.cache = None

    def forward(self, x):
        batch_size, _, n_features = x.shape
        padded_x = np.pad(x, ((0,0),(0,0),(self.pad,self.pad)), 'constant')
        out_len = (n_features + 2*self.pad - self.k_size) // self.stride + 1
        output = np.zeros((batch_size, self.out_c, out_len))

        windows = []
        for i in range(0, out_len):
            start = i * self.stride
            window = padded_x[:, :, start:start+self.k_size]
            windows.append((i, start, window))

        for i, start, window in windows:
            for k in range(self.out_c):
                output[:, k, i] = np.sum(window * self.W[k], axis=(1,2)) + self.b[k]

        self.cache = (x, padded_x, windows)
        return output

    def backward(self, d_out):
        x, padded_x, windows = self.cache
        batch_size, _, n_features = x.shape

        dW = np.zeros_like(self.W)
        db = np.sum(d_out, axis=(0,2))
        dx_padded = np.zeros_like(padded_x)

        for i, start, window in windows:
            for k in range(self.out_c):
                dW[k] += np.sum(window * d_out[:, k, i][:, None, None], axis=0)
                dx_padded[:, :, start:start+self.k_size] += d_out[:, k, i][:, None, None] * self.W[k]

        dx = dx_padded[:, :, self.pad:self.pad+n_features] if self.pad > 0 else dx_padded
        self.dW, self.db = dW, db
        return dx

    def update(self, lr):
        self.W -= lr * self.dW
        self.b -= lr * self.db

# CNN Classifier with integrated components
class SimpleCNN:
    def __init__(self, lr=0.001):
        self.lr = lr
        self.conv1 = Conv1d(1, 8, 5, pad=2)
        self.conv2 = Conv1d(8, 16, 3, pad=1)

        # INTEGRATED FULLYCONNECTED LAYER PARAMETERS
        self.fc_W = np.random.randn(16*784, 10) * np.sqrt(2.0/(16*784))
        self.fc_b = np.zeros(10)
        self.fc_cache = None

        self.layers = [self.conv1, self.conv2]

    def forward(self, x):
        # CONV1 + INTEGRATED ReLU
        x = self.conv1.forward(x)
        x = np.maximum(0, x)  # INTEGRATED ReLU ACTIVATION

        # CONV2 + INTEGRATED ReLU
        x = self.conv2.forward(x)
        x = np.maximum(0, x)  # INTEGRATED ReLU ACTIVATION

        # INTEGRATED FULLYCONNECTED LAYER FORWARD PASS
        x_flat = x.reshape(x.shape[0], -1)
        self.fc_cache = x_flat
        logits = np.dot(x_flat, self.fc_W) + self.fc_b

        # INTEGRATED SOFTMAX ACTIVATION
        exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
        probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

        return probs

    def backward(self, d_out):
        # INTEGRATED SOFTMAX BACKWARD
        x_flat = self.fc_cache
        batch_size = x_flat.shape[0]

        # Calculating gradients for FC layer
        d_fc_W = np.dot(x_flat.T, d_out) / batch_size
        d_fc_b = np.sum(d_out, axis=0) / batch_size

        # Backpropagate through FC layer
        d_conv_out = np.dot(d_out, self.fc_W.T)
        d_conv_out = d_conv_out.reshape(-1, 16, 784)

        # Updating FC parameters
        self.fc_W -= self.lr * d_fc_W
        self.fc_b -= self.lr * d_fc_b

        # Backpropagate through convolutional layers with INTEGRATED ReLU
        d_conv_out[d_conv_out < 0] = 0  # ReLU gradient
        d_conv_out = self.conv2.backward(d_conv_out)

        d_conv_out[d_conv_out < 0] = 0  # ReLU gradient
        d_conv_out = self.conv1.backward(d_conv_out)

        return d_conv_out

    def fit(self, X, y, epochs=3, batch_size=64):
        for epoch in range(epochs):
            for i in range(0, len(X), batch_size):
                X_batch = X[i:i+batch_size]
                y_batch = y[i:i+batch_size]

                # Forward pass (includes INTEGRATED ReLU, FullyConnected, and Softmax)
                probs = self.forward(X_batch)

                # Calculating loss gradient
                d_out = (probs - y_batch) / batch_size

                # Backward pass (includes INTEGRATED components)
                self.backward(d_out)

                # Updating convolutional layers
                for layer in self.layers:
                    layer.update(self.lr)

            # Calculating accuracy
            preds = np.argmax(self.forward(X), axis=1)
            true_labels = np.argmax(y, axis=1)
            acc = accuracy_score(true_labels, preds)
            print(f"Epoch {epoch+1}, Accuracy: {acc:.4f}")

# Main execution
if __name__ == "__main__":
    print("=== INTEGRATION DEMONSTRATION ===")
    print("Components integrated:")
    print("1. ReLU activation (inline np.maximum(0, x))")
    print("2. FullyConnected layer (integrated weights and matrix multiplication)")
    print("3. Softmax activation (inline computation)")
    print("="*40)

    # Loading and preparing data
    X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False, parser='pandas')
    X, y = X[:2000].astype('float32'), y[:2000].astype('int32')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fitting the scaler on training data first, then transforming both
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train).reshape(-1, 1, 784)  # Fit and transform training
    X_test = scaler.transform(X_test).reshape(-1, 1, 784)        # Transform test using fitted scaler

    # One-hot encode
    y_train_oh = np.eye(10)[y_train]
    y_test_oh = np.eye(10)[y_test]

    # Training and evaluation
    model = SimpleCNN(lr=0.001)
    model.fit(X_train, y_train_oh, epochs=3)

    test_preds = np.argmax(model.forward(X_test), axis=1)
    test_acc = accuracy_score(y_test, test_preds)
    print(f"\nFinal Test Accuracy: {test_acc:.4f}")

=== INTEGRATION DEMONSTRATION ===
Components integrated:
1. ReLU activation (inline np.maximum(0, x))
2. FullyConnected layer (integrated weights and matrix multiplication)
3. Softmax activation (inline computation)
Epoch 1, Accuracy: 0.1194
Epoch 2, Accuracy: 0.1106
Epoch 3, Accuracy: 0.1200

Final Test Accuracy: 0.1150
