In [1]:
from abc import abstractmethod
import numpy as np

class Layer():
    def __init__(self) -> None:
        self.optimizable = True
    
    @abstractmethod
    def forward():
        pass

    @abstractmethod
    def backward():
        pass


In [2]:
class conv2D(Layer):
    """
    The 2D convolutional layer. Try to implement it on your own.
    """
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, initialize_method=np.random.normal, weight_decay=False, weight_decay_lambda=1e-8) -> None:
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

        self.filters = initialize_method(size=(out_channels, in_channels, self.kernel_size, self.kernel_size))
        # [out_channels, in_channels, kernel, kernel]
        self.bias = np.zeros((out_channels,))

        self.grads = {'W': None, 'b': None}
        self.input = None  # Record the input for backward process.

        self.params = {'W': self.filters, 'b': self.bias}

        self.weight_decay = weight_decay  # whether using weight decay
        self.weight_decay_lambda = weight_decay_lambda  # control the intensity of weight decay

    def __call__(self, X) -> np.ndarray:
        return self.forward(X)
    
    def forward(self, X):
        """
        input X: [batch, channels, H, W]
        W : [1, out, in, k, k]
        no padding
        """

        self.input = X
        batch_size, in_channels, H_in, W_in = X.shape
        k = self.kernel_size

        # padding
        if self.padding > 0:
            X_padded = np.pad(X, ((0, 0), (0, 0), (self.padding,) * 2, (self.padding,) * 2))
        else:
            X_padded = X
            
        print("Padded Input:")
        print(X_padded)

        H_out = (H_in + 2 * self.padding - k) // self.stride + 1
        W_out = (W_in + 2 * self.padding - k) // self.stride + 1
        output = np.zeros((batch_size, self.out_channels, H_out, W_out))

        # 计算卷积
        for i in range(H_out):
            h_start = i * self.stride
            h_end = h_start + k
            for j in range(W_out):
                w_start = j * self.stride
                w_end = w_start + k

                window = X_padded[:, :, h_start:h_end, w_start:w_end]
                output[:, :, i, j] = np.tensordot(
                    window, self.filters, axes=([1, 2, 3], [1, 2, 3])
                ) + self.bias
        print("Convolution Output:")
        print(output)
        return output

        
        pass

    def backward(self, grads):
        """
        grads : [batch_size, out_channel, new_H, new_W]
        """

        batch_size, out_channels, H_out, W_out = grads.shape
        k = self.kernel_size

        # 初始化梯度
        dX = np.zeros_like(self.input, dtype=np.float64)  # 确保使用float64
        dfilters = np.zeros_like(self.filters, dtype=np.float64)  # 确保使用float64
        dbias = np.zeros_like(self.bias, dtype=np.float64)  # 确保使用float64

        # 旋转卷积核（关键步骤）
        rotated_filters = np.rot90(self.filters, 2, axes=(2, 3))
        rotated_filters=self.filters

        # 处理padding
        if self.padding > 0:
            X_padded = np.pad(self.input,
                              ((0, 0), (0, 0), (self.padding,) * 2, (self.padding,) * 2))
            dX_padded = np.pad(dX,
                               ((0, 0), (0, 0), (self.padding,) * 2, (self.padding,) * 2))
        else:
            X_padded = self.input
            dX_padded = dX
            
        # print("padded X:",X_padded)
        # print("padded dX:",dX_padded)

        for i in range(H_out):
            for j in range(W_out):
                h_start = i * self.stride
                w_start = j * self.stride
                window = X_padded[:, :, h_start:h_start + k, w_start:w_start + k]

                # 1. 计算滤波器梯度（修正转置问题）
                dfilters += np.tensordot(
                    grads[:, :, i, j],  # shape: (batch, out_ch)
                    window,  # shape: (batch, in_ch, k, k)
                    axes=([0], [0])  # 沿着batch维度做点积
                )  # 结果形状: (out_ch, in_ch, k, k)

                # 2. 计算输入梯度（修正维度扩展）
                grad_slice = grads[:, :, i, j][:, :, np.newaxis, np.newaxis,
                             np.newaxis]  # shape: (batch, out_ch, 1, 1, 1)
                dX_padded[:, :, h_start:h_start + k, w_start:w_start + k] += np.sum(
                    grad_slice * rotated_filters[np.newaxis, :, :, :, :],  # shape: (1, out_ch, in_ch, k, k)
                    axis=1  # 沿out_ch维度求和
                )  # 结果形状: (batch, in_ch, k, k)

                # 3. 计算偏置梯度（修正索引方式）
                dbias += np.sum(grads[:, :, i, j], axis=0)  # 向量化计算

                # print(f"位置({i},{j})")
                # print("滤波器梯度增量:", np.sum(dfilters))
                # print("输入梯度增量:", np.sum(dX_padded))

        # 去除padding
        if self.padding > 0:
            dX = dX_padded[:, :, self.padding:-self.padding, self.padding:-self.padding]

        # 权重衰减
        if self.weight_decay:
            dfilters += 2 * self.weight_decay_lambda * self.filters

        self.grads['W'] = dfilters/batch_size
        self.grads['b'] = dbias/batch_size
        
        # print("dW",self.grads['W'])
        # print("dbias",self.grads['b'])
        # print("dX",dX)
        return dX
    
    def clear_grad(self):
        self.grads = {'W' : None, 'b' : None}

In [3]:
layer=conv2D(in_channels=1,out_channels=1,kernel_size=3,stride=1,padding=1)
layer.filters = np.array([[[[1, 0, -1], [1, 0, -1], [1, 0, -1]]]])  # 经典的边缘检测核
layer.bias = np.array([0])  # 设定偏置为 0
X = np.array([[[[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]]]])

ans=layer(X)


Padded Input:
[[[[0 0 0 0 0]
   [0 1 2 3 0]
   [0 4 5 6 0]
   [0 7 8 9 0]
   [0 0 0 0 0]]]]
Convolution Output:
[[[[ -7.  -4.   7.]
   [-15.  -6.  15.]
   [-13.  -4.  13.]]]]


In [19]:
layer=conv2D(in_channels=1,out_channels=2,kernel_size=3,stride=1,padding=1)

layer.filters = np.array([
    [[[1, 1, 1],  
      [1, 1, 1],  
      [1, 1, 1]]],  # 第一组卷积核（全 1），形状 (1,3,3)

    [[[2, 2, 2],  
      [2, 2, 2],  
      [2, 2, 2]]],  # 第二组卷积核（全 2），形状 (1,3,3)
])


layer.bias = np.array([0, 0]) # 设定偏置为 0
print(layer.filters.shape)
print(layer.bias.shape)

X = np.array([[[[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]]]])

output=layer(X)

print(output.shape)


(2, 1, 3, 3)
(2,)
Padded Input:
[[[[0 0 0 0 0]
   [0 1 2 3 0]
   [0 4 5 6 0]
   [0 7 8 9 0]
   [0 0 0 0 0]]]]
Convolution Output:
[[[[12. 21. 16.]
   [27. 45. 33.]
   [24. 39. 28.]]

  [[24. 42. 32.]
   [54. 90. 66.]
   [48. 78. 56.]]]]
(1, 2, 3, 3)


In [20]:
# ====== 反向传播测试 ======
# 1. 定义损失函数（假设损失为输出的总和）
loss = np.sum(output)
print("初始损失值:", loss)

# 2. 反向传播计算梯度
# 生成模拟的顶层梯度（与输出形状相同，全1）
dout = np.ones_like(output)
dx = layer.backward(dout)

# 3. 打印反向传播得到的梯度
print("\n=== 反向传播梯度 ===")
print("卷积核梯度 (W):")
print(layer.grads['W'].round(4))  # 形状 (2,1,3,3)
print("\n偏置梯度 (b):")
print(layer.grads['b'].round(4))  # 形状 (2,)
print("\n输入梯度 (dX):")
print(dx.round(4))                # 形状 (1,1,3,3)



初始损失值: 735.0

=== 反向传播梯度 ===
卷积核梯度 (W):
[[[[12. 21. 16.]
   [27. 45. 33.]
   [24. 39. 28.]]]


 [[[12. 21. 16.]
   [27. 45. 33.]
   [24. 39. 28.]]]]

偏置梯度 (b):
[9. 9.]

输入梯度 (dX):
[[[[12. 18. 12.]
   [18. 27. 18.]
   [12. 18. 12.]]]]


从上面的结果来看，卷积核梯度正确，dX正确，db怎么算？db是每个核的输出全部+b，所以是9？



In [3]:
layer=conv2D(in_channels=1,out_channels=2,kernel_size=3,stride=1,padding=1)

layer.filters = np.array([
    [[[2, 1, 1],  
      [1, 1, 1],  
      [1, 1, 1]]],  # 第一组卷积核（全 1），形状 (1,3,3)

    [[[3, 1, 1],  
      [1, 1, 1],  
      [1, 1, 1]]],  # 第二组卷积核（全 2），形状 (1,3,3)
])


layer.bias = np.array([0, 0]) # 设定偏置为 0
print(layer.filters.shape)
print(layer.bias.shape)

X = np.array([[[[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]]]])

output=layer(X)

print(output.shape)

(2, 1, 3, 3)
(2,)
Padded Input:
[[[[0 0 0 0 0]
   [0 1 2 3 0]
   [0 4 5 6 0]
   [0 7 8 9 0]
   [0 0 0 0 0]]]]
Convolution Output:
[[[[12. 21. 16.]
   [27. 46. 35.]
   [24. 43. 33.]]

  [[12. 21. 16.]
   [27. 47. 37.]
   [24. 47. 38.]]]]
(1, 2, 3, 3)


In [4]:
loss = np.sum(output)
print("初始损失值:", loss)

# 2. 反向传播计算梯度
# 生成模拟的顶层梯度（与输出形状相同，全1）
dout = np.ones_like(output)
dx = layer.backward(dout)

# 3. 打印反向传播得到的梯度
print("\n=== 反向传播梯度 ===")
print("卷积核梯度 (W):")
print(layer.grads['W'].round(4))  # 形状 (2,1,3,3)
print("\n偏置梯度 (b):")
print(layer.grads['b'].round(4))  # 形状 (2,)
print("\n输入梯度 (dX):")
print(dx.round(4))                # 形状 (1,1,3,3)

初始损失值: 526.0

=== 反向传播梯度 ===
卷积核梯度 (W):
[[[[12. 21. 16.]
   [27. 45. 33.]
   [24. 39. 28.]]]


 [[[12. 21. 16.]
   [27. 45. 33.]
   [24. 39. 28.]]]]

偏置梯度 (b):
[9. 9.]

输入梯度 (dX):
[[[[11. 15.  8.]
   [15. 21. 12.]
   [ 8. 12.  8.]]]]
