<a href="https://colab.research.google.com/github/SY-256/basics-of-image-recognition/blob/main/chapter03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 畳み込みニューラルネットワーク

In [None]:
base_path = "/content/drive/MyDrive/colab-notebooks/basics-of-image-recognition/"

In [None]:
import numpy as np

def im2col(img, h, w, stride=1, pad=0):
    """
    畳み込みフィルタ
    img: (B, C, H, W)
    h: kernel size
    w: kernel width
    """
    B, C, H, W = img.shape
    img = np.pad(img, [(0, 0), (0, 0), (pad, pad), (pad, pad)], "constant")
    out_h = (H + 2*pad - h)//stride + 1
    out_w = (W + 2*pad - w)//stride + 1

    out = np.zeros((B, C, h, w, out_h, out_w))

    for y in range(h):
        y_ = y + stride*out_h
        for x in range(w):
            x_ = x + stride*out_w
            out[:, :, y, x, :, :] = img[:, :, y:y_:stride, x:x_:stride]

    out = out.transpose(0, 4, 5, 1, 2, 3).reshape(B*out_h*out_w, -1)
    return out

class Conv2D:
    """
    in_ch: number of input channels
    out_ch: number of output channels
    h, w: kernel size
    stride: stride of conv process
    pad: padding size
    img: input image (B, in_ch, H, W)
    """
    def __init__(self, in_ch, out_ch, h, w, stride=1, pad=0):
        self.out_ch = out_ch
        self.stride = stride
        self.pad = pad
        self.filters = np.random.randn(out_ch, in_ch, h, w)
        self.bias = np.zeros(out_ch)

    def forward(self, img):
        B, in_ch, H, W = img.shape
        out_ch, in_ch, h, w = self.filters.shape

        img = im2col(img, h, w, self.stride, self.pad)
        filters = self.filters.reshape(out_ch, -1).T

        out = np.dot(img, filters) + self.bias

        out_h = 1 + int((H + 2*self.pad - h) / self.stride)
        out_w = 1 + int((W + 2*self.pad - w) / self.stride)
        out = out.reshape(B, out_h, out_w, -1).transpose(0, 3, 1, 2)
        return out

In [None]:
### edeg detection
import matplotlib.pyplot as plt
from PIL import Image
img = Image.open(base_path + "keyboard.png").convert("L")
img = np.array(img, dtype=float) / 255
img = img[np.newaxis, np.newaxis, :, :]
conv = Conv2D(in_ch=1, out_ch=1, h=3, w=3, stride=1, pad=1)
### horizontal edge detectio
conv.filters[0, 0, :, :] = np.array([[-1, -2, -1], [0 ,0, 0], [1, 2, 1]])
conv.filters[0, 0, :, :] = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
out = conv.forward(img)
out = np.clip(out, 0, 1)

plt.subplot(121)
plt.imshow(img[0, 0], cmap="gray")
plt.title("Input Image")
plt.subplot(122)
plt.imshow(out[0, 0], cmap="gray")
plt.title("Output Image")
plt.savefig(base_path + "convoled_output.png")
plt.show()

# GAP（グローバルアベレージプーリング）をスクラッチで

In [None]:
### GAP（グローバルアベレージプーリング）
import numpy as np

def global_average_poooling_2d(x):
    """
    Global Average Pooling 2D

    Args:
        x: 入力テンソル（batch, hight, width, channels）または（height, width, chnnels）
    Returns:
        出力テンソル（batch_size, channels）または（channels,）
    """
    if x.ndim == 4: # バッチあり
        return np.mean(x, axis=(1,2))
    elif x.ndim == 3: # バッチなしの場合
        return np.mean(x, axis=(0, 1))
    else:
        raise ValueError("入力次元は3次元または4次元である必要があります")

def global_average_poooling_1d(x):
    """
    Global Average Pooling 1D

    Args:
        x: 入力テンソル（batch_size, length, channels）または（lenght, channels）

    Returns:
        出力テンソル（batch_size, channels）または（channels,）
    """
    if x.ndim == 3: # バッチあり
        return np.mean(x, axis=1)
    elif x.ndim == 2: # バッチなし
        return np.mean(x, axis=0)
    else:
        raise ValueError("入力は2次元または3次元である必要があります")

class GlobalAveragePooling2D:
    """クラス版のGlobal Average Pooling 2D"""

    def forward(self, x):
        self.input_shape = x.shape
        return global_average_poooling_2d(x)

    def backward(self, grad_output):
        """逆伝播の実装"""
        if len(self.input_shape) == 4:
            batch_size, height, width, channels = self.input_shape
            # 勾配を元の形状に戻す（平均で割った値を全ピクセルに分散）
            grad_input = np.zeros(self.input_shape)
            for i in range(batch_size):
                for c in range(channels):
                    grad_input[i, :, :, c] = grad_output[i, c] / (height * width)

        else:
            height, width, channels = self.input_shape
            grad_input = np.zeros(self.input_shape)
            for c in range(channels):
                grad_input[:, :, c] = grad_output[c] / (height * width)

        return grad_input


In [None]:
print("=== Global Average Pooling 2D ===")
# 単一画像の場合（height=4, width=4, channels=3）
x_2d = np.random.randn(4, 4, 3)
print(f"入力形状: {x_2d.shape}")
print(f"入力値: {x_2d}")

output_2d = global_average_poooling_2d(x_2d)
print(f"出力形状: {output_2d.shape}")
print(f"出力値: {output_2d}")

# バッチの場合（batch=2, height=4, width=4, channels=3）
x_batch_2d = np.random.randn(2, 4, 4, 3)
print(f"\n入力バッチ形状: {x_batch_2d.shape}")

output_batch_2d = global_average_poooling_2d(x_batch_2d)
print(f"バッチ出力形状: {output_batch_2d.shape}")

print("\n==== クラス版 ====")
gap_layer = GlobalAveragePooling2D()

x_test = np.random.randn(1, 3, 3, 2)
output = gap_layer.forward(x_test)
print(f"入力形状: {x_test.shape}")
print(f"出力形状: {output.shape}")

# 逆伝播のテスト
grad_output = np.ones_like(output)
grad_input = gap_layer.backward(grad_output)
print(f"勾配入力形状: {grad_input.shape}")
print(f"勾配値の例: {grad_input[0, 0, 0, :]}")


=== Global Average Pooling 2D ===
入力形状: (4, 4, 3)
入力値: [[[ 1.9216899   0.76574373  0.19227038]
  [ 0.90747202 -1.00150843 -0.93774835]
  [ 0.45887446  1.36584178 -0.61927072]
  [ 0.66800848 -0.97647051 -0.8558606 ]]

 [[ 1.51556905 -0.81584812  1.353832  ]
  [ 0.82094649 -1.34260133 -0.19318977]
  [-0.7527346   1.99628424 -1.3232735 ]
  [-0.76252744  1.4831022   1.13187252]]

 [[ 1.33507294  1.27968203  0.43559142]
  [ 0.19062981  0.41805862 -0.30810173]
  [-0.94957467 -0.17696031  0.8310906 ]
  [ 0.57156049 -0.66808225 -1.60692287]]

 [[-1.21114089  1.37205759 -0.22141397]
  [-0.19499464  0.24747499 -0.61352329]
  [ 0.11728387  0.46471609  0.68810329]
  [ 0.42640274  1.00966312 -0.35621508]]]
出力形状: (3,)
出力値: [ 0.31640863  0.33882209 -0.15017248]

入力バッチ形状: (2, 4, 4, 3)
バッチ出力形状: (2, 3)

==== クラス版 ====
入力形状: (1, 3, 3, 2)
出力形状: (1, 2)
勾配入力形状: (1, 3, 3, 2)
勾配値の例: [0.11111111 0.11111111]


# 残差学習をスクラッチで

In [None]:
import numpy as np

def relu(x):
    """ReLU活計化関数"""
    return np.maximum(0, x)

def conv2d(x, weights, bias=None, stride=1, padding=0):
    """2D畳み込み（簡易版）"""
    batch_size, in_height, in_width, in_channels = x.shape
    out_channels, kernel_h, kernel_w, _ = weights.shape

    # パディング
    if padding > 0:
        x = np.pad(x, ((0, 0), (padding, padding), (padding, padding), (0, 0,)), "constant")

    out_height = (x.shape[1] - kernel_h) // stride + 1
    out_width = (x.shape[2] - kernel_w) // stride + 1

    output = np.zeros((batch_size, out_height, out_width, out_channels))

    for b in range(batch_size):
        for oc in range(out_channels):
            for oh in range(out_height):
                for ow in range(out_width):
                    h_start = oh * stride
                    w_start = ow * stride
                    patch = x[b, h_start:h_start+kernel_h, w_start:w_start+kernel_w, :]
                    output[b, oh, ow, oc] = np.sum(patch * weights[oc]) + (bias[oc] if bias is not None else 0)
    return output

def batch_norm(x, gamma, beta, eps=1e-5):
    """バッチ正規化"""
    mean = np.mean(x, axis=(0, 1, 2), keepdims=True)
    var = np.var(x, axis=(0, 1, 2), keepdims=True)
    x_norm = (x - mean) / np.sqrt(var + eps)
    return gamma * x_norm + beta

class BasicBlock:
    """ResNetの基本ブロック（2層）"""

    def __init__(self, in_channels, out_channels, stride=1):
        self.stride = stride
        self.in_channels = in_channels
        self.out_channels = out_channels

        # 畳み込み層の重み初期化
        self.conv1d_weights = np.random.randn(out_channels, 3, 3, in_channels) * 0.1
        self.conv1d_bias = np.zeros(out_channels)

        self.conv2d_weights = np.random.randn(out_channels, 3, 3, out_channels) * 0.1
        self.conv2d_bias = np.zeros(out_channels)

        # バッチ正規化パラメータ
        self.bn1_gamma = np.ones(out_channels)
        self.bn1_beta = np.zeros(out_channels)
        self.bn2_gamma = np.ones(out_channels)
        self.bn2_beta = np.zeros(out_channels)

        # ショートカット接続（チャネル数が変わる場合）
        self.use_shortcut_conv = (stride != 1) or (in_channels != out_channels)
        if self.use_shortcut_conv:
            self.shortcut_weights = np.random.randn(out_channels, 1, 1, in_channels) * 0.1
            self.shortcut_bias = np.zeros(out_channels)
            self.shortcut_bn_gamma = np.ones(out_channels)
            self.shortcut_bn_beta = np.zeros(out_channels)

    def forward(self, x):
        """順伝播"""
        identity = x

        # メインパス
        out = conv2d(x, self.conv1d_weights, self.conv1d_bias, stride=self.stride, padding=1)
        out = batch_norm(out, self.bn1_gamma, self.bn1_beta)
        out = relu(out)

        out = conv2d(out, self.conv2d_weights, self.conv2d_bias, stride=1, padding=1)
        out = batch_norm(out, self.bn2_gamma, self.bn2_beta)

        # ショートカット接続
        if self.use_shortcut_conv:
            identity = conv2d(identity, self.shortcut_weights, self.shortcut_bias,
                              stride=self.stride, padding=0)
            identity = batch_norm(identity, self.shortcut_bn_gamma, self.shortcut_bn_beta)

        # 残差接続
        out = out + identity
        out = relu(out)

        return out

class SimpleResNet:
    """シンプルなResNet"""

    def __init__(self, num_classes=10):
        self.num_classes = num_classes

        # 初期畳み込み層
        self.conv1d_weights = np.random.randn(64, 7, 7, 3) * 0.1
        self.conv1d_bias = np.zeros(64)
        self.bn1_gamma = np.ones(64)
        self.bn1_beta = np.zeros(64)

        # 残差ブロック
        self.layer1 = [BasicBlock(64, 64) for _ in range(2)]
        self.layer2 = [BasicBlock(64, 128, stride=2)] + [BasicBlock(128, 128) for _ in range(1)]
        self.layer3 = [BasicBlock(128, 256, stride=2)] + [BasicBlock(256, 256) for _ in range(1)]

        # 分類層
        self.fc_weights = np.random.randn(256, num_classes) * 0.1
        self.fc_bias = np.zeros(num_classes)

    def forward(self, x):
        """順伝播"""
        # 初期畳み込み
        out = conv2d(x, self.conv1d_weights, self.conv1d_bias, stride=2, padding=3)
        out = batch_norm(out, self.bn1_gamma, self.bn1_beta)
        out = relu(out)

        # 残差ブロック
        for block in self.layer1:
            out = block.forward(out)

        for block in self.layer2:
            out = block.forward(out)

        for block in self.layer3:
            out = block.forward(out)

        # Global Average Pooling
        out = np.mean(out, axis=(1, 2))

        # 全結合層
        out = np.dot(out, self.fc_weights) + self.fc_bias

        return out

In [None]:
# テストデータ
batch_size = 2
x = np.random.randn(batch_size, 32, 32, 3)

print("=== BasicBlockテスト ===")
basic_block = BasicBlock(3, 64, stride=1)
output_basic = basic_block.forward(x)
print(f"入力形状: {x.shape}")
print(f"BasicBlock出力形状: {output_basic.shape}")

print("\n=== SimpleResNet ===")
model = SimpleResNet(num_classes=10)
output = model.forward(x)
print(f"ResNet出力形状: {output.shape}")
print(f"出力例: {output[0][:5]}") # 最初のサンプルの最初の5クラス

# 残差接続の効果を確認
print("\n=== 残差接続の効果確認 ===")
# 同じ入力に対して複数回実行（実際の学習では勾配が流れやすくなる）
x_test = np.random.randn(1, 8, 8, 64)
block_test = BasicBlock(64, 64)

# 入力と出力の差分（残差）を確認
output_test = block_test.forward(x_test)
residual = output_test - x_test # これが学習される残差
print(f"入力平均: {np.mean(x_test):.4f}")
print(f"出力平均: {np.mean(output_test):.4f}")
print(f"残差平均: {np.mean(residual):.4f}")
print("⇒ 残差学習により、恒等写像からの小さな変化を学習")

=== BasicBlockテスト ===
入力形状: (2, 32, 32, 3)
BasicBlock出力形状: (2, 32, 32, 64)

=== SimpleResNet ===
ResNet出力形状: (2, 10)
出力例: [ 0.88414622 -0.03603872  1.18273422 -0.09054755 -1.17715313]

=== 残差接続の効果確認 ===
入力平均: 0.0114
出力平均: 0.5692
残差平均: 0.5579
⇒ 残差学習により、恒等写像からの小さな変化を学習
