<a href="https://colab.research.google.com/github/SILVIAIRENE/Data-Scientist-Machine-Learning-Engineer-Introductory-Course/blob/master/SimpleConv2d_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
# Red Neuronal Convolucional 2D Scratch - Solución completa (lista para Google Colab)
# Autor: Respuesta generada por un Ingeniero ML experto (implementación NumPy)
# NOTA: Este script usa únicamente NumPy y tf.keras.datasets para cargar MNIST.
# Ejecuta en Google Colab: asegúrate de tener suficiente RAM; el entrenamiento aquí es pequeño por rapidez.

# ----------------------------
# [Problema 0] Imports y utilidades
# ----------------------------
import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from time import time

# Reproducibilidad rápida
np.random.seed(42)

# ----------------------------
# [Problema 3] Función: tamaño de salida después de la convolución 2D
# Fórmula:
# N_out_h = (N_in_h + 2*PAD_h - F_h) // S_h + 1
# N_out_w = (N_in_w + 2*PAD_w - F_w) // S_w + 1
# Implementamos una función utilitaria.
# ----------------------------
def conv2d_output_size(n_in_h, n_in_w, filter_h, filter_w, pad_h=0, pad_w=0, stride_h=1, stride_w=1):
    n_out_h = (n_in_h + 2*pad_h - filter_h) // stride_h + 1
    n_out_w = (n_in_w + 2*pad_w - filter_w) // stride_w + 1
    return n_out_h, n_out_w

# ----------------------------
# [Problema 1] Clase Conv2d (NCHW)
# Implementa forward y backward (grad w.r.t weights, bias y entrada).
# Soporta padding y stride (integers or tuples).
# ----------------------------
class Conv2d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        # kernel_size can be int or tuple
        if isinstance(kernel_size, int):
            kh, kw = kernel_size, kernel_size
        else:
            kh, kw = kernel_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kh = kh
        self.kw = kw
        if isinstance(stride, int):
            self.stride_h = self.stride_w = stride
        else:
            self.stride_h, self.stride_w = stride
        if isinstance(padding, int):
            self.pad_h = self.pad_w = padding
        else:
            self.pad_h, self.pad_w = padding
        # He initialization for conv filters
        fan_in = in_channels * kh * kw
        limit = np.sqrt(2.0 / fan_in)
        self.W = np.random.randn(out_channels, in_channels, kh, kw) * limit
        self.b = np.zeros((out_channels,), dtype=np.float32)

        # placeholders for backward
        self.cache = None
        # gradients
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)

    def forward(self, x):
        # x: (N, C, H, W) NCHW
        N, C, H, W = x.shape
        assert C == self.in_channels, "Input channel mismatch"
        out_h, out_w = conv2d_output_size(H, W, self.kh, self.kw, self.pad_h, self.pad_w, self.stride_h, self.stride_w)
        y = np.zeros((N, self.out_channels, out_h, out_w), dtype=np.float32)

        # pad input
        x_padded = np.pad(x, ((0,0),(0,0),(self.pad_h,self.pad_h),(self.pad_w,self.pad_w)), mode='constant')
        # naive loops (clear and correct)
        for n in range(N):
            for m in range(self.out_channels):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride_h
                        w_start = j * self.stride_w
                        patch = x_padded[n, :, h_start:h_start + self.kh, w_start:w_start + self.kw]
                        y[n, m, i, j] = np.sum(patch * self.W[m]) + self.b[m]
        self.cache = (x, x_padded, y.shape)
        return y

    def backward(self, d_out):
        # d_out: (N, out_channels, out_h, out_w)
        x, x_padded, out_shape = self.cache
        N, C, H, W = x.shape
        _, _, out_h, out_w = d_out.shape

        # initialize gradients
        dW = np.zeros_like(self.W)
        db = np.zeros_like(self.b)
        dx_padded = np.zeros_like(x_padded)

        # compute dW and db and dx_padded
        for n in range(N):
            for m in range(self.out_channels):
                db[m] += np.sum(d_out[n, m])
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride_h
                        w_start = j * self.stride_w
                        patch = x_padded[n, :, h_start:h_start + self.kh, w_start:w_start + self.kw]
                        dW[m] += d_out[n, m, i, j] * patch
                        dx_padded[n, :, h_start:h_start + self.kh, w_start:w_start + self.kw] += d_out[n, m, i, j] * self.W[m]

        # remove padding from dx_padded
        if self.pad_h == 0 and self.pad_w == 0:
            dx = dx_padded
        else:
            dx = dx_padded[:, :, self.pad_h:self.pad_h+H, self.pad_w:self.pad_w+W]

        # store gradients
        self.dW = dW
        self.db = db
        return dx

    def step(self, lr=1e-3):
        self.W -= lr * self.dW
        self.b -= lr * self.db

# ----------------------------
# [Problema 4] Clase MaxPool2D (NCHW)
# Guarda índices de max para el backward
# ----------------------------
class MaxPool2D:
    def __init__(self, kernel_size=2, stride=None):
        if isinstance(kernel_size, int):
            self.kh = self.kw = kernel_size
        else:
            self.kh, self.kw = kernel_size
        if stride is None:
            self.stride_h = self.kh
            self.stride_w = self.kw
        else:
            if isinstance(stride, int):
                self.stride_h = self.stride_w = stride
            else:
                self.stride_h, self.stride_w = stride
        self.cache = None

    def forward(self, x):
        # x: (N, C, H, W)
        N, C, H, W = x.shape
        out_h, out_w = conv2d_output_size(H, W, self.kh, self.kw, 0, 0, self.stride_h, self.stride_w)
        y = np.zeros((N, C, out_h, out_w), dtype=x.dtype)
        # mask to store indices
        mask = np.zeros_like(x, dtype=bool)
        for n in range(N):
            for c in range(C):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride_h
                        w_start = j * self.stride_w
                        patch = x[n, c, h_start:h_start + self.kh, w_start:w_start + self.kw]
                        flat_index = np.argmax(patch)
                        # store max
                        y[n, c, i, j] = patch.flatten()[flat_index]
                        # convert flat_index to 2d
                        ph = flat_index // self.kw
                        pw = flat_index % self.kw
                        mask[n, c, h_start + ph, w_start + pw] = True
        self.cache = (x.shape, mask)
        return y

    def backward(self, d_out):
        x_shape, mask = self.cache
        N, C, H, W = x_shape
        dx = np.zeros(x_shape, dtype=np.float32)
        out_h, out_w = d_out.shape[2], d_out.shape[3]
        for n in range(N):
            for c in range(C):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride_h
                        w_start = j * self.stride_w
                        # find where mask True inside window and add gradient
                        window_mask = mask[n, c, h_start:h_start + self.kh, w_start:w_start + self.kw]
                        # there should be exactly one True
                        dx[n, c, h_start:h_start + self.kh, w_start:w_start + self.kw] += d_out[n, c, i, j] * window_mask
        return dx

# ----------------------------
# [Problema 5] AveragePool2D (opcional avanzado)
# ----------------------------
class AveragePool2D:
    def __init__(self, kernel_size=2, stride=None):
        if isinstance(kernel_size, int):
            self.kh = self.kw = kernel_size
        else:
            self.kh, self.kw = kernel_size
        if stride is None:
            self.stride_h = self.kh
            self.stride_w = self.kw
        else:
            if isinstance(stride, int):
                self.stride_h = self.stride_w = stride
            else:
                self.stride_h, self.stride_w = stride
        self.cache = None

    def forward(self, x):
        N, C, H, W = x.shape
        out_h, out_w = conv2d_output_size(H, W, self.kh, self.kw, 0, 0, self.stride_h, self.stride_w)
        y = np.zeros((N, C, out_h, out_w), dtype=x.dtype)
        for n in range(N):
            for c in range(C):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride_h
                        w_start = j * self.stride_w
                        patch = x[n, c, h_start:h_start + self.kh, w_start:w_start + self.kw]
                        y[n, c, i, j] = np.mean(patch)
        self.cache = (x.shape,)
        return y

    def backward(self, d_out):
        x_shape = self.cache[0]
        N, C, H, W = x_shape
        dx = np.zeros(x_shape, dtype=np.float32)
        out_h, out_w = d_out.shape[2], d_out.shape[3]
        area = self.kh * self.kw
        for n in range(N):
            for c in range(C):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride_h
                        w_start = j * self.stride_w
                        dx[n, c, h_start:h_start + self.kh, w_start:w_start + self.kw] += d_out[n, c, i, j] / area
        return dx

# ----------------------------
# [Problema 6] Flatten
# ----------------------------
class Flatten:
    def __init__(self):
        self.orig_shape = None

    def forward(self, x):
        # x: (N, C, H, W) -> (N, C*H*W)
        self.orig_shape = x.shape
        N = x.shape[0]
        return x.reshape(N, -1)

    def backward(self, d_out):
        return d_out.reshape(self.orig_shape)

# ----------------------------
# Capas Fully connected y Activaciones (utilidades)
# ----------------------------
class Linear:
    def __init__(self, in_features, out_features):
        self.W = np.random.randn(in_features, out_features) * np.sqrt(2.0 / in_features)
        self.b = np.zeros((out_features,), dtype=np.float32)
        self.cache = None
        self.dW = None
        self.db = None

    def forward(self, x):
        # x: (N, D)
        self.cache = x
        return x.dot(self.W) + self.b

    def backward(self, d_out):
        x = self.cache  # (N, D)
        self.dW = x.T.dot(d_out)  # (D, out)
        self.db = np.sum(d_out, axis=0)
        dx = d_out.dot(self.W.T)
        return dx

    def step(self, lr=1e-3):
        self.W -= lr * self.dW
        self.b -= lr * self.db

def relu(x):
    return np.maximum(0, x)

def relu_backward(x, d_out):
    dx = d_out.copy()
    dx[x <= 0] = 0
    return dx

def softmax(x):
    # x: (N, C)
    x = x - np.max(x, axis=1, keepdims=True)
    ex = np.exp(x)
    return ex / np.sum(ex, axis=1, keepdims=True)

def cross_entropy_loss(probs, labels_onehot):
    # probs: (N, C), labels_onehot: (N, C)
    N = probs.shape[0]
    clipped = np.clip(probs, 1e-12, 1.0)
    loss = -np.sum(labels_onehot * np.log(clipped)) / N
    return loss

def softmax_cross_entropy_backward(probs, labels_onehot):
    # derivative of loss w.r.t logits pre-softmax is (probs - labels)/N
    N = probs.shape[0]
    return (probs - labels_onehot) / N

# ----------------------------
# [Problema 8] ScratchLeNetClassifier (versión moderna con ReLU y MaxPool)
# Arquitectura:
# Conv(1->6,5x5,stride1) -> ReLU -> MaxPool(2)
# Conv(6->16,5x5,stride1) -> ReLU -> MaxPool(2)
# Flatten
# FC 120 -> ReLU
# FC 84 -> ReLU
# FC 10 -> Softmax
# ----------------------------
class ScratchLeNetClassifier:
    def __init__(self, lr=1e-2):
        self.conv1 = Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1, padding=0)
        self.pool1 = MaxPool2D(kernel_size=2, stride=2)
        self.conv2 = Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1, padding=0)
        self.pool2 = MaxPool2D(kernel_size=2, stride=2)
        self.flatten = Flatten()
        # compute flatten size for MNIST 28x28:
        # After conv1: (28-5+1)=24 -> pool -> 12
        # After conv2: (12-5+1)=8 -> pool -> 4 => 16 channels * 4 * 4 = 256
        fc_in = 16 * 4 * 4
        self.fc1 = Linear(fc_in, 120)
        self.fc2 = Linear(120, 84)
        self.fc3 = Linear(84, 10)
        self.lr = lr
        # caches for activations to apply relu backward
        self.caches = {}

    def forward(self, x):
        # x: (N, C=1, H=28, W=28)
        a1 = self.conv1.forward(x)
        r1 = relu(a1)
        p1 = self.pool1.forward(r1)
        a2 = self.conv2.forward(p1)
        r2 = relu(a2)
        p2 = self.pool2.forward(r2)
        flat = self.flatten.forward(p2)
        z1 = self.fc1.forward(flat)
        r3 = relu(z1)
        z2 = self.fc2.forward(r3)
        r4 = relu(z2)
        logits = self.fc3.forward(r4)
        probs = softmax(logits)
        # store caches for backward
        self.caches = dict(a1=a1, r1=r1, p1=p1, a2=a2, r2=r2, p2=p2, flat=flat, z1=z1, r3=r3, z2=z2, r4=r4, logits=logits, probs=probs)
        return probs

    def backward(self, labels_onehot):
        probs = self.caches['probs']
        logits = self.caches['logits']
        # dlogits
        dlogits = softmax_cross_entropy_backward(probs, labels_onehot)  # (N,10)
        # fc3 backward
        dr4 = self.fc3.backward(dlogits)  # (N,84)
        # fc3 grads stored in layer
        dz2 = relu_backward(self.caches['z2'], dr4)
        dr3 = self.fc2.backward(dz2)
        dz1 = relu_backward(self.caches['z1'], dr3)
        dflat = self.fc1.backward(dz1)
        # flatten backward
        dp2 = self.flatten.backward(dflat)
        # pool2 backward
        dr2 = self.pool2.backward(dp2)
        da2 = relu_backward(self.caches['a2'], dr2)
        dp1 = self.conv2.backward(da2)
        dr1 = self.pool1.backward(dp1)
        da1 = relu_backward(self.caches['a1'], dr1)
        dx = self.conv1.backward(da1)
        # after backward, update weights
        # step for conv and fc layers
        self.conv1.step(self.lr)
        self.conv2.step(self.lr)
        self.fc1.step(self.lr)
        self.fc2.step(self.lr)
        self.fc3.step(self.lr)
        # return loss components if needed
        return

    def compute_loss(self, probs, labels_onehot):
        return cross_entropy_loss(probs, labels_onehot)

    def predict(self, x):
        probs = self.forward(x)
        return np.argmax(probs, axis=1)

# ----------------------------
# [Problema 2] Test de forward/backward en matriz pequeña (verificación)
# Aquí replicamos el ejemplo dado y comprobamos resultados numéricos.
# Entrada x (1,1,4,4) y filtros w (2,3,3) con los valores del enunciado.
# ----------------------------
def problem2_check():
    x = np.array([[[[ 1,  2,  3,  4],
                    [ 5,  6,  7,  8],
                    [ 9, 10, 11, 12],
                    [13, 14, 15, 16]]]], dtype=np.float32)  # (1,1,4,4)

    # Provided w has shape (2,3,3) but for our Conv2d we need (out_channels, in_channels, kh, kw)
    # Let's construct weight array: 2 output channels, 1 input channel, 3x3 each.
    w = np.array([[[[ 0.,  0.,  0.],
                    [ 0.,  1.,  0.],
                    [ 0., -1.,  0.]]],

                  [[[ 0.,  0.,  0.],
                    [ 0., -1.,  1.],
                    [ 0.,  0.,  0.]]]], dtype=np.float32)  # (2,1,3,3)
    b = np.zeros((2,), dtype=np.float32)

    conv = Conv2d(in_channels=1, out_channels=2, kernel_size=3, stride=1, padding=0)
    conv.W = w.copy()
    conv.b = b.copy()

    # forward
    y = conv.forward(x)
    print("Problema 2 - Forward output:\n", y.astype(int))
    # Expected forward:
    # array([[[-4, -4],
    #         [-4, -4]],
    #
    #        [[ 1,  1],
    #         [ 1,  1]]])
    # Now backward with provided delta
    delta = np.array([[[ -4,  -4],
                       [ 10,  11]],

                      [[  1,  -7],
                       [  1, -11]]], dtype=np.float32)  # (2,2,2)
    # delta shape needs to be (N, outC, out_h, out_w)
    delta = delta.reshape(1, 2, 2, 2)
    dx = conv.backward(delta)
    print("Problema 2 - Backward dx (padded removed):\n", dx[0,0].astype(int))
    # Expected backward (with padding) result per enunciado:
    # array([[-5,  4],
    #        [13, 27]])
    # Note: The example says "Con padding, la salida es..." — our dx is without padding trimmed to input shape.
    return y, dx

# Run the Problem 2 check
print("=== Ejecutando verificación Problema 2 ===")
y2, dx2 = problem2_check()
print("=== Fin verificación Problema 2 ===\n\n")

# ----------------------------
# [Problema 7 & 8] Entrenamiento (MNIST) - entrenamos una pasada pequeña para demostrar funcionamiento.
# NOTA: para ahorrar tiempo en Colab, usamos subset (por ejemplo N=2000) y pocas épocas.
# ----------------------------
def load_mnist_nchw(subset=2000):
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    # Normalize floats
    x_train = x_train.astype(np.float32) / 255.0
    x_test = x_test.astype(np.float32) / 255.0
    # expand channel axis to NCHW
    x_train = x_train[:subset]
    y_train = y_train[:subset]
    x_train = x_train.reshape(-1, 1, 28, 28)
    x_test = x_test.reshape(-1, 1, 28, 28)
    y_test = y_test.astype(int)
    return x_train, y_train, x_test, y_test

# Small trainer
def train_small(model, x_train, y_train, epochs=1, batch_size=64):
    N = x_train.shape[0]
    steps = max(1, N // batch_size)
    for epoch in range(epochs):
        t0 = time()
        # shuffle
        perm = np.random.permutation(N)
        x_train = x_train[perm]
        y_train = y_train[perm]
        total_loss = 0.0
        total_correct = 0
        for i in range(0, N, batch_size):
            xb = x_train[i:i+batch_size]
            yb = y_train[i:i+batch_size]
            labels_onehot = to_categorical(yb, num_classes=10)
            probs = model.forward(xb)
            loss = model.compute_loss(probs, labels_onehot)
            total_loss += loss * xb.shape[0]
            preds = np.argmax(probs, axis=1)
            total_correct += np.sum(preds == yb)
            model.backward(labels_onehot)
        avg_loss = total_loss / N
        acc = total_correct / N
        print(f"Epoch {epoch+1}/{epochs} - loss: {avg_loss:.4f} - acc: {acc*100:.2f}% - time: {time()-t0:.1f}s")

# Load subset of MNIST and train
print("=== Cargando MNIST y entrenando LeNet pequeño (subset) ===")
x_train, y_train, x_test, y_test = load_mnist_nchw(subset=2000)
model = ScratchLeNetClassifier(lr=1e-2)
train_small(model, x_train, y_train, epochs=1, batch_size=128)
# Quick test on a small portion of test set
x_test_small = x_test[:500]
y_test_small = y_test[:500]
preds = model.predict(x_test_small)
acc_test = np.mean(preds == y_test_small)
print(f"Test accuracy (first 500 samples): {acc_test*100:.2f}%")
print("=== Fin entrenamiento y evaluación rápida ===\n\n")

# ----------------------------
# [Problema 10] Cálculo del tamaño de salida y número de parámetros
# Tenemos 3 casos, calculamos y mostramos.
# Fórmula para parámetros de una conv2d:
# params = (in_channels * filter_h * filter_w) * out_channels + out_channels (biases)
# ----------------------------
def problem10_calculations():
    cases = [
        dict(in_size=(144,144), in_ch=3, filter=(3,3), out_ch=6, stride=1, pad=0),
        dict(in_size=(60,60), in_ch=24, filter=(3,3), out_ch=48, stride=1, pad=0),
        dict(in_size=(20,20), in_ch=10, filter=(3,3), out_ch=20, stride=2, pad=0),
    ]
    results = []
    for case in cases:
        Hin, Win = case['in_size']
        Fh, Fw = case['filter']
        Sh = case['stride']
        Sw = case['stride']
        P = case['pad']
        out_h, out_w = conv2d_output_size(Hin, Win, Fh, Fw, P, P, Sh, Sw)
        # If non-integer (i.e., leftover pixels), formula uses floor via integer division above.
        params = (case['in_ch'] * Fh * Fw) * case['out_ch'] + case['out_ch']
        results.append((case, out_h, out_w, params))
    return results

res10 = problem10_calculations()
print("=== Problema 10: Tamaño de salida y número de parámetros ===")
for case, oh, ow, params in res10:
    Hin, Win = case['in_size']
    print(f"Input: {Hin}x{Win}, in_ch={case['in_ch']}, filter={case['filter']}, out_ch={case['out_ch']}, stride={case['stride']}")
    print(f" -> Output size: {oh} x {ow}; Num params (incl. bias): {params}")
print("=== Fin Problema 10 ===\n\n")

# ----------------------------
# [Problema 9] (Investigación) - RESUMEN
# (Se hará un breve resumen más abajo con citas web.run)
# ----------------------------

# ----------------------------
# [Problema 11] Investigación sobre tamaño de filtro - respuestas dentro de comentarios
# Se proveerán las respuestas teóricas en bloque multilinea debajo (como exige el enunciado).
# ----------------------------

# ----------------------------
# [Problemas 9, 10, 11] - Respuestas teóricas (incluidas como comentarios multilínea)
# ----------------------------
PROBLEM_9_10_11_TEXT = """
[Problema 9] Resumen rápido (ver referencias externas):
- AlexNet (Krizhevsky et al., 2012): introdujo un gran CNN entrenado en ImageNet con ReLUs, dropout, y entrenamiento en GPU multi-GPU. Aceleró la adopción de redes profundas en visión por su gran mejora en ILSVRC-2012.
- VGG16 (Simonyan & Zisserman, 2014): mostró que apilar muchas capas de convolución 3x3 (profundidad 16-19) mejora la precisión; diseño muy simple y repetitivo (bloques de conv3x3 + maxpool), usado ampliamente como extractor de características preentrenado.

[Problema 10] (Cálculos realizados por el código):
Resultados mostrados en la salida del programa:
- Caso 1 -> Output size y número de parámetros (incl. bias).
- Caso 2 -> ...
- Caso 3 -> ...
(Ver impresión en la ejecución).

[Problema 11] Sobre tamaños de filtro:
- Por qué 3x3 en lugar de 7x7:
  * Las pilas de 2 o 3 filtros 3x3 consecutivos tienen la misma recepción de campo que un filtro mayor (por ejemplo, tres 3x3 -> RF 7x7) pero con menos parámetros y más no linealidad (más ReLU), lo cual mejora la capacidad de representación y reduce el número de parámetros.
  * 3x3 es un buen equilibrio entre captar contexto local y mantener eficiencia computacional y de memoria.
- Efecto de un filtro 1x1:
  * Un 1x1 actúa como combinación lineal across-channels (mezcla de canales) sin afectar la dimensión espacial.
  * Es muy útil para cambiar el número de canales (dimension-reduction/expansion), añadir no-linealidad entre convoluciones espaciales, y como "bottleneck" para reducir parámetros (ej.: ResNet/Bottleneck).
"""
# (lo anterior queda como documentación; no se imprime de nuevo aquí)

# Guardamos resultado para posible inspección
__PROBLEM_SUMMARY = PROBLEM_9_10_11_TEXT

# ----------------------------
# FIN DEL SCRIPT
# ----------------------------
# El script implementa Conv2d, MaxPool2D, AveragePool2D, Flatten, la arquitectura LeNet (ScratchLeNetClassifier),
# y resuelve los problemas de verificación (Problema 2), cálculo de tamaños/parametros (Problema 10),
# y proporciona respuestas teóricas (Problema 9 y 11) dentro de la variable PROBLEM_9_10_11_TEXT.
#
# Para ejecutar todo en Colab: copia y pega este archivo en una celda y ejecútalo.
# Puedes ajustar subset en load_mnist_nchw(subset=...) y epochs en train_small(...) para experimentar más.


=== Ejecutando verificación Problema 2 ===
Problema 2 - Forward output:
 [[[[-4 -4]
   [-4 -4]]

  [[ 1  1]
   [ 1  1]]]]
Problema 2 - Backward dx (padded removed):
 [[  0   0   0   0]
 [  0  -5   4  -7]
 [  0  13  27 -11]
 [  0 -10 -11   0]]
=== Fin verificación Problema 2 ===


=== Cargando MNIST y entrenando LeNet pequeño (subset) ===
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Epoch 1/1 - loss: 2.3747 - acc: 12.50% - time: 229.3s
Test accuracy (first 500 samples): 16.60%
=== Fin entrenamiento y evaluación rápida ===


=== Problema 10: Tamaño de salida y número de parámetros ===
Input: 144x144, in_ch=3, filter=(3, 3), out_ch=6, stride=1
 -> Output size: 142 x 142; Num params (incl. bias): 168
Input: 60x60, in_ch=24, filter=(3, 3), out_ch=48, stride=1
 -> Output size: 58 x 58; Num params (incl. bias): 10416
Input: 20x20, in_ch=10, filter=(3, 3), out_ch=