# LN 层归一化（Layer Normalization）

In [None]:
import torch
from torch import nn

class LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps = 1e-6):
        super().__init__()
        self.hidden_size = hidden_size
        self.eps = eps

        self.gamma = nn.Parameter(torch.ones(hidden_size))
        self.beta = nn.Parameter(torch.zeros(hidden_size))

    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True)
        variance = x.var(dim = -1, keepdim = True, unbiased = False)

        x_normalized = (x - mean) / torch.sqrt(variance + self.eps)

        output = self.gamma * x_normalized + self.beta

        return output

def test_layer_norm():
    batch_size = 2
    seq_size = 4
    hidden_size = 8

    x = torch.randn(batch_size, seq_size, hidden_size)
    
    layer_norm = LayerNorm(hidden_size)

    output = layer_norm(x)

    print("Input shape:", x.shape)
    print("Output shape:", output.shape)

if __name__ == "__main__":
    test_layer_norm()

# RMSNorm 均方根归一化（Root Mean Square）

In [None]:
import torch
from torch import nn

class RMSNorm(nn.Module):
    def __init__(self, hidden_size, eps = 1e-6):
        super().__init__()
        self.hidden_size = hidden_size
        self.eps = eps

        self.gamma = nn.Parameter(torch.ones(hidden_size))

    def forward(self, x):
        rms = torch.sqrt(torch.mean(x.pow(2), dim = -1, keepdim= True) + self.eps)

        x_normalized = x // rms

        output = self.gamma * x_normalized

        return output
    
def test_rms_norm():
    batch_size = 2
    seq_len = 4
    hidden_size = 8

    x = torch.randn(batch_size, seq_len, hidden_size)

    rms_norm = RMSNorm(hidden_size)
    
    output = rms_norm(x)

    print("Input shape:", x.shape)
    print("Output shape:", output.shape)
    print("Parameters:", list(rms_norm.parameters()))

if __name__ == "__main__":
    test_rms_norm()



# BN 批次归一化 (Batch Normalization)

In [None]:
import torch
from torch import nn

class BatchNorm(nn.Module):
    def __init__(self, hidden_size, eps = 1e-6, momentum = 0.1):
        super().__init__()
        self.hidden_size = hidden_size
        self.eps = eps
        self.momentum = momentum

        self.gamma = nn.Parameter(torch.ones(hidden_size))
        self.betta = nn.Parameter(torch.zeros(hidden_size))

        self.running_mean = torch.zeros(hidden_size)
        self.running_var = torch.ones(hidden_size)

    def forward(self, x):
        if self.train:
            batch_mean = x.mean(dim = (0, 1), keepdim = True)
            batch_var = x.var(dim = (0, 1), keepdim = True)

            self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean
            self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var

            mean = batch_mean
            variance = batch_var
        else:
            mean = self.running_mean
            variance = self.running_var
        
        x_normalized = (x - mean) // torch.sqrt(variance + self.eps)

        output = self.gamma * x_normalized + self.betta

        return output

def test_batch_norm():
    batch_size = 2
    seq_len = 4
    hidden_size = 8

    x = torch.randn(batch_size, seq_len, hidden_size)

    bn = BatchNorm(hidden_size)

    output = bn(x)

    print("Input shape:", x.shape)
    print("Output shape:", output.shape)

if __name__ == "__main__":
    test_batch_norm()

# Dropout

In [None]:
import torch
from torch import nn

class Dropout(nn.Module):
    def __init__(self, dropout_prob = 0.1):
        super().__init__()
        self.dropout_prob = dropout_prob

    def forward(self, x):
        if self.train:
            mask = (torch.randn(x.shape) > self.dropout_prob).float()

            output = mask * x / (1.0 - self.dropout_prob)
        else:
            output = x
        
        return output

def test_dropout():
    batch_size = 2
    seq_len = 4
    hidden_size = 8

    x = torch.randn(batch_size, seq_len, hidden_size)

    dropout = Dropout(dropout_prob= 0.1)

    dropout.train()
    output_train = dropout(x)

    dropout.eval()
    output_eval = dropout(x)

    print("Input shape:", x.shape)
    print("Output shape during training:", output_train.shape)
    print("Output shape during evaluation:", output_eval.shape)

if __name__ == "__main__":
    test_dropout()

# Backpropagation 反向传播梯度下降

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

#define model, optimizer, criterion, dataloader
model = MyModel()
optimizer = optim.sgd(model.parameters(), lr = 0.1)
criterion = nn.CrossEntropyLoss()
dataloader = MyDataloader()

model.tarin()
for epoch in range(num_epochs):
    for inputs, labels in dataloader:
        preds = model(inputs)
        loss = criterion(preds, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        


### 反向传播 (BP) 算法核心公式详解

#### 1. **前向传播公式**
| 层类型 | 公式 | 说明 |
|--------|------|------|
| **输入层** | $a^{(0)} = x$ | $x$ 为输入数据 |
| **隐藏层** | $z^{(l)} = W^{(l)}a^{(l-1)} + b^{(l)}$ | 线性变换 |
| | $a^{(l)} = f(z^{(l)})$ | 激活函数 $f$ (如 Sigmoid, ReLU) |
| **输出层** | $\hat{y} = a^{(L)}$ | 最终预测值 |

#### 2. **损失函数 (均方误差 MSE)**
$$ \mathcal{L} = \frac{1}{2N} \sum_{i=1}^N (\hat{y}_i - y_i)^2 $$
- $N$: 样本数量
- $\hat{y}$: 预测值
- $y$: 真实值

#### 3. **反向传播核心公式 (链式法则)**
##### 3.1 输出层梯度
$$ \delta^{(L)} = \frac{\partial \mathcal{L}}{\partial z^{(L)}} = \frac{\partial \mathcal{L}}{\partial \hat{y}} \cdot \frac{\partial \hat{y}}{\partial z^{(L)}} = (\hat{y} - y) \odot f'(z^{(L)}) $$
- $\odot$: 逐元素乘法
- $f'$: 激活函数的导数

##### 3.2 隐藏层梯度 (递归计算)
$$ \delta^{(l)} = \frac{\partial \mathcal{L}}{\partial z^{(l)}} = \left( (W^{(l+1)})^T \delta^{(l+1)} \right) \odot f'(z^{(l)}) $$

##### 3.3 参数梯度计算
| 参数 | 梯度公式 | 维度说明 |
|------|----------|---------|
| **权重梯度** | $\frac{\partial \mathcal{L}}{\partial W^{(l)}} = \delta^{(l)} (a^{(l-1)})^T$ | $d^{(l)} \times d^{(l-1)}$ |
| **偏置梯度** | $\frac{\partial \mathcal{L}}{\partial b^{(l)}} = \delta^{(l)}$ | $d^{(l)} \times 1$ |

#### 4. **激活函数导数**
| 激活函数 | $f(z)$ | $f'(z)$ | 特点 |
|----------|--------|---------|------|
| **Sigmoid** | $\frac{1}{1+e^{-z}}$ | $f(z)(1-f(z))$ | 易梯度消失 |
| **ReLU** | $\max(0,z)$ | $\begin{cases}1 & z>0\\0 & z\leq0\end{cases}$ | 缓解梯度消失 |
| **Tanh** | $\frac{e^z-e^{-z}}{e^z+e^{-z}}$ | $1-f(z)^2$ | 中心化输出 |

#### 5. **参数更新规则**
$$ \begin{aligned}
W^{(l)} &\leftarrow W^{(l)} - \eta \frac{\partial \mathcal{L}}{\partial W^{(l)}} \\
b^{(l)} &\leftarrow b^{(l)} - \eta \frac{\partial \mathcal{L}}{\partial b^{(l)}}
\end{aligned} $$
- $\eta$: 学习率 (learning rate)

#### 6. **批量梯度计算**
$$ \nabla_W \mathcal{L} = \frac{1}{N} \sum_{i=1}^N \nabla_W \mathcal{L}_i $$
- 对 $N$ 个样本的梯度求平均

### 公式推导详解 (以Sigmoid激活为例)

#### 1. 输出层梯度推导：
$$
\begin{aligned}
\delta^{(L)} &= \frac{\partial \mathcal{L}}{\partial z^{(L)}} \\
&= \frac{\partial \mathcal{L}}{\partial \hat{y}} \cdot \frac{\partial \hat{y}}{\partial z^{(L)}} \\
&= (\hat{y} - y) \cdot \underbrace{\frac{\partial \sigma(z^{(L)})}{\partial z^{(L)}}}_{\text{Sigmoid导数}} \\
&= (\hat{y} - y) \odot \sigma(z^{(L)}) \odot (1 - \sigma(z^{(L)}))
\end{aligned}
$$

#### 2. 隐藏层梯度推导：
$$
\begin{aligned}
\delta^{(l)} &= \frac{\partial \mathcal{L}}{\partial z^{(l)}} \\
&= \frac{\partial \mathcal{L}}{\partial z^{(l+1)}} \cdot \frac{\partial z^{(l+1)}}{\partial a^{(l)}} \cdot \frac{\partial a^{(l)}}{\partial z^{(l)}} \\
&= \delta^{(l+1)} \cdot \frac{\partial}{\partial a^{(l)}} \left( W^{(l+1)}a^{(l)} + b^{(l+1)} \right) \cdot f'(z^{(l)}) \\
&= \left( (W^{(l+1)})^T \delta^{(l+1)} \right) \odot f'(z^{(l)})
\end{aligned}
$$

### 反向传播算法流程总结

```mermaid
graph LR
A[前向传播] --> B[计算损失]
B --> C[计算输出层梯度 δ⁽ᴸ⁾]
C --> D[递归计算隐藏层梯度 δ⁽ˡ⁾]
D --> E[计算权重梯度 ∂L/∂W]
E --> F[计算偏置梯度 ∂L/∂b]
F --> G[更新参数]
G --> A
```

### 关键记忆点
1. **误差反向传播**：$\delta^{(l)} = (W^{(l+1)})^T \delta^{(l+1)} \odot f'(z^{(l)})$
2. **权重梯度**：$\nabla_W = \delta^{(l)} (a^{(l-1)})^T$
3. **链式法则**：核心数学原理
4. **激活函数导数**：决定梯度传播特性
5. **批量平均**：对多个样本梯度求平均

> BP算法通过高效计算梯度，使深度神经网络训练成为可能。理解这些核心公式是掌握深度学习优化的基础，尤其要注意激活函数导数的计算，它直接影响梯度传播的稳定性。

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict, Tuple, List

class NeuralNetwork:
    def __init__(self, layers_dim: List[int], learning_rate: float = 0.3):
        """
        初始化神经网络
        
        参数:
        layers_dim -- 每层的神经元数量，例如 [1, 25, 1] 表示输入层1个神经元，隐藏层25个，输出层1个
        learning_rate -- 学习率
        """
        self.layers_dim = layers_dim
        self.learning_rate = learning_rate
        self.num_layers = len(layers_dim)
        self.parameters = self._init_parameters()
        self.loss_history = []
        
    def _init_parameters(self) -> Dict[str, np.ndarray]:
        """初始化权重和偏置"""
        parameters = {}
        # 使用Xavier/Glorot初始化权重，有助于改善训练
        for i in range(1, self.num_layers):
            # 输入维度
            n_in = self.layers_dim[i-1]
            # 输出维度
            n_out = self.layers_dim[i]
            
            # 权重初始化
            limit = np.sqrt(6 / (n_in + n_out))
            parameters[f"w{i}"] = np.random.uniform(-limit, limit, (n_out, n_in))
            parameters[f"b{i}"] = np.zeros((n_out, 1))
        return parameters
    
    @staticmethod
    def sigmoid(z: np.ndarray) -> np.ndarray:
        """Sigmoid激活函数"""
        return 1.0 / (1.0 + np.exp(-z))
    
    @staticmethod
    def sigmoid_prime(z: np.ndarray) -> np.ndarray:
        """Sigmoid函数的导数"""
        s = NeuralNetwork.sigmoid(z)
        return s * (1 - s)
    
    def forward(self, x: np.ndarray) -> Tuple[Dict[str, np.ndarray], np.ndarray]:
        """
        前向传播
        
        返回:
        caches -- 包含各层的z和a值的字典
        output -- 网络输出
        """
        # 确保输入是二维数组 (features, samples)
        if x.ndim == 1:
            x = x.reshape(1, -1)
            
        caches = {"z": [x], "a": [x]}  # 输入层
        a = x
        
        # 隐藏层的前向传播 (使用sigmoid激活)
        for i in range(1, self.num_layers - 1):
            z = self.parameters[f"w{i}"].dot(a) + self.parameters[f"b{i}"]
            a = self.sigmoid(z)
            caches["z"].append(z)
            caches["a"].append(a)
        
        # 输出层 (无激活函数)
        z_output = self.parameters[f"w{self.num_layers-1}"].dot(a) + self.parameters[f"b{self.num_layers-1}"]
        a_output = z_output
        caches["z"].append(z_output)
        caches["a"].append(a_output)
        
        return caches, a_output
    
    def backward(self, caches: Dict[str, np.ndarray], output: np.ndarray, y: np.ndarray) -> Dict[str, np.ndarray]:
        """
        反向传播计算梯度
        
        返回:
        gradients -- 包含各层权重和偏置梯度的字典
        """
        gradients = {}
        m = y.shape[1]  # 样本数量
        
        # 输出层梯度 (无激活函数)
        dz_last = output - y
        gradients[f"dz{self.num_layers-1}"] = dz_last
        gradients[f"dw{self.num_layers-1}"] = dz_last.dot(caches["a"][self.num_layers-2].T) / m
        gradients[f"db{self.num_layers-1}"] = np.sum(dz_last, axis=1, keepdims=True) / m
        
        # 隐藏层梯度 (使用sigmoid激活)
        for i in range(self.num_layers-2, 0, -1):
            dz = self.parameters[f"w{i+1}"].T.dot(gradients[f"dz{i+1}"]) * self.sigmoid_prime(caches["z"][i])
            gradients[f"dz{i}"] = dz
            gradients[f"dw{i}"] = dz.dot(caches["a"][i-1].T) / m
            gradients[f"db{i}"] = np.sum(dz, axis=1, keepdims=True) / m
            
        return gradients
    
    def update_parameters(self, gradients: Dict[str, np.ndarray]):
        """更新权重和偏置"""
        for i in range(1, self.num_layers):
            self.parameters[f"w{i}"] -= self.learning_rate * gradients[f"dw{i}"]
            self.parameters[f"b{i}"] -= self.learning_rate * gradients[f"db{i}"]
    
    @staticmethod
    def compute_loss(output: np.ndarray, y: np.ndarray) -> float:
        """计算均方误差损失"""
        return np.mean(np.square(output - y))
    
    def predict(self, x: np.ndarray) -> np.ndarray:
        """预测输出"""
        _, output = self.forward(x)
        return output
    
    def train(self, x: np.ndarray, y: np.ndarray, epochs: int, log_interval: int = 100):
        """
        训练神经网络
        
        参数:
        x -- 输入数据 (1, n)
        y -- 目标值 (1, n)
        epochs -- 训练轮数
        log_interval -- 打印损失的时间间隔
        """
        # 确保数据格式正确
        x = x.reshape(1, -1) if x.ndim == 1 else x
        y = y.reshape(1, -1) if y.ndim == 1 else y
        
        self.loss_history = []
        
        for epoch in range(1, epochs + 1):
            # 前向传播
            caches, output = self.forward(x)
            
            # 计算损失
            loss = self.compute_loss(output, y)
            self.loss_history.append(loss)
            
            # 反向传播
            gradients = self.backward(caches, output, y)
            
            # 更新参数
            self.update_parameters(gradients)
            
            # 定期打印训练进度
            if epoch % log_interval == 0 or epoch == 1:
                print(f"Epoch {epoch:4d}/{epochs} | Loss: {loss:.6f}")
    
    def plot_training_history(self):
        """绘制训练损失曲线"""
        plt.figure(figsize=(10, 6))
        plt.plot(self.loss_history)
        plt.title("Training Loss History")
        plt.xlabel("Epoch")
        plt.ylabel("Mean Squared Error")
        plt.grid(True)
        plt.show()


def generate_sine_data(num_samples: int = 100) -> Tuple[np.ndarray, np.ndarray]:
    """
    生成正弦波数据
    
    返回:
    x -- 输入数据 (1, num_samples)
    y -- 目标值 (1, num_samples)
    """
    x = np.linspace(0, 1, num_samples)
    y = 20 * np.sin(2 * np.pi * x)
    return x.reshape(1, -1), y.reshape(1, -1)


def plot_results(x: np.ndarray, y_true: np.ndarray, y_pred: np.ndarray):
    """绘制原始数据和预测结果"""
    plt.figure(figsize=(12, 7))
    
    # 原始数据
    plt.scatter(x.flatten(), y_true.flatten(), 
                color='blue', label='True Values', alpha=0.6)
    
    # 预测结果
    plt.plot(x.flatten(), y_pred.flatten(), 
             color='red', linewidth=2, label='Predictions')
    
    plt.title("Neural Network Regression: True vs Predicted")
    plt.xlabel("Input")
    plt.ylabel("Output")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


def main():
    # 配置参数
    LAYERS_DIM = [1, 25, 1]  # 输入层1个神经元，隐藏层25个，输出层1个
    LEARNING_RATE = 0.3
    EPOCHS = 4000
    LOG_INTERVAL = 500
    
    # 生成数据
    x, y = generate_sine_data()
    
    # 初始化神经网络
    print("Initializing neural network...")
    nn = NeuralNetwork(layers_dim=LAYERS_DIM, learning_rate=LEARNING_RATE)
    
    # 训练神经网络
    print(f"Training neural network for {EPOCHS} epochs...")
    nn.train(x, y, epochs=EPOCHS, log_interval=LOG_INTERVAL)
    
    # 可视化训练过程
    print("Plotting training history...")
    nn.plot_training_history()
    
    # 预测结果
    print("Generating predictions...")
    predictions = nn.predict(x)
    
    # 可视化结果
    print("Plotting results...")
    plot_results(x, y, predictions)
    
    # 计算最终损失
    final_loss = nn.compute_loss(predictions, y)
    print(f"\nTraining completed. Final loss: {final_loss:.6f}")


if __name__ == "__main__":
    main()

# Gradient Accumulation

In [None]:
gradient_accumulation_steps = 4

for batch_idx, (inputs, labels) in enumerate(dataloader):
    preds = model(inputs)
    loss = criterion(preds, labels)
    
    loss = loss / gradient_accumulation_steps
    loss.backward()

    if (batch_idx + 1) % gradient_accumulation_steps == 0 or (batch_idx + 1) == len(dataloader):
        optimizer.step()
        optimizer.zero_grad()

# MLP 全连接神经网络

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
 
### 定义模型
### N,1 -> N,10 -> N,10 -> N,1
class Net(nn.Module):
    def __init__(self, n_input, n_hidden, n_output):
        super(Net, self).__init__()
        self.dense1 = nn.Linear(n_input, n_hidden)
        self.dense2 = nn.Linear(n_hidden, n_hidden)
        self.out = nn.Linear(n_hidden, n_output)
    
    def forward(self, x):
        x = self.dense1(x)
        x = F.relu(x)
        x = self.dense2(x)
        x = F.relu(x)
        x = self.out(x)
        return x
    
model = Net(1, 20, 1)
print(model)
 
### 准备数据
x = torch.unsqueeze(torch.linspace(-1,1,100),dim=1)
y = x.pow(3)+0.1*torch.randn(x.size())
 
x , y =(Variable(x),Variable(y))
 
plt.scatter(x.data,y.data)
# 或者采用如下的方式也可以输出x,y
# plt.scatter(x.data.numpy(),y.data.numpy())
plt.show()
 
 
####  pipeline
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
loss_func = torch.nn.MSELoss()
 
for t in range(500):
    predict = model(x)
    loss = loss_func(predict, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if t%5 ==0:
        plt.cla()
        plt.scatter(x.data.numpy(), y.data.numpy())
        plt.plot(x.data.numpy(), predict.data.numpy(), 'r-', lw=5)
        plt.text(0.5, 0, 'Loss = %.4f' % loss.data, fontdict={'size': 20, 'color': 'red'})
        plt.pause(0.05)   