# 第七章 - 深度学习模型

## 7.1 CNN类深度网络

### 7.1.1 1D-CNN

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# 1D-CNN模型定义
class Simple1DCNN(nn.Module):
    def __init__(self):
        super(Simple1DCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(16 * 50, 64)  # 假设输入序列的长度为100
        self.fc2 = nn.Linear(64, 10)       # 假设输出类别数为10

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = x.view(-1, 16 * 50)  # 将多维输入一维化
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [3]:
# 生成测试数据
# 假设输入维度为(batch_size, channels, sequence_length)
batch_size = 2
channels = 1
sequence_length = 100
test_input = torch.randn(batch_size, channels, sequence_length)

In [4]:
# 创建模型实例
model = Simple1DCNN()

# 执行模型前向传播
output = model(test_input)

print("Output shape:", output.shape)
print("Output tensor:", output)

Output shape: torch.Size([2, 10])
Output tensor: tensor([[-0.3385,  0.0440, -0.1097, -0.0110, -0.1404, -0.3486, -0.1274, -0.2438,
          0.1753, -0.1129],
        [-0.2386,  0.0682, -0.3389, -0.0713, -0.1648, -0.3605, -0.1303,  0.0037,
          0.4261, -0.1400]], grad_fn=<AddmmBackward0>)


### 7.1.2 Wavenet

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [6]:
# 定义WaveNet的一个因果卷积层
class CausalConv1d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation=1):
        super(CausalConv1d, self).__init__()
        self.padding = (kernel_size - 1) * dilation
        self.conv1d = nn.Conv1d(in_channels, out_channels, kernel_size,
                                padding=self.padding, dilation=dilation)

    def forward(self, x):
        x = self.conv1d(x)
        return x[:, :, :-self.padding]  # 移除多余的padding

# 定义WaveNet的残差块
class ResidualBlock(nn.Module):
    def __init__(self, residual_channels, skip_channels, dilation):
        super(ResidualBlock, self).__init__()
        self.dilated_conv = CausalConv1d(residual_channels, 2 * residual_channels, kernel_size=2, dilation=dilation)
        self.conv_res = nn.Conv1d(residual_channels, residual_channels, kernel_size=1)
        self.conv_skip = nn.Conv1d(residual_channels, skip_channels, kernel_size=1)

    def forward(self, x):
        res = x
        x = self.dilated_conv(x)
        x = F.tanh(x[:, :res.size(1), :]) * F.sigmoid(x[:, res.size(1):, :])
        skip = self.conv_skip(x)
        res = res + self.conv_res(x)
        return res, skip

In [7]:
# 定义完整的WaveNet模型
# 注意：该样例代码没有加入条件输入和门控激活函数
class WaveNet(nn.Module):
    def __init__(self, num_blocks, num_layers, residual_channels, skip_channels, num_classes):
        super(WaveNet, self).__init__()
        self.num_blocks = num_blocks
        self.num_layers = num_layers
        self.front_conv = nn.Conv1d(1, residual_channels, kernel_size=1)
        self.res_blocks = nn.ModuleList()
        for b in range(num_blocks):
            for l in range(num_layers):
                self.res_blocks.append(ResidualBlock(residual_channels, skip_channels, 2**l))
        self.skip_conv = nn.Conv1d(skip_channels, skip_channels, kernel_size=1)
        self.fc = nn.Conv1d(skip_channels, num_classes, kernel_size=1)

    def forward(self, x):
        x = self.front_conv(x)
        skip_connections = []
        for res_block in self.res_blocks:
            x, skip = res_block(x)
            skip_connections.append(skip)
        x = sum(skip_connections)
        x = F.relu(x)
        x = self.skip_conv(x)
        x = F.relu(x)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

In [9]:
# 创建模型实例
num_blocks = 3
num_layers = 10
residual_channels = 32
skip_channels = 64
num_classes = 256 
model = WaveNet(num_blocks, num_layers, residual_channels, skip_channels, num_classes)

In [10]:
# 创建测试数据
batch_size = 5
sequence_length = 16000  # 数据长度
test_data = torch.randn(batch_size, 1, sequence_length)

In [11]:
# 运行模型
model.eval()
with torch.no_grad():
    output = model(test_data)

print(output.size())  # 输出应该是(batch_size, num_classes, sequence_length)

torch.Size([5, 256, 16000])


## 7.2 RNN类深度网络

### 7.2.1 ESN

In [12]:
import torch
import torch.nn as nn
import numpy as np

In [13]:
# 定义回声神经网络结构
class EchoStateNetwork(nn.Module):
    def __init__(self, input_size, reservoir_size, output_size):
        super(EchoStateNetwork, self).__init__()
        
        # 初始化输入层到reservoir的权重
        self.input_weights = nn.Parameter(torch.randn(input_size, reservoir_size) * 0.1, requires_grad=False)
        
        # 初始化reservoir内部的权重
        self.reservoir_weights = nn.Parameter(torch.rand(reservoir_size, reservoir_size) - 0.5, requires_grad=False)
        # 保证reservoir的回声状态属性 (spectral radius < 1)
        self.reservoir_weights.data *= 0.9 / torch.max(torch.abs(torch.linalg.eigvals(self.reservoir_weights.data)))
        
        # 输出层权重将在训练过程中更新
        self.output_weights = nn.Parameter(torch.randn(reservoir_size, output_size) * 0.1)
        
    def forward(self, x):
        # 初始化reservoir状态
        batch_size, sequence_length, _ = x.size()
        h = torch.zeros(batch_size, self.reservoir_weights.size(0), device=x.device)
        
        # 计算reservoir的状态
        for t in range(sequence_length):
            h = torch.tanh(x[:,t] @ self.input_weights + h @ self.reservoir_weights)
        
        # 计算输出层
        y = h @ self.output_weights
        
        return y

In [17]:
# 测试数据生成
def generate_test_data(sequence_length, batch_size, input_size, output_size):
    # 随机生成一些输入数据
    x = torch.randn(batch_size, sequence_length, input_size)
    # 生成输出数据，这里只是简单地用随机数代替
    y = torch.randn(batch_size, output_size)
    return x, y


In [16]:
# 测试ESN模型
input_size = 10
reservoir_size = 100
output_size = 1
sequence_length = 20
batch_size = 5

In [18]:
# 生成测试数据
x, y_true = generate_test_data(sequence_length, batch_size, input_size, output_size)

In [19]:
# 创建ESN实例
esn = EchoStateNetwork(input_size, reservoir_size, output_size)

In [20]:
# 执行模型
y_pred = esn(x)

# 打印预测结果
print("Predicted output:", y_pred)

Predicted output: tensor([[ 0.3731],
        [-0.0515],
        [-0.4167],
        [ 0.1139],
        [ 0.3155]], grad_fn=<MmBackward0>)


### 7.2.2 TPA-LSTM

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F


In [22]:
# 定义TPA-LSTM模型结构
class TemporalPatternAttention(nn.Module):
    def __init__(self, input_size, time_steps):
        super(TemporalPatternAttention, self).__init__()
        self.input_size = input_size
        self.time_steps = time_steps
        self.W = nn.Parameter(torch.Tensor(input_size, input_size))
        self.U = nn.Parameter(torch.Tensor(input_size, input_size))
        self.b = nn.Parameter(torch.Tensor(input_size))
        self.v = nn.Parameter(torch.Tensor(input_size, 1))
        self.init_weights()
        
    def init_weights(self):
        for param in self.parameters():
            nn.init.normal_(param, mean=0, std=0.01)
    
    def forward(self, x):
        # x shape: (batch_size, time_steps, input_size)
        Ux = torch.tanh(torch.matmul(x, self.W) + torch.matmul(x.mean(dim=1, keepdim=True), self.U) + self.b)
        # Ux shape: (batch_size, time_steps, input_size)
        vu = torch.matmul(Ux, self.v)
        # vu shape: (batch_size, time_steps, 1)
        alphas = F.softmax(vu, dim=1)
        # alphas shape: (batch_size, time_steps, 1)
        output = x * alphas
        # output shape: (batch_size, time_steps, input_size)
        return output, alphas

class TPA_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, time_steps, num_layers=1):
        super(TPA_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.time_steps = time_steps
        self.input_size = input_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.attention = TemporalPatternAttention(hidden_size, time_steps)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # x shape: (batch_size, time_steps, input_size)
        lstm_out, _ = self.lstm(x)
        # lstm_out shape: (batch_size, time_steps, hidden_size)
        attn_out, alphas = self.attention(lstm_out)
        # attn_out shape: (batch_size, time_steps, hidden_size)
        # alphas shape: (batch_size, time_steps, 1)
        out = attn_out[:, -1, :]  # We only take the output from the last time step
        # out shape: (batch_size, hidden_size)
        out = self.fc(out)
        # out shape: (batch_size, 1)
        return out, alphas

In [23]:
# 参数设置
time_steps = 10
input_size = 5
hidden_size = 64
batch_size = 32
num_layers = 1

In [24]:
# TPA-LSTM 模型实例
model = TPA_LSTM(input_size=input_size, hidden_size=hidden_size, time_steps=time_steps, num_layers=num_layers)

In [25]:
# 创建测试数据
test_data = torch.rand(batch_size, time_steps, input_size)


In [27]:
# 模型输出
output, alphas = model(test_data)

print("输出形状:", output.shape)
print("输出:", output)

输出形状: torch.Size([32, 1])
输出: tensor([[-0.0256],
        [-0.0261],
        [-0.0263],
        [-0.0263],
        [-0.0264],
        [-0.0258],
        [-0.0257],
        [-0.0255],
        [-0.0259],
        [-0.0257],
        [-0.0256],
        [-0.0249],
        [-0.0258],
        [-0.0261],
        [-0.0258],
        [-0.0253],
        [-0.0262],
        [-0.0261],
        [-0.0258],
        [-0.0260],
        [-0.0258],
        [-0.0265],
        [-0.0262],
        [-0.0266],
        [-0.0250],
        [-0.0248],
        [-0.0262],
        [-0.0251],
        [-0.0268],
        [-0.0264],
        [-0.0264],
        [-0.0254]], grad_fn=<AddmmBackward0>)


### 7.2.3 DeepAR

In [None]:
# DeepAR 模型使用GluonTS的接口实现，具体可以参考
# https://ts.gluon.ai/stable/api/gluonts/gluonts.mx.model.deepar.html?highlight=deeparestimator#gluonts.mx.model.deepar.DeepAREstimator
import mxnet as mx
from gluonts.model.deepar import DeepAREstimator
from gluonts.mx.trainer import Trainer
from gluonts.dataset.common import ListDataset
import pandas as pd
import numpy as np

In [None]:
# 设置随机种子以确保可重复性
np.random.seed(42)
mx.random.seed(42)

# 创建合成测试数据
num_series = 10
num_steps = 24
prediction_length = 7
freq = '1H'  # 每小时一个数据点

In [None]:
data = []
for i in range(num_series):
    ts = pd.date_range(start='2020-01-01', periods=num_steps, freq=freq)
    values = np.random.rand(len(ts))
    data.append({'start': ts[0], 'target': values[:-prediction_length]})
    
# 使用ListDataset创建GluonTS数据集
test_data = ListDataset(data, freq=freq, one_dim_target=True)

In [None]:
# 定义Deepar模型的超参数
estimator = DeepAREstimator(
    freq=freq,
    prediction_length=prediction_length,
    trainer=Trainer(epochs=5)  # 训练5个epoch
)

In [None]:
# 训练模型
predictor = estimator.train(training_data=test_data)

In [None]:
# 使用模型进行预测
for test_entry, forecast in zip(test_data, predictor.predict(test_data)):
    print(forecast.mean)
    # 这里使用mean，也可以使用其他分位数，例如.quantile(0.7) -> 70分位数预测

### 7.2.4 LSTNet

In [68]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [100]:
# 定义 LSTNet 模型
class LSTNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, kernel_size, cnn_out_channels):
        super(LSTNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_size, out_channels=cnn_out_channels, kernel_size=kernel_size)
        self.relu = nn.ReLU()
        self.gru = nn.GRU(cnn_out_channels, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Convolutional layer
        c = x.view(x.size(0), x.size(2), -1)  # (batch, input_size, seq_len)
        c = self.conv1(c)
        c = self.relu(c)
        
        # GRU layer
        r = c.permute(2, 0, 1)  # (seq_len, batch, cnn_out_channels)
        _, h = self.gru(r)
        
        # Fully connected layer
        out = self.fc(h.squeeze(0))
        return out

In [101]:

# 参数设置
input_size = 1  # 输入特征的维度
hidden_size = 10  # GRU隐藏层的维度
output_size = 1  # 输出特征的维度
kernel_size = 2  # CNN的卷积核大小
cnn_out_channels = 5  # CNN输出的通道数
seq_length = 10  # 序列的长度
batch_size = 16  # 批次大小
epochs = 5  # 训练轮数

In [102]:
# 创建测试数据
np.random.seed(0)
torch.manual_seed(0)

# 生成随机数据作为输入
x = np.random.randn(100, seq_length, input_size).astype(np.float32)
y = np.random.randn(100, output_size).astype(np.float32)

# 转化为tensor
x_tensor = torch.from_numpy(x)
y_tensor = torch.from_numpy(y)


In [103]:
# 创建 DataLoader
dataset = TensorDataset(x_tensor, y_tensor)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 实例化模型
model = LSTNet(input_size, hidden_size, output_size, kernel_size, cnn_out_channels)

# 损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [104]:
# 训练模型
for epoch in range(epochs):
    for i, (inputs, targets) in enumerate(data_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        if (i+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(data_loader)}], Loss: {loss.item():.4f}')

In [105]:
# 测试模型
# 假设我们用最后一个batch的数据作为测试数据
test_inputs, test_targets = next(iter(data_loader))
predictions = model(test_inputs)
print("Predictions:", predictions.detach().numpy())
print("Actual targets:", test_targets.numpy())

Predictions: [[-0.05802027]
 [-0.02585397]
 [ 0.07551305]
 [-0.00784112]
 [-0.00163513]
 [ 0.00550213]
 [ 0.0008548 ]
 [-0.04464719]
 [-0.04730664]
 [-0.03543879]
 [-0.02438176]
 [-0.04572471]
 [-0.02713918]
 [-0.04959046]
 [-0.02132425]
 [-0.04947438]]
Actual targets: [[-0.2311016 ]
 [-1.8180777 ]
 [ 0.18949963]
 [-0.04932407]
 [-0.8000825 ]
 [ 0.41839802]
 [ 3.1709747 ]
 [ 0.67746216]
 [ 1.0490932 ]
 [-1.4299912 ]
 [ 0.20147994]
 [ 0.16155927]
 [-0.2069447 ]
 [-0.9872867 ]
 [ 0.22425222]
 [ 2.1495745 ]]


## 7.3 Transformer

In [50]:
import math
import pandas as pd
import torch
from torch import nn
import os

In [33]:
class PositionWiseFFN(nn.Module):
    """基于位置的前馈网络"""
    def __init__(self, ffn_num_input, ffn_num_hiddens, ffn_num_outputs,
                 **kwargs):
        super(PositionWiseFFN, self).__init__(**kwargs)
        self.dense1 = nn.Linear(ffn_num_input, ffn_num_hiddens)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(ffn_num_hiddens, ffn_num_outputs)

    def forward(self, X):
        return self.dense2(self.relu(self.dense1(X)))

In [34]:
class AddNorm(nn.Module):
    """残差连接后进行层规范化"""
    def __init__(self, normalized_shape, dropout, **kwargs):
        super(AddNorm, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)
        self.ln = nn.LayerNorm(normalized_shape)

    def forward(self, X, Y):
        return self.ln(self.dropout(Y) + X)

In [35]:
class DotProductAttention(nn.Module):
    """缩放点积注意力"""
    def __init__(self, dropout, **kwargs):
        super(DotProductAttention, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)

    # queries的形状：(batch_size，查询的个数，d)
    # keys的形状：(batch_size，“键－值”对的个数，d)
    # values的形状：(batch_size，“键－值”对的个数，值的维度)
    # valid_lens的形状:(batch_size，)或者(batch_size，查询的个数)
    def forward(self, queries, keys, values, valid_lens=None):
        d = queries.shape[-1]
        # 设置transpose_b=True为了交换keys的最后两个维度
        scores = torch.bmm(queries, keys.transpose(1,2)) / math.sqrt(d)
        self.attention_weights = masked_softmax(scores, valid_lens)
        return torch.bmm(self.dropout(self.attention_weights), values)

In [36]:
def grad_clipping(net, theta):
    """裁剪梯度"""
    if isinstance(net, nn.Module):
        params = [p for p in net.parameters() if p.requires_grad]
    else:
        params = net.params
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm

In [59]:
class MultiHeadAttention(nn.Module):
    """多头注意力"""
    def __init__(self, key_size, query_size, value_size, num_hiddens,
                 num_heads, dropout, bias=False, **kwargs):
        super(MultiHeadAttention, self).__init__(**kwargs)
        self.num_heads = num_heads
        self.attention = DotProductAttention(dropout)
        self.W_q = nn.Linear(query_size, num_hiddens, bias=bias)
        self.W_k = nn.Linear(key_size, num_hiddens, bias=bias)
        self.W_v = nn.Linear(value_size, num_hiddens, bias=bias)
        self.W_o = nn.Linear(num_hiddens, num_hiddens, bias=bias)

    def forward(self, queries, keys, values, valid_lens):
        # queries，keys，values的形状:
        # (batch_size，查询或者“键－值”对的个数，num_hiddens)
        # valid_lens　的形状:
        # (batch_size，)或(batch_size，查询的个数)
        # 经过变换后，输出的queries，keys，values　的形状:
        # (batch_size*num_heads，查询或者“键－值”对的个数，
        # num_hiddens/num_heads)
        queries = transpose_qkv(self.W_q(queries), self.num_heads)
        keys = transpose_qkv(self.W_k(keys), self.num_heads)
        values = transpose_qkv(self.W_v(values), self.num_heads)

        if valid_lens is not None:
            # 在轴0，将第一项（标量或者矢量）复制num_heads次，
            # 然后如此复制第二项，然后诸如此类。
            valid_lens = torch.repeat_interleave(
                valid_lens, repeats=self.num_heads, dim=0)

        # output的形状:(batch_size*num_heads，查询的个数，
        # num_hiddens/num_heads)
        output = self.attention(queries, keys, values, valid_lens)

        # output_concat的形状:(batch_size，查询的个数，num_hiddens)
        output_concat = transpose_output(output, self.num_heads)
        return self.W_o(output_concat)

def transpose_qkv(X, num_heads):
    """为了多注意力头的并行计算而变换形状"""
    # 输入X的形状:(batch_size，查询或者“键－值”对的个数，num_hiddens)
    # 输出X的形状:(batch_size，查询或者“键－值”对的个数，num_heads，
    # num_hiddens/num_heads)
    X = X.reshape(X.shape[0], X.shape[1], num_heads, -1)

    # 输出X的形状:(batch_size，num_heads，查询或者“键－值”对的个数,
    # num_hiddens/num_heads)
    X = X.permute(0, 2, 1, 3)

    # 最终输出的形状:(batch_size*num_heads,查询或者“键－值”对的个数,
    # num_hiddens/num_heads)
    return X.reshape(-1, X.shape[2], X.shape[3])


#@save
def transpose_output(X, num_heads):
    """逆转transpose_qkv函数的操作"""
    X = X.reshape(-1, num_heads, X.shape[1], X.shape[2])
    X = X.permute(0, 2, 1, 3)
    return X.reshape(X.shape[0], X.shape[1], -1)

In [38]:
class PositionalEncoding(nn.Module):
    """位置编码"""
    def __init__(self, num_hiddens, dropout, max_len=1000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        # 创建一个足够长的P
        self.P = torch.zeros((1, max_len, num_hiddens))
        X = torch.arange(max_len, dtype=torch.float32).reshape(
            -1, 1) / torch.pow(10000, torch.arange(
            0, num_hiddens, 2, dtype=torch.float32) / num_hiddens)
        self.P[:, :, 0::2] = torch.sin(X)
        self.P[:, :, 1::2] = torch.cos(X)

    def forward(self, X):
        X = X + self.P[:, :X.shape[1], :].to(X.device)
        return self.dropout(X)

In [39]:
class EncoderBlock(nn.Module):
    """Transformer编码器块"""
    def __init__(self, key_size, query_size, value_size, num_hiddens,
                 norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
                 dropout, use_bias=False, **kwargs):
        super(EncoderBlock, self).__init__(**kwargs)
        self.attention = MultiHeadAttention(
            key_size, query_size, value_size, num_hiddens, num_heads, dropout,
            use_bias)
        self.addnorm1 = AddNorm(norm_shape, dropout)
        self.ffn = PositionWiseFFN(
            ffn_num_input, ffn_num_hiddens, num_hiddens)
        self.addnorm2 = AddNorm(norm_shape, dropout)

    def forward(self, X, valid_lens):
        Y = self.addnorm1(X, self.attention(X, X, X, valid_lens))
        return self.addnorm2(Y, self.ffn(Y))

In [41]:
class Encoder(nn.Module):
    """编码器-解码器架构的基本编码器接口"""
    def __init__(self, **kwargs):
        super(Encoder, self).__init__(**kwargs)

    def forward(self, X, *args):
        raise NotImplementedError

In [43]:
class TransformerEncoder(Encoder):
    """Transformer编码器"""
    def __init__(self, vocab_size, key_size, query_size, value_size,
                 num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens,
                 num_heads, num_layers, dropout, use_bias=False, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(vocab_size, num_hiddens)
        self.pos_encoding = d2l.PositionalEncoding(num_hiddens, dropout)
        self.blks = nn.Sequential()
        for i in range(num_layers):
            self.blks.add_module("block"+str(i),
                EncoderBlock(key_size, query_size, value_size, num_hiddens,
                             norm_shape, ffn_num_input, ffn_num_hiddens,
                             num_heads, dropout, use_bias))

    def forward(self, X, valid_lens, *args):
        # 因为位置编码值在-1和1之间，
        # 因此嵌入值乘以嵌入维度的平方根进行缩放，
        # 然后再与位置编码相加。
        X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens))
        self.attention_weights = [None] * len(self.blks)
        for i, blk in enumerate(self.blks):
            X = blk(X, valid_lens)
            self.attention_weights[
                i] = blk.attention.attention.attention_weights
        return X

In [53]:
class DecoderBlock(nn.Module):
    """解码器中第i个块"""
    def __init__(self, key_size, query_size, value_size, num_hiddens,
                 norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
                 dropout, i, **kwargs):
        super(DecoderBlock, self).__init__(**kwargs)
        self.i = i
        self.attention1 = MultiHeadAttention(
            key_size, query_size, value_size, num_hiddens, num_heads, dropout)
        self.addnorm1 = AddNorm(norm_shape, dropout)
        self.attention2 = MultiHeadAttention(
            key_size, query_size, value_size, num_hiddens, num_heads, dropout)
        self.addnorm2 = AddNorm(norm_shape, dropout)
        self.ffn = PositionWiseFFN(ffn_num_input, ffn_num_hiddens,
                                   num_hiddens)
        self.addnorm3 = AddNorm(norm_shape, dropout)

    def forward(self, X, state):
        enc_outputs, enc_valid_lens = state[0], state[1]
        # 训练阶段，输出序列的所有词元都在同一时间处理，
        # 因此state[2][self.i]初始化为None。
        # 预测阶段，输出序列是通过词元一个接着一个解码的，
        # 因此state[2][self.i]包含着直到当前时间步第i个块解码的输出表示
        if state[2][self.i] is None:
            key_values = X
        else:
            key_values = torch.cat((state[2][self.i], X), axis=1)
        state[2][self.i] = key_values
        if self.training:
            batch_size, num_steps, _ = X.shape
            # dec_valid_lens的开头:(batch_size,num_steps),
            # 其中每一行是[1,2,...,num_steps]
            dec_valid_lens = torch.arange(
                1, num_steps + 1, device=X.device).repeat(batch_size, 1)
        else:
            dec_valid_lens = None

        # 自注意力
        X2 = self.attention1(X, key_values, key_values, dec_valid_lens)
        Y = self.addnorm1(X, X2)
        # 编码器－解码器注意力。
        # enc_outputs的开头:(batch_size,num_steps,num_hiddens)
        Y2 = self.attention2(Y, enc_outputs, enc_outputs, enc_valid_lens)
        Z = self.addnorm2(Y, Y2)
        return self.addnorm3(Z, self.ffn(Z)), state

In [45]:
class Decoder(nn.Module):
    """编码器-解码器架构的基本解码器接口"""
    def __init__(self, **kwargs):
        super(Decoder, self).__init__(**kwargs)

    def init_state(self, enc_outputs, *args):
        raise NotImplementedError

    def forward(self, X, state):
        raise NotImplementedError

In [46]:
class AttentionDecoder(Decoder):
    """带有注意力机制解码器的基本接口"""
    def __init__(self, **kwargs):
        super(AttentionDecoder, self).__init__(**kwargs)

    @property
    def attention_weights(self):
        raise NotImplementedError

In [47]:
class TransformerDecoder(AttentionDecoder):
    def __init__(self, vocab_size, key_size, query_size, value_size,
                 num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens,
                 num_heads, num_layers, dropout, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, num_hiddens)
        self.pos_encoding = PositionalEncoding(num_hiddens, dropout)
        self.blks = nn.Sequential()
        for i in range(num_layers):
            self.blks.add_module("block"+str(i),
                DecoderBlock(key_size, query_size, value_size, num_hiddens,
                             norm_shape, ffn_num_input, ffn_num_hiddens,
                             num_heads, dropout, i))
        self.dense = nn.Linear(num_hiddens, vocab_size)

    def init_state(self, enc_outputs, enc_valid_lens, *args):
        return [enc_outputs, enc_valid_lens, [None] * self.num_layers]

    def forward(self, X, state):
        X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens))
        self._attention_weights = [[None] * len(self.blks) for _ in range (2)]
        for i, blk in enumerate(self.blks):
            X, state = blk(X, state)
            # 解码器自注意力权重
            self._attention_weights[0][
                i] = blk.attention1.attention.attention_weights
            # “编码器－解码器”自注意力权重
            self._attention_weights[1][
                i] = blk.attention2.attention.attention_weights
        return self.dense(X), state

    @property
    def attention_weights(self):
        return self._attention_weights

In [48]:
class EncoderDecoder(nn.Module):
    """编码器-解码器架构的基类"""
    def __init__(self, encoder, decoder, **kwargs):
        super(EncoderDecoder, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, enc_X, dec_X, *args):
        enc_outputs = self.encoder(enc_X, *args)
        dec_state = self.decoder.init_state(enc_outputs, *args)
        return self.decoder(dec_X, dec_state)

In [51]:
# 准备数据，本次使用d2l提供的英文-法语数据集
from d2l import torch as d2l

d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip',
                           '94646ad1522d915e7b0f9296181140edcf86a4f5')

def read_data_nmt():
    """载入“英语－法语”数据集"""
    data_dir = d2l.download_extract('fra-eng')
    with open(os.path.join(data_dir, 'fra.txt'), 'r',
             encoding='utf-8') as f:
        return f.read()

raw_text = read_data_nmt()
print(raw_text[:75])

Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !



In [54]:
# 训练
# 定义参数
num_hiddens, num_layers, dropout, batch_size, num_steps = 32, 2, 0.1, 64, 10
lr, num_epochs, device = 0.005, 200, d2l.try_gpu()
ffn_num_input, ffn_num_hiddens, num_heads = 32, 64, 4
key_size, query_size, value_size = 32, 32, 32
norm_shape = [32]

# 获取测试数据 
train_iter, src_vocab, tgt_vocab = d2l.load_data_nmt(batch_size, num_steps)

encoder = TransformerEncoder(
    len(src_vocab), key_size, query_size, value_size, num_hiddens,
    norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
    num_layers, dropout)
decoder = TransformerDecoder(
    len(tgt_vocab), key_size, query_size, value_size, num_hiddens,
    norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
    num_layers, dropout)

# 定义模型
net = EncoderDecoder(encoder, decoder)

In [66]:
# 训练模型
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
    """训练序列到序列模型"""
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])

    net.apply(xavier_init_weights)
    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    net.train()
    for epoch in range(num_epochs):
        for batch in data_iter:
            optimizer.zero_grad()
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                          device=device).reshape(-1, 1)
            dec_input = torch.cat([bos, Y[:, :-1]], 1)  # 强制教学
            Y_hat, _ = net(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()      # 损失函数的标量进行“反向传播”
            grad_clipping(net, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()

In [64]:
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    """带遮蔽的softmax交叉熵损失函数"""
    # pred的形状：(batch_size,num_steps,vocab_size)
    # label的形状：(batch_size,num_steps)
    # valid_len的形状：(batch_size,)
    def forward(self, pred, label, valid_len):
        weights = torch.ones_like(label)
        weights = sequence_mask(weights, valid_len)
        self.reduction='none'
        unweighted_loss = super(MaskedSoftmaxCELoss, self).forward(
            pred.permute(0, 2, 1), label)
        weighted_loss = (unweighted_loss * weights).mean(dim=1)
        return weighted_loss

def masked_softmax(X, valid_lens):
    """通过在最后一个轴上掩蔽元素来执行softmax操作"""
    # X:3D张量，valid_lens:1D或2D张量
    if valid_lens is None:
        return nn.functional.softmax(X, dim=-1)
    else:
        shape = X.shape
        if valid_lens.dim() == 1:
            valid_lens = torch.repeat_interleave(valid_lens, shape[1])
        else:
            valid_lens = valid_lens.reshape(-1)
        # 最后一轴上被掩蔽的元素使用一个非常大的负值替换，从而其softmax输出为0
        X = d2l.sequence_mask(X.reshape(-1, shape[-1]), valid_lens,
                              value=-1e6)
        return nn.functional.softmax(X.reshape(shape), dim=-1)

def sequence_mask(X, valid_len, value=0):
    """在序列中屏蔽不相关的项"""
    maxlen = X.size(1)
    mask = torch.arange((maxlen), dtype=torch.float32,
                        device=X.device)[None, :] < valid_len[:, None]
    X[~mask] = value
    return X


In [None]:
# 开始训练
train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)

## 7.4 N-beats

In [113]:
import torch
from torch import nn

In [114]:
class NBeatsBlock(nn.Module):
    def __init__(self, input_size, theta_size, hidden_units, layers):
        super(NBeatsBlock, self).__init__()
        self.fc_layers = nn.ModuleList([nn.Linear(input_size if i == 0 else hidden_units, hidden_units) for i in range(layers)])
        self.fc_theta = nn.Linear(hidden_units, theta_size)
        self.backcast_length = input_size
        self.forecast_length = theta_size - input_size

    def forward(self, x):
        block_input = x
        for layer in self.fc_layers:
            block_input = torch.relu(layer(block_input))
        theta = self.fc_theta(block_input)
        backcast = theta[:, :self.backcast_length]
        forecast = theta[:, self.backcast_length:]
        return backcast, forecast

class NBeats(nn.Module):
    def __init__(self, input_size, theta_size, hidden_units, layers, blocks):
        super(NBeats, self).__init__()
        self.blocks = nn.ModuleList([NBeatsBlock(input_size, theta_size, hidden_units, layers) for _ in range(blocks)])

    def forward(self, x):
        backcasts = []
        forecasts = []
        for block in self.blocks:
            backcast, forecast = block(x)
            backcasts.append(backcast)
            forecasts.append(forecast)
            x = x - backcast
        return torch.stack(backcasts), torch.sum(torch.stack(forecasts), dim=0)

In [115]:
# 超参数
input_size = 10  # Number of past time steps used as input
output_size = 5  # Number of future time steps to predict
hidden_units = 512
layers = 4
blocks = 3

In [116]:
# 实例化模型
nbeats = NBeats(input_size, input_size + output_size, hidden_units, layers, blocks)

In [117]:
# 创建测试数据
batch_size = 16
num_samples = 100
X = torch.randn(batch_size, num_samples, input_size)
y = torch.randn(batch_size, num_samples, output_size)


In [118]:
# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(nbeats.parameters())

# 开始训练
epochs = 10
for epoch in range(epochs):
    for i in range(num_samples):
        optimizer.zero_grad()
        backcasts, forecast = nbeats(X[:, i, :])
        loss = criterion(forecast, y[:, i, :])
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

Epoch 1/10, Loss: 0.9286693334579468
Epoch 2/10, Loss: 0.9149163961410522
Epoch 3/10, Loss: 0.9017965197563171
Epoch 4/10, Loss: 0.9096616506576538
Epoch 5/10, Loss: 0.9134806394577026
Epoch 6/10, Loss: 0.9015148878097534
Epoch 7/10, Loss: 0.8723934292793274
Epoch 8/10, Loss: 0.8574414253234863
Epoch 9/10, Loss: 0.8507696986198425
Epoch 10/10, Loss: 0.8287347555160522


In [None]:
# 预测
with torch.no_grad():
    test_input = torch.randn(batch_size, input_size)
    _, forecast = nbeats(test_input)
    print(f'Forecast: {forecast}')

## 7.5 Prophet

In [106]:
# prophet使用facebook实现的api接口
from prophet import Prophet
import pandas as pd
from matplotlib import pyplot as plt

In [107]:
# 创建测试数据
# Prophet需要两列: 'ds' 和 'y'
ds = pd.date_range(start='2021-01-01', periods=365, freq='D')
y = (pd.Series(range(365)) + pd.Series(range(365)).apply(lambda x: x*0.1)) + \
    pd.Series(np.random.normal(0, 2, 365))  # 简单的线性增长加上一些噪声
df = pd.DataFrame({'ds': ds, 'y': y})


In [108]:
# 初始化Prophet模型
model = Prophet()

# 拟合模型
model.fit(df)

INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1


<prophet.forecaster.Prophet at 0x16180f070>

In [109]:
# 创建未来数据框架
future = model.make_future_dataframe(periods=365)

# 预测未来
forecast = model.predict(future)

In [112]:
print(forecast.head(1))

          ds     trend  yhat_lower  yhat_upper  trend_lower  trend_upper  \
0 2021-01-01  0.438672   -1.884259    3.020923     0.438672     0.438672   

   additive_terms  additive_terms_lower  additive_terms_upper    weekly  \
0        0.180405              0.180405              0.180405  0.180405   

   weekly_lower  weekly_upper  multiplicative_terms  \
0      0.180405      0.180405                   0.0   

   multiplicative_terms_lower  multiplicative_terms_upper      yhat  
0                         0.0                         0.0  0.619077  


## 7.6 Neural-Prophet

In [2]:
import pandas as pd
from neuralprophet import NeuralProphet
import numpy as np

In [3]:
# 创建测试数据
# 假设我们创建一个简单的时间序列，这个时间序列有一个线性趋势和一个年度周期性。
def create_test_data(periods=365):
    dates = pd.date_range(start='2020-01-01', periods=periods)
    trend = pd.Series(range(periods), index=dates)
    seasonality = pd.Series(10 * np.sin(np.linspace(0, 2 * np.pi * periods / 365, periods)), index=dates)
    data = pd.DataFrame({'ds': dates, 'y': trend + seasonality})
    return data

In [4]:
# 生成数据
df = create_test_data()

# 初始化 NeuralProphet 模型
m = NeuralProphet()


In [5]:
# 拟合模型
metrics = m.fit(df, freq='D')

# 进行预测
future = m.make_future_dataframe(df, periods=30)
forecast = m.predict(future)

# 打印预测结果
print(forecast[['ds', 'yhat1']])  # 'yhat1' 是预测值的列名

# 如果你想查看模型的训练和验证损失，可以查看 metrics
print(metrics)

INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.726% of the data.
INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D
INFO - (NP.config.init_data_params) - Setting normalization to global as only one dataframe provided for training.
INFO - (NP.utils.set_auto_seasonalities) - Disabling yearly seasonality. Run NeuralProphet with yearly_seasonality=True to override this.
INFO - (NP.utils.set_auto_seasonalities) - Disabling daily seasonality. Run NeuralProphet with daily_seasonality=True to override this.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 16
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 278


Finding best initial lr:   0%|          | 0/216 [00:00<?, ?it/s]

Training: 0it [00:00, ?it/s]

INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 99.726% of the data.
INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D
INFO - (NP.df_utils.return_df_in_original_format) - Returning df with no ID column
INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 96.667% of the data.
INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D
INFO - (NP.df_utils._infer_frequency) - Major frequency D corresponds to 96.667% of the data.
INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - D


Predicting: 23it [00:00, ?it/s]

INFO - (NP.df_utils.return_df_in_original_format) - Returning df with no ID column


           ds       yhat1
0  2020-12-31  363.447693
1  2021-01-01  364.563782
2  2021-01-02  365.703064
3  2021-01-03  366.840515
4  2021-01-04  367.978058
5  2021-01-05  369.106140
6  2021-01-06  370.063385
7  2021-01-07  371.150024
8  2021-01-08  372.266205
9  2021-01-09  373.405365
10 2021-01-10  374.542847
11 2021-01-11  375.680481
12 2021-01-12  376.808441
13 2021-01-13  377.765778
14 2021-01-14  378.852356
15 2021-01-15  379.968536
16 2021-01-16  381.107788
17 2021-01-17  382.245178
18 2021-01-18  383.382782
19 2021-01-19  384.510864
20 2021-01-20  385.468109
21 2021-01-21  386.554749
22 2021-01-22  387.670837
23 2021-01-23  388.810120
24 2021-01-24  389.947571
25 2021-01-25  391.085114
26 2021-01-26  392.213196
27 2021-01-27  393.170532
28 2021-01-28  394.257080
29 2021-01-29  395.373260
            MAE        RMSE      Loss  RegLoss  epoch
0    461.119720  546.455994  0.619706      0.0      0
1    455.449982  538.055664  0.608181      0.0      1
2    447.833893  530.522949  0.5

## 7.7 Informer

Informer 可以参考论文作者的原始实现 https://github.com/zhouhaoyi/Informer2020