In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from apex import amp
from torch.utils.data import DataLoader, TensorDataset
import time

In [2]:
# 检查CUDA是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'使用设备: {device}')

使用设备: cuda


In [3]:
# 定义一个简单的线性模型
class SimpleModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimpleModel, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        return self.linear(x)

In [4]:
# 初始化模型、损失函数和优化器
input_size = 100    # 输入特征数量增加至100
output_size = 1     # 输出特征数量保持不变
model = SimpleModel(input_size=input_size, output_size=output_size).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)  # 调整学习率

In [5]:
# 使用 APEX 的 amp 初始化模型和优化器，设置为混合精度O3，即纯fp16
model, optimizer = amp.initialize(model, optimizer, opt_level='O3')

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


  self._overflow_buf = torch.cuda.IntTensor([0])


In [6]:
# 生成一些虚拟数据
num_samples = 100000  # 样本数量
X = torch.randn(num_samples, input_size).to(device).half()    # 输入为 fp16
y = torch.randn(num_samples, output_size).to(device)          # 目标为 fp32

# 创建数据集和数据加载器
dataset = TensorDataset(X, y)
batch_size = 1024  # 适当增大批量大小
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [7]:
# 记录训练开始时间
start_time = time.time()

# 训练循环
num_epochs = 10
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        

        inputs = inputs.to(device)
        targets = targets.to(device)
        
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # 使用 amp.scale_loss 来缩放损失，避免梯度下溢
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # 计算平均损失
    avg_epoch_loss = epoch_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_epoch_loss:.4f}')

# 计算并打印训练总时间
end_time = time.time()
training_time = end_time - start_time
print(f"Training time: {training_time:.2f} seconds")

Epoch [1/10], Loss: 1.1212
Epoch [2/10], Loss: 1.0003
Epoch [3/10], Loss: 0.9969
Epoch [4/10], Loss: 0.9971
Epoch [5/10], Loss: 0.9970
Epoch [6/10], Loss: 0.9971
Epoch [7/10], Loss: 0.9973
Epoch [8/10], Loss: 0.9973
Epoch [9/10], Loss: 0.9977
Epoch [10/10], Loss: 0.9977
Training time: 13.90 seconds
