In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])



trainset = torchvision.datasets.MNIST(root='./data', train=True,
                                      download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True, num_workers=2)
testset = torchvision.datasets.MNIST(root='./data', train=False,
                                        download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(testset, batch_size=64,
                                          shuffle=False, num_workers=2)
# 这是一个nmist数据集
#
#trainset是训练集，testset是测试集
print(len(trainset))
print(len(testset))
#trainset和testset都是一个数据集，里面有60000个样本,被分成了两个数据集，一个是训练集，一个是测试集，确保训练集和测试集是独立的
# print(trainset)

60000
10000


In [2]:
class CVAE(nn.Module):
    def __init__(self):
        super(CVAE, self).__init__()
        self.label_emb = nn.Embedding(10, 10)  # Embedding for the 10 classes

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(794, 400),  # 784 pixels + 10 label embedding = 794
            nn.ReLU(),
        )
        self.fc_mu = nn.Linear(400, 20)  # Latent space mean
        self.fc_var = nn.Linear(400, 20)  # Latent space variance

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(30, 400),  # 20 latent dims + 10 label embedding
            nn.ReLU(),
            nn.Linear(400, 784),
            nn.Sigmoid(),  # Output between 0 and 1
        )

    def encode(self, x, labels):
        combined = torch.cat((x, self.label_emb(labels)), 1)
        h1 = self.encoder(combined)
        return self.fc_mu(h1), self.fc_var(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z, labels):
        combined = torch.cat((z, self.label_emb(labels)), 1)
        return self.decoder(combined)

    def forward(self, x, labels):
        mu, logvar = self.encode(x.view(-1, 784), labels)
        z = self.reparameterize(mu, logvar)
        return self.decode(z, labels), mu, logvar
# test the model
model = CVAE()
x = torch.randn(64, 784)
labels = torch.randint(0, 10, (64,))
output = model(x, labels)
print(output[0].shape)
print(output[1].shape)
print(output[2].shape)


print(output[0])
import torch.nn.functional as F
# Reconstruction + KL divergence losses summed over all elements and batch
def loss_function(recon_x, x, mu, logvar):
    recon_x = torch.clamp(recon_x, 0, 1)  # Ensure the reconstructions are in the valid range for BCE
    BCE = F.binary_cross_entropy(recon_x, x.view(-1, 784), reduction='sum')

    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

optimizer = optim.Adam(model.parameters(), lr=1e-3)


torch.Size([64, 784])
torch.Size([64, 20])
torch.Size([64, 20])
tensor([[0.4154, 0.4601, 0.3943,  ..., 0.5111, 0.3938, 0.5057],
        [0.4585, 0.4871, 0.5030,  ..., 0.4416, 0.5652, 0.5018],
        [0.4122, 0.5386, 0.4323,  ..., 0.4447, 0.4600, 0.4008],
        ...,
        [0.4137, 0.5161, 0.4431,  ..., 0.4732, 0.4933, 0.4406],
        [0.4346, 0.4951, 0.4104,  ..., 0.4737, 0.4944, 0.4698],
        [0.5236, 0.5448, 0.3843,  ..., 0.5443, 0.4576, 0.4762]],
       grad_fn=<SigmoidBackward0>)


In [3]:
def train(model, device, train_loader, optimizer, loss_function):
    model.train()
    total_loss = 0
    for batch_idx, (data, labels) in enumerate(train_loader):
        data, labels = data.to(device), labels.to(device)
        optimizer.zero_grad()
        
        # Forward pass: Compute predicted y by passing x to the model
        recon_batch, mu, logvar = model(data, labels)
        
        # Compute and print loss
        loss = loss_function(recon_batch, data, mu, logvar)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(train_loader.dataset)
    return average_loss  # We return average loss to track over epochs
def test(model, device, test_loader, loss_function):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device)
            recon_batch, mu, logvar = model(data, labels)
            test_loss += loss_function(recon_batch, data, mu, logvar).item()
    
    test_loss /= len(test_loader.dataset)
    return test_loss

def train_test_loop(model, train_loader, test_loader, optimizer, loss_function, num_epochs, device):
    for epoch in range(num_epochs):
        train_loss = train(model, device, train_loader, optimizer, loss_function)
        test_loss = test(model, device, test_loader, loss_function)
        
        print(f'Epoch [{epoch+1}/{num_epochs}]: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = CVAE().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 10
train_test_loop(model, train_loader, test_loader, optimizer, loss_function, num_epochs, device)


/opt/conda/conda-bld/pytorch_1711403463728/work/aten/src/ATen/native/cuda/Loss.cu:95: operator(): block: [51,0,0], thread: [64,0,0] Assertion `target_val >= zero && target_val <= one` failed.
/opt/conda/conda-bld/pytorch_1711403463728/work/aten/src/ATen/native/cuda/Loss.cu:95: operator(): block: [51,0,0], thread: [65,0,0] Assertion `target_val >= zero && target_val <= one` failed.
/opt/conda/conda-bld/pytorch_1711403463728/work/aten/src/ATen/native/cuda/Loss.cu:95: operator(): block: [51,0,0], thread: [66,0,0] Assertion `target_val >= zero && target_val <= one` failed.
/opt/conda/conda-bld/pytorch_1711403463728/work/aten/src/ATen/native/cuda/Loss.cu:95: operator(): block: [51,0,0], thread: [67,0,0] Assertion `target_val >= zero && target_val <= one` failed.
/opt/conda/conda-bld/pytorch_1711403463728/work/aten/src/ATen/native/cuda/Loss.cu:95: operator(): block: [51,0,0], thread: [69,0,0] Assertion `target_val >= zero && target_val <= one` failed.
/opt/conda/conda-bld/pytorch_17114034637

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [10]:
def train(model, device, train_loader, optimizer, loss_function):
    model.train()  
    total_loss = 0
    correct = 0
    total = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)  
        correct += pred.eq(target.view_as(pred)).sum().item()
        total += target.size(0)
    
    average_loss = total_loss / len(train_loader)
    accuracy = 100. * correct / total
    return average_loss, accuracy


def test(model, device, test_loader, loss_function):
    model.eval() # 将模型设置为评估模式
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = loss_function(output, target)
            test_loss += loss.item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
            total += target.size(0)

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / total
    return test_loss, accuracy

#写一个循环来做训练和测试
# train(mlp, trainloader, criterion, optimizer)
# test(mlp, testloader, criterion）

#现在我们把训练和测试的代码封装到一个函数里面

def train_test_loop(model, train_loader, test_loader, optimizer, loss_function, num_epochs, device, save_path='model_checkpoint/', scheduler=None, checkpoint_filename=None, save_frequency=1):
    start_epoch = 0
    for epoch in range(start_epoch, num_epochs):
        train_loss, train_accuracy = train(model, device, train_loader, optimizer, loss_function)
        test_loss, test_accuracy = test(model, device, test_loader, loss_function)

        if scheduler is not None:
            scheduler.step()
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%')



In [7]:

# 定义损失函数和优化器

loss_function = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Autoencoder().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

num_epochs=4
train_test_loop(model,train_loader, test_loader, 
                optimizer, 
                loss_function, 
                num_epochs,device,
                )

  return F.conv2d(input, weight, bias, self.stride,


RuntimeError: only batches of spatial targets supported (3D tensors) but got targets of size: : [64]

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AttentionLayer(nn.Module):
    def __init__(self, key_size, query_size, value_size):
        """
        初始化注意力层
        :param key_size: 键向量的维度
        :param query_size: 查询向量的维度
        :param value_size: 值向量的维度
        """
        super(AttentionLayer, self).__init__()
        self.key_size = key_size
        self.query_size = query_size
        self.value_size = value_size

        # 定义权重矩阵
        self.key_layer = nn.Linear(query_size, key_size, bias=False)
        self.query_layer = nn.Linear(query_size, key_size, bias=False)
        self.value_layer = nn.Linear(value_size, value_size, bias=False)

    def forward(self, keys, queries, values, mask=None):
        """
        前向传播方法
        :param keys: 键向量
        :param queries: 查询向量
        :param values: 值向量
        :param mask: 可选的掩码向量，用于遮盖不需要关注的部分
        """
        # 计算查询、键、值
        queries = self.query_layer(queries)
        keys = self.key_layer(keys)
        values = self.value_layer(values)

        # 计算注意力分数
        scores = torch.matmul(queries, keys.transpose(-2, -1)) / self.key_size**0.5

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
            
        # 应用softmax
        attention_weights = F.softmax(scores, dim=-1)

        # 计算加权和
        output = torch.matmul(attention_weights, values)
        return output, attention_weights

# 示例用法
key_size = 64
query_size = 64
value_size = 128
batch_size = 10
seq_length = 20

# 创建模型
attention = AttentionLayer(key_size, query_size, value_size)

# 模拟输入数据
keys = torch.rand(batch_size, seq_length, key_size)
queries = torch.rand(batch_size, seq_length, query_size)
values = torch.rand(batch_size, seq_length, value_size)

# 运行注意力层
output, attention_weights = attention(keys, queries, values)

print("Output shape:", output.shape)
print("Attention weights shape:", attention_weights.shape)

        

Output shape: torch.Size([10, 20, 128])
Attention weights shape: torch.Size([10, 20, 20])


In [None]:
class my_transformer(nn.Module):
    def __init__(self):
        super(my_transformer, self).__init__()
        
    def forward(self, x):

In [None]:

# 定义损失函数和优化器

loss_function = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = my_transformer().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

num_epochs=4
# train_test_loop(model,train_loader, test_loader, 
#                 optimizer, 
#                 loss_function, 
#                 num_epochs,device,
#                 )