# MLP

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class MLP(nn.module):
    def __init__(self,input_size,hidden_size,num_classes):
        super(MLP,self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self,x):
        x = x.view(-1, self.input_size)
        x = torch.relu(self.fc1(x))
        '''
        或者这样写
        x = self.fc1(x)
        x = torch.relu(x)
        '''
        x = self.fc2(x)
        return x
    
# 1. 创建一个MLP实例
input_size = 576 # 假设每个输入有784个特征（例如，24x24像素的图像）
hidden_size = 512 # 第一个隐藏层的神经元数量
num_classes = 10 # 输出类别数量
mlp = MLP(input_size, hidden_size, num_classes)
criterion = nn.MSELoss()
optimizer = optim.SGD(mlp.parameters(), lr=1e-4)

x_train = torch.randn(64, input_size) 
y_train = torch.randn(64, num_classes)

for epoch in range(100):
    # 进行100个训练周期
    # 前向传播 
    outputs = mlp(x_train) 
    loss = criterion(outputs, y_train)

    # 反向传播和优化 
    optimizer.zero_grad()
    loss.backward() 
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

with torch.no_grad():
    sample_data = torch.randn(1, input_size) 
    predictions = mlp(sample_data) 
    print(predictions)
        
        


        

# Self-Attention

In [None]:
import torch
import torch.nn as nn
import math

class SelfAttention(nn.module):
    def __init__(self, hidden_dim):
        super(SelfAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.q_proj = nn.Linear(hidden_dim, hidden_dim)
        self.k_proj = nn.Linear(hidden_dim, hidden_dim)
        self.v_proj = nn.Linear(hidden_dim, hidden_dim)
        
    def forward(self, x):
        q = self.q_proj(x)
        k = self.k_proj(x)
        v = self.v_proj(x)
        attn_weight = torch.matmul(q, k.transpose(-2, -1))
        attn_weight = torch.softmax(attn_weight/math.sqrt(self.hidden_dim), dim=-1)
        output = torch.matmul(attn_weight, v)
        return output

# MHA

In [None]:
import torch
import torch.nn as nn

class MHA(nn.Module):
    def __init__(self, hidden_dim, num_heads,attention_dropout=0.1, output_dropout=0.1):
        super(MHA, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.head_dim = hidden_dim / num_heads
        self.q_proj = nn.Linear(hidden_dim, hidden_dim)
        self.k_proj = nn.Linear(hidden_dim, hidden_dim)
        self.v_proj = nn.Linear(hidden_dim, hidden_dim)
        self.out_proj = nn.Linear(hidden_dim, hidden_dim)
        self.attn_dropout = nn.Dropout(attention_dropout)
        self.output_dropout = nn.Dropout(output_dropout)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.shape
        # shape 变成 （batch_size, num_head, seq_len, head_dim）
        Q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).permute(#另一种写法
            0, 2, 1, 3
        )
        K = self.k_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.v_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        
        attn_weight = torch.softmax(Q @ K.transpose(-2, -1)/math.sqrt(self.head_dim),dim = -1)
        attn_weight = self.dropout(attn_weight)
        output = attn_weight @ V
        # contiguous重新分配内存，使张量在内存中是连续的，view函数要求输入的张量在内存中是连续的
        output = output.transpose(1,2).contiguous().view(batch_size, seq_len, self.hidden_dim)
        output = self.out_proj(output)
        output = self.output_dropout(output)
        
        return output
        
        
        
        