# Ulysses 序列并行示例

Author: kaiyuan

Email: kyxie@zju.edu.cn

**说明**

仅关注前向运算。先定义一个标准Attention运算作为参照，再定义一个具备ulysses原理的运算过程。关键点：
* 用for循环来模拟多个GPU运算；
* 定义两个函数：模拟attention计算的前后alltoall过程；
* 为了方便结果比较，序列并行体现在Attention运算内部，最后计算结果cat到一起。

In [1]:
import torch
import torch.nn as nn

class StandardAttention(nn.Module):
    """方式1：标准多头注意力"""
    def __init__(self, hidden_dim=8, num_heads=2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.head_dim = hidden_dim // num_heads

        self.q_proj = nn.Linear(hidden_dim, hidden_dim)
        self.k_proj = nn.Linear(hidden_dim, hidden_dim)
        self.v_proj = nn.Linear(hidden_dim, hidden_dim)
        self.out_proj = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, x):
        seq_len = x.shape[0]

        print("="*60)
        print("方式1：标准多头注意力")
        print("="*60)
        print(f"1. 输入 shape: {x.shape}")

        # Q/K/V投影
        Q = self.q_proj(x)
        K = self.k_proj(x)
        V = self.v_proj(x)
        print(f"2. Q/K/V投影 shape: {Q.shape}")

        # 重塑为多头
        Q = Q.view(seq_len, self.num_heads, self.head_dim)
        K = K.view(seq_len, self.num_heads, self.head_dim)
        V = V.view(seq_len, self.num_heads, self.head_dim)
        print(f"3. 重塑多头 shape: {Q.shape}")

        # 转置用于注意力计算
        Q = Q.transpose(0, 1)  # [num_heads, seq_len, head_dim]
        K = K.transpose(0, 1)
        V = V.transpose(0, 1)
        print(f"4. 转置后 shape: {Q.shape}")

        # 注意力计算（包含softmax）
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn_scores = torch.softmax(attn_scores, dim=-1)  # 加上softmax
        attn_output = torch.matmul(attn_scores, V)
        print(f"5. 注意力计算后 shape: {attn_output.shape}")

        # 转置回来
        attn_output = attn_output.transpose(0, 1)  # [seq_len, num_heads, head_dim]
        print(f"6. 转置回来 shape: {attn_output.shape}")

        # 重塑回原始形状
        attn_output = attn_output.reshape(seq_len, self.hidden_dim)
        print(f"7. 重塑回原始 shape: {attn_output.shape}")

        # 输出投影
        output = self.out_proj(attn_output)
        print(f"8. 最终输出 shape: {output.shape}")

        return output

In [2]:
class UlyssesParallelAttention(nn.Module):
    """方式2：Ulysses序列并行"""
    def __init__(self, hidden_dim=8, num_heads=2, num_gpus=2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.num_gpus = num_gpus
        self.head_dim = hidden_dim // num_heads
        self.local_num_heads = num_heads // num_gpus

        self.q_proj = nn.Linear(hidden_dim, hidden_dim)
        self.k_proj = nn.Linear(hidden_dim, hidden_dim)
        self.v_proj = nn.Linear(hidden_dim, hidden_dim)
        self.out_proj = nn.Linear(hidden_dim, hidden_dim)

    def all_to_all_head_to_sequence(self, data_list):
        """模拟All-to-All通信：从头维度交换到序列维度"""
        # data_list: 每个GPU的数据 [local_seq_len, num_heads, head_dim]
        # 返回: 每个GPU的数据 [seq_len, local_num_heads, head_dim]

        num_gpus = len(data_list)
        local_seq_len = data_list[0].shape[0]
        seq_len = local_seq_len * num_gpus

        results = []
        for gpu_i in range(num_gpus):
            # 收集来自所有GPU的第gpu_i个头的部分
            parts = []
            for gpu_j in range(num_gpus):
                # 从GPU_j获取对应头的部分
                part = data_list[gpu_j][:, gpu_i*self.local_num_heads:(gpu_i+1)*self.local_num_heads, :]
                parts.append(part)

            # 在序列维度拼接
            result = torch.cat(parts, dim=0)  # [seq_len, local_num_heads, head_dim]
            results.append(result)

        return results

    def all_to_all_sequence_to_head(self, data_list):
        """模拟All-to-All通信：从序列维度交换到头维度"""
        # data_list: 每个GPU的数据 [seq_len, local_num_heads, head_dim]
        # 返回: 每个GPU的数据 [local_seq_len, num_heads, head_dim]

        num_gpus = len(data_list)
        seq_len = data_list[0].shape[0]
        local_seq_len = seq_len // num_gpus

        results = []
        for gpu_i in range(num_gpus):
            # 收集来自所有GPU的第gpu_i个序列部分
            parts = []
            for gpu_j in range(num_gpus):
                # 从GPU_j获取对应序列的部分
                part = data_list[gpu_j][gpu_i*local_seq_len:(gpu_i+1)*local_seq_len, :, :]
                parts.append(part)

            # 在头维度拼接
            result = torch.cat(parts, dim=1)  # [local_seq_len, num_heads, head_dim]
            results.append(result)

        return results

    def forward(self, x):
        seq_len = x.shape[0]
        local_seq_len = seq_len // self.num_gpus

        print("\n" + "="*60)
        print("方式2：Ulysses序列并行")
        print("="*60)
        print(f"参数: seq_len={seq_len}, 每个GPU local_seq_len={local_seq_len}")
        print(f"      num_heads={self.num_heads}, 每个GPU local_num_heads={self.local_num_heads}")
        print()
        print(f"1. 完整输入 shape: {x.shape}")

        # 1. 分割序列到不同GPU
        x_split = torch.chunk(x, self.num_gpus, dim=0)
        print(f"2. 序列分割到{self.num_gpus}个GPU:")
        for i in range(self.num_gpus):
            print(f"   GPU{i} 输入 shape: {x_split[i].shape}")

        # 2. 每个GPU本地计算Q/K/V
        print(f"\n3. 每个GPU计算本地Q/K/V:")
        q_list, k_list, v_list = [], [], []

        for i in range(self.num_gpus):
            x_local = x_split[i]

            # 本地投影
            Q_local = self.q_proj(x_local)
            K_local = self.k_proj(x_local)
            V_local = self.v_proj(x_local)
            print(f"   GPU{i}: Q/K/V投影 shape: {Q_local.shape}")

            # 重塑为多头
            Q_local = Q_local.view(local_seq_len, self.num_heads, self.head_dim)
            K_local = K_local.view(local_seq_len, self.num_heads, self.head_dim)
            V_local = V_local.view(local_seq_len, self.num_heads, self.head_dim)
            print(f"   GPU{i}: 重塑多头 shape: {Q_local.shape}")

            q_list.append(Q_local)
            k_list.append(K_local)
            v_list.append(V_local)

        # 3. 第一次All-to-All：从头维度交换到序列维度
        print(f"\n4. 第一次All-to-All通信:")
        print(f"   前: 每个GPU shape {q_list[0].shape}")

        Q_exchanged = self.all_to_all_head_to_sequence(q_list)
        K_exchanged = self.all_to_all_head_to_sequence(k_list)
        V_exchanged = self.all_to_all_head_to_sequence(v_list)

        print(f"   后: 每个GPU shape {Q_exchanged[0].shape}")

        # 4. 每个GPU计算注意力（包含softmax）
        print(f"\n5. 每个GPU计算注意力:")
        attn_outputs = []

        for i in range(self.num_gpus):
            Q_ex = Q_exchanged[i]
            K_ex = K_exchanged[i]
            V_ex = V_exchanged[i]

            # 转置用于注意力计算
            Q_ex = Q_ex.transpose(0, 1)  # [local_num_heads, seq_len, head_dim]
            K_ex = K_ex.transpose(0, 1)
            V_ex = V_ex.transpose(0, 1)
            print(f"   GPU{i}: 转置后 shape: {Q_ex.shape}")

            # 注意力计算
            attn_scores = torch.matmul(Q_ex, K_ex.transpose(-2, -1)) / (self.head_dim ** 0.5)
            attn_scores = torch.softmax(attn_scores, dim=-1)
            attn_output = torch.matmul(attn_scores, V_ex)
            print(f"   GPU{i}: 注意力计算后 shape: {attn_output.shape}")

            # 转置回来
            attn_output = attn_output.transpose(0, 1)  # [seq_len, local_num_heads, head_dim]
            print(f"   GPU{i}: 转置回来 shape: {attn_output.shape}")

            attn_outputs.append(attn_output)

        # 5. 第二次All-to-All：从序列维度交换回头维度
        print(f"\n6. 第二次All-to-All通信:")
        print(f"   前: 每个GPU shape {attn_outputs[0].shape}")

        attn_exchanged = self.all_to_all_sequence_to_head(attn_outputs)

        print(f"   后: 每个GPU shape {attn_exchanged[0].shape}")

        # 6. 重塑并投影输出
        print(f"\n7. 每个GPU重塑并投影:")
        final_outputs = []

        for i in range(self.num_gpus):
            attn_local = attn_exchanged[i]

            # 重塑为 [local_seq_len, hidden_dim]
            attn_reshaped = attn_local.reshape(local_seq_len, self.hidden_dim)
            print(f"   GPU{i}: 重塑后 shape: {attn_reshaped.shape}")

            # 输出投影
            output_local = self.out_proj(attn_reshaped)
            print(f"   GPU{i}: 投影后 shape: {output_local.shape}")

            final_outputs.append(output_local)

        # 7. 收集所有GPU输出
        full_output = torch.cat(final_outputs, dim=0)
        print(f"\n8. 收集所有GPU输出 shape: {full_output.shape}")

        return full_output


In [3]:
def compare_with_same_weights():
    """使用相同权重对比两种方式"""
    print("="*60)
    print("Ulysses序列并行结果对比")
    print("="*60)

    # 设置随机种子
    torch.manual_seed(3)

    # 参数
    seq_len = 2
    hidden_dim = 6
    num_heads = 2
    num_gpus = 2

    # 创建相同权重的模型
    standard_model = StandardAttention(hidden_dim, num_heads)
    parallel_model = UlyssesParallelAttention(hidden_dim, num_heads, num_gpus)

    # 复制权重，确保两种方式使用相同的权重
    with torch.no_grad():
        parallel_model.q_proj.weight.copy_(standard_model.q_proj.weight)
        parallel_model.q_proj.bias.copy_(standard_model.q_proj.bias)
        parallel_model.k_proj.weight.copy_(standard_model.k_proj.weight)
        parallel_model.k_proj.bias.copy_(standard_model.k_proj.bias)
        parallel_model.v_proj.weight.copy_(standard_model.v_proj.weight)
        parallel_model.v_proj.bias.copy_(standard_model.v_proj.bias)
        parallel_model.out_proj.weight.copy_(standard_model.out_proj.weight)
        parallel_model.out_proj.bias.copy_(standard_model.out_proj.bias)

    # 生成相同的输入数据
    x = torch.randn(seq_len, hidden_dim)
    print(f"输入数据 shape: {x.shape}")
    print(f"输入数据 (前3行):\n{x[:3]}")

    print("\n" + "="*60)

    # 方式1：标准注意力
    with torch.no_grad():
        output1 = standard_model(x.clone())

    print("\n" + "="*60)

    # 方式2：Ulysses并行
    with torch.no_grad():
        output2 = parallel_model(x.clone())

    print("\n" + "="*60)
    print("结果对比")
    print("="*60)

    # 比较输出
    print(f"方式1输出 shape: {output1.shape}")
    print(f"方式2输出 shape: {output2.shape}")

    # 计算差异
    diff = torch.abs(output1 - output2).max().item()
    print(f"\n两种方式输出最大差异: {diff:.6f}")

    # 计算相对误差
    rel_error = torch.norm(output1 - output2) / torch.norm(output1)
    print(f"相对误差: {rel_error:.6f}")

    # 检查结果是否匹配
    if diff < 1e-5:
        print("✓ 两种方式计算结果一致！")
    else:
        print("⚠ 两种方式计算结果存在差异")

    # 打印部分输出对比
    print(f"\n方式1输出 (前3行):")
    print(output1[:3])
    print(f"\n方式2输出 (前3行):")
    print(output2[:3])

    return output1, output2

In [4]:
compare_with_same_weights()

Ulysses序列并行结果对比
输入数据 shape: torch.Size([2, 6])
输入数据 (前3行):
tensor([[-0.0991,  1.0835, -0.4738,  0.9102,  0.8597, -0.0252],
        [ 0.3625, -1.5558,  0.5427, -0.0135,  1.1126,  1.2475]])

方式1：标准多头注意力
1. 输入 shape: torch.Size([2, 6])
2. Q/K/V投影 shape: torch.Size([2, 6])
3. 重塑多头 shape: torch.Size([2, 2, 3])
4. 转置后 shape: torch.Size([2, 2, 3])
5. 注意力计算后 shape: torch.Size([2, 2, 3])
6. 转置回来 shape: torch.Size([2, 2, 3])
7. 重塑回原始 shape: torch.Size([2, 6])
8. 最终输出 shape: torch.Size([2, 6])


方式2：Ulysses序列并行
参数: seq_len=2, 每个GPU local_seq_len=1
      num_heads=2, 每个GPU local_num_heads=1

1. 完整输入 shape: torch.Size([2, 6])
2. 序列分割到2个GPU:
   GPU0 输入 shape: torch.Size([1, 6])
   GPU1 输入 shape: torch.Size([1, 6])

3. 每个GPU计算本地Q/K/V:
   GPU0: Q/K/V投影 shape: torch.Size([1, 6])
   GPU0: 重塑多头 shape: torch.Size([1, 2, 3])
   GPU1: Q/K/V投影 shape: torch.Size([1, 6])
   GPU1: 重塑多头 shape: torch.Size([1, 2, 3])

4. 第一次All-to-All通信:
   前: 每个GPU shape torch.Size([1, 2, 3])
   后: 每个GPU shape torch.Size([2, 1, 3

(tensor([[-0.1666,  0.1110, -0.0746, -0.2954, -0.2557,  0.0878],
         [-0.1656,  0.1140, -0.0928, -0.3176, -0.2792,  0.1062]]),
 tensor([[-0.1666,  0.1110, -0.0746, -0.2954, -0.2557,  0.0878],
         [-0.1656,  0.1140, -0.0928, -0.3176, -0.2792,  0.1062]]))