## Free test

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
# 假设你希望使用 'SimHei'
plt.rcParams['font.sans-serif'] = ['SimHei', 'WenQuanYi Zen Hei', 'Arial Unicode MS'] # 添加多个备用字体
plt.rcParams['axes.unicode_minus'] = False # 解决负号 '-' 显示为方块的问题
plt.rcParams['font.size'] = 12 # 可以适当调整字体大小

In [None]:
def compute_relevance_scores(query_embeddings, document_embeddings):
    """
    计算给定查询的前 k 个最相关文档。

    参数:
    query_embeddings: 表示查询嵌入的张量 (tensor),形状: [seq_len, d_model]
    document_embeddings: 表示 k 个文档嵌入的张量,形状: [k, max_sql_len, d_model]

    返回: 基于相关性分数排序的文档索引
    """

    # 注: 假设 document_embeddings 已经进行了适当的填充并转移到 GPU

    # 1. 计算查询嵌入和文档嵌入的批量点积
    # scores = torch.matmul(query_embeddings.unsqueeze(0), document_embeddings.transpose(1, 2))
    scores = torch.einsum("sd, kmd -> ksm", query_embeddings, document_embeddings)
    print("scores shape:", scores.shape)  # 输出形状应为 [k, seq_len, max_sql_len]

    # 2. 在文档词语维度上应用最大池化,找出每个查询词语的最大相似度
    max_scores_per_query_term = scores.max(dim=2).values
    print(
        "max_scores_per_query_term shape:", max_scores_per_query_term.shape
    )  # 输出形状应为 [k, seq_len]

    # 3. 对查询词语的分数求和,得到每个文档的总分
    total_scores = max_scores_per_query_term.sum(dim=1)
    print("total_scores shape:", total_scores.shape)  # 输出形状应为 [k]

    # 4. 根据总分对文档进行降序排序
    sorted_indices = total_scores.argsort(descending=True)

    return sorted_indices


# 测试代码
if __name__ == "__main__":
    # 设置随机种子以保证结果可复现
    torch.manual_seed(42)

    # 模拟输入数据
    num_queries = 5  # 查询中的词数
    embedding_dim = 768  # 假设使用BERT-base的嵌入维度
    num_documents = 3  # 测试用文档数量
    max_doc_length = 10  # 文档的最大长度

    # 随机生成查询和文档嵌入
    query_embeddings = torch.randn(num_queries, embedding_dim)
    document_embeddings = torch.randn(num_documents, max_doc_length, embedding_dim)

    # 计算相关性分数
    relevance_scores = compute_relevance_scores(query_embeddings, document_embeddings)

    print("相关性分数:", relevance_scores)

    # 根据相关性分数对文档进行排序
    sorted_indices = relevance_scores.argsort(descending=True)
    print("按相关性分数降序排列的文档索引:", sorted_indices)

In [None]:
# --- 1. 创建一个示例的“高秩”矩阵 ---
# 我们可以创建一个 M x N 的矩阵
M, N = 100, 80
# 为了演示，我们先创建一个真实的低秩矩阵，然后加上一些噪声，使其变成一个“高秩”但可近似的矩阵
# 创建两个“瘦”矩阵，它们的乘积将是低秩的
rank_k = 5 # 假设真实秩为 5
U_true = np.random.rand(M, rank_k)
V_true = np.random.rand(N, rank_k)

# 真实的低秩矩阵
original_low_rank_matrix = np.dot(U_true, V_true.T)

# 加上一些随机噪声，使其变为一个“高秩”但有内在低秩结构的数据矩阵
noise = np.random.randn(M, N) * 0.1 # 噪音强度
A_original = original_low_rank_matrix + noise

print(f"原始矩阵 A_original 形状: {A_original.shape}")
print(f"原始矩阵 A_original 的秩 (理论上，实际计算可能因噪声而接近满秩): {np.linalg.matrix_rank(A_original)}")
print("-" * 30)

# --- 2. 使用奇异值分解 (SVD) 进行低秩分解 ---
# SVD 将矩阵 A 分解为 U * S * Vh
# U: 左奇异向量矩阵 (M x M)
# s: 奇异值向量 (min(M, N) 长度)
# Vh: 右奇异向量的共轭转置矩阵 (N x N)
U, s, Vh = np.linalg.svd(A_original)

print(f"U 矩阵形状: {U.shape}")
print(f"奇异值向量 s 的长度: {s.shape}")
print(f"Vh 矩阵形状: {Vh.shape}")
print("\n前10个奇异值 (SVD能够揭示矩阵的“能量”或重要性):")
print(s[:10])
print("-" * 30)

# --- 3. 通过截断 SVD 实现低秩近似 ---
# 我们选择一个比原始矩阵秩小得多的 k，来近似原矩阵
approx_rank = 5 # 我们希望用秩为5的矩阵来近似

if approx_rank > len(s):
    approx_rank = len(s) # 确保不超过奇异值数量

# 提取前 k 个奇异值、左奇异向量和右奇异向量
U_k = U[:, :approx_rank]
s_k = np.diag(s[:approx_rank]) # 将奇异值向量转换为对角矩阵
Vh_k = Vh[:approx_rank, :]

# 重构低秩近似矩阵 A_approx
# A_approx = U_k * s_k * Vh_k
A_approx = np.dot(U_k, np.dot(s_k, Vh_k))

print(f"低秩近似矩阵 A_approx 的形状: {A_approx.shape}")
print(f"低秩近似矩阵 A_approx 的秩: {np.linalg.matrix_rank(A_approx)}") # 理论上是 approx_rank
print("-" * 30)

# --- 4. 比较原始矩阵和低秩近似矩阵 ---
# 计算近似误差（Frobenius范数）
approximation_error = np.linalg.norm(A_original - A_approx, 'fro')
print(f"原始矩阵和低秩近似矩阵之间的误差 (Frobenius范数): {approximation_error}")

# 可视化前几个奇异值，可以看到它们衰减得很快，这正是低秩矩阵的特征
plt.figure(figsize=(10, 5))
plt.plot(s, 'o-')
plt.title('Singular Values (奇异值)')
plt.xlabel('Index')
plt.ylabel('Value')
plt.grid(True)
plt.axvline(x=approx_rank - 1, color='r', linestyle='--', label=f'Approximation Rank = {approx_rank}')
plt.legend()
plt.yscale('log') # 奇异值通常呈指数衰减，用对数坐标更清晰
plt.show()

# 进一步可视化，展示原始矩阵和近似矩阵的差异（如果矩阵足够小，可以显示图像）
# 这里只显示一个小块，因为整个矩阵太大
if M * N <= 2500: # 如果矩阵足够小，才尝试显示为图像
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.imshow(A_original, cmap='gray', aspect='auto')
    plt.title('Original Matrix')
    plt.colorbar()

    plt.subplot(1, 2, 2)
    plt.imshow(A_approx, cmap='gray', aspect='auto')
    plt.title(f'Low-Rank Approximation (rank={approx_rank})')
    plt.colorbar()
    plt.show()
else:
    print("\n矩阵太大，不适合直接可视化为图像，请检查数值。")

print("\n--- 关键点总结 ---")
print(f"原始矩阵存储参数量: {M * N}")
print(f"低秩近似矩阵存储参数量 (U_k, s_k, Vh_k): {M * approx_rank + approx_rank + N * approx_rank} (不包括额外的对角矩阵存储，如果只存向量的话)")
print(f"当 rank_k ({approx_rank}) 远小于 M({M}) 和 N({N}) 时，参数量大大减少，实现了压缩。")
print("例如，如果只存储 U_k 和 Vh_k，参数量为 (M+N)*approx_rank。")
print(f"本例中：原始参数 {M*N} = {M*N}，近似参数 {(M+N)*approx_rank} = {(M+N)*approx_rank}")

## LLM code practice

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
# 假设你希望使用 'SimHei'
plt.rcParams['font.sans-serif'] = ['SimHei', 'WenQuanYi Zen Hei', 'Arial Unicode MS'] # 添加多个备用字体
plt.rcParams['axes.unicode_minus'] = False # 解决负号 '-' 显示为方块的问题
plt.rcParams['font.size'] = 12 # 可以适当调整字体大小

In [6]:
inputs = torch.tensor(
    [
        [0.43, 0.15, 0.89],
        [0.55, 0.87, 0.66],
        [0.57, 0.85, 0.64],
        [0.22, 0.58, 0.33],
        [0.77, 0.25, 0.10],
        [0.05, 0.80, 0.55],
    ]
)
print("batch size 1, seq length 6, dim 3")
inputs.shape


batch size 1, seq length 6, dim 3


torch.Size([6, 3])

In [48]:
d_model = inputs[1].shape[0]

w_q = torch.nn.Parameter(torch.rand(d_model, d_model), requires_grad=False)
w_k = torch.nn.Parameter(torch.rand(d_model, d_model), requires_grad=False)
w_v = torch.nn.Parameter(torch.rand(d_model, d_model), requires_grad=False)

x_2 = inputs[1]
query_2 = x_2 @ w_q
key_2 = x_2 @ w_k
value_2 = x_2 @ w_v

query_2

attn_scores = inputs @ inputs.T
attn_weights = torch.softmax(attn_scores, dim=1)
attn_weights @ inputs

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])

In [42]:
class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_q = nn.Linear(d_in, d_out, bias=False)
        self.W_k = nn.Linear(d_in, d_out, bias=False)
        self.W_v = nn.Linear(d_in, d_out, bias=False)

    def forward(self, x):
        keys = self.W_k(x)
        values = self.W_v(x)
        queries = self.W_q(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[0]**0.5, dim=-1)

        context_vec = attn_weights @ values
        return context_vec

In [47]:
sa_v1 = SelfAttention_v2(3, 3)
print(sa_v1(inputs))

tensor([[0.2356, 0.5940, 0.2045],
        [0.2364, 0.5991, 0.2040],
        [0.2363, 0.5992, 0.2041],
        [0.2371, 0.6000, 0.2047],
        [0.2355, 0.6001, 0.2049],
        [0.2376, 0.5995, 0.2044]], grad_fn=<MmBackward0>)


In [23]:
seqlen = 6
torch.tril(torch.ones(seqlen, seqlen))

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])

In [51]:
context_len = attn_weights.shape[0]
mask_simple = torch.tril(torch.ones(context_len, context_len))
attn_weights * mask_simple

tensor([[0.2098, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1385, 0.2379, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1390, 0.2369, 0.2326, 0.0000, 0.0000, 0.0000],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.0000, 0.0000],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.0000],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])

In [31]:
row_sum = mask_simple.sum(dim=1, keepdim=True)
mask_simple_norm = mask_simple / row_sum
mask_simple_norm

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667]])

In [58]:
mask = torch.triu(torch.ones(context_len, context_len), diagonal=1)
masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
masked

tensor([[0.9995,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.9544, 1.4950,   -inf,   -inf,   -inf,   -inf],
        [0.9422, 1.4754, 1.4570,   -inf,   -inf,   -inf],
        [0.4753, 0.8434, 0.8296, 0.4937,   -inf,   -inf],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654,   -inf],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

In [None]:
attn_weights = torch.softmax(masked / d_model**0.5, dim=-1)
attn_weights

In [None]:
dropout = nn.Dropout(0.5)
dropout(attn_weights)

In [72]:
batch = torch.stack((inputs, inputs), dim=0)
batch.shape

torch.Size([2, 6, 3])

implementing compact causal self-attention class

In [77]:
class CausalAttention(nn.Module):
    def __init__(self, d_model, seq_len, dropout, qkv_bias=False):
        super().__init__()
        self.W_q = nn.Linear(d_model, d_model, bias=qkv_bias)
        self.W_k = nn.Linear(d_model, d_model, bias=qkv_bias)
        self.W_v = nn.Linear(d_model, d_model, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_k(x)
        queries = self.W_q(x)
        values = self.W_v(x)

        attn_scores = queries @ keys.transpose(1, 2)
        mask = torch.triu(torch.ones(num_tokens, num_tokens), diagonal=1)
        attn_scores = attn_scores.masked_fill_(
            mask.bool()[:num_tokens, :num_tokens], -torch.inf
        )
        attn_weights = torch.softmax(attn_scores / d_in**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = attn_weights @ values
        return context_vec
    
context_length = batch.shape[1]
ca = CausalAttention(3, context_length, 0.0)

context_vecs = ca(batch)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[ 0.0457, -0.1734, -0.2304],
         [ 0.2276, -0.3493, -0.3021],
         [ 0.2899, -0.4045, -0.3238],
         [ 0.2786, -0.3888, -0.2985],
         [ 0.2945, -0.3402, -0.2637],
         [ 0.2819, -0.3639, -0.2719]],

        [[ 0.0457, -0.1734, -0.2304],
         [ 0.2276, -0.3493, -0.3021],
         [ 0.2899, -0.4045, -0.3238],
         [ 0.2786, -0.3888, -0.2985],
         [ 0.2945, -0.3402, -0.2637],
         [ 0.2819, -0.3639, -0.2719]]], grad_fn=<UnsafeViewBackward0>)
context_vecs.shape: torch.Size([2, 6, 3])


multi-head attention

In [78]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = (
            d_out // num_heads
        )  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask", torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # combine heads, where self.d_out = num_heads * head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec
    
batch_size, context_length, d_in = batch.shape
d_out = 2
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)

context_vecs = mha(batch)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[-0.2026,  0.0589],
         [-0.2335,  0.1066],
         [-0.2453,  0.1239],
         [-0.2445,  0.1052],
         [-0.2623,  0.1193],
         [-0.2528,  0.1023]],

        [[-0.2026,  0.0589],
         [-0.2335,  0.1066],
         [-0.2453,  0.1239],
         [-0.2445,  0.1052],
         [-0.2623,  0.1193],
         [-0.2528,  0.1023]]], grad_fn=<ViewBackward0>)
context_vecs.shape: torch.Size([2, 6, 2])
