## Free test

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
# 假设你希望使用 'SimHei'
plt.rcParams['font.sans-serif'] = ['SimHei', 'WenQuanYi Zen Hei', 'Arial Unicode MS'] # 添加多个备用字体
plt.rcParams['axes.unicode_minus'] = False # 解决负号 '-' 显示为方块的问题
plt.rcParams['font.size'] = 12 # 可以适当调整字体大小

In [None]:
def compute_relevance_scores(query_embeddings, document_embeddings):
    """
    计算给定查询的前 k 个最相关文档。

    参数:
    query_embeddings: 表示查询嵌入的张量 (tensor),形状: [seq_len, d_model]
    document_embeddings: 表示 k 个文档嵌入的张量,形状: [k, max_sql_len, d_model]

    返回: 基于相关性分数排序的文档索引
    """

    # 注: 假设 document_embeddings 已经进行了适当的填充并转移到 GPU

    # 1. 计算查询嵌入和文档嵌入的批量点积
    # scores = torch.matmul(query_embeddings.unsqueeze(0), document_embeddings.transpose(1, 2))
    scores = torch.einsum("sd, kmd -> ksm", query_embeddings, document_embeddings)
    print("scores shape:", scores.shape)  # 输出形状应为 [k, seq_len, max_sql_len]

    # 2. 在文档词语维度上应用最大池化,找出每个查询词语的最大相似度
    max_scores_per_query_term = scores.max(dim=2).values
    print(
        "max_scores_per_query_term shape:", max_scores_per_query_term.shape
    )  # 输出形状应为 [k, seq_len]

    # 3. 对查询词语的分数求和,得到每个文档的总分
    total_scores = max_scores_per_query_term.sum(dim=1)
    print("total_scores shape:", total_scores.shape)  # 输出形状应为 [k]

    # 4. 根据总分对文档进行降序排序
    sorted_indices = total_scores.argsort(descending=True)

    return sorted_indices


# 测试代码
if __name__ == "__main__":
    # 设置随机种子以保证结果可复现
    torch.manual_seed(42)

    # 模拟输入数据
    num_queries = 5  # 查询中的词数
    embedding_dim = 768  # 假设使用BERT-base的嵌入维度
    num_documents = 3  # 测试用文档数量
    max_doc_length = 10  # 文档的最大长度

    # 随机生成查询和文档嵌入
    query_embeddings = torch.randn(num_queries, embedding_dim)
    document_embeddings = torch.randn(num_documents, max_doc_length, embedding_dim)

    # 计算相关性分数
    relevance_scores = compute_relevance_scores(query_embeddings, document_embeddings)

    print("相关性分数:", relevance_scores)

    # 根据相关性分数对文档进行排序
    sorted_indices = relevance_scores.argsort(descending=True)
    print("按相关性分数降序排列的文档索引:", sorted_indices)

In [None]:
# --- 1. 创建一个示例的“高秩”矩阵 ---
# 我们可以创建一个 M x N 的矩阵
M, N = 100, 80
# 为了演示，我们先创建一个真实的低秩矩阵，然后加上一些噪声，使其变成一个“高秩”但可近似的矩阵
# 创建两个“瘦”矩阵，它们的乘积将是低秩的
rank_k = 5 # 假设真实秩为 5
U_true = np.random.rand(M, rank_k)
V_true = np.random.rand(N, rank_k)

# 真实的低秩矩阵
original_low_rank_matrix = np.dot(U_true, V_true.T)

# 加上一些随机噪声，使其变为一个“高秩”但有内在低秩结构的数据矩阵
noise = np.random.randn(M, N) * 0.1 # 噪音强度
A_original = original_low_rank_matrix + noise

print(f"原始矩阵 A_original 形状: {A_original.shape}")
print(f"原始矩阵 A_original 的秩 (理论上，实际计算可能因噪声而接近满秩): {np.linalg.matrix_rank(A_original)}")
print("-" * 30)

# --- 2. 使用奇异值分解 (SVD) 进行低秩分解 ---
# SVD 将矩阵 A 分解为 U * S * Vh
# U: 左奇异向量矩阵 (M x M)
# s: 奇异值向量 (min(M, N) 长度)
# Vh: 右奇异向量的共轭转置矩阵 (N x N)
U, s, Vh = np.linalg.svd(A_original)

print(f"U 矩阵形状: {U.shape}")
print(f"奇异值向量 s 的长度: {s.shape}")
print(f"Vh 矩阵形状: {Vh.shape}")
print("\n前10个奇异值 (SVD能够揭示矩阵的“能量”或重要性):")
print(s[:10])
print("-" * 30)

# --- 3. 通过截断 SVD 实现低秩近似 ---
# 我们选择一个比原始矩阵秩小得多的 k，来近似原矩阵
approx_rank = 5 # 我们希望用秩为5的矩阵来近似

if approx_rank > len(s):
    approx_rank = len(s) # 确保不超过奇异值数量

# 提取前 k 个奇异值、左奇异向量和右奇异向量
U_k = U[:, :approx_rank]
s_k = np.diag(s[:approx_rank]) # 将奇异值向量转换为对角矩阵
Vh_k = Vh[:approx_rank, :]

# 重构低秩近似矩阵 A_approx
# A_approx = U_k * s_k * Vh_k
A_approx = np.dot(U_k, np.dot(s_k, Vh_k))

print(f"低秩近似矩阵 A_approx 的形状: {A_approx.shape}")
print(f"低秩近似矩阵 A_approx 的秩: {np.linalg.matrix_rank(A_approx)}") # 理论上是 approx_rank
print("-" * 30)

# --- 4. 比较原始矩阵和低秩近似矩阵 ---
# 计算近似误差（Frobenius范数）
approximation_error = np.linalg.norm(A_original - A_approx, 'fro')
print(f"原始矩阵和低秩近似矩阵之间的误差 (Frobenius范数): {approximation_error}")

# 可视化前几个奇异值，可以看到它们衰减得很快，这正是低秩矩阵的特征
plt.figure(figsize=(10, 5))
plt.plot(s, 'o-')
plt.title('Singular Values (奇异值)')
plt.xlabel('Index')
plt.ylabel('Value')
plt.grid(True)
plt.axvline(x=approx_rank - 1, color='r', linestyle='--', label=f'Approximation Rank = {approx_rank}')
plt.legend()
plt.yscale('log') # 奇异值通常呈指数衰减，用对数坐标更清晰
plt.show()

# 进一步可视化，展示原始矩阵和近似矩阵的差异（如果矩阵足够小，可以显示图像）
# 这里只显示一个小块，因为整个矩阵太大
if M * N <= 2500: # 如果矩阵足够小，才尝试显示为图像
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.imshow(A_original, cmap='gray', aspect='auto')
    plt.title('Original Matrix')
    plt.colorbar()

    plt.subplot(1, 2, 2)
    plt.imshow(A_approx, cmap='gray', aspect='auto')
    plt.title(f'Low-Rank Approximation (rank={approx_rank})')
    plt.colorbar()
    plt.show()
else:
    print("\n矩阵太大，不适合直接可视化为图像，请检查数值。")

print("\n--- 关键点总结 ---")
print(f"原始矩阵存储参数量: {M * N}")
print(f"低秩近似矩阵存储参数量 (U_k, s_k, Vh_k): {M * approx_rank + approx_rank + N * approx_rank} (不包括额外的对角矩阵存储，如果只存向量的话)")
print(f"当 rank_k ({approx_rank}) 远小于 M({M}) 和 N({N}) 时，参数量大大减少，实现了压缩。")
print("例如，如果只存储 U_k 和 Vh_k，参数量为 (M+N)*approx_rank。")
print(f"本例中：原始参数 {M*N} = {M*N}，近似参数 {(M+N)*approx_rank} = {(M+N)*approx_rank}")

In [None]:
class AttentionEnhancedModule(nn.Module):
    """知识推理增强的注意力机制模块"""
    
    def __init__(self, hidden_size):
        super(AttentionEnhancedModule, self).__init__()
        self.hidden_size = hidden_size
        
        # 动态融合题目信息和推理过程的门控机制
        self.query_linear = nn.Linear(hidden_size, hidden_size)
        self.key_linear = nn.Linear(hidden_size, hidden_size)
        self.value_linear = nn.Linear(hidden_size, hidden_size)
        self.gate = nn.Linear(hidden_size * 2, hidden_size)
        
        # 多尺度特征提取
        self.scale_transforms = nn.ModuleList([
            nn.Linear(hidden_size, hidden_size // 2),
            nn.Linear(hidden_size, hidden_size // 4)
        ])
        self.scale_combine = nn.Linear(hidden_size + hidden_size // 2 + hidden_size // 4, hidden_size)
        
        # 归一化层
        self.layer_norm = nn.LayerNorm(hidden_size)
    
    def forward(self, x, context=None):
        """
        前向传播
        x: 题目理解的特征 [batch_size, seq_len, hidden_size]
        context: 可选的上下文特征 [batch_size, seq_len, hidden_size]
        """
        if context is None:
            context = x
            
        batch_size, seq_len, _ = x.size()
        
        # 自注意力计算
        q = self.query_linear(x)
        k = self.key_linear(context)
        v = self.value_linear(context)
        
        # 计算注意力分数
        attention_scores = torch.matmul(q, k.transpose(-1, -2)) / (self.hidden_size ** 0.5)
        attention_probs = F.softmax(attention_scores, dim=-1)
        
        # 应用注意力
        attention_output = torch.matmul(attention_probs, v)
        
        # 门控机制
        gate_input = torch.cat([x, attention_output], dim=-1)
        gate_value = torch.sigmoid(self.gate(gate_input))
        gated_output = gate_value * attention_output + (1 - gate_value) * x
        
        # 多尺度特征提取
        scales = [gated_output]
        for transform in self.scale_transforms:
            scales.append(transform(gated_output))
        
        # 组合多尺度特征
        multi_scale = torch.cat([s for s in scales], dim=-1)
        combined = self.scale_combine(multi_scale)
        
        # 残差连接和层归一化
        output = self.layer_norm(combined + x)
        
        return output

In [None]:
class KnowledgeRoutingModule(nn.Module):
    """动态知识路由模块"""
    
    def __init__(self, hidden_size, num_domains=4):
        super(KnowledgeRoutingModule, self).__init__()
        self.hidden_size = hidden_size
        self.num_domains = num_domains
        
        # 知识域专家网络
        self.domain_experts = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_size, hidden_size),
                nn.GELU(),
                nn.Linear(hidden_size, hidden_size)
            ) for _ in range(num_domains)
        ])
        
        # 路由网络
        self.router = nn.Linear(hidden_size, num_domains)
        
        # 输出层
        self.output_layer = nn.Linear(hidden_size, hidden_size)
        self.layer_norm = nn.LayerNorm(hidden_size)
    
    def forward(self, x):
        """
        前向传播
        x: 输入特征 [batch_size, seq_len, hidden_size]
        """
        batch_size, seq_len, _ = x.size()
        
        # 计算路由权重
        routing_logits = self.router(x.mean(dim=1))  # [batch_size, num_domains]
        routing_weights = F.softmax(routing_logits, dim=-1)  # [batch_size, num_domains]
        
        # 扩展维度以便于广播
        routing_weights = routing_weights.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, num_domains]
        
        # 将输入通过每个专家网络
        expert_outputs = []
        for expert in self.domain_experts:
            expert_output = expert(x).unsqueeze(-1)  # [batch_size, seq_len, hidden_size, 1]
            expert_outputs.append(expert_output)
        
        # 堆叠专家输出 [batch_size, seq_len, hidden_size, num_domains]
        stacked_outputs = torch.cat(expert_outputs, dim=-1)
        
        # 应用路由权重
        routed_output = torch.sum(stacked_outputs * routing_weights, dim=-1)
        
        # 输出层
        output = self.output_layer(routed_output)
        output = self.layer_norm(output + x)  # 残差连接
        
        return output, routing_weights.squeeze()
    

tensor([[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.]])

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GatingMechanism(nn.Module):
    """
    基本的门控机制实现
    """
    def __init__(self, input_size):
        super(GatingMechanism, self).__init__()
        # 门控网络，输出0-1之间的值
        self.gate = nn.Sequential(
            nn.Linear(input_size * 2, input_size),  # 输入是两个向量的拼接
            nn.Sigmoid()  # Sigmoid函数将输出压缩到0-1之间
        )
    
    def forward(self, x1, x2):
        """
        x1, x2: 输入张量，形状为 [..., input_size]
        """
        # 拼接两个输入
        concat_input = torch.cat([x1, x2], dim=-1)
        
        # 计算门控值
        gate_value = self.gate(concat_input)
        print(gate_value)
        
        # 应用门控机制：g * x1 + (1-g) * x2
        output = gate_value * x1 + (1 - gate_value) * x2
        
        return output

# 示例使用
input_size = 4
gate_module = GatingMechanism(input_size)

# 创建示例输入
x1 = torch.randn(2, input_size)  # batch_size=2
x2 = torch.randn(2, input_size)

# 应用门控机制
output = gate_module(x1, x2)
print(f"Input shapes: x1{x1.shape}, x2{x2.shape}")
print(f"Output shape: {output.shape}")

tensor([[0.4445, 0.5470, 0.6187, 0.6925],
        [0.5750, 0.3050, 0.5550, 0.4843]], grad_fn=<SigmoidBackward0>)
Input shapes: x1torch.Size([2, 4]), x2torch.Size([2, 4])
Output shape: torch.Size([2, 4])


## LLM code practice

### Attention

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
# 假设你希望使用 'SimHei'
plt.rcParams['font.sans-serif'] = ['SimHei', 'WenQuanYi Zen Hei', 'Arial Unicode MS'] # 添加多个备用字体
plt.rcParams['axes.unicode_minus'] = False # 解决负号 '-' 显示为方块的问题
plt.rcParams['font.size'] = 12 # 可以适当调整字体大小

In [None]:
inputs = torch.tensor(
    [
        [0.43, 0.15, 0.89],
        [0.55, 0.87, 0.66],
        [0.57, 0.85, 0.64],
        [0.22, 0.58, 0.33],
        [0.77, 0.25, 0.10],
        [0.05, 0.80, 0.55],
    ]
)
print("batch size 1, seq length 6, dim 3")
inputs.shape


In [None]:
d_model = inputs[1].shape[0]

w_q = torch.nn.Parameter(torch.rand(d_model, d_model), requires_grad=False)
w_k = torch.nn.Parameter(torch.rand(d_model, d_model), requires_grad=False)
w_v = torch.nn.Parameter(torch.rand(d_model, d_model), requires_grad=False)

x_2 = inputs[1]
query_2 = x_2 @ w_q
key_2 = x_2 @ w_k
value_2 = x_2 @ w_v

query_2

attn_scores = inputs @ inputs.T
attn_weights = torch.softmax(attn_scores, dim=1)
attn_weights @ inputs

In [None]:
class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_q = nn.Linear(d_in, d_out, bias=False)
        self.W_k = nn.Linear(d_in, d_out, bias=False)
        self.W_v = nn.Linear(d_in, d_out, bias=False)

    def forward(self, x):
        keys = self.W_k(x)
        values = self.W_v(x)
        queries = self.W_q(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[0]**0.5, dim=-1)

        context_vec = attn_weights @ values
        return context_vec

In [None]:
sa_v1 = SelfAttention_v2(3, 3)
print(sa_v1(inputs))

In [None]:
seqlen = 6
torch.tril(torch.ones(seqlen, seqlen))

In [None]:
context_len = attn_weights.shape[0]
mask_simple = torch.tril(torch.ones(context_len, context_len))
attn_weights * mask_simple

In [None]:
row_sum = mask_simple.sum(dim=1, keepdim=True)
mask_simple_norm = mask_simple / row_sum
mask_simple_norm

In [None]:
mask = torch.triu(torch.ones(context_len, context_len), diagonal=1)
masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
masked

In [None]:
attn_weights = torch.softmax(masked / d_model**0.5, dim=-1)
attn_weights

In [None]:
dropout = nn.Dropout(0.5)
dropout(attn_weights)

In [None]:
batch = torch.stack((inputs, inputs), dim=0)
batch.shape

implementing compact causal self-attention class

In [None]:
class CausalAttention(nn.Module):
    def __init__(self, d_model, seq_len, dropout, qkv_bias=False):
        super().__init__()
        self.W_q = nn.Linear(d_model, d_model, bias=qkv_bias)
        self.W_k = nn.Linear(d_model, d_model, bias=qkv_bias)
        self.W_v = nn.Linear(d_model, d_model, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_k(x)
        queries = self.W_q(x)
        values = self.W_v(x)

        attn_scores = queries @ keys.transpose(1, 2)
        mask = torch.triu(torch.ones(num_tokens, num_tokens), diagonal=1)
        attn_scores = attn_scores.masked_fill_(
            mask.bool()[:num_tokens, :num_tokens], -torch.inf
        )
        attn_weights = torch.softmax(attn_scores / d_in**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = attn_weights @ values
        return context_vec
    
context_length = batch.shape[1]
ca = CausalAttention(3, context_length, 0.0)

context_vecs = ca(batch)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

multi-head attention

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        # Reduce the projection dim to match desired output dim
        self.head_dim = d_out // num_heads

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask", torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # combine heads, where self.d_out = num_heads * head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec


batch_size, context_length, d_in = batch.shape
d_out = 2
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)

context_vecs = mha(batch)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)