In [1]:
import pynq.pl_server.global_state
pynq.pl_server.global_state.clear_global_state()

In [2]:
import numpy as np
import torch
from pynq import Overlay, allocate
import time
import os

print("PYNQ and Torch environment loaded successfully.")

# ====== 1. 全局常量定义 ======
# 数据维度
N_DIM = 64
D_DIM = 768
BUFFER_SHAPE = (N_DIM * D_DIM,)

# 文件路径
BITSTREAM_PATH = "activation_accelerator.bit"

# 从 Vivado 时序报告中确认的核心时钟频率 (MHz)
CLOCK_FREQ_MHZ = 250.0

# 单个 bf16 元素占用的字节数
BYTES_PER_ELEMENT = 2

# 每次计算涉及的总数据量 (从 BRAM/URAM 读一次, 写一次)
TOTAL_BYTES_MOVED = N_DIM * D_DIM * BYTES_PER_ELEMENT * 2 

# Elementwise 操作的总操作数 (用于计算 GOPS)
TOTAL_ELEMENTWISE_OPS = N_DIM * D_DIM


PYNQ and Torch environment loaded successfully.


In [3]:
# ====== 2. 加载 Overlay 和数据 ======
print(f"Loading bitstream: {BITSTREAM_PATH}")
overlay = Overlay(BITSTREAM_PATH)
help(overlay)


Loading bitstream: activation_accelerator.bit


Help on Overlay in module pynq.overlay:

<pynq.overlay.Overlay object>
    Default documentation for overlay activation_accelerator.bit. The following
    attributes are available on this overlay:
    
    IP Blocks
    ----------
    activation_accelerat_0 : pynq.overlay.DefaultIP
    zynq_ultra_ps_e_0    : pynq.overlay.DefaultIP
    
    Hierarchies
    -----------
    None
    
    Interrupts
    ----------
    None
    
    GPIO Outputs
    ------------
    None
    
    Memories
    ------------
    PSDDR                : Memory



In [4]:
# 获取 IP 句柄。PYNQ 将其识别为 DefaultIP 类型，这是正确的。
acc_ip = overlay.activation_accelerat_0
help(acc_ip)
print(f"Bitstream loaded. IP core handle '{acc_ip}' is ready.")

Help on DefaultIP in module pynq.overlay object:

class DefaultIP(builtins.object)
 |  DefaultIP(description)
 |  
 |  Driver for an IP without a more specific driver
 |  
 |  This driver wraps an MMIO device and provides a base class
 |  for more specific drivers written later. It also provides
 |  access to GPIO outputs and interrupts inputs via attributes. More specific
 |  drivers should inherit from `DefaultIP` and include a
 |  `bindto` entry containing all of the IP that the driver
 |  should bind to. Subclasses meeting these requirements will
 |  automatically be registered.
 |  
 |  Attributes
 |  ----------
 |  mmio : pynq.MMIO
 |      Underlying MMIO driver for the device
 |  _interrupts : dict
 |      Subset of the PL.interrupt_pins related to this IP
 |  _gpio : dict
 |      Subset of the PL.gpio_dict related to this IP
 |  
 |  Methods defined here:
 |  
 |  __init__(self, description)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  read(se

In [5]:
# =================================================================================
#  >>> 最终确认版: HLS IP 核的 AXI-Lite 寄存器地址偏移 <<<
#  (根据您 HLS 在合并接口后新生成的 xactivation_accelerator_hw.h 文件)
# =================================================================================
ADDR_AP_CTRL   = 0x00
ADDR_IN0_LOW   = 0x10
ADDR_IN0_HIGH  = 0x14
ADDR_IN1_LOW   = 0x1c
ADDR_IN1_HIGH  = 0x20
ADDR_OUT_LOW   = 0x28
ADDR_OUT_HIGH  = 0x2c
ADDR_STAGE     = 0x34 # 注意: 地址已根据新 hw.h 文件更新
ADDR_CONFIG    = 0x3c # 注意: 地址已根据新 hw.h 文件更新
# =================================================================================



In [6]:
# 竞赛函数与您硬件的 Opcode 映射
OPCODE_MAP = {
    'Softmax': 9, 'LayerNorm': 6, 'RMSNorm': 5, 'SiLU': 4,
    'GELU': 8, 'Add': 0, 'Mul': 7,
}

# 按照竞赛权重顺序排列的待测函数列表
FUNCTIONS_TO_TEST = ['Softmax', 'LayerNorm', 'RMSNorm', 'SiLU', 'GELU', 'Add', 'Mul']

# 精度评分参数
LOSSLESS_THRESHOLD = 1e-3
print("Cell 1 finished")

Cell 1 finished


In [7]:
def to_float32_from_bits(bf16_bits_np: np.ndarray):
    """将 uint16 的 bf16 位模式数组转换为 float32 numpy 数组"""
    assert bf16_bits_np.dtype == np.uint16
    u32s = bf16_bits_np.astype(np.uint32) << 16
    return u32s.view(np.float32)

def calculate_accuracy_score(fpga_out_bits_np: np.ndarray, golden_ref_pt: torch.Tensor):
    """根据竞赛规则计算 L2 误差和精度分数 (修正版)"""
    fpga_out_f32 = to_float32_from_bits(fpga_out_bits_np)
    golden_ref_f32 = golden_ref_pt.to(torch.float32).cpu().numpy().flatten()

    # --- 1. 严格检查特殊值 (NaN 和 Inf) ---
    nan_mask_fpga = np.isnan(fpga_out_f32)
    nan_mask_golden = np.isnan(golden_ref_f32)
    if np.any(nan_mask_fpga != nan_mask_golden):
        mismatched_special_count = np.count_nonzero(nan_mask_fpga != nan_mask_golden)
        print(f"  [Accuracy FATAL] Mismatch in NaN locations! Count: {mismatched_special_count}")
        return float('inf'), 0.0, mismatched_special_count

    inf_mask_fpga = np.isinf(fpga_out_f32)
    inf_mask_golden = np.isinf(golden_ref_f32)
    if np.any(inf_mask_fpga != inf_mask_golden):
        mismatched_special_count = np.count_nonzero(inf_mask_fpga != inf_mask_golden)
        print(f"  [Accuracy FATAL] Mismatch in Inf locations! Count: {mismatched_special_count}")
        return float('inf'), 0.0, mismatched_special_count
        
    # 检查 Inf 的符号是否匹配
    if np.any(inf_mask_golden): # 只有当存在 Inf 时才检查
      if not np.all(fpga_out_f32[inf_mask_golden] == golden_ref_f32[inf_mask_golden]):
          mismatched_special_count = np.count_nonzero(fpga_out_f32[inf_mask_golden] != golden_ref_f32[inf_mask_golden])
          print(f"  [Accuracy FATAL] Mismatch in Inf signs! Count: {mismatched_special_count}")
          return float('inf'), 0.0, mismatched_special_count

    # --- 2. 计算常规数值的 L2 误差 ---
    # 正确的 valid_indices：排除所有 NaN 和 Inf
    valid_indices = ~nan_mask_golden & ~inf_mask_golden
    
    # 如果所有值都是特殊值，那么到这里已经通过了
    if not np.any(valid_indices):
        return 0.0, 1.0, 0

    numerator = np.linalg.norm(fpga_out_f32[valid_indices] - golden_ref_f32[valid_indices])
    denominator = np.linalg.norm(golden_ref_f32[valid_indices])
    
    if denominator < 1e-12:
        relative_l2_error = 0.0 if numerator < 1e-12 else float('inf')
    else:
        relative_l2_error = numerator / denominator
    
    # --- 3. 计算精度分数 ---
    ef = relative_l2_error
    LOSSLESS_THRESHOLD = 1e-3 # 与脚本其他部分保持一致
    if ef <= LOSSLESS_THRESHOLD:
        accuracy_score = 1.0
    elif ef < 100 * LOSSLESS_THRESHOLD:
        accuracy_score = (np.log(100 * LOSSLESS_THRESHOLD) - np.log(ef)) / np.log(100)
    else:
        accuracy_score = 0.0
        
    # --- 4. 计算常规数值中，位模式不匹配的个数 ---
    fpga_bits_valid = fpga_out_bits_np[valid_indices]
    # golden_bits_valid = golden_ref_pt.view(torch.uint16).cpu().numpy().flatten()[valid_indices]
    golden_bits_valid = golden_ref_pt.view(torch.int16).cpu().numpy().flatten()[valid_indices]
    diff_count = np.count_nonzero(fpga_bits_valid != golden_bits_valid)

    return ef, accuracy_score, diff_count
print("Cell 2 finished")

Cell 2 finished


In [8]:
X_BIN_PATH = "X_test_tensor_bf16.bin"
Y_BIN_PATH = "Y_test_tensor_bf16.bin"
REF_DIR = "./refs/"

In [9]:
# 加载输入向量
print(f"\nLoading test vectors: {X_BIN_PATH}, {Y_BIN_PATH}")
X_test_bits = np.fromfile(X_BIN_PATH, dtype=np.uint16)
Y_test_bits = np.fromfile(Y_BIN_PATH, dtype=np.uint16)

# 加载所有黄金参考
print("Loading golden reference files...")
golden_refs = {}
for func_name in FUNCTIONS_TO_TEST:
    ref_filename = f"ref_{func_name.lower()}_bf16.pt"
    ref_path = os.path.join(REF_DIR, ref_filename)
    golden_refs[func_name] = torch.load(ref_path, weights_only=True)
    print(f"  - Loaded reference for {func_name}")

# 在板上 DDR 分配物理连续内存
print("\nAllocating PYNQ buffers for DMA...")
in0_buffer = allocate(shape=BUFFER_SHAPE, dtype=np.uint16)
in1_buffer = allocate(shape=BUFFER_SHAPE, dtype=np.uint16)
out_buffer = allocate(shape=BUFFER_SHAPE, dtype=np.uint16)
print("Buffers allocated.")

# 拷贝数据到 DDR 并刷新
in0_buffer[:] = X_test_bits
in1_buffer[:] = Y_test_bits
in0_buffer.flush()
in1_buffer.flush()
print("Input data copied to DDR.")
print("Cell 3 finished")


Loading test vectors: X_test_tensor_bf16.bin, Y_test_tensor_bf16.bin
Loading golden reference files...
  - Loaded reference for Softmax
  - Loaded reference for LayerNorm
  - Loaded reference for RMSNorm
  - Loaded reference for SiLU
  - Loaded reference for GELU
  - Loaded reference for Add
  - Loaded reference for Mul

Allocating PYNQ buffers for DMA...
Buffers allocated.
Input data copied to DDR.
Cell 3 finished


In [10]:
def run_stage_and_wait(stage, config=0, timeout_s=15):
    """一个封装好的函数，用于执行一个阶段并等待其完成"""
    acc_ip.write(ADDR_CONFIG, config)
    acc_ip.write(ADDR_STAGE, stage)
    acc_ip.write(ADDR_AP_CTRL, 1) # ap_start
    
    t_start = time.time()
    while (acc_ip.read(ADDR_AP_CTRL) & 0x2) == 0: # 轮询 ap_done (bit 1)
        if time.time() - t_start > timeout_s:
            raise TimeoutError(f"IP execution timed out on stage={stage}, config={config}!")
        time.sleep(0.0001)

# ====== 3. 执行测试 ======
# 获取缓冲区的物理地址
pa_in0 = in0_buffer.physical_address
pa_in1 = in1_buffer.physical_address
pa_out = out_buffer.physical_address

# 将64位物理地址写入 IP 核的相应寄存器
acc_ip.write(ADDR_IN0_LOW, pa_in0 & 0xFFFFFFFF)
acc_ip.write(ADDR_IN0_HIGH, pa_in0 >> 32)
acc_ip.write(ADDR_IN1_LOW, pa_in1 & 0xFFFFFFFF)
acc_ip.write(ADDR_IN1_HIGH, pa_in1 >> 32)
acc_ip.write(ADDR_OUT_LOW, pa_out & 0xFFFFFFFF)
acc_ip.write(ADDR_OUT_HIGH, pa_out >> 32)
print("DDR buffer addresses written to IP core.\n")

print("--- Executing STAGE 0: Load data from DDR to on-chip memory ---")
run_stage_and_wait(stage=0)
print("STAGE 0 complete.\n")

results_summary = {}

# 循环测试所有函数
for func_name in FUNCTIONS_TO_TEST:
    print(f"--- Testing: {func_name} ---")
    opcode = OPCODE_MAP[func_name]
    
    # 计时 Stage 1 (计算)
    t_start = time.time()
    run_stage_and_wait(stage=1, config=opcode)
    t_end = time.time()
    latency_ms = (t_end - t_start) * 1000
    
    # 执行 Stage 2 (写回)
    run_stage_and_wait(stage=2)
    
    # 从 DDR 读取结果
    out_buffer.invalidate()
    fpga_result_bits = out_buffer.copy()
    
    # 与黄金参考对比并计算分数
    golden_ref_pt = golden_refs[func_name]
    l2_error, acc_score, diff_count = calculate_accuracy_score(fpga_result_bits, golden_ref_pt)
    
    latency_s = latency_ms / 1000.0 # 将延迟从毫秒转换为秒
    # 1. 计算有效吞吐率 (Tensors/sec)
    tensors_per_second = 1.0 / latency_s if latency_s > 0 else 0
    # 2. 计算有效内存带宽 (MB/s) - 衡量片上存储的交互速度
    effective_bandwidth_MBs = (TOTAL_BYTES_MOVED / latency_s) / (1024 * 1024) if latency_s > 0 else 0
    # 3. 计算算力 (GOPS) - 仅对 Elementwise 操作有明确意义
    # SiLU 和 GELU 也是 Elementwise 的，其 GOPS 代表每秒执行函数次数
    if func_name in ['Add', 'Mul', 'SiLU', 'GELU']:
         gops = (TOTAL_ELEMENTWISE_OPS / latency_s) / 1e9 if latency_s > 0 else 0
         gops_str = f"{gops:.2f} GOPS"
    else: # 对于 Softmax 和 Norms, 包含复杂的 Reduction, GOPS 难以统一定义
         gops_str = "N/A (Reduction-based)"
    
    results_summary[func_name] = {
        "latency_ms": latency_ms,
        "l2_error": l2_error,
        "accuracy_score": acc_score,
        "diff_elements": diff_count,
        "throughput_tensors_per_sec": tensors_per_second, # <--- NEW
        "bandwidth_MBps": effective_bandwidth_MBs,        # <--- NEW
        "gops_str": gops_str                               # <--- NEW
    }
    
    print(f"  Latency (Stage 1): {latency_ms:.4f} ms")
    print(f"  Relative L2 Error (εf): {l2_error:.6e}")
    print(f"  Mismatched Elements (non-NaN): {diff_count}")
    print(f"  Accuracy Score (Af): {acc_score:.6f}")
    # 打印新增的性能指标
    print(f"  ---------------- Performance Metrics -----------------")
    print(f"  Clock Frequency (from report): {CLOCK_FREQ_MHZ:.1f} MHz")
    print(f"  Throughput: {tensors_per_second:.2f} Tensors/sec")
    print(f"  Effective On-Chip Bandwidth: {effective_bandwidth_MBs:.2f} MB/s")
    print(f"  Compute Power: {gops_str}")
    print(f"  ----------------------------------------------------")

    # ================================================================================
    # [CORRECTED DEBUG BLOCK v2]
    # Fixes the ValueError by providing the correct format specifier for scientific
    # notation in Python f-strings (changed ":.e" to ":.1e").
    # ================================================================================
    if acc_score < 0.99999 and diff_count > 0:
        print("  [DEBUG] Mismatch detected. Performing detailed row-by-row analysis...")
    
        # --- 1. 配置常量 ---
        ROW_ERROR_THRESHOLD = 1e-2
        MAX_ROWS_TO_PRINT = 60
        ELEMENTS_PER_ROW = 4
    
        # --- 2. 准备数据 ---
        fpga_out_f32 = to_float32_from_bits(fpga_result_bits)
        golden_ref_f32 = golden_ref_pt.to(torch.float32).cpu().numpy().flatten()
        golden_ref_bits = golden_ref_pt.view(torch.int16).cpu().numpy().flatten()
        input_a_bits = X_test_bits
        input_b_bits = Y_test_bits if func_name in ['Add', 'Mul'] else None
    
        # --- 3. 逐行分析，找出所有出错的行和【真正】原因 ---
        failing_rows_info = []
        for i in range(N_DIM):
            start_idx, end_idx = i * D_DIM, (i + 1) * D_DIM
            fpga_row, golden_row = fpga_out_f32[start_idx:end_idx], golden_ref_f32[start_idx:end_idx]
    
            # [FIXED] 分别检查 NaN 和 Inf 不匹配
            nan_mismatch = np.any(np.isnan(fpga_row) != np.isnan(golden_row))
            inf_mismatch = np.any(np.isinf(fpga_row) != np.isinf(golden_row))
    
            # 计算纯数值L2误差
            valid_indices = ~np.isnan(fpga_row) & ~np.isnan(golden_row) & \
                            ~np.isinf(fpga_row) & ~np.isinf(golden_row)
            row_l2_error = 0.0
            if np.any(valid_indices):
                numerator = np.linalg.norm(fpga_row[valid_indices] - golden_row[valid_indices])
                denominator = np.linalg.norm(golden_row[valid_indices])
                if denominator > 1e-12: row_l2_error = numerator / denominator
    
            # [FIXED] 根据优先级记录失败原因
            if nan_mismatch:
                failing_rows_info.append({'idx': i, 'reason': 'NaN Mismatch'})
            elif inf_mismatch:
                failing_rows_info.append({'idx': i, 'reason': 'Inf Mismatch'})
            elif row_l2_error > ROW_ERROR_THRESHOLD:
                failing_rows_info.append({'idx': i, 'reason': f'High L2 Error ({row_l2_error:.2e})'})
    
        # --- 4. 生成最终的、清晰的调试报告 ---
        if not failing_rows_info:
            print("  [Analysis Result] A global mismatch was detected, but the per-row analysis script failed to pinpoint the cause.")
        else:
            print(f"  [Analysis Result] Found {len(failing_rows_info)} problematic rows.")
            rows_printed_count = 0
            for info in failing_rows_info:
                if rows_printed_count >= MAX_ROWS_TO_PRINT:
                    print("  ... (more problematic rows exist but are not shown)")
                    break
                
                row_idx = info['idx']
                print(f"\n  --- Details for Row {row_idx} (Reason: {info['reason']}) ---")
                
                row_start_idx = row_idx * D_DIM
                def format_val(bits):
                    val = to_float32_from_bits(np.array([bits], dtype=np.uint16))[0]
                    if np.isnan(val): return "nan"
                    if np.isposinf(val): return "+inf"
                    if np.isneginf(val): return "-inf"
                    return f"{val:<9.4f}"
    
                for i in range(ELEMENTS_PER_ROW):
                    idx = row_start_idx + i
                    in_a_b, fpga_b, golden_b = input_a_bits[idx], fpga_result_bits[idx], golden_ref_bits[idx]
                    
                    print(f"    - Idx {idx:<5d} (col {i:<2d}): In_A  = 0x{in_a_b:04x} ({format_val(in_a_b)})", end="")
                    if input_b_bits is not None:
                        in_b_b = input_b_bits[idx]
                        print(f" | In_B   = 0x{in_b_b:04x} ({format_val(in_b_b)})", end="")
                    
                    marker = "FAIL" if fpga_b != golden_b else "  pass"
                    print(f"\n      {marker:^8s} -> HLS    = 0x{fpga_b:04x} ({format_val(fpga_b)})")
                    print(f"               Golden = 0x{golden_b:04x} ({format_val(golden_b)})")
    
                rows_printed_count += 1
    
    print("-" * 25 + "\n")
print("Cell 4 finished")

DDR buffer addresses written to IP core.

--- Executing STAGE 0: Load data from DDR to on-chip memory ---
STAGE 0 complete.

--- Testing: Softmax ---
  Latency (Stage 1): 0.8447 ms
  Relative L2 Error (εf): 1.132788e-03
  Mismatched Elements (non-NaN): 19133
  Accuracy Score (Af): 0.972926
  ---------------- Performance Metrics -----------------
  Clock Frequency (from report): 250.0 MHz
  Throughput: 1183.83 Tensors/sec
  Effective On-Chip Bandwidth: 221.97 MB/s
  Compute Power: N/A (Reduction-based)
  ----------------------------------------------------
  [DEBUG] Mismatch detected. Performing detailed row-by-row analysis...
  [Analysis Result] A global mismatch was detected, but the per-row analysis script failed to pinpoint the cause.
-------------------------

--- Testing: LayerNorm ---
  Latency (Stage 1): 0.8254 ms
  Relative L2 Error (εf): 9.781953e-04
  Mismatched Elements (non-NaN): 15686
  Accuracy Score (Af): 1.000000
  ---------------- Performance Metrics -----------------


In [11]:
print("\n\n" + "=" * 110)
print(" " * 35 + "FPT'25 FINAL PERFORMANCE REPORT")
print("=" * 110)
# 增强版的表头
print(f"{'Function':<12} | {'Latency(ms)':<14} | {'Acc. Score':<12} | {'Throughput(T/s)':<16} | {'Bandwidth(MB/s)':<18} | {'Compute(GOPS)':<20}")
print("-" * 110)

for func_name in FUNCTIONS_TO_TEST:
    res = results_summary[func_name]
    
    # 准备用于打印的字符串
    lat_str = f"{res['latency_ms']:.4f}"
    acc_str = f"{res['accuracy_score']:.6f}"
    throughput_str = f"{res['throughput_tensors_per_sec']:.2f}"
    bandwidth_str = f"{res['bandwidth_MBps']:.2f}"
    gops_str = res['gops_str']
    print(f"{func_name:<12} | {lat_str:<14} | {acc_str:<12} | {throughput_str:<16} | {bandwidth_str:<18} | {gops_str:<20}")

print("=" * 110)
print("\n* T/s: Tensors per second (64x768 bf16)")
print("* Bandwidth: Effective on-chip memory bandwidth (read + write)")
print("* GOPS: Giga Operations Per Second (for element-wise functions)")


# 清理 PYNQ 缓冲区
print("\nFreeing PYNQ buffers...")
in0_buffer.close()
in1_buffer.close()
out_buffer.close()
print("Test complete. Buffers freed.")
print("All cells finished")



                                   FPT'25 FINAL PERFORMANCE REPORT
Function     | Latency(ms)    | Acc. Score   | Throughput(T/s)  | Bandwidth(MB/s)    | Compute(GOPS)       
--------------------------------------------------------------------------------------------------------------
Softmax      | 0.8447         | 0.972926     | 1183.83          | 221.97             | N/A (Reduction-based)
LayerNorm    | 0.8254         | 1.000000     | 1211.53          | 227.16             | N/A (Reduction-based)
RMSNorm      | 0.8476         | 1.000000     | 1179.83          | 221.22             | N/A (Reduction-based)
SiLU         | 0.3107         | 1.000000     | 3218.96          | 603.55             | 0.16 GOPS           
GELU         | 0.3164         | 1.000000     | 3160.74          | 592.64             | 0.16 GOPS           
Add          | 0.3231         | 1.000000     | 3095.43          | 580.39             | 0.15 GOPS           
Mul          | 0.3295         | 1.000000     | 3034.95       