In [1]:
from pynq import Overlay
overlay = Overlay("/home/ubuntu/workspace/pynq_bitfiles/2-24/MatMul_SA6.bit")
accel_ip = overlay.mmult_accel_0

In [2]:
def call_fpga(A_buf, B_buf, C_buf, accel_ip, N, K, M, update_A):
    """
    Runs a 2D matrix multiplication on the FPGA accelerator:
      (N, K) x (K, M) => (N, M)

    A_buf, B_buf, C_buf are PYNQ buffers allocated with shape=(N,K), (K,M), (N,M).
    update_A: 1 to load A into BRAM (new input), 0 to reuse persistent A.
    """
    print("calling fpga, update_A =", update_A)
    
    # Flush input buffers to ensure data consistency.
    # Only flush A_buf if we intend to update A (update_A==1).
    if update_A:
        A_buf.flush()
    B_buf.flush()

    # Configure the accelerator registers
    accel_ip.register_map.A_1 = A_buf.physical_address & 0xFFFFFFFF
    accel_ip.register_map.A_2 = (A_buf.physical_address >> 32) & 0xFFFFFFFF
    accel_ip.register_map.B_1 = B_buf.physical_address & 0xFFFFFFFF
    accel_ip.register_map.B_2 = (B_buf.physical_address >> 32) & 0xFFFFFFFF
    accel_ip.register_map.C_1 = C_buf.physical_address & 0xFFFFFFFF
    accel_ip.register_map.C_2 = (C_buf.physical_address >> 32) & 0xFFFFFFFF
    accel_ip.register_map.N = N
    accel_ip.register_map.K = K
    accel_ip.register_map.M = M
    # Pass the update_A flag to the accelerator
    accel_ip.register_map.update_A = update_A

    # Start the accelerator
    accel_ip.register_map.CTRL.AP_START = 1

    # Wait for finish
    while accel_ip.register_map.CTRL.AP_DONE == 0:
        pass

    # Invalidate output buffer so the CPU sees the updated data from DDR
    C_buf.invalidate()

# Block 1: Helper Functions

In [3]:
import numpy as np
from pynq import allocate

def pynq_buffer_from_numpy(np_array):
    """
    Allocates a PYNQ buffer with the same shape and dtype as np_array,
    then copies the data into the buffer.
    """
    buf = allocate(np_array.shape, dtype=np_array.dtype)
    np.copyto(buf, np_array)
    return buf

def requantize(int32_array, scale, zero_point=0):
    """
    Requantizes an int32 numpy array to int8 using the provided scale and zero_point.
    Operation: int8_val = clip(round(int32_val * scale + zero_point), -128, 127)
    """
    scaled = np.round(int32_array * scale + zero_point)
    int8_array = np.clip(scaled, -128, 127).astype(np.int8)
    return int8_array


# Block 2: Custom Module for FPGA Offload
#### This module wraps a quantized linear layer (for Q, K, or V) and in its forward pass it quantizes its input (if needed), converts the activation and weight to PYNQ buffers, calls the FPGA accelerator via call_fpga(), and then dequantizes the resulting int32 accumulation back to float.

In [5]:
import torch
import numpy as np

class FPGAQuantizedLinear(torch.nn.Module):
    def __init__(self, quantized_linear, act_scale, accel_ip, hidden_size=768, update_A=True):
        """
        Parameters:
          quantized_linear : an instance of DynamicQuantizedLinear from the quantized model.
          act_scale        : scaling factor for quantizing input activations.
          accel_ip         : the FPGA accelerator IP handle.
          hidden_size      : hidden dimension size (typically 768).
          update_A         : flag indicating whether to update A in persistent BRAM (True for Q, False for K/V).
        """
        super(FPGAQuantizedLinear, self).__init__()
        self.accel_ip = accel_ip
        self.hidden_size = hidden_size
        self.act_scale = act_scale
        self.update_A = update_A  # Store the update flag
        
        # Extract quantized weight and its parameters.
        self.weight_int8_tensor = quantized_linear.weight().int_repr()
        self.weight_scale = quantized_linear.weight().q_scale()
        self.weight_zero_point = quantized_linear.weight().q_zero_point()
        # Transpose so that the weight shape becomes (in_features, out_features)
        self.weight_int8 = self.weight_int8_tensor.cpu().numpy().T  # shape: (hidden_size, hidden_size)
        
        # Effective scale: multiplication of activation scale and weight scale.
        self.effective_scale = self.act_scale * self.weight_scale
        
        # Check for bias. Note that in DynamicQuantizedLinear, bias remains in FP32.
        bias_val = quantized_linear.bias()  # This calls the bound method.
        if bias_val is not None:
            # Save bias as a NumPy array (shape: (hidden_size,))
            self.bias = bias_val.detach().cpu().numpy().astype(np.float32)
        else:
            self.bias = None

    def forward(self, x):
        """
        Forward pass for FPGA offload.
        Accepts input x which may be 2D (N, D) or 3D (B, S, D). In case of 3D input,
        the tensor is reshaped to 2D for matrix multiplication and then reshaped back.
        The input is quantized to int8 using self.act_scale. After the FPGA multiplication,
        the int32 result is dequantized to FP32 and the bias is added (if available).
        """
        # Save the original shape.
        orig_shape = x.shape
        if x.dim() == 3:
            B, S, D = x.shape
            x_flat = x.reshape(B * S, D)
        else:
            x_flat = x

        # Determine the number of rows for the FPGA call.
        N = x_flat.shape[0]

        # Quantize the input if it is in float32.
        if x_flat.dtype == torch.float32:
            x_int8 = torch.clamp(torch.round(x_flat / self.act_scale), -128, 127).to(torch.int8)
        else:
            x_int8 = x_flat

        # Convert to a NumPy int8 array.
        x_np = x_int8.cpu().numpy().astype(np.int8)
        
        # Convert input activation and weight to PYNQ buffers.
        A_buf = pynq_buffer_from_numpy(x_np)
        W_buf = pynq_buffer_from_numpy(self.weight_int8)
        # Allocate an output buffer for the int32 result (shape: (N, hidden_size))
        C_buf = allocate((N, self.hidden_size), dtype=np.int32)
        
        # Call the FPGA accelerator:
        # Instead of hardcoding update_A=1, we now use self.update_A:
        call_fpga(A_buf, W_buf, C_buf, self.accel_ip, N, self.hidden_size, self.hidden_size, update_A=int(self.update_A))
        
        # Retrieve the int32 result.
        C_int32 = np.array(C_buf)
        # Dequantize: convert int32 accumulator to FP32 using the effective scale.
        out_fp32 = C_int32.astype(np.float32) * self.effective_scale
        
        # If a bias is present, add it (broadcast along axis 0).
        if self.bias is not None:
            # Ensure bias is added to each row.
            out_fp32 = out_fp32 + self.bias
        
        # Convert back to a torch tensor.
        out_tensor = torch.tensor(out_fp32, dtype=torch.float32)
        
        # If the original input was 3D, reshape back to (B, S, hidden_size).
        if x.dim() == 3:
            out_tensor = out_tensor.reshape(B, S, self.hidden_size)
        return out_tensor

# Block 3: Function to Replace Q, K, V Layers with FPGA-Offloaded Versions
#### This function walks through all transformer layers in the quantized DistilBERT model and replaces the Q, K, and V projection layers with our custom FPGA-accelerated module.

In [6]:
def integrate_fpga_offload(model_quant, act_scale, accel_ip, hidden_size=768):
    """
    Replaces the Q, K, V projection layers in each transformer layer with the FPGA-accelerated custom module.
    
    Parameters:
      model_quant  : Quantized DistilBertForSequenceClassification model.
      act_scale    : Scaling factor for quantizing activations (assumed same for demo).
      accel_ip     : Configured FPGA accelerator IP handle.
      hidden_size  : Hidden dimension (typically 768).
    """
    for layer in model_quant.distilbert.transformer.layer:
        # For the Q projection, set update_A to True so that the persistent A is updated.
        layer.attention.q_lin = FPGAQuantizedLinear(layer.attention.q_lin, act_scale, accel_ip, hidden_size, update_A=True)
        # For K and V projections, set update_A to False to reuse A from BRAM.
        layer.attention.k_lin = FPGAQuantizedLinear(layer.attention.k_lin, act_scale, accel_ip, hidden_size, update_A=False)
        layer.attention.v_lin = FPGAQuantizedLinear(layer.attention.v_lin, act_scale, accel_ip, hidden_size, update_A=False)

In [7]:
import numpy as np

def compute_activation_scale(activation_list, percentile=99.9, use_demo=0):
    """
    Computes a global activation scale from a calibration set of activations.
    
    Parameters:
      activation_list: List of NumPy arrays representing activations 
                       (for example, from the embedding layer).
      percentile:      The percentile to use for robust scale computation (if use_demo=0).
      use_demo:        If set to 1, uses the demo method (scale = max_abs_value/127.0);
                       otherwise, uses the robust method (scale = percentile_value/127.0).
    
    Returns:
      A scaling factor computed as:
         - Demo method: scale = (max(|activations|)) / 127.0
         - Robust method: scale = (percentile(|activations|)) / 127.0
    """
    # Concatenate all activations from the calibration samples into one array.
    all_activations = np.concatenate([act.flatten() for act in activation_list])
    
    if use_demo:
        # Demo method: use the maximum absolute value.
        act_abs_max = np.max(np.abs(all_activations))
        scale = act_abs_max / 127.0 if act_abs_max != 0 else 1.0
    else:
        # Robust method: use the specified percentile.
        act_abs_percentile = np.percentile(np.abs(all_activations), percentile)
        scale = act_abs_percentile / 127.0 if act_abs_percentile != 0 else 1.0
    
    return scale


# Block 4: Example Usage – Custom Forward Pass Integration
#### This block shows how to load and quantize the model, extract an activation from the embedding layer, integrate the FPGA offload into the model’s transformer layers, and run a forward pass. (For demonstration, only the Q, K, V projections are offloaded; the rest of the model remains unchanged.)

In [8]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Assume call_fpga() is already defined and accel_ip is configured on your KV260.
# For example:
# accel_ip = get_accel_ip_handle()   # <-- user-specific setup

# 1. Load and Quantize the Model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)
model.eval()

# Apply dynamic quantization to convert Linear layers to int8.
model_int8 = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
model_int8.eval()

# 2. Gather a Calibration Set of Activations to Compute a Global Activation Scale
calib_sentences = [
    "The moonlight shimmered over the ocean as waves gently kissed the sandy shore, while distant lanterns flickered in the cool evening breeze. A lone traveler wandered along the coastline, footsteps sinking softly into the damp sand, lost in thought. The rhythmic sound of the water mixed with the rustling palms, creating a nice"
]
calib_activations = []
for sentence in calib_sentences:
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        # Get the embedding output; shape: (B, L, 768). Here B=1.
        emb = model.distilbert.embeddings(inputs.input_ids)  # shape: (1, L, 768)
        # Remove the batch dimension.
        emb = emb.squeeze(0)  # shape: (L, 768)
        calib_activations.append(emb.cpu().numpy())

# Compute the activation scale using the robust method (percentile-based):
# global_act_scale_robust = compute_activation_scale(calib_activations, percentile=99.9, use_demo=0)
# print("Global Activation Scale (Robust):", global_act_scale_robust)

# # Compute the activation scale using the demo method (max-based):
global_act_scale_demo = compute_activation_scale(calib_activations, use_demo=1)
print("Global Activation Scale (Demo):", global_act_scale_demo)

test_sentence = calib_sentences[0]
print(f"input = '{test_sentence}'")

Global Activation Scale (Demo): 0.06596306177574819
input = 'The moonlight shimmered over the ocean as waves gently kissed the sandy shore, while distant lanterns flickered in the cool evening breeze. A lone traveler wandered along the coastline, footsteps sinking softly into the damp sand, lost in thought. The rhythmic sound of the water mixed with the rustling palms, creating a nice'


In [9]:
import time

# CPU-only Inference
inputs = tokenizer(test_sentence, return_tensors="pt")

start_time = time.time()
with torch.no_grad():
    outputs_cpu = model_int8(inputs.input_ids)
    logits_cpu = outputs_cpu.logits
cpu_time = time.time() - start_time

print(f"CPU Inference Time: {cpu_time:.6f} seconds")
print("CPU Logits:", logits_cpu)

# FPGA-Offloaded Inference
integrate_fpga_offload(model_int8, global_act_scale_demo, accel_ip, hidden_size=768)

start_time = time.time()
with torch.no_grad():
    outputs_fpga = model_int8(inputs.input_ids)
    logits_fpga = outputs_fpga.logits
fpga_time = time.time() - start_time

print(f"FPGA Offloaded Inference Time: {fpga_time:.6f} seconds")
print("FPGA Logits:", logits_fpga)

# Compare Speedup
speedup = cpu_time / fpga_time
print(f"Speedup (CPU vs. FPGA): {speedup:.2f}x")

# Compare Logits Difference
logits_diff = torch.abs(logits_cpu - logits_fpga)
max_diff = torch.max(logits_diff).item()
mean_diff = torch.mean(logits_diff).item()

print(f"Max Logits Difference: {max_diff:.6f}")
print(f"Mean Logits Difference: {mean_diff:.6f}")


CPU Inference Time: 0.553830 seconds
CPU Logits: tensor([[-3.1795,  3.4633]])
calling fpga, update_A = 1
calling fpga, update_A = 0
calling fpga, update_A = 0
calling fpga, update_A = 1
calling fpga, update_A = 0
calling fpga, update_A = 0
calling fpga, update_A = 1
calling fpga, update_A = 0
calling fpga, update_A = 0
calling fpga, update_A = 1
calling fpga, update_A = 0
calling fpga, update_A = 0
calling fpga, update_A = 1
calling fpga, update_A = 0
calling fpga, update_A = 0
calling fpga, update_A = 1
calling fpga, update_A = 0
calling fpga, update_A = 0
FPGA Offloaded Inference Time: 1.228901 seconds
FPGA Logits: tensor([[-2.9969,  3.2308]])
Speedup (CPU vs. FPGA): 0.45x
Max Logits Difference: 0.232455
Mean Logits Difference: 0.207528
