# Pytorch Quantization of S4
This notebook performed S4 Pytorch quantization. The API can only quantized the Linear() layer. The S4D kernel is built from nn.Parameters() and is not supported by the API. However, the current version run into kernel crash issue and unsuccesfully run the quantized the S4 model. Please see NNCF_quantization for an alternative to Pytroch API

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

# Import the s4 model path
import sys
sys.path.append('/Users/poomchan/Developer/s4')

import s4d
from tqdm.auto import tqdm

from numpy import genfromtxt
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import importlib
import utils
importlib.reload(utils)
importlib.reload(s4d)

import nncf
from nncf.parameters import ModelType
import openvino as ov

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, openvino


### Set up

In [2]:
# Torch device configuration
device = (
    "cuda" if torch.cuda.is_available() else "mps"
    if torch.backends.mps.is_available() else "cpu"
)

print(f"Using {device} device")
if torch.cuda.is_available():
    print(f"Current GPU device: {torch.cuda.get_device_name(device)}")
print(torch.__version__)

Using mps device
2.1.0


In [3]:
PROJECT_DIR = "/Users/poomchan/Developer/light-har"
DATA_DIR = PROJECT_DIR + "/data"
MODEL_DIR = PROJECT_DIR + "/code/s4/models"

### Load the Data

In [4]:
# Load Data
x = genfromtxt(PROJECT_DIR+'/data/WISDM_x.csv', delimiter=',')
y_df = pd.read_csv(PROJECT_DIR+'/data/WISDM_y.csv')
y = y_df.values.flatten()  # Flatten if y is 2D

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Function to create time series dataset
def create_series(x, y, timestep, overlap):
    slide_step = int(timestep * (1 - overlap))
    data_num = int((len(x) / slide_step) - 1)
    dataset = np.ndarray(shape=(data_num, timestep, x.shape[1]))
    labels = []

    for i in range(data_num):
        labels.append(y[slide_step * (i + 1) - 1])
        for j in range(timestep):
            dataset[i, j, :] = x[slide_step * i + j, :]

    return dataset, np.array(labels)

# Create time series
seq_length = 16
overlap = 0.5
X_series, y_series = create_series(x, y_encoded, seq_length, overlap)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_series, y_series, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
x_train_tensor = torch.tensor(X_train, dtype=torch.float32)
x_test_tensor = torch.tensor(X_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [5]:
# Create a DataLoader
batch_size = 32
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

# Create a test Dataloader
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

# Create a calibration Dataloader
x_cal_tensor = x_train_tensor[:300]
y_cal_tensor = y_train_tensor[:300]
calibration_loader = DataLoader(TensorDataset(x_cal_tensor, y_cal_tensor), batch_size=32, shuffle=False)

### Load the trained model

In [6]:
class S4Model(nn.Module):
    def __init__(
        self,
        d_input,
        d_output,
        d_model=256,
        n_layers=4,
        dropout=0.2,
        lr=0.001,
        dropout_fn=nn.Dropout,
        prenorm=False,
    ):
        super().__init__()

        self.prenorm = prenorm

        # Linear encoder
        self.encoder = nn.Linear(d_input, d_model)

        # Stack S4 layers as residual blocks
        self.s4_layers = nn.ModuleList()
        self.norms = nn.ModuleList()
        self.dropouts = nn.ModuleList()
        for _ in range(n_layers):
            self.s4_layers.append(
                s4d.S4D(d_model, dropout=dropout, transposed=True, lr=lr)
            )
            self.norms.append(nn.LayerNorm(d_model))
            self.dropouts.append(dropout_fn(dropout))

        # Linear decoder
        self.decoder = nn.Linear(d_model, d_output)
        
        self.quant = torch.quantization.QuantStub()
        self.dequant = torch.quantization.DeQuantStub()

    def forward(self, x):
        """
        Input x is shape (B, L, d_input)
        """
        # Perform static quantization
        x = self.quant(x)
        
        x = self.encoder(x)  # (B, L, d_input) -> (B, L, d_model)

        x = x.transpose(-1, -2)  # (B, L, d_model) -> (B, d_model, L)
        for layer, norm, dropout in zip(self.s4_layers, self.norms, self.dropouts):
            # Each iteration of this loop will map (B, d_model, L) -> (B, d_model, L)
            z = x
            if self.prenorm:
                # Prenorm
                z = norm(z.transpose(-1, -2)).transpose(-1, -2)

            # Apply S4 block: we ignore the state input and output
            z, _ = layer(z)

            # Dropout on the output of the S4 block
            z = dropout(z)

            # Residual connection
            x, z = self.dequant(x), self.dequant(z)
            x = z + x
            x, z = self.quant(x), self.quant(z)

            if not self.prenorm:
                # Postnorm
                x = norm(x.transpose(-1, -2)).transpose(-1, -2)

        x = x.transpose(-1, -2)

        # Pooling: average pooling over the sequence length
        x = x.mean(dim=1)

        # Decode the outputs
        x = self.decoder(x)  # (B, d_model) -> (B, d_output)
        
        # Dequantize
        x = self.dequant(x)

        return x


In [7]:
model = S4Model(
    d_input=3, # num of feature
    d_output=6, # 6 classes
    d_model=16,
    n_layers=4,
    dropout=0.2,
    lr=0.001,
    dropout_fn=nn.Dropout,
    prenorm=False,
)

state_path = f"{MODEL_DIR}/s4-d16.pt"
state_dict = torch.load(state_path, map_location='mps')
model.load_state_dict(state_dict)

<All keys matched successfully>

In [8]:
model.eval()

S4Model(
  (encoder): Linear(in_features=3, out_features=16, bias=True)
  (s4_layers): ModuleList(
    (0-3): 4 x S4D(
      (kernel): S4DKernel()
      (activation): GELU(approximate='none')
      (dropout): DropoutNd()
      (output_linear): Sequential(
        (0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
        (1): GLU(dim=-2)
      )
      (quant): QuantStub()
      (dequant): DeQuantStub()
    )
  )
  (norms): ModuleList(
    (0-3): 4 x LayerNorm((16,), eps=1e-05, elementwise_affine=True)
  )
  (dropouts): ModuleList(
    (0-3): 4 x Dropout(p=0.2, inplace=False)
  )
  (decoder): Linear(in_features=16, out_features=6, bias=True)
  (quant): QuantStub()
  (dequant): DeQuantStub()
)

In [9]:
utils.run_measurements(model, test_loader, device='cpu')

Accuracy on the test set: 92.40 %
Size of the model: 56.25 KB
Total inference time: 2.82 seconds
CPU Utilization: 52.65 %


### Custom Quantization

In [10]:
import copy

backend = "fbgemm"  # "fbgemm" for x86 CPU. Use "qnnpack" if running on ARM.

quantized_model = copy.deepcopy(model)
quantized_model.eval()

"""Prepare"""
quantized_model.qconfig = torch.quantization.get_default_qconfig(backend)
torch.quantization.prepare(quantized_model, inplace=True)

"""Calibrate
- Use representative (validation) data.
"""
with torch.inference_mode():
  for inputs, labels in calibration_loader:
    quantized_model(inputs)

"""Convert"""
torch.quantization.convert(quantized_model, inplace=True)



S4Model(
  (encoder): QuantizedLinear(in_features=3, out_features=16, scale=0.3021073341369629, zero_point=64, qscheme=torch.per_channel_affine)
  (s4_layers): ModuleList(
    (0): S4D(
      (kernel): S4DKernel()
      (activation): GELU(approximate='none')
      (dropout): DropoutNd()
      (output_linear): Sequential(
        (0): QuantizedConv1d(16, 32, kernel_size=(1,), stride=(1,), scale=0.9975776076316833, zero_point=74)
        (1): GLU(dim=-2)
      )
      (quant): Quantize(scale=tensor([0.2703]), zero_point=tensor([51]), dtype=torch.quint8)
      (dequant): DeQuantize()
    )
    (1): S4D(
      (kernel): S4DKernel()
      (activation): GELU(approximate='none')
      (dropout): DropoutNd()
      (output_linear): Sequential(
        (0): QuantizedConv1d(16, 32, kernel_size=(1,), stride=(1,), scale=0.297935426235199, zero_point=85)
        (1): GLU(dim=-2)
      )
      (quant): Quantize(scale=tensor([0.0739]), zero_point=tensor([48]), dtype=torch.quint8)
      (dequant): DeQu

In [11]:
utils.run_measurements(quantized_model, test_loader, device='cpu')

Accuracy on the test set: 91.49 %
Size of the model: 64.27 KB
Total inference time: 13.46 seconds
CPU Utilization: 73.65 %
