In [4]:
%pip install -q "nncf>=2.5.0"
%pip install -q "openvino>=2023.1.0"

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

# Import the s4 model path
import sys
sys.path.append('/Users/poomchan/Developer/s4')

from models.s4.s4 import S4Block as S4
from models.s4.s4d import S4D, S4DKernel
from tqdm.auto import tqdm

from numpy import genfromtxt
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import importlib
import utils
importlib.reload(utils)

import nncf
from nncf.parameters import ModelType
import openvino as ov

CUDA extension for structured kernels (Cauchy and Vandermonde multiplication) not found. Install by going to extensions/kernels/ and running `python setup.py install`, for improved speed and memory efficiency. Note that the kernel changed for state-spaces 4.0 and must be recompiled.
Falling back on slow Cauchy and Vandermonde kernel. Install at least one of pykeops or the CUDA extension for better speed and memory efficiency.


INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, openvino


### Setting up

In [28]:
PROJECT_DIR = "/Users/poomchan/Developer/light-har"
DATA_DIR = PROJECT_DIR + "/data"
MODEL_DIR = PROJECT_DIR + "/code/s4/models"

# Torch device configuration
device = (
    "cuda" if torch.cuda.is_available() else "mps"
    if torch.backends.mps.is_available() else "cpu"
)

print(f"Using {device} device")
if torch.cuda.is_available():
    print(f"Current GPU device: {torch.cuda.get_device_name(device)}")

Using mps device


### Load the Data

In [3]:
# Load Data
x = genfromtxt(PROJECT_DIR+'/data/WISDM_x.csv', delimiter=',')
y_df = pd.read_csv(PROJECT_DIR+'/data/WISDM_y.csv')
y = y_df.values.flatten()  # Flatten if y is 2D

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Function to create time series dataset
def create_series(x, y, timestep, overlap):
    slide_step = int(timestep * (1 - overlap))
    data_num = int((len(x) / slide_step) - 1)
    dataset = np.ndarray(shape=(data_num, timestep, x.shape[1]))
    labels = []

    for i in range(data_num):
        labels.append(y[slide_step * (i + 1) - 1])
        for j in range(timestep):
            dataset[i, j, :] = x[slide_step * i + j, :]

    return dataset, np.array(labels)

# Create time series
seq_length = 16
overlap = 0.5
X_series, y_series = create_series(x, y_encoded, seq_length, overlap)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_series, y_series, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
x_train_tensor = torch.tensor(X_train, dtype=torch.float32)
x_test_tensor = torch.tensor(X_test, dtype=torch.float32)
x_cal_tensor = x_train_tensor[:300]

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
y_cal_tensor = y_train_tensor[:300]

print(x_cal_tensor.shape)

torch.Size([300, 16, 3])


In [4]:
# Create a DataLoader
batch_size = 32
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

# Create a test Dataloader
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

# Create a calibration Dataloader
calibration_loader = DataLoader(TensorDataset(x_cal_tensor, y_cal_tensor), batch_size=32, shuffle=False)

### Load the trained model

In [5]:
class S4Model(nn.Module):
    def __init__(
        self,
        d_input,
        d_output,
        d_model=256,
        n_layers=4,
        dropout=0.2,
        lr=0.001,
        dropout_fn=nn.Dropout,
        prenorm=False,
    ):
        super().__init__()

        self.prenorm = prenorm

        # Linear encoder
        self.encoder = nn.Linear(d_input, d_model)

        # Stack S4 layers as residual blocks
        self.s4_layers = nn.ModuleList()
        self.norms = nn.ModuleList()
        self.dropouts = nn.ModuleList()
        for _ in range(n_layers):
            self.s4_layers.append(
                S4D(d_model, dropout=dropout, transposed=True, lr=lr)
            )
            self.norms.append(nn.LayerNorm(d_model))
            self.dropouts.append(dropout_fn(dropout))

        # Linear decoder
        self.decoder = nn.Linear(d_model, d_output)

    def forward(self, x):
        """
        Input x is shape (B, L, d_input)
        """
        x = self.encoder(x)  # (B, L, d_input) -> (B, L, d_model)

        x = x.transpose(-1, -2)  # (B, L, d_model) -> (B, d_model, L)
        for layer, norm, dropout in zip(self.s4_layers, self.norms, self.dropouts):
            # Each iteration of this loop will map (B, d_model, L) -> (B, d_model, L)

            z = x
            if self.prenorm:
                # Prenorm
                z = norm(z.transpose(-1, -2)).transpose(-1, -2)

            # Apply S4 block: we ignore the state input and output
            z, _ = layer(z)

            # Dropout on the output of the S4 block
            z = dropout(z)

            # Residual connection
            x = z + x

            if not self.prenorm:
                # Postnorm
                x = norm(x.transpose(-1, -2)).transpose(-1, -2)

        x = x.transpose(-1, -2)

        # Pooling: average pooling over the sequence length
        x = x.mean(dim=1)

        # Decode the outputs
        x = self.decoder(x)  # (B, d_model) -> (B, d_output)

        return x


In [29]:
model = S4Model(
    d_input=3, # num of feature
    d_output=6, # 6 classes
    d_model=16,
    n_layers=4,
    dropout=0.2,
    lr=0.001,
    dropout_fn=nn.Dropout,
    prenorm=False,
)

state_path = f"{MODEL_DIR}/s4-d16.pt"
state_dict = torch.load(state_path, map_location='mps')
model.load_state_dict(state_dict)

<All keys matched successfully>

In [7]:
utils.run_measurements(model, test_loader, device='cpu')

Accuracy on the test set: 92.40 %
Size of the model: 55.87 KB
Total inference time: 3.06 seconds
CPU Utilization: 51.80 %


### NNCF Quantization

In [8]:
# The calibration dataset is a small, no label, representative dataset
# (~100-500 samples) that is used to estimate the range, i.e. (min, max) of all
# floating point activation tensors in the model, to initialize the quantization
# parameters.

def transform_fn(data_item):
    features, label = data_item
    return features

calibration_dataset = nncf.Dataset(calibration_loader, transform_fn)

quantized_model = nncf.quantize(model, calibration_dataset)

2024-03-25 08:21:46.704797: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.




Output()

INFO:nncf:Compiling and loading torch extension: quantized_functions_cpu...
INFO:nncf:Finished loading torch extension: quantized_functions_cpu


Output()

In [10]:
quantized_model.eval()

S4Model(
  (encoder): NNCFLinear(
    in_features=3, out_features=16, bias=True
    (pre_ops): ModuleDict(
      (0): UpdateWeight(
        (op): SymmetricQuantizer(bit=8, ch=True)
      )
    )
    (post_ops): ModuleDict()
  )
  (s4_layers): ModuleList(
    (0-3): 4 x S4D(
      (kernel): S4DKernel()
      (activation): GELU(approximate='none')
      (dropout): DropoutNd()
      (output_linear): Sequential(
        (0): NNCFConv1d(
          16, 32, kernel_size=(1,), stride=(1,)
          (pre_ops): ModuleDict(
            (0): UpdateWeight(
              (op): SymmetricQuantizer(bit=8, ch=True)
            )
          )
          (post_ops): ModuleDict()
        )
        (1): GLU(dim=-2)
      )
    )
  )
  (norms): ModuleList(
    (0-3): 4 x NNCFLayerNorm(
      (16,), eps=1e-05, elementwise_affine=True
      (pre_ops): ModuleDict(
        (0): UpdateWeight(
          (op): SymmetricQuantizer(bit=8, ch=False)
        )
      )
      (post_ops): ModuleDict()
    )
  )
  (dropouts): 

In [9]:
utils.run_measurements(quantized_model, test_loader, device='cpu')

Accuracy on the test set: 92.47 %
Size of the model: 105.96 KB
Total inference time: 14.37 seconds
CPU Utilization: 43.50 %


### Use OpenVino Representation

In [15]:
dummy_input = torch.randn(1, 16, 3)
ov_model = ov.convert_model(model.cpu(), example_input=dummy_input)
ov_quantized_model = ov.convert_model(quantized_model.cpu(), example_input=dummy_input)

[W NNPACK.cpp:64] Could not initialize NNPACK! Reason: Unsupported hardware.


OpConversionFailure: Check 'is_conversion_successful' failed at src/frontends/pytorch/src/frontend.cpp:143:
FrontEnd API failed with OpConversionFailure:
Model wasn't fully converted. Failed operations detailed log:
-- ov::align_types with a message:
This is internal operation for type alignment and should be removed at normalization step. It can't be removed if types can't be resolved.
-- prim::Constant with a message:
None constant cannot be converted to OpenVINO opset and should be removed by consuming operation.
Summary:
-- No conversion rule found for operations: aten::fft_irfft, aten::fft_rfft, aten::real, aten::view_as_complex
-- Conversion is failed for: ov::align_types, prim::Constant
