# RKNN fixed version 

* noMel
* conv1d + fix for pad
* newAVact - approximation of GELU 

===============================================================

* build new noMel model based on base line
* replace bad layers with working layers
    * no need to change the width of the layers
* run voices through the model and compare to baseline

===============================================================

In [1]:
%load_ext autoreload
%autoreload 2
## our utils
from utils.common_import import *
from utils.test_all_voices import *

2.6.0+cu124


In [2]:
%%capture --no-display
import my_utils as myUtils
from play1_setBase_line_B2 import original_model,base_line_embedding

## original activations


In [3]:
ACT_TYPES = (
    nn.ReLU, nn.ReLU6, nn.LeakyReLU, nn.ELU, nn.PReLU, nn.GELU,
    nn.SiLU, nn.Sigmoid, nn.Tanh, nn.Hardswish
)

def list_activations(model):
    """Print every module whose class is in ACT_TYPES."""
    for name, m in model.named_modules():
        if isinstance(m, ACT_TYPES):
            print(f'{name:<60} {m}')

In [4]:
list_activations(original_model)   

backbone.stage0.3.conv_block.act                             GELU(approximate='none')
backbone.stage0.4.conv_block.act                             GELU(approximate='none')
backbone.stage0.6.tcm.0.act                                  GELU(approximate='none')
backbone.stage0.6.tcm.1.act                                  GELU(approximate='none')
backbone.stage0.6.tcm.2.act                                  GELU(approximate='none')
backbone.stage0.6.tcm.3.act                                  GELU(approximate='none')
backbone.stage1.3.conv_block.act                             GELU(approximate='none')
backbone.stage1.4.conv_block.act                             GELU(approximate='none')
backbone.stage1.6.tcm.0.act                                  GELU(approximate='none')
backbone.stage1.6.tcm.1.act                                  GELU(approximate='none')
backbone.stage1.6.tcm.2.act                                  GELU(approximate='none')
backbone.stage1.6.tcm.3.act                           

# create new model

### Conv1dAs2d

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from math import floor

# ---------------------------------------------------------------------
#  Global switches
# ---------------------------------------------------------------------
DEBUG = True                 # False → silence all prints
MAX_NPU_KERNEL = 10          # Rockchip depth-wise limit (stride = 1)


def _dbg(msg: str):
    if DEBUG:
        print(msg)


# =====================================================================
# 1.  Shape-safe Conv1d → Conv2d wrapper (NO weight edits)
# =====================================================================
class Conv1dAs2d(nn.Module):
    """
    Wrap a Conv1d as Conv2d without touching its weights:
      * accepts [B,C,T]  or  [B,C,T,1]
      * gives the same rank back
      * never uses 'same' | 'valid' strings → RKNN keeps pads inside node
    """
    def __init__(self, src: nn.Conv1d):
        super().__init__()

        k = src.kernel_size[0]
        d = src.dilation[0]
        s = src.stride[0]
        g = src.groups
        in_c, out_c = src.in_channels, src.out_channels
        pad_in = src.padding                     # "same"/"valid"/int/tuple

        # numeric padding (H, W)
        if isinstance(pad_in, str):
            pad_num = floor(d * (k - 1) / 2) if pad_in == "same" else 0
        else:
            pad_num = pad_in[0] if isinstance(pad_in, tuple) else pad_in
        pad_hw = (pad_num, 0)

        # banner ------------------------------------------------------
        place = "CPU (k>10 depth-wise)" if (k > MAX_NPU_KERNEL and g == in_c) else "NPU"
        _dbg(f"[Conv1dAs2d] {in_c}→{out_c}  k={k} d={d} s={s} g={g}  pad={pad_hw}  ⇒ {place}")

        # build Conv2d -----------------------------------------------
        self.conv2d = nn.Conv2d(
            in_channels=in_c,
            out_channels=out_c,
            kernel_size=(k, 1),
            stride=(s, 1),
            padding=pad_hw,
            dilation=(d, 1),
            groups=g,
            bias=src.bias is not None,
        )

        # copy weights verbatim
        with torch.no_grad():
            self.conv2d.weight.copy_(src.weight.unsqueeze(-1))
            if src.bias is not None:
                self.conv2d.bias.copy_(src.bias)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        add_dim = False
        if x.dim() == 3:                   # [B,C,T]  → [B,C,T,1]
            x, add_dim = x.unsqueeze(-1), True
        elif not (x.dim() == 4 and x.shape[-1] == 1):
            raise ValueError(f"Conv1dAs2d got shape {tuple(x.shape)}")

        y = self.conv2d(x)                 # Conv2d works on 4-D
        return y.squeeze(-1) if add_dim else y


### split >10 

In [6]:
import torch
import torch.nn as nn
from math import floor

# ------------------------------------------------------------
#  Global settings
# ------------------------------------------------------------
DEBUG_CONV1D_AS_2D   = True            # set False to silence all prints
MAX_NPU_KERNEL       = 10              # HW limit (stride = 1)
MAX_SUB_KERNEL_SPLIT = 9               # odd, so symmetric pad works

def _dbg(msg: str):
    if DEBUG_CONV1D_AS_2D:
        print(msg)

# ------------------------------------------------------------
#  Helper: split a large receptive field into odd sub-kernels ≤ 9
#  For stride==1 the effective RF of a cascade is:
#        k_eff = sum(k_i) - (n_stages - 1)
# ------------------------------------------------------------
def _split_kernel(k: int, k_max: int = MAX_SUB_KERNEL_SPLIT):
    """Return a list of odd kernel sizes whose cascade reproduces k."""
    if k <= k_max:
        return [k]

    segments = []
    remaining = k
    while remaining > k_max:
        segments.append(k_max)             # add a full-size 9
        remaining -= (k_max - 1)           # because RF grows by k_max-1
    if remaining % 2 == 0:                 # make it odd (8 → 7)
        remaining -= 1
        segments[-1] += 1                  # compensate so RF stays exact
    segments.append(remaining)
    assert sum(segments) - (len(segments)-1) == k, "RF mismatch"
    return segments

# ------------------------------------------------------------
#  Main wrapper
# ------------------------------------------------------------
class Conv1dAs2d_split(nn.Module):
    """
    * k ≤ 10  → single Conv2d (weights copied, runs on NPU)
    * k  > 10 → cascade of Conv2d layers, every sub-kernel ≤ 9 (runs on NPU)
                (weights are *not* copied; fine-tune is required)
    Padding is always numeric – ONNX will not emit a standalone Pad op.
    """
    def __init__(self, conv1d: nn.Conv1d):
        super().__init__()

        # ---------- original 1-D parameters ----------
        k, d, s, g = conv1d.kernel_size[0], conv1d.dilation[0], conv1d.stride[0], conv1d.groups
        in_c, out_c = conv1d.in_channels, conv1d.out_channels
        pad_in = conv1d.padding                         # "same" | "valid" | int/tuple

        if d != 1 or s != 1:
            raise ValueError("Wrapper currently supports stride=1, dilation=1 only")

        # ----------------------------------------------------
        # Case A — kernel already NPU-friendly
        # ----------------------------------------------------
        if k <= MAX_NPU_KERNEL:
            pad_num = floor((k - 1) / 2) if isinstance(pad_in, str) and pad_in == "same" \
                      else (pad_in[0] if isinstance(pad_in, tuple) else pad_in)
            pad_arg = (pad_num, 0)

            self.conv = nn.Conv2d(
                in_channels=in_c, out_channels=out_c,
                kernel_size=(k, 1), stride=(1, 1),
                padding=pad_arg, dilation=(1, 1),
                groups=g, bias=conv1d.bias is not None
            )
            with torch.no_grad():
                self.conv.weight.copy_(conv1d.weight.unsqueeze(-1))
                if conv1d.bias is not None:
                    self.conv.bias.copy_(conv1d.bias)

            _dbg(f"[Conv1dAs2d] k={k} → single Conv2d, pad={pad_arg}, runs on NPU")

        # ----------------------------------------------------
        # Case B — kernel too wide → split into ≤9-tap stages
        # ----------------------------------------------------
        else:
            k_list = _split_kernel(k)                   # e.g. 59 → [9,9,9,9,9,5]
            layers = []
            for i, ks in enumerate(k_list):
                pad_num = (ks - 1) // 2                 # symmetric
                conv2d = nn.Conv2d(
                    in_channels=in_c, out_channels=out_c,
                    kernel_size=(ks, 1), stride=(1, 1),
                    padding=(pad_num, 0), dilation=(1, 1),
                    groups=g, bias=False                # leave bias out; easier to fine-tune later
                )
                layers.append(conv2d)
                _dbg(f"[Conv1dAs2d]  ├─ stage {i}: ks={ks}, pad={pad_num}")
            self.conv = nn.Sequential(*layers)

            _dbg(f"[Conv1dAs2d] k={k} split into {k_list} (cascade runs on NPU)\n"
                 "           ⚠ weights not copied — fine-tune is required")

        _dbg("------------------------------------------------------------")

    # forward
    def forward(self, x: torch.Tensor) -> torch.Tensor:   # x: [B, C, T]
        return self.conv(x.unsqueeze(-1)).squeeze(-1)


### safe pool

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ---------------------------------------------------------------------
# 1. RKNN-safe Attentive Statistics Pooling that matches any channel count
# ---------------------------------------------------------------------
class ASTP_RKNNSafe(nn.Module):
    """ASTP rewritten so linear1 expects in_channels (600 here)."""
    def __init__(self, src_pool: nn.Module, in_channels: int):
        super().__init__()
        mid_channels = src_pool.linear1.out_channels   # 128
        out_channels = in_channels                     # 600

        # fresh 1x1 conv layers
        self.linear1 = nn.Conv1d(in_channels, mid_channels, kernel_size=1, bias=True)
        self.linear2 = nn.Conv1d(mid_channels, out_channels, kernel_size=1, bias=True)

        # copy original weights where dimensions allow
        with torch.no_grad():
            # linear1: tile or truncate old weights to fit new in_channels
            old_w1 = src_pool.linear1.weight           # [128, 1800, 1]
            repeat = (in_channels + old_w1.size(1) - 1) // old_w1.size(1)
            new_w1 = old_w1.repeat(1, repeat, 1)[:, :in_channels, :]
            self.linear1.weight.copy_(new_w1)
            self.linear1.bias.copy_(src_pool.linear1.bias)

            # linear2: out_channels is 600, just copy first 600
            self.linear2.weight.copy_(src_pool.linear2.weight[:, :out_channels, :])
            self.linear2.bias.copy_(src_pool.linear2.bias[:out_channels])

        # make them RKNN friendly
        self.linear1 = Conv1dAs2d(self.linear1)
        self.linear2 = Conv1dAs2d(self.linear2)
        self.eps = getattr(src_pool, "eps", 1e-12)

    def forward(self, x):
        attn = torch.softmax(self.linear2(torch.tanh(self.linear1(x))), dim=-1)
        mean = torch.sum(attn * x, dim=-1)
        var  = torch.sum(attn * (x - mean.unsqueeze(-1)) ** 2, dim=-1)
        std  = torch.pow(var + self.eps, 0.5)          # RKNN keeps Pow on NPU
        return torch.cat([mean, std], dim=1)           # [B, 2 * C]
        


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ASTP_RKNNSafe_cpu(nn.Module):
    """CPU-safe attentive statistics pooling (mean + variance)."""
    def __init__(self, src_pool: nn.Module, in_channels: int):
        super().__init__()
        mid = src_pool.linear1.out_channels   # 128
        out = in_channels                     # 600

        self.linear1 = nn.Conv1d(in_channels, mid, kernel_size=1, bias=True)
        self.linear2 = nn.Conv1d(mid,        out, kernel_size=1, bias=True)

        # copy or tile pretrained weights
        with torch.no_grad():
            w1 = src_pool.linear1.weight                 # [128,1800,1]
            reps = (in_channels + w1.size(1) - 1) // w1.size(1)
            self.linear1.weight.copy_(w1.repeat(1, reps, 1)[:, :in_channels, :])
            self.linear1.bias.copy_(src_pool.linear1.bias)

            self.linear2.weight.copy_(src_pool.linear2.weight[:, :out, :])
            self.linear2.bias.copy_(src_pool.linear2.bias[:out])

        self.eps = getattr(src_pool, "eps", 1e-12)

    def forward(self, x):
        # x shape: [B, C, T]
        attn  = torch.softmax(self.linear2(torch.tanh(self.linear1(x))), dim=-1)
        mean  = (attn * x).sum(dim=-1)                    # [B, C]

        diff  = x - mean.unsqueeze(-1)                    # broadcast
        sqr   = diff * diff                              # element-wise square, no Pow
        var   = (attn * sqr).sum(dim=-1) + self.eps       # [B, C]

        return torch.cat([mean, var], dim=1)              # [B, 2C]


### gather

### activations


In [9]:
class NewGELUActivation(nn.Module):
    """
    tanh-based approximation of GELU from Hendrycks & Gimpel (2016):
        0.5 * x * (1 + tanh( √(2/π) · (x + 0.044715 x³) ))
    """
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(
            (torch.sqrt(torch.tensor(2.0 / torch.pi, device=x.device)) * 
             (x + 0.044715 * torch.pow(x, 3)))
        ))
    

In [10]:
def replace_activation_(module, old_cls=nn.GELU, new_cls=nn.ReLU, **new_kwargs):
    """
    In-place, recursive swap of every instance of `old_cls`
    with `new_cls(**new_kwargs)`.
    """
    for name, child in module.named_children():
        if isinstance(child, old_cls):
            setattr(module, name, new_cls(**new_kwargs))
        else:
            replace_activation_(child, old_cls, new_cls, **new_kwargs)
# -----------------------------

## ReDimNetNoMel

In [11]:
########################################
# 2) Define a Model Class without MelBanks
########################################
import torch
import torch.nn as nn

class ReDimNetNoMel(nn.Module):
    """
    A wrapper around the original ReDimNetWrap that:
      - Excludes the 'spec' (MelBanks) module
      - Uses 'backbone', 'pool', 'bn', and 'linear'
    We expect a precomputed mel spectrogram as input with shape [B, 1, n_mels, time_frames].
    """
    def __init__(self, original_wrap):
        super().__init__()
        
        # Grab references to the submodules we want to keep
        self.backbone = original_wrap.backbone
        
        # fix problem01
        # list of (stage, block) indices you already know are problematic
        TARGETS = [(0, 6), (1, 6), (2, 7), (3, 8), (4, 8) , (5, 8)]
        
        for s_idx, b_idx in TARGETS:
            for tcm_idx in range(4):
                block = self.backbone.__getattr__(f"stage{s_idx}")[b_idx].tcm[tcm_idx]

                # # test ident
                # block.dwconvs[0] = nn.Identity()
                # block.pwconv1    = nn.Identity()

                ## original fix
                # block.dwconvs[0] = Conv1dAs2d(block.dwconvs[0])
                # block.pwconv1    = Conv1dAs2d(block.pwconv1)   # 1×1 conv
                
                # test with split
                block.dwconvs[0] = Conv1dAs2d_split(block.dwconvs[0])
                block.pwconv1    = Conv1dAs2d_split(block.pwconv1)   # 1×1 conv
                
                
                
        # #orignal before
        # self.pool   = original_wrap.pool
        # # ------------------------------------------------------------------
        # # 2. patch the pool (ASTP) – its two "linear" Conv1d layers
        # # ------------------------------------------------------------------
        # self.pool.linear1 = Conv1dAs2d(self.pool.linear1)   # 1 × 1 conv 1800→128
        # self.pool.linear2 = Conv1dAs2d(self.pool.linear2)   # 1 × 1 conv 128 →600


        # # # --- pooling --> RKNN-safe variant ------------------------------------
        # # replace pool with the new channel-aware version
        # with torch.no_grad():
        #     dummy = torch.zeros(1, 1, 72, 134)
        #     out_channels = self.backbone(dummy).shape[1]   # 600
        # self.pool = ASTP_RKNNSafe(original_wrap.pool, out_channels)
        
        
        
         # find backbone output channels once
        with torch.no_grad():
            c_backbone = self.backbone(torch.zeros(1, 1, 72, 134)).shape[1]  # 600
        # pooling block now entirely on CPU (no Conv1dAs2d)
        self.pool = ASTP_RKNNSafe_cpu(original_wrap.pool, c_backbone)

        
        # ---------- tail ----------
        self.bn = original_wrap.bn
        self.linear = original_wrap.linear
        
        ## Replace activations in the backbone
        replace_activation_(self, old_cls=nn.GELU, new_cls=NewGELUActivation)

    def forward(self, x):
        # x: shape [B, 1, n_mels, time_frames]
        # (1) Pass through the backbone
        x = self.backbone(x)    # shape might become [B, channels, frames] or similar
        # (2) Pooling
        x = self.pool(x)        # ASTP => shape likely [B, embedding_dim]
        # (3) BatchNorm
        x = self.bn(x)
        # (4) Final linear => 192-dim (if that's your embedding size)
        x = self.linear(x)
        return x


# Create an instance of our new model that skips the MelBanks front-end
model_no_mel = ReDimNetNoMel(original_model)



[Conv1dAs2d] k=7 → single Conv2d, pad=(3, 0), runs on NPU
------------------------------------------------------------
[Conv1dAs2d] k=1 → single Conv2d, pad=(0, 0), runs on NPU
------------------------------------------------------------
[Conv1dAs2d]  ├─ stage 0: ks=9, pad=4
[Conv1dAs2d]  ├─ stage 1: ks=9, pad=4
[Conv1dAs2d]  ├─ stage 2: ks=3, pad=1
[Conv1dAs2d] k=19 split into [9, 9, 3] (cascade runs on NPU)
           ⚠ weights not copied — fine-tune is required
------------------------------------------------------------
[Conv1dAs2d] k=1 → single Conv2d, pad=(0, 0), runs on NPU
------------------------------------------------------------
[Conv1dAs2d]  ├─ stage 0: ks=9, pad=4
[Conv1dAs2d]  ├─ stage 1: ks=9, pad=4
[Conv1dAs2d]  ├─ stage 2: ks=9, pad=4
[Conv1dAs2d]  ├─ stage 3: ks=7, pad=3
[Conv1dAs2d] k=31 split into [9, 9, 9, 7] (cascade runs on NPU)
           ⚠ weights not copied — fine-tune is required
------------------------------------------------------------
[Conv1dAs2d] k=1 →

# test

In [12]:
model_no_mel.eval()  # <- this line is critical!
dummy = torch.randn(1, 1, 72, 134)
model_no_mel(dummy)

tensor([[-0.7342, -0.3765,  0.1065, -2.1669,  0.4111,  0.9082, -0.3310,  1.1415,
          1.6039,  0.0577,  0.7036, -0.9166, -1.4775, -0.8100,  0.3512, -1.4892,
          2.5740, -1.9717,  0.2193, -1.0464,  0.5683, -0.8797,  0.1442, -2.7725,
          1.3930, -3.4797,  1.2009,  0.8568, -2.2360,  0.7037,  1.4462,  0.9154,
         -0.6945, -0.6774,  0.0602, -0.0771, -0.4021, -1.9023,  1.0491, -1.7664,
          0.2396, -0.3444, -3.8919, -1.0772,  0.4447, -0.6326,  0.2721,  0.4667,
         -1.7621, -1.3656,  2.1652,  2.7714,  1.3198, -1.8804,  0.0269,  2.1040,
         -0.2017,  1.0771,  1.5558,  0.0683, -0.7983,  1.2724,  2.2012,  0.5660,
         -2.4000,  0.5101, -0.6607,  2.3502, -0.1457, -1.7068, -0.9537, -0.6582,
         -2.0329,  1.4163,  1.3181, -1.0567, -0.0096,  0.7868,  3.0657, -1.1313,
          1.6675, -0.7146,  0.8327,  2.5079,  0.4475, -1.4926,  0.7370, -1.2295,
         -0.5195, -0.0219,  1.7682,  1.3294, -0.5187,  0.1664, -0.2296, -1.9450,
         -0.6819, -2.1093, -

### PADS?


Conv1dAs2d deliberately switches to padding='same' whenever
pad_num (= (k-1)//2) > max_pad (currently 4).
PyTorch → ONNX keeps that as the auto_pad attribute, and RKNN reacts by
materialising an explicit Pad node that it then assigns to the CPU,
which breaks compilation.

Hardware can handle large numeric pads – it merely dislikes auto_pad.

```
       (dwconvs): ModuleList(
              (0): Conv1dAs2d(
                (conv2d): Conv2d(20, 20, kernel_size=(59, 1), stride=(1, 1), padding=same, groups=20)
              )
            )
            (norm): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (act): NewGELUActivation()
            (pwconv1): Conv1dAs2d(
              (conv2d): Conv2d(20, 20, kernel_size=(1, 1), stride=(1, 1))
            )

```

 unsupport cpu Pad op, op name: Pad:/backbone/stage0/stage0.6/tcm/tcm.1/dwconvs.0/Unsqueeze_output_0_pad

In [13]:

# ──────────────────────────────────────────────────────────────
# 4. Inside PyTorch: list every Conv2d that still uses "padding=\'same\'"
# ──────────────────────────────────────────────────────────────
print('\n=== Conv2d layers with padding mode "same" ===')
for name, mod in model_no_mel.named_modules():
    if isinstance(mod, torch.nn.Conv2d) and isinstance(mod.padding, str):
        k, _ = mod.kernel_size
        d, _ = mod.dilation
        print(f'  {name:60s}  k={k:2d}  d={d:2d}  mode="same"')


=== Conv2d layers with padding mode "same" ===
  backbone.stem.0                                               k= 3  d= 1  mode="same"
  backbone.stage0.3.conv_block.dwconvs.0                        k= 3  d= 1  mode="same"
  backbone.stage0.4.conv_block.dwconvs.0                        k= 3  d= 1  mode="same"
  backbone.stage1.3.conv_block.dwconvs.0                        k= 3  d= 1  mode="same"
  backbone.stage1.4.conv_block.dwconvs.0                        k= 3  d= 1  mode="same"
  backbone.stage2.3.conv_block.dwconvs.0                        k= 3  d= 1  mode="same"
  backbone.stage2.4.conv_block.dwconvs.0                        k= 3  d= 1  mode="same"
  backbone.stage2.5.conv_block.dwconvs.0                        k= 3  d= 1  mode="same"
  backbone.stage3.3.conv_block.dwconvs.0                        k= 3  d= 1  mode="same"
  backbone.stage3.4.conv_block.dwconvs.0                        k= 3  d= 1  mode="same"
  backbone.stage3.5.conv_block.dwconvs.0                        k= 3  d=

### SIZE?

For stride = 1 the silicon docs spell it out:

    “The minimum supported kernel size is 1 and the maximum is 11 × stride – 1.”
    dl.radxa.com

With stride = 1 the ceiling is 10. Anything wider forces RKNN to split the layer into a CPU-side Pad ➜ Conv pair, and the runtime you’re using still lacks a CPU implementation of Pad, so the build fails even though the Conv itself would happily run on the CPU.

In [14]:
LIMIT = 10         # 11*stride - 1 with stride = 1

bad = []
for name, m in model_no_mel.named_modules():
    if isinstance(m, torch.nn.Conv2d):
        k, _ = m.kernel_size
        pad, _ = (m.padding if not isinstance(m.padding, str)
                   else (floor((k-1)/2), 0))          # same-padding case
        if k > LIMIT or pad > 4:
            bad.append((name, k, pad))

if not bad:
    print("✓ All Conv2d layers are within RKNN limits.")
else:
    print("✗ Layers that violate RKNN limits:")
    for n,k,p in bad:
        print(f"   {n:60s}  k={k}  pad={p}")

✓ All Conv2d layers are within RKNN limits.


### FP16 check


In [15]:
with torch.no_grad():
    fp16_net = copy.deepcopy(model_no_mel).half().eval()
    ok = torch.isfinite(fp16_net(dummy.half())).all()
    print('safe in pure FP16?', ok)

safe in pure FP16? tensor(True)


## info

In [16]:
list_activations(model_no_mel)   

In [17]:
model_no_mel.eval()


ReDimNetNoMel(
  (backbone): ReDimNet(
    (stem): Sequential(
      (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=same)
      (1): LayerNorm(C=(16,), data_format=channels_first, eps=1e-06)
      (2): to1d()
    )
    (stage0): Sequential(
      (0): weigth1d(w=(1, 1, 1, 1),sequential=False)
      (1): to2d(f=72,c=16)
      (2): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1))
      (3): ConvBlock2d(
        (conv_block): ConvNeXtLikeBlock(
          (dwconvs): ModuleList(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=same, groups=4)
          )
          (norm): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): NewGELUActivation()
          (pwconv1): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1))
        )
      )
      (4): ConvBlock2d(
        (conv_block): ConvNeXtLikeBlock(
          (dwconvs): ModuleList(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=same,

In [18]:
summary(model_no_mel, (1, 1, 72, 134))


Layer (type:depth-idx)                                       Output Shape              Param #
ReDimNetNoMel                                                [1, 192]                  --
├─ReDimNet: 1-1                                              [1, 1152, 134]            --
│    └─Sequential: 2-1                                       [1, 1152, 134]            --
│    │    └─Conv2d: 3-1                                      [1, 16, 72, 134]          160
│    │    └─LayerNorm: 3-2                                   [1, 16, 72, 134]          32
│    │    └─to1d: 3-3                                        [1, 1152, 134]            --
│    └─Sequential: 2-2                                       [1, 1152, 134]            --
│    │    └─weigth1d: 3-4                                    [1, 1152, 134]            (1)
│    │    └─to2d: 3-5                                        [1, 16, 72, 134]          --
│    │    └─Conv2d: 3-6                                      [1, 16, 72, 134]          272
│ 

# TORCH SIDE

In [19]:
def torch_inference(wav_path: str):
    # (a) Load audio
    waveform, sample_rate = torchaudio.load(wav_path)  # shape: [channels, time]
    # If stereo, select one channel, or average:
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Resample if needed
    target_sample_rate=16000
    if sample_rate != target_sample_rate:
        resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)

    # (b) Convert to log-mel
    log_mel = myUtils.waveform_to_logmel(waveform)
    print('feeding logmel shape:', log_mel.shape)
    
    # (c) Forward pass
    with torch.no_grad():
        embedding = model_no_mel(log_mel)  # shape typically [1, 192] or so

    print("Embedding shape:", embedding.shape)
    #print("Embedding:", embedding)
    return embedding

* run test

In [20]:
torch_embedding = test_all_voices(
    extract_speaker_embedding_function = torch_inference,
    cosine_similarity_function = myUtils.cosine_similarity
)

Input waveform shape: torch.Size([1, 32000])
feeding logmel shape: torch.Size([1, 1, 72, 134])
Embedding shape: torch.Size([1, 192])
Input waveform shape: torch.Size([1, 25776])
Padding log_mel from 108 to 134 frames
feeding logmel shape: torch.Size([1, 1, 72, 134])
Embedding shape: torch.Size([1, 192])
Input waveform shape: torch.Size([1, 23570])
Padding log_mel from 99 to 134 frames
feeding logmel shape: torch.Size([1, 1, 72, 134])
Embedding shape: torch.Size([1, 192])
Input waveform shape: torch.Size([1, 32000])
feeding logmel shape: torch.Size([1, 1, 72, 134])
Embedding shape: torch.Size([1, 192])
Input waveform shape: torch.Size([1, 32000])
feeding logmel shape: torch.Size([1, 1, 72, 134])
Embedding shape: torch.Size([1, 192])
Input waveform shape: torch.Size([1, 28126])
Padding log_mel from 118 to 134 frames
feeding logmel shape: torch.Size([1, 1, 72, 134])
Embedding shape: torch.Size([1, 192])
Input waveform shape: torch.Size([1, 32000])
feeding logmel shape: torch.Size([1, 1, 7

## compare to baseline

* test embedding compare of voice in the currnet model with baseline model:

In [21]:
print(f"Similarity embde0: {myUtils.cosine_similarity(base_line_embedding['embed0'], torch_embedding['embed0'])}")
print(f"Similarity embde1: {myUtils.cosine_similarity(base_line_embedding['embed1'], torch_embedding['embed1'])}")
print(f"Similarity embde2: {myUtils.cosine_similarity(base_line_embedding['embed2'], torch_embedding['embed2'])}")
print(f"Similarity embde3: {myUtils.cosine_similarity(base_line_embedding['embed3'], torch_embedding['embed3'])}")
print(f"Similarity embde4: {myUtils.cosine_similarity(base_line_embedding['embed4'], torch_embedding['embed4'])}")
print(f"Similarity embde5: {myUtils.cosine_similarity(base_line_embedding['embed5'], torch_embedding['embed5'])}")
print(f"Similarity embde6: {myUtils.cosine_similarity(base_line_embedding['embed6'], torch_embedding['embed6'])}")

Similarity embde0: 0.407304972410202
Similarity embde1: 0.5619342923164368
Similarity embde2: 0.6011209487915039
Similarity embde3: 0.48210597038269043
Similarity embde4: 0.46966877579689026
Similarity embde5: 0.2696359157562256
Similarity embde6: 0.35251766443252563


# ONNX SIDE

In [22]:
myUtils.export_to_onnx(model_no_mel,onnx_path = "ReDimNet_no_mel.onnx")
!ls -lah ReDimNet_no_mel.onnx

  (torch.sqrt(torch.tensor(2.0 / torch.pi, device=x.device)) *


Exported NHWC model to ReDimNet_no_mel_nhwc.onnx
Exported to ReDimNet_no_mel.onnx
-rw-rw-r-- 1 vlad vlad 19M Jul  6 07:01 ReDimNet_no_mel.onnx


### store half

In [23]:
myUtils.restore_in_half_precision('ReDimNet_no_mel.onnx','ReDimNet_no_mel_fp16.onnx')
myUtils.restore_in_half_precision('ReDimNet_no_mel_nhwc.onnx','ReDimNet_no_mel_nhwc_fp16.onnx')




Converted ReDimNet_no_mel.onnx to half precision and saved as ReDimNet_no_mel_fp16.onnx
Converted ReDimNet_no_mel_nhwc.onnx to half precision and saved as ReDimNet_no_mel_nhwc_fp16.onnx


## verify

In [24]:
# onnx_path = "ReDimNet_no_mel.onnx"
onnx_path = "ReDimNet_no_mel_fp16.onnx"

In [25]:
onnx_model = onnx.load(onnx_path)
onnx.checker.check_model(onnx_model)
print("ONNX model is valid!")

ONNX model is valid!


In [26]:
import torch, onnx, textwrap, json
from pathlib import Path

m= onnx_model

print("\n=== Conv nodes with auto_pad = 'SAME_UPPER' or 'SAME_LOWER' ===")
for n in m.graph.node:
    if n.op_type != 'Conv':
        continue

    auto_attr = next((a for a in n.attribute if a.name == 'auto_pad'), None)
    if auto_attr and auto_attr.s.decode() != 'NOTSET':
        # gather some extra context -------------------------------------------
        # kernel size & in/out channels
        w_init = next(i for i in m.graph.initializer if i.name == n.input[1])
        C_out, C_in_div_g, kH, kW = w_init.dims
        pads_attr = next((a for a in n.attribute if a.name == 'pads'), None)
        pads = json.loads(str(list(pads_attr.ints))) if pads_attr else 'auto'

        print(textwrap.dedent(f"""\
            ── {n.name}
               auto_pad : {auto_attr.s.decode()}
               pads     : {pads}
               kernel   : k={kH}  (C_in={C_in_div_g}, C_out={C_out})
               input    : {n.input[0]}
               output   : {n.output[0]}"""))


=== Conv nodes with auto_pad = 'SAME_UPPER' or 'SAME_LOWER' ===
── /backbone/stem/stem.0/Conv
   auto_pad : SAME_UPPER
   pads     : auto
   kernel   : k=3  (C_in=1, C_out=16)
   input    : graph_input_cast_0
   output   : /backbone/stem/stem.0/Conv_output_0
── /backbone/stage0/stage0.3/conv_block/dwconvs.0/Conv
   auto_pad : SAME_UPPER
   pads     : auto
   kernel   : k=3  (C_in=4, C_out=16)
   input    : /backbone/stage0/stage0.2/Conv_output_0
   output   : /backbone/stage0/stage0.3/conv_block/dwconvs.0/Conv_output_0
── /backbone/stage0/stage0.4/conv_block/dwconvs.0/Conv
   auto_pad : SAME_UPPER
   pads     : auto
   kernel   : k=3  (C_in=4, C_out=16)
   input    : /backbone/stage0/stage0.3/conv_block/Add_output_0
   output   : /backbone/stage0/stage0.4/conv_block/dwconvs.0/Conv_output_0
── /backbone/stage1/stage1.3/conv_block/dwconvs.0/Conv
   auto_pad : SAME_UPPER
   pads     : auto
   kernel   : k=3  (C_in=4, C_out=32)
   input    : /backbone/stage1/stage1.2/Conv_output_0
   outp

In [27]:
from onnx import AttributeProto, numpy_helper

def walk_graph(g, scope=""):
    for n in g.node:
        if n.op_type == "Gather":
            axis = next((a.i for a in n.attribute if a.name=="axis"), "?")
            print(f"\n🔹 {scope}{n.name or '(unnamed)'}   axis={axis}")
            print(f"   inputs : {n.input}")
            print(f"   outputs: {n.output}")
        # dive into sub-graphs (Loop/If/etc.)
        for a in n.attribute:
            if a.type == AttributeProto.GRAPH:
                walk_graph(a.g, scope + n.name + "/")


m= onnx_model

print("\n───────── Gather nodes in model_cf_false_op11.onnx ─────────")
walk_graph(m.graph)
print("\n───────── Gather nodes in model_cf_false_op11.onnx ─────────")



───────── Gather nodes in model_cf_false_op11.onnx ─────────

───────── Gather nodes in model_cf_false_op11.onnx ─────────


In [28]:
import onnx
m= onnx_model
bad_ops = {"Gather", "Pow", "Sqrt", "Log", "Exp", "Transpose"}  # add more if needed
for n in m.graph.node:
    if n.op_type in bad_ops:
        print(n.op_type, n.name)

Pow /backbone/stem/stem.1/Pow
Sqrt /backbone/stem/stem.1/Sqrt
Transpose /backbone/stem/stem.2/Transpose
Transpose /backbone/stage0/stage0.1/Transpose
Pow /backbone/stage0/stage0.3/conv_block/act/Pow
Pow /backbone/stage0/stage0.4/conv_block/act/Pow
Transpose /backbone/stage0/stage0.5/Transpose
Pow /backbone/stage0/stage0.6/red_dim_conv/red_dim_conv.1/Pow
Sqrt /backbone/stage0/stage0.6/red_dim_conv/red_dim_conv.1/Sqrt
Pow /backbone/stage0/stage0.6/tcm/tcm.0/act/Pow
Pow /backbone/stage0/stage0.6/tcm/tcm.1/act/Pow
Pow /backbone/stage0/stage0.6/tcm/tcm.2/act/Pow
Pow /backbone/stage0/stage0.6/tcm/tcm.3/act/Pow
Transpose /backbone/stage0/stage0.6/tcm/tcm.4/Transpose
Transpose /backbone/stage0/stage0.6/tcm/tcm.4/attention/Transpose
Transpose /backbone/stage0/stage0.6/tcm/tcm.4/attention/Transpose_1
Transpose /backbone/stage0/stage0.6/tcm/tcm.4/attention/Transpose_2
Transpose /backbone/stage0/stage0.6/tcm/tcm.4/attention/Transpose_3
Transpose /backbone/stage0/stage0.6/tcm/tcm.4/attention/Transp

In [29]:
def inference_onnx(wav_path):
    """
    Loads an audio file, converts to log-mel, and runs inference
    in an ONNX session. Returns the embedding as a NumPy array.
    """
    print("===================================================")
    print("===========   run_inference_onnx   ================")
    print("===================================================")
    #######################################
    # 1) Load your ONNX model
    #######################################
    # (Optional) onnx.checker to confirm it’s valid
    onnx_model = onnx.load(onnx_path)
    onnx.checker.check_model(onnx_model)
    print(f"Loaded and checked ONNX model from: {onnx_path}")

    # Create an inference session
    session = ort.InferenceSession(onnx_path)

    # Usually we retrieve the first input & output name
    input_name = session.get_inputs()[0].name
    output_name = session.get_outputs()[0].name

    #######################################
    # 2) Load audio, get log-mel
    #######################################
    print("loading audio from:", wav_path)
    waveform, sample_rate = torchaudio.load(wav_path)
    print(f"...Waveform rate {sample_rate}  ; shape : {waveform.shape}")

    
    # If multi-channel, downmix:
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
        
    # Resample if needed
    target_sample_rate=16000
    if sample_rate != target_sample_rate:
        resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)
        # save resampled waveform to files with suffix "_resampled_16.wav"
        # torchaudio.save(wav_path.replace(".wav", "_resampled_16.wav"), waveform, target_sample_rate)

    log_mel =  myUtils.waveform_to_logmel(waveform)
    
    #######################################
    # 3) ONNX Inference
    #######################################
    # Convert to NumPy for ONNX runtime
    log_mel_np = log_mel.cpu().numpy()
    
    ## save log_mel_np to file with suffix "_logmel.npy" to check later
    print("logmelshape : ", log_mel_np.shape)
    log_mel_fp16 = log_mel_np.astype(np.float16)  # → half precision
    orig_name = os.path.splitext(os.path.basename(wav_path))[0]
    folder = os.path.dirname(wav_path)
    out_path = os.path.join(folder, f"logmel_{orig_name}.npy")
    np.save(out_path, log_mel_fp16)
    
    # Run inference
    outputs = session.run([output_name], {input_name: log_mel_np})
    # outputs is a list; typically we want the first item
    embedding = outputs[0]  # shape is [1, embedding_dim]

    # print("Embedding[10]: ", embedding[0:10])  # Print the 10th element of the embedding
    print("Embedding shape:", embedding.shape)
    # print("Embedding data:\n", embedding)
    return embedding


In [30]:
onnx_embedding = test_all_voices(
    extract_speaker_embedding_function = inference_onnx,
    cosine_similarity_function = myUtils.cosine_similarity_numpys,
    save_embeddings=True,  # Save embeddings to files
)

Loaded and checked ONNX model from: ReDimNet_no_mel_fp16.onnx
loading audio from: /data/proj/voice/redimnet/wrkB2/utils/../audio/test000.wav
...Waveform rate 16000  ; shape : torch.Size([1, 293699])
Input waveform shape: torch.Size([1, 32000])
logmelshape :  (1, 1, 72, 134)
Embedding shape: (1, 192)
Loaded and checked ONNX model from: ReDimNet_no_mel_fp16.onnx
loading audio from: /data/proj/voice/redimnet/wrkB2/utils/../audio/testRob1.wav
...Waveform rate 22050  ; shape : torch.Size([1, 35522])
Input waveform shape: torch.Size([1, 25776])
Padding log_mel from 108 to 134 frames
logmelshape :  (1, 1, 72, 134)
Embedding shape: (1, 192)
Loaded and checked ONNX model from: ReDimNet_no_mel_fp16.onnx
loading audio from: /data/proj/voice/redimnet/wrkB2/utils/../audio/testRob2.wav
...Waveform rate 22050  ; shape : torch.Size([1, 32482])
Input waveform shape: torch.Size([1, 23570])
Padding log_mel from 99 to 134 frames
logmelshape :  (1, 1, 72, 134)
Embedding shape: (1, 192)
Loaded and checked O

### compare onnx with torch

In [31]:
print(f"Similarity embde0: {myUtils.cosine_similarity_numpys(torch_embedding['embed0'], onnx_embedding['embed0'])}")
print(f"Similarity embde1: {myUtils.cosine_similarity_numpys(torch_embedding['embed1'], onnx_embedding['embed1'])}")
print(f"Similarity embde2: {myUtils.cosine_similarity_numpys(torch_embedding['embed2'], onnx_embedding['embed2'])}")
print(f"Similarity embde3: {myUtils.cosine_similarity_numpys(torch_embedding['embed3'], onnx_embedding['embed3'])}")
print(f"Similarity embde4: {myUtils.cosine_similarity_numpys(torch_embedding['embed4'], onnx_embedding['embed4'])}")
print(f"Similarity embde5: {myUtils.cosine_similarity_numpys(torch_embedding['embed5'], onnx_embedding['embed5'])}")
print(f"Similarity embde6: {myUtils.cosine_similarity_numpys(torch_embedding['embed6'], onnx_embedding['embed6'])}")

Similarity embde0: 0.9999987483024597
Similarity embde1: 0.9999989867210388
Similarity embde2: 0.9999990463256836
Similarity embde3: 0.9999991655349731
Similarity embde4: 0.9999986886978149
Similarity embde5: 0.9999983906745911
Similarity embde6: 0.999998927116394


### compare onnx with base line

In [32]:
print(f"Similarity embde0: {myUtils.cosine_similarity_numpys(base_line_embedding['embed0'], onnx_embedding['embed0'])}")
print(f"Similarity embde1: {myUtils.cosine_similarity_numpys(base_line_embedding['embed1'], onnx_embedding['embed1'])}")
print(f"Similarity embde2: {myUtils.cosine_similarity_numpys(base_line_embedding['embed2'], onnx_embedding['embed2'])}")
print(f"Similarity embde3: {myUtils.cosine_similarity_numpys(base_line_embedding['embed3'], onnx_embedding['embed3'])}")
print(f"Similarity embde4: {myUtils.cosine_similarity_numpys(base_line_embedding['embed4'], onnx_embedding['embed4'])}")
print(f"Similarity embde5: {myUtils.cosine_similarity_numpys(base_line_embedding['embed5'], onnx_embedding['embed5'])}")
print(f"Similarity embde6: {myUtils.cosine_similarity_numpys(base_line_embedding['embed6'], onnx_embedding['embed6'])}")

Similarity embde0: 0.40715181827545166
Similarity embde1: 0.5618564486503601
Similarity embde2: 0.6011956930160522
Similarity embde3: 0.4821603298187256
Similarity embde4: 0.46946969628334045
Similarity embde5: 0.26959607005119324
Similarity embde6: 0.3525412976741791


# cal fake data

* dummy for nchw

In [33]:
# import os
# import numpy as np
# import torch

# # Directory for calibration inputs
# os.makedirs("calib_npy", exist_ok=True)

# # Create 100 dummy log-mel tensors
# for i in range(10):
#     log_mel = torch.randn(1, 1, 60, 134).numpy().astype(np.float16)
#     np.save(f"calib_npy/sample_{i}.npy", log_mel)

# # Write dataset.txt listing all paths
# with open("dataset.txt", "w") as f:
#     for i in range(10):
#         f.write(f"calib_npy/sample_{i}.npy\n")


* dummy for nchw

In [34]:
# import os
# import numpy as np
# import torch

# # Directory for calibration inputs
# os.makedirs("calib_npy", exist_ok=True)

# # Create 100 dummy log-mel tensors
# for i in range(2):
#     log_mel = torch.randn(1, 72, 134,1).numpy().astype(np.float16)
#     np.save(f"calib_npy/sample_{i}.npy", log_mel)

# # Write dataset.txt listing all paths
# with open("dataset.txt", "w") as f:
#     for i in range(10):
#         f.write(f"calib_npy/sample_{i}.npy\n")


# converts

```
python convert.py \
       ../wrkB0/ReDimNet_no_mel_fp16.onnx rk3588 fp ReDimNet_no_mel.rknn \
       ../wrkB0/audio/logmel_testRob1.npy  ../wrkB0/audio/embedding_testRob1.torch

```