# RKNN fixed version (noMel/conv1d/activation)


===============================================================

* build new noMel model based on base line
* replace bad layers with working layers
    * no need to change the width of the layers
* run voices through the model and compare to baseline

===============================================================

In [1]:
%load_ext autoreload
%autoreload 2
## our utils
from utils.common_import import *
from utils.test_all_voices import *

2.6.0+cu124


In [2]:
%%capture --no-display
import my_utils as myUtils
from play1_setBase_line_B0 import original_model,base_line_embedding

## original activations


In [3]:
ACT_TYPES = (
    nn.ReLU, nn.ReLU6, nn.LeakyReLU, nn.ELU, nn.PReLU, nn.GELU,
    nn.SiLU, nn.Sigmoid, nn.Tanh, nn.Hardswish
)

def list_activations(model):
    """Print every module whose class is in ACT_TYPES."""
    for name, m in model.named_modules():
        if isinstance(m, ACT_TYPES):
            print(f'{name:<60} {m}')

In [4]:
list_activations(original_model)   

backbone.stage0.3.conv_block.relu                            ReLU(inplace=True)
backbone.stage0.4.conv_block.relu                            ReLU(inplace=True)
backbone.stage0.6.tcm.0.act                                  GELU(approximate='none')
backbone.stage0.6.tcm.1.act                                  GELU(approximate='none')
backbone.stage0.6.tcm.2.act                                  GELU(approximate='none')
backbone.stage0.6.tcm.3.act                                  GELU(approximate='none')
backbone.stage1.3.conv_block.relu                            ReLU(inplace=True)
backbone.stage1.4.conv_block.relu                            ReLU(inplace=True)
backbone.stage1.5.conv_block.relu                            ReLU(inplace=True)
backbone.stage1.6.2                                          GELU(approximate='none')
backbone.stage1.8.tcm.0.act                                  GELU(approximate='none')
backbone.stage1.8.tcm.1.act                                  GELU(approximate='none'

## create new model

In [5]:

class Conv1dAs2d(nn.Module):
    """
    Replace a Conv1d with an equivalent Conv2d (H = kernel, W = 1)
    so that ONNX shows only Conv2d, which RKNN supports.
    """
    def __init__(self, conv1d: nn.Conv1d):
        super().__init__()

        k, d, s, g = conv1d.kernel_size[0], conv1d.dilation[0], conv1d.stride[0], conv1d.groups

        # --- numeric padding ---
        if isinstance(conv1d.padding, str):        # "same" or "valid"
            if conv1d.padding == "same":
                pad_num = floor(d * (k - 1) / 2)
            else:                                  # "valid"
                pad_num = 0
        else:                                      # already a tuple/int
            pad_num = conv1d.padding[0]

        # Build the Conv2d with weights copied
        self.conv2d = nn.Conv2d(
            in_channels  = conv1d.in_channels,
            out_channels = conv1d.out_channels,
            kernel_size  = (k, 1),
            stride       = (s, 1),
            padding      = (pad_num, 0),
            dilation     = (d, 1),
            groups       = g,
            bias         = conv1d.bias is not None
        )

        with torch.no_grad():
            # (out, in/groups, k) → (out, in/groups, k, 1)
            self.conv2d.weight.copy_(conv1d.weight.unsqueeze(-1))
            if conv1d.bias is not None:
                self.conv2d.bias.copy_(conv1d.bias)

    def forward(self, x):           # x: [B, C, T]
        #todo : pay attention to the input shape! AVI APPROVED
        return self.conv2d(x.unsqueeze(-1)).squeeze(-1)


In [6]:
def replace_activation_(module, old_cls=nn.GELU, new_cls=nn.ReLU, **new_kwargs):
    """
    In-place, recursive swap of every instance of `old_cls`
    with `new_cls(**new_kwargs)`.
    """
    for name, child in module.named_children():
        if isinstance(child, old_cls):
            setattr(module, name, new_cls(**new_kwargs))
        else:
            replace_activation_(child, old_cls, new_cls, **new_kwargs)
# -----------------------------

In [7]:
########################################
# 2) Define a Model Class without MelBanks
########################################
import torch
import torch.nn as nn

class ReDimNetNoMel(nn.Module):
    """
    A wrapper around the original ReDimNetWrap that:
      - Excludes the 'spec' (MelBanks) module
      - Uses 'backbone', 'pool', 'bn', and 'linear'
    We expect a precomputed mel spectrogram as input with shape [B, 1, n_mels, time_frames].
    """
    def __init__(self, original_wrap):
        super().__init__()
        
        # Grab references to the submodules we want to keep
        self.backbone = original_wrap.backbone
        
        # fix problem01
        # list of (stage, block) indices you already know are problematic
        TARGETS = [(0, 6), (1, 8), (2, 8), (3, 9), (4, 7)]
        for s_idx, b_idx in TARGETS:
            for tcm_idx in range(4):
                block = self.backbone.__getattr__(f"stage{s_idx}")[b_idx].tcm[tcm_idx]

                block.dwconvs[0] = Conv1dAs2d(block.dwconvs[0])
                block.pwconv1    = Conv1dAs2d(block.pwconv1)   # 1×1 conv

        
        # Replace ASTP with RKNN-safe version:
        self.pool = original_wrap.pool
        self.bn = original_wrap.bn
        self.linear = original_wrap.linear
        
        ## Replace activations in the backbone
        replace_activation_(self, old_cls=nn.GELU, new_cls=nn.ReLU, inplace=True)

    def forward(self, x):
        # x: shape [B, 1, n_mels, time_frames]
        # (1) Pass through the backbone
        x = self.backbone(x)    # shape might become [B, channels, frames] or similar
        # (2) Pooling
        x = self.pool(x)        # ASTP => shape likely [B, embedding_dim]
        # (3) BatchNorm
        x = self.bn(x)
        # (4) Final linear => 192-dim (if that's your embedding size)
        x = self.linear(x)
        return x


# Create an instance of our new model that skips the MelBanks front-end
model_no_mel = ReDimNetNoMel(original_model)



run to test the model


In [8]:
LIMIT = 10         # 11*stride - 1 with stride = 1

bad = []
for name, m in model_no_mel.named_modules():
    if isinstance(m, torch.nn.Conv2d):
        k, _ = m.kernel_size
        pad, _ = (m.padding if not isinstance(m.padding, str)
                   else (floor((k-1)/2), 0))          # same-padding case
        if k > LIMIT or pad > 4:
            bad.append((name, k, pad))

if not bad:
    print("✓ All Conv2d layers are within RKNN limits.")
else:
    print("✗ Layers that violate RKNN limits:")
    for n,k,p in bad:
        print(f"   {n:60s}  k={k}  pad={p}")

✗ Layers that violate RKNN limits:
   backbone.stage0.6.tcm.1.dwconvs.0.conv2d                      k=19  pad=9
   backbone.stage0.6.tcm.2.dwconvs.0.conv2d                      k=31  pad=15
   backbone.stage0.6.tcm.3.dwconvs.0.conv2d                      k=59  pad=29
   backbone.stage1.8.tcm.1.dwconvs.0.conv2d                      k=19  pad=9
   backbone.stage1.8.tcm.2.dwconvs.0.conv2d                      k=31  pad=15
   backbone.stage1.8.tcm.3.dwconvs.0.conv2d                      k=59  pad=29
   backbone.stage2.8.tcm.1.dwconvs.0.conv2d                      k=19  pad=9
   backbone.stage2.8.tcm.2.dwconvs.0.conv2d                      k=31  pad=15
   backbone.stage2.8.tcm.3.dwconvs.0.conv2d                      k=59  pad=29
   backbone.stage3.9.tcm.1.dwconvs.0.conv2d                      k=19  pad=9
   backbone.stage3.9.tcm.2.dwconvs.0.conv2d                      k=31  pad=15
   backbone.stage3.9.tcm.3.dwconvs.0.conv2d                      k=59  pad=29
   backbone.stage4.7.tcm.1.dwconv

In [9]:
model_no_mel.eval()  # <- this line is critical!
dummy = torch.randn(1, 1, 60, 200)
model_no_mel(dummy)

tensor([[-0.5142,  1.6030, -0.7297, -2.2561,  0.5372,  0.3788,  3.6268, -1.9376,
          0.4963, -0.1588,  1.9016,  0.0542, -2.1352,  0.6386,  1.2388, -1.4947,
          2.5222, -1.7304,  3.2020, -2.4959,  1.0403,  0.3743, -0.8966, -0.5583,
         -0.2115, -3.7184, -0.8057, -0.7268,  0.4182, -2.2033, -0.0536, -1.7411,
          0.2409,  1.2284, -1.5887,  0.4787, -1.6448, -0.0792, -1.7830, -0.5552,
         -0.5940,  0.0618,  4.3995, -2.1242, -0.8426,  0.5148, -0.0223, -1.6560,
         -3.4438, -0.9869, -4.1040,  1.4759, -1.6216,  1.3513,  2.1658, -0.4615,
          1.2447, -0.2935,  0.2195, -2.5226,  0.0621,  0.5176, -1.4498,  0.9213,
         -1.0173,  1.4027, -2.9840,  0.3719, -0.5580, -0.0756, -0.8063,  1.5905,
         -1.4361, -2.2850, -3.0722, -0.4535, -2.8037, -1.3631,  1.6177,  0.7166,
          0.4644,  3.0383,  0.9515,  0.7255, -1.3273,  2.2221,  1.2819,  0.7723,
          0.2885,  0.3929, -0.1728,  1.8508,  0.3380,  0.6587, -1.9119,  0.7565,
          1.5330, -0.1050,  

### FP16 check


In [10]:
with torch.no_grad():
    fp16_net = copy.deepcopy(model_no_mel).half().eval()
    ok = torch.isfinite(fp16_net(dummy.half())).all()
    print('safe in pure FP16?', ok)

safe in pure FP16? tensor(True)


## info

In [11]:
list_activations(model_no_mel)   

backbone.stage0.3.conv_block.relu                            ReLU(inplace=True)
backbone.stage0.4.conv_block.relu                            ReLU(inplace=True)
backbone.stage0.6.tcm.0.act                                  ReLU(inplace=True)
backbone.stage0.6.tcm.1.act                                  ReLU(inplace=True)
backbone.stage0.6.tcm.2.act                                  ReLU(inplace=True)
backbone.stage0.6.tcm.3.act                                  ReLU(inplace=True)
backbone.stage1.3.conv_block.relu                            ReLU(inplace=True)
backbone.stage1.4.conv_block.relu                            ReLU(inplace=True)
backbone.stage1.5.conv_block.relu                            ReLU(inplace=True)
backbone.stage1.6.2                                          ReLU(inplace=True)
backbone.stage1.8.tcm.0.act                                  ReLU(inplace=True)
backbone.stage1.8.tcm.1.act                                  ReLU(inplace=True)
backbone.stage1.8.tcm.2.act             

In [12]:
model_no_mel.eval()


ReDimNetNoMel(
  (backbone): ReDimNet(
    (stem): Sequential(
      (0): Conv2d(1, 10, kernel_size=(3, 3), stride=(1, 1), padding=same)
      (1): LayerNorm(C=(10,), data_format=channels_first, eps=1e-06)
      (2): to1d()
    )
    (stage0): Sequential(
      (0): weigth1d(w=(1, 1, 1, 1),sequential=False)
      (1): to2d(f=60,c=10)
      (2): Conv2d(10, 10, kernel_size=(1, 1), stride=(1, 1))
      (3): ConvBlock2d(
        (conv_block): ResBasicBlock(
          (conv1): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=10, bias=False)
          (conv1pw): Conv2d(10, 10, kernel_size=(1, 1), stride=(1, 1))
          (bn1): BatchNorm2d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=10, bias=False)
          (conv2pw): Conv2d(10, 10, kernel_size=(1, 1), stride=(1, 1))
          (bn2): BatchNorm2d(10, eps=1e-05, momentum=0.1, affine=True, track_running_st

In [13]:
summary(model_no_mel, (1, 1, 60, 134))


Layer (type:depth-idx)                                       Output Shape              Param #
ReDimNetNoMel                                                [1, 192]                  --
├─ReDimNet: 1-1                                              [1, 600, 134]             --
│    └─Sequential: 2-1                                       [1, 600, 134]             --
│    │    └─Conv2d: 3-1                                      [1, 10, 60, 134]          100
│    │    └─LayerNorm: 3-2                                   [1, 10, 60, 134]          20
│    │    └─to1d: 3-3                                        [1, 600, 134]             --
│    └─Sequential: 2-2                                       [1, 600, 134]             --
│    │    └─weigth1d: 3-4                                    [1, 600, 134]             (1)
│    │    └─to2d: 3-5                                        [1, 10, 60, 134]          --
│    │    └─Conv2d: 3-6                                      [1, 10, 60, 134]          110
│ 

# TORCH SIDE

In [14]:
def torch_inference(wav_path: str):
    # (a) Load audio
    waveform, sample_rate = torchaudio.load(wav_path)  # shape: [channels, time]
    # If stereo, select one channel, or average:
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Resample if needed
    target_sample_rate=16000
    if sample_rate != target_sample_rate:
        resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)

    # (b) Convert to log-mel
    log_mel = myUtils.waveform_to_logmel(waveform)
    print('feeding logmel shape:', log_mel.shape)
    
    # (c) Forward pass
    with torch.no_grad():
        embedding = model_no_mel(log_mel)  # shape typically [1, 192] or so

    print("Embedding shape:", embedding.shape)
    #print("Embedding:", embedding)
    return embedding

run test

In [15]:
torch_embedding = test_all_voices(
    extract_speaker_embedding_function = torch_inference,
    cosine_similarity_function = myUtils.cosine_similarity
)

Input waveform shape: torch.Size([1, 32000])
feeding logmel shape: torch.Size([1, 1, 60, 134])
Embedding shape: torch.Size([1, 192])
Input waveform shape: torch.Size([1, 25776])
Padding log_mel from 108 to 134 frames
feeding logmel shape: torch.Size([1, 1, 60, 134])
Embedding shape: torch.Size([1, 192])
Input waveform shape: torch.Size([1, 23570])
Padding log_mel from 99 to 134 frames
feeding logmel shape: torch.Size([1, 1, 60, 134])
Embedding shape: torch.Size([1, 192])
Input waveform shape: torch.Size([1, 32000])
feeding logmel shape: torch.Size([1, 1, 60, 134])
Embedding shape: torch.Size([1, 192])
Input waveform shape: torch.Size([1, 32000])
feeding logmel shape: torch.Size([1, 1, 60, 134])
Embedding shape: torch.Size([1, 192])
Input waveform shape: torch.Size([1, 28126])
Padding log_mel from 118 to 134 frames
feeding logmel shape: torch.Size([1, 1, 60, 134])
Embedding shape: torch.Size([1, 192])
Input waveform shape: torch.Size([1, 32000])
feeding logmel shape: torch.Size([1, 1, 6

## compare to baseline

* test embedding compare of voice in the currnet model with baseline model:

In [16]:
print(f"Similarity embde0: {myUtils.cosine_similarity(base_line_embedding['embed0'], torch_embedding['embed0'])}")
print(f"Similarity embde1: {myUtils.cosine_similarity(base_line_embedding['embed1'], torch_embedding['embed1'])}")
print(f"Similarity embde2: {myUtils.cosine_similarity(base_line_embedding['embed2'], torch_embedding['embed2'])}")
print(f"Similarity embde3: {myUtils.cosine_similarity(base_line_embedding['embed3'], torch_embedding['embed3'])}")
print(f"Similarity embde4: {myUtils.cosine_similarity(base_line_embedding['embed4'], torch_embedding['embed4'])}")
print(f"Similarity embde5: {myUtils.cosine_similarity(base_line_embedding['embed5'], torch_embedding['embed5'])}")
print(f"Similarity embde6: {myUtils.cosine_similarity(base_line_embedding['embed6'], torch_embedding['embed6'])}")

Similarity embde0: 0.725683867931366
Similarity embde1: 0.8639224767684937
Similarity embde2: 0.7972668409347534
Similarity embde3: 0.6831815242767334
Similarity embde4: 0.7413924932479858
Similarity embde5: 0.7062074542045593
Similarity embde6: 0.7143062353134155


# ONNX SIDE

In [17]:
myUtils.export_to_onnx(model_no_mel,onnx_path = "ReDimNet_no_mel.onnx")
!ls -lah ReDimNet_no_mel.onnx

Exported to ReDimNet_no_mel.onnx
-rw-rw-r-- 1 vlad vlad 4.1M Jun 24 16:20 ReDimNet_no_mel.onnx


### store half

In [18]:
myUtils.restore_in_half_precision('ReDimNet_no_mel.onnx','ReDimNet_no_mel_fp16.onnx')



Converted ReDimNet_no_mel.onnx to half precision and saved as ReDimNet_no_mel_fp16.onnx


## verify

In [19]:
# onnx_path = "ReDimNet_no_mel.onnx"
onnx_path = "ReDimNet_no_mel_fp16.onnx"

In [20]:
onnx_model = onnx.load(onnx_path)
onnx.checker.check_model(onnx_model)
print("ONNX model is valid!")

ONNX model is valid!


In [21]:
def inference_onnx(wav_path):
    """
    Loads an audio file, converts to log-mel, and runs inference
    in an ONNX session. Returns the embedding as a NumPy array.
    """
    print("===================================================")
    print("===========   run_inference_onnx   ================")
    print("===================================================")
    #######################################
    # 1) Load your ONNX model
    #######################################
    # (Optional) onnx.checker to confirm it’s valid
    onnx_model = onnx.load(onnx_path)
    onnx.checker.check_model(onnx_model)
    print(f"Loaded and checked ONNX model from: {onnx_path}")

    # Create an inference session
    session = ort.InferenceSession(onnx_path)

    # Usually we retrieve the first input & output name
    input_name = session.get_inputs()[0].name
    output_name = session.get_outputs()[0].name

    #######################################
    # 2) Load audio, get log-mel
    #######################################
    print("loading audio from:", wav_path)
    waveform, sample_rate = torchaudio.load(wav_path)
    print(f"...Waveform rate {sample_rate}  ; shape : {waveform.shape}")

    
    # If multi-channel, downmix:
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
        
    # Resample if needed
    target_sample_rate=16000
    if sample_rate != target_sample_rate:
        resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)
        # save resampled waveform to files with suffix "_resampled_16.wav"
        # torchaudio.save(wav_path.replace(".wav", "_resampled_16.wav"), waveform, target_sample_rate)

    log_mel =  myUtils.waveform_to_logmel(waveform)
    
    #######################################
    # 3) ONNX Inference
    #######################################
    # Convert to NumPy for ONNX runtime
    log_mel_np = log_mel.cpu().numpy()
    
    ## save log_mel_np to file with suffix "_logmel.npy" to check later
    print("logmelshape : ", log_mel_np.shape)
    log_mel_fp16 = log_mel_np.astype(np.float16)  # → half precision
    orig_name = os.path.splitext(os.path.basename(wav_path))[0]
    folder = os.path.dirname(wav_path)
    out_path = os.path.join(folder, f"logmel_{orig_name}.npy")
    np.save(out_path, log_mel_fp16)
    
    # Run inference
    outputs = session.run([output_name], {input_name: log_mel_np})
    # outputs is a list; typically we want the first item
    embedding = outputs[0]  # shape is [1, embedding_dim]

    # print("Embedding[10]: ", embedding[0:10])  # Print the 10th element of the embedding
    print("Embedding shape:", embedding.shape)
    # print("Embedding data:\n", embedding)
    return embedding


In [22]:
onnx_embedding = test_all_voices(
    extract_speaker_embedding_function = inference_onnx,
    cosine_similarity_function = myUtils.cosine_similarity_numpys,
    save_embeddings=True,  # Save embeddings to files
)

Loaded and checked ONNX model from: ReDimNet_no_mel_fp16.onnx
loading audio from: /data/proj/voice/redimnet/wrkB0/utils/../audio/test000.wav
...Waveform rate 16000  ; shape : torch.Size([1, 293699])
Input waveform shape: torch.Size([1, 32000])
logmelshape :  (1, 1, 60, 134)


[0;93m2025-06-24 16:20:34.385699800 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m
[0;93m2025-06-24 16:20:34.391312986 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m
[0;93m2025-06-24 16:20:34.603021950 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m
[0;93m2025-06-24 16:20:34.608076966 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m


Embedding shape: (1, 192)
Loaded and checked ONNX model from: ReDimNet_no_mel_fp16.onnx
loading audio from: /data/proj/voice/redimnet/wrkB0/utils/../audio/testRob1.wav
...Waveform rate 22050  ; shape : torch.Size([1, 35522])
Input waveform shape: torch.Size([1, 25776])
Padding log_mel from 108 to 134 frames
logmelshape :  (1, 1, 60, 134)
Embedding shape: (1, 192)
Loaded and checked ONNX model from: ReDimNet_no_mel_fp16.onnx
loading audio from: /data/proj/voice/redimnet/wrkB0/utils/../audio/testRob2.wav
...Waveform rate 22050  ; shape : torch.Size([1, 32482])
Input waveform shape: torch.Size([1, 23570])
Padding log_mel from 99 to 134 frames
logmelshape :  (1, 1, 60, 134)
Embedding shape: (1, 192)
Loaded and checked ONNX model from: ReDimNet_no_mel_fp16.onnx
loading audio from: /data/proj/voice/redimnet/wrkB0/utils/../audio/test_human1_1.wav
...Waveform rate 16000  ; shape : torch.Size([1, 65867])
Input waveform shape: torch.Size([1, 32000])


[0;93m2025-06-24 16:20:34.796828050 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m
[0;93m2025-06-24 16:20:34.802532644 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m
[0;93m2025-06-24 16:20:34.991791513 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m
[0;93m2025-06-24 16:20:34.998575223 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m


logmelshape :  (1, 1, 60, 134)
Embedding shape: (1, 192)
Loaded and checked ONNX model from: ReDimNet_no_mel_fp16.onnx
loading audio from: /data/proj/voice/redimnet/wrkB0/utils/../audio/test_human1_2.wav
...Waveform rate 16000  ; shape : torch.Size([1, 101189])
Input waveform shape: torch.Size([1, 32000])
logmelshape :  (1, 1, 60, 134)


[0;93m2025-06-24 16:20:35.179202480 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m
[0;93m2025-06-24 16:20:35.185068402 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m
[0;93m2025-06-24 16:20:35.372573246 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m
[0;93m2025-06-24 16:20:35.378513260 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m


Embedding shape: (1, 192)
Loaded and checked ONNX model from: ReDimNet_no_mel_fp16.onnx
loading audio from: /data/proj/voice/redimnet/wrkB0/utils/../audio/test_human2_1.wav
...Waveform rate 48000  ; shape : torch.Size([1, 84376])
Input waveform shape: torch.Size([1, 28126])
Padding log_mel from 118 to 134 frames
logmelshape :  (1, 1, 60, 134)
Embedding shape: (1, 192)
Loaded and checked ONNX model from: ReDimNet_no_mel_fp16.onnx
loading audio from: /data/proj/voice/redimnet/wrkB0/utils/../audio/test_human2_2.wav
...Waveform rate 48000  ; shape : torch.Size([1, 159256])
Input waveform shape: torch.Size([1, 32000])
logmelshape :  (1, 1, 60, 134)
Embedding shape: (1, 192)
**************************************************************************
*************************   compare summary ******************************
**************************************************************************
====>>>> should be similar:
Similarity (robot1 to robot2 ): 0.8749251961708069
Similarity (human1

[0;93m2025-06-24 16:20:35.545177135 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m
[0;93m2025-06-24 16:20:35.550787492 [W:onnxruntime:, constant_folding.cc:268 ApplyImpl] Could not find a CPU kernel and hence can't constant fold Sub node '/pool/Sub_1'[m


### compare onnx with torch

In [23]:
print(f"Similarity embde0: {myUtils.cosine_similarity_numpys(torch_embedding['embed0'], onnx_embedding['embed0'])}")
print(f"Similarity embde1: {myUtils.cosine_similarity_numpys(torch_embedding['embed1'], onnx_embedding['embed1'])}")
print(f"Similarity embde2: {myUtils.cosine_similarity_numpys(torch_embedding['embed2'], onnx_embedding['embed2'])}")
print(f"Similarity embde3: {myUtils.cosine_similarity_numpys(torch_embedding['embed3'], onnx_embedding['embed3'])}")
print(f"Similarity embde4: {myUtils.cosine_similarity_numpys(torch_embedding['embed4'], onnx_embedding['embed4'])}")
print(f"Similarity embde5: {myUtils.cosine_similarity_numpys(torch_embedding['embed5'], onnx_embedding['embed5'])}")
print(f"Similarity embde6: {myUtils.cosine_similarity_numpys(torch_embedding['embed6'], onnx_embedding['embed6'])}")

Similarity embde0: 0.9999975562095642
Similarity embde1: 0.9999975562095642
Similarity embde2: 0.999997615814209
Similarity embde3: 0.9999982118606567
Similarity embde4: 0.9999983906745911
Similarity embde5: 0.9999959468841553
Similarity embde6: 0.9999971389770508


### compare onnx with base line

In [24]:
print(f"Similarity embde0: {myUtils.cosine_similarity_numpys(base_line_embedding['embed0'], onnx_embedding['embed0'])}")
print(f"Similarity embde1: {myUtils.cosine_similarity_numpys(base_line_embedding['embed1'], onnx_embedding['embed1'])}")
print(f"Similarity embde2: {myUtils.cosine_similarity_numpys(base_line_embedding['embed2'], onnx_embedding['embed2'])}")
print(f"Similarity embde3: {myUtils.cosine_similarity_numpys(base_line_embedding['embed3'], onnx_embedding['embed3'])}")
print(f"Similarity embde4: {myUtils.cosine_similarity_numpys(base_line_embedding['embed4'], onnx_embedding['embed4'])}")
print(f"Similarity embde5: {myUtils.cosine_similarity_numpys(base_line_embedding['embed5'], onnx_embedding['embed5'])}")
print(f"Similarity embde6: {myUtils.cosine_similarity_numpys(base_line_embedding['embed6'], onnx_embedding['embed6'])}")

Similarity embde0: 0.7255515456199646
Similarity embde1: 0.863834023475647
Similarity embde2: 0.7971532940864563
Similarity embde3: 0.6831507086753845
Similarity embde4: 0.7414388656616211
Similarity embde5: 0.7058244943618774
Similarity embde6: 0.7139862775802612


## cal fake data

In [25]:
# import os
# import numpy as np
# import torch

# # Directory for calibration inputs
# os.makedirs("calib_npy", exist_ok=True)

# # Create 100 dummy log-mel tensors
# for i in range(10):
#     log_mel = torch.randn(1, 1, 60, 134).numpy().astype(np.float16)
#     np.save(f"calib_npy/sample_{i}.npy", log_mel)

# # Write dataset.txt listing all paths
# with open("dataset.txt", "w") as f:
#     for i in range(10):
#         f.write(f"calib_npy/sample_{i}.npy\n")


## converts

```
python convert.py \
       ../wrkB0/ReDimNet_no_mel_fp16.onnx rk3588 fp ReDimNet_no_mel.rknn \
       ../wrkB0/audio/logmel_testRob1.npy  ../wrkB0/audio/embedding_testRob1.torch

```