# Set Base line for REFERENCE MODEL 

===============================================================

* load the model
* extract embeddings 
* test some voices to set baseline

===============================================================

In [3]:
%load_ext autoreload
%autoreload 2
## our utils
from utils.common_import import *
from utils.test_all_voices import *
 

2.6.0+cu124


* config the starting point model and get it

In [4]:
def load_orig_model_B0():

    model_name='b2'
    # train_type='ft_lm'
    train_type='ptn'
    dataset='vox2'
    
    torch.hub.set_dir('/data/proj/voice/redimnet/models')

    model = torch.hub.load('IDRnD/ReDimNet', 'ReDimNet', 
                        model_name=model_name, 
                        train_type=train_type, 
                        dataset=dataset)
    
    return model

original_model = load_orig_model_B0()
original_model.eval()


Using cache found in /data/proj/voice/redimnet/models/IDRnD_ReDimNet_master


/data/proj/voice/redimnet/models/IDRnD_ReDimNet_master
load_res : <All keys matched successfully>


ReDimNetWrap(
  (backbone): ReDimNet(
    (stem): Sequential(
      (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=same)
      (1): LayerNorm(C=(16,), data_format=channels_first, eps=1e-06)
      (2): to1d()
    )
    (stage0): Sequential(
      (0): weigth1d(w=(1, 1, 1, 1),sequential=False)
      (1): to2d(f=72,c=16)
      (2): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1))
      (3): ConvBlock2d(
        (conv_block): ConvNeXtLikeBlock(
          (dwconvs): ModuleList(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=same, groups=4)
          )
          (norm): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): GELU(approximate='none')
          (pwconv1): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1))
        )
      )
      (4): ConvBlock2d(
        (conv_block): ConvNeXtLikeBlock(
          (dwconvs): ModuleList(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=s

In [5]:
summary(original_model, input_size=(1, 32000))

  with torch.cuda.amp.autocast(enabled=False):


Layer (type:depth-idx)                                       Output Shape              Param #
ReDimNetWrap                                                 [1, 192]                  --
├─MelBanks: 1-1                                              [1, 72, 134]              --
│    └─Sequential: 2-1                                       [1, 72, 134]              --
│    │    └─Identity: 3-1                                    [1, 32000]                --
│    │    └─PreEmphasis: 3-2                                 [1, 32000]                --
│    │    └─MelSpectrogram: 3-3                              [1, 72, 134]              --
├─ReDimNet: 1-2                                              [1, 1152, 134]            --
│    └─Sequential: 2-2                                       [1, 1152, 134]            --
│    │    └─Conv2d: 3-4                                      [1, 16, 72, 134]          160
│    │    └─LayerNorm: 3-5                                   [1, 16, 72, 134]          32
│   

## CALC FUNCTIONS

In [6]:
def extract_speaker_embedding(wav_path, target_sample_rate=16000, target_length=32000):
    """
    Extracts a speaker embedding from a given WAV file using the ReDimNet model.
    
    Parameters:
    - model: The ReDimNet model
    - wav_path: Path to the WAV file
    - target_sample_rate: Sample rate the model expects (default: 16kHz)
    - target_length: Number of samples the model expects (default: 32000 = 2 sec @ 16kHz)
    
    Returns:
    - speaker_embedding: The extracted speaker embedding as a PyTorch tensor
    """
    # Load audio file
    waveform, sample_rate = torchaudio.load(wav_path)
    
    # Convert to mono if needed
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    
    # Resample if needed
    if sample_rate != target_sample_rate:
        resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)
    
    # Ensure the waveform has exactly `target_length` samples
    if waveform.shape[1] < target_length:
        # Pad with zeros if too short
        pad_size = target_length - waveform.shape[1]
        waveform = F.pad(waveform, (0, pad_size))
        print(f"Padding waveform to {target_length} samples.")
    else:
        # Trim if too long
        waveform = waveform[:, :target_length]
        print(f"Trimming waveform to {target_length} samples.")
    
    # Ensure correct shape (batch_size, num_samples)
    print(f"waveform Sample Shape: {waveform.shape} ; type : {type(waveform)}")
    
    # Extract speaker embedding
    with torch.no_grad():
        speaker_embedding = original_model(waveform)
        
    print(f"Speaker Embedding Shape: {speaker_embedding.shape} ; type : {type(speaker_embedding)}")  # Expected: (1, embedding_dim)
    
    return speaker_embedding


In [7]:
# Compute similarity between two embeddings
def cosine_similarity(embedding1, embedding2):
    return F.cosine_similarity(embedding1, embedding2).item()


## set BASE LINE

In [8]:
base_line_embedding = test_all_voices(
    extract_speaker_embedding_function = extract_speaker_embedding,
    cosine_similarity_function = cosine_similarity
)

Trimming waveform to 32000 samples.
waveform Sample Shape: torch.Size([1, 32000]) ; type : <class 'torch.Tensor'>


Speaker Embedding Shape: torch.Size([1, 192]) ; type : <class 'torch.Tensor'>
Padding waveform to 32000 samples.
waveform Sample Shape: torch.Size([1, 32000]) ; type : <class 'torch.Tensor'>
Speaker Embedding Shape: torch.Size([1, 192]) ; type : <class 'torch.Tensor'>
Padding waveform to 32000 samples.
waveform Sample Shape: torch.Size([1, 32000]) ; type : <class 'torch.Tensor'>
Speaker Embedding Shape: torch.Size([1, 192]) ; type : <class 'torch.Tensor'>
Trimming waveform to 32000 samples.
waveform Sample Shape: torch.Size([1, 32000]) ; type : <class 'torch.Tensor'>
Speaker Embedding Shape: torch.Size([1, 192]) ; type : <class 'torch.Tensor'>
Trimming waveform to 32000 samples.
waveform Sample Shape: torch.Size([1, 32000]) ; type : <class 'torch.Tensor'>
Speaker Embedding Shape: torch.Size([1, 192]) ; type : <class 'torch.Tensor'>
Padding waveform to 32000 samples.
waveform Sample Shape: torch.Size([1, 32000]) ; type : <class 'torch.Tensor'>
Speaker Embedding Shape: torch.Size([1, 192

## test input len

* test how target file len is affecting the output embeddings

In [11]:

embd_len1 = extract_speaker_embedding(wav_path='./audio/test000.wav',target_sample_rate=16000, target_length=32000)
embd_len2 = extract_speaker_embedding(wav_path='./audio/test000.wav',target_sample_rate=16000, target_length=16000)

Trimming waveform to 32000 samples.
waveform Sample Shape: torch.Size([1, 32000]) ; type : <class 'torch.Tensor'>


  with torch.cuda.amp.autocast(enabled=False):


Speaker Embedding Shape: torch.Size([1, 192]) ; type : <class 'torch.Tensor'>
Trimming waveform to 16000 samples.
waveform Sample Shape: torch.Size([1, 16000]) ; type : <class 'torch.Tensor'>
Speaker Embedding Shape: torch.Size([1, 192]) ; type : <class 'torch.Tensor'>


In [15]:
cosine_similarity(embd_len1, embd_len2)

0.33534541726112366