In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from argparse import Namespace
from typing import Any, NamedTuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

In [3]:
import torchaudio
from torch.utils.data import ConcatDataset, DataLoader, Subset

# Load LibriSpeech dataset


LibriSpeech 数据集是语音识别领域的流行资源，广泛用于训练和评估机器学习模型。它包含大约 1,000 小时的英语语音，来源于 LibriVox 项目中的有声读物。根据数据量和录音质量，数据集被划分为不同的子集，使其适用于自动语音识别（ASR）的各种任务。


Dataset Structure:

- The dataset is organized into several parts:
  - train-clean-100: 100 hours of clean speech for training.
  - train-clean-360: 360 hours of clean speech for training.
  - train-other-500: 500 hours of speech from various sources that may include background noise, making it less clean.
  - dev-clean: A development set containing clean speech.
  - dev-other: A development set containing more diverse and noisy speech.
  - test-clean: A test set of clean speech.
  - test-other: A test set with more diverse and noisy speech.
- Audio Format:
  - The audio files are typically in the `flac` format, sampled at 16 kHz, and stored as mono channels.


:::{note}
We use all the train data for training, not just a subset. Thus, **Total 960 hours** are used to train the Kmeans model.
:::


In [4]:
dataset_root_path = "/home/ay/data2/datasets/Lib"

## Help functions


In [5]:
def prepare_dataset(feature_extractor):

    def return_collate_fn(batch):
        res = {}
        model_input_name = feature_extractor.model_input_names[0]


        waveform = [item[0][0].numpy() for item in batch]
        inputs = feature_extractor(
            waveform,
            sampling_rate=16000,
            return_attention_mask=True,
            padding=True,
            return_tensors="pt",
        )
        # print(inputs.get(model_input_name).shape)
        
        res['org_waveform'] = torch.nn.utils.rnn.pad_sequence([item[0][0] for item in batch], batch_first=True)

        res[model_input_name] = inputs.get(model_input_name)
        res['waveform'] = inputs.get(model_input_name)
        res["attention_mask"] = inputs.get("attention_mask")
        return res

    return return_collate_fn

In [6]:
def default_collate_fn(batch):
    """Collate function to pad audio waveforms for batching.

    This function takes a batch of audio waveforms and their associated
    metadata and pads the waveforms to ensure they all have the same length
    for efficient processing in neural networks.

    Args:
        batch (list): A list of tuples, where each tuple contains:
            - waveform (Tensor): The audio waveform tensor.
            - sample_rate (int): The sample rate of the audio.
            - utterance (str): The transcription of the audio.
            - speaker_id (int): The ID of the speaker.
            - chapter_id (int): The ID of the chapter.
            - file_id (int): The ID of the file.

    Returns:
        tuple: A tuple containing:
            - padded_waveforms (Tensor): A tensor of padded audio waveforms.
            - sample_rate (int): The sample rate of the audio.
            - transcriptions (list): A list of transcriptions for the audio.

    Raises:
        ValueError: If the input batch is empty.
    """

    # Get the maximum length of the waveforms in the batch
    max_length = max(waveform.shape[1] for waveform, *_ in batch)

    # Pad the waveforms and create a tensor for the batch
    padded_waveforms = []
    transcriptions = []

    for waveform, sample_rate, utterance, speaker_id, chapter_id, file_id in batch:
        # Pad the waveform with zeros
        padded_waveform = torch.nn.functional.pad(
            waveform, (0, max_length - waveform.shape[1])
        )
        padded_waveforms.append(padded_waveform)
        transcriptions.append(utterance)

    # Stack the waveforms into a single tensor
    return torch.concat(padded_waveforms), sample_rate, transcriptions

:::{note}
We use a `percentage` of 960 hours for training.
:::


In [7]:
def load_librispeech_dataset(
    root_path: str,
    split="train",
    batch_size: int = 32,
    num_workers: int = 2,
    percentage=0.1,
    collate_fn=default_collate_fn,
) -> DataLoader:
    """Load the LibriSpeech dataset and return a DataLoader.

    This function initializes the LibriSpeech dataset and creates a DataLoader
    with a custom collate function to handle variable-length audio samples.

    Args:
        root_path (str): The path to the directory where the dataset will be stored.
        split (str): train or val .
        batch_size (int, optional): The number of samples per batch. Defaults to 32.
        num_workers (int, optional): The number of subprocesses to use for data loading.
            Defaults to 2.
        percentage (float, optional): The percentage that used to generate features

    Returns:
        DataLoader: A DataLoader object that provides batches of audio waveforms
        and their corresponding metadata from the LibriSpeech dataset.

    Raises:
        FileNotFoundError: If the dataset cannot be downloaded or accessed.
    """

    if split == "train":
        # Load individual datasets
        train_clean_100 = torchaudio.datasets.LIBRISPEECH(
            root=root_path, url="train-clean-100", download=True
        )
        train_clean_360 = torchaudio.datasets.LIBRISPEECH(
            root=root_path, url="train-clean-360", download=True
        )
        train_other_500 = torchaudio.datasets.LIBRISPEECH(
            root=root_path, url="train-other-500", download=True
        )
        # Combine the datasets
        dataset = ConcatDataset([train_clean_100, train_clean_360, train_other_500])
    else:
        dev_clean = torchaudio.datasets.LIBRISPEECH(
            root=root_path, url="dev-clean", download=True
        )
        dev_other = torchaudio.datasets.LIBRISPEECH(
            root=root_path, url="dev-other", download=True
        )
        dataset = ConcatDataset([dev_clean, dev_other])

    # Randomly select 10% of the dataset,  and then create a Subset using the selected indices
    subset_size = int(len(dataset) * percentage)
    indices = torch.randperm(len(dataset))[:subset_size]
    dataset = Subset(dataset, indices)

    # Create a DataLoader with the custom collate function
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        collate_fn=collate_fn,
    )

    return dataloader

## Load dataloader


In [8]:
# dataloader = load_librispeech_dataset(dataset_root_path, split="train", percentage=0.2, num_workers=4)

# # Iterate through the DataLoader
# for batch in dataloader:
#     waveform, sample_rate, utterance = batch
#     print(f"Waveform shape: {waveform.shape}, Sample Rate: {sample_rate}")
#     # print(f"Utterance: {utterance}")
#     break

# WavLM


## Intro of WavLM

WavLM is a state-of-the-art model introduced for automatic speech recognition (ASR) and other speech-related tasks. Developed by researchers at Microsoft, WavLM builds upon the success of prior models like Wav2Vec 2.0, enhancing the ability to learn representations from raw audio and improving performance across various speech processing applications.

### Key Features of WavLM

1. **Architecture**:

   - WavLM utilizes a transformer-based architecture similar to Wav2Vec 2.0 but incorporates several innovations to handle speech signals more effectively.
   - It employs a feature extraction layer that processes raw audio inputs into a more manageable format for the transformer layers.

2. **Self-Supervised Learning**:

   - WavLM is trained using self-supervised learning techniques, which allow it to learn from unlabeled data. This is particularly beneficial in scenarios where labeled data is scarce.
   - The model is pre-trained on a large corpus of unlabeled audio, enabling it to capture various acoustic characteristics and linguistic features.

3. **Multi-Task Learning**:

   - WavLM is designed to perform multiple tasks, including speech recognition, speaker identification, and emotion recognition, making it versatile for different applications.
   - The model can be fine-tuned on specific tasks, allowing it to adapt to various domains and improve its performance.

4. **Robustness**:

   - One of the significant advancements of WavLM is its robustness to different acoustic conditions and noise levels, making it suitable for real-world applications where audio quality may vary.

5. **Fine-Tuning Capabilities**:
   - After pre-training, WavLM can be fine-tuned on specific datasets, allowing it to achieve high accuracy on specific tasks like ASR or speaker verification.
   - Fine-tuning can involve supervised data, enabling the model to adapt to the nuances of the target domain effectively.


We use the [WavLMModel](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/models/wavlm/modeling_wavlm.py#L1105) to extract audio features and then train the Kmeans model


In [9]:
from transformers import WavLMModel, Wav2Vec2FeatureExtractor, AutoFeatureExtractor

In [None]:
class WavLM(nn.Module):
    """
    https://huggingface.co/docs/transformers/model_doc/wavlm#transformers.WavLMForCTC
    """

    def __init__(self, pretrained_name="microsoft/wavlm-base"):
        super().__init__()
        print("Load WavLM model!!!!!!!")
        self.model = WavLMModel.from_pretrained(pretrained_name)
        self.model.lm_head = nn.Identity()

    def forward(self, x, attention_mask=None):
        outputs = self.model(x, attention_mask=attention_mask)
        return outputs

In [11]:
pretrained_name = (
    "/home/ay/.cache/huggingface/hub/models--microsoft--wavlm-base"
    "/snapshots/efa81aae7ff777e464159e0f877d54eac5b84f81/"
)

model = WavLM(pretrained_name).cuda(1)
model = model.eval()

  return self.fget.__get__(instance, owner)()
Some weights of the model checkpoint at /home/ay/.cache/huggingface/hub/models--microsoft--wavlm-base/snapshots/efa81aae7ff777e464159e0f877d54eac5b84f81/ were not used when initializing WavLMModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMModel were not initialized from the model checkpoint at /home/ay/.cache/huggingface/hub/models--microsoft--wavlm-base/snapshots/efa81aae7ff777e464159e0f877d54eac5b84f81/ and are newly initialized: ['

Load WavLM model!!!!!!!


In [12]:
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-base")
feature_extractor



Wav2Vec2FeatureExtractor {
  "do_normalize": false,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

In [13]:
# with torch.no_grad():
#     res = model.model(waveform.to(next(model.parameters()).device))
# res.hidden_states

## Load Validation Split and obtain val features


In [14]:
val_dataloader = load_librispeech_dataset(
    dataset_root_path, split="val", percentage=1.0, num_workers=4, batch_size=8, collate_fn=prepare_dataset(feature_extractor)
)
val_feat = []
with torch.no_grad():
    for i, batch in enumerate(tqdm(val_dataloader, desc="Processing Audio")):
        waveform = batch['waveform']
        outputs = model(waveform.cuda(1), attention_mask=batch['attention_mask'].cuda(1))
        last_hidden_states = outputs.last_hidden_state.cpu().numpy() # [batch, frames, features]
        for j, x in enumerate(last_hidden_states):
            length = batch['attention_mask'][j].sum() // 320
            val_feat.append(last_hidden_states[j, :length])

val_feat = np.concatenate(val_feat)

Processing Audio: 100%|██████████| 696/696 [01:11<00:00,  9.73it/s]


In [15]:
def validate_kmeans(kmeans, features, metric="inertia"):
    """
    Validate a trained KMeans model using features from a validation DataLoader.

    Args:
        model (torch.nn.Module): Pretrained feature extraction model (e.g., WavLM).
        dataloader (torch.utils.data.DataLoader): DataLoader for validation data.
        processor: Processor for preprocessing input data for the model.
        kmeans (MiniBatchKMeans): Trained MiniBatchKMeans model.
        metric (str): Metric to compute. Options: "inertia" (default) or "silhouette".

    Returns:
        float: Computed evaluation metric (e.g., inertia or silhouette score).
    """

    from sklearn.metrics import silhouette_score

    features_list = []
    cluster_assignments = []

    # Predict cluster assignments
    cluster_labels = kmeans.predict(features)

    # Compute the specified metric
    if metric == "inertia":
        # Inertia: Sum of squared distances of samples to their closest cluster center
        return kmeans.inertia_

    elif metric == "silhouette":
        # Silhouette score: Measures the separation between clusters
        return silhouette_score(features, cluster_labels)

    else:
        raise ValueError(
            f"Unknown metric '{metric}'. Supported metrics: 'inertia', 'silhouette'."
        )

In [16]:
# validate_kmeans(kmeans_model, val_feat, metric="inertia")

# Train Kmeans


## Introduction of Kmeans

MiniBatchKMeans is a variant of the traditional K-Means clustering algorithm that is designed to handle large datasets more efficiently. It was introduced to overcome some of the scalability issues associated with standard K-Means, particularly when dealing with very large datasets that may not fit into memory. Here's an overview of its key features, advantages, and applications:

### Key Features of MiniBatchKMeans

1. **Batch Processing**:

   - Unlike the standard K-Means algorithm, which uses the entire dataset to update the cluster centroids in each iteration, MiniBatchKMeans processes small, random subsets (mini-batches) of the data.
   - This approach significantly reduces the computational burden and memory requirements, allowing the algorithm to scale to larger datasets.

2. **Algorithm Steps**:

   - **Initialization**: Similar to K-Means, the algorithm starts by randomly initializing the centroids of the clusters.
   - **Mini-Batch Selection**: In each iteration, a small random sample of the data is selected as a mini-batch.
   - **Centroid Update**: The centroids are updated based on the mini-batch rather than the entire dataset, which leads to faster convergence.
   - **Iteration**: This process is repeated for a specified number of iterations or until the centroids converge to stable values.

3. **Reduced Memory Usage**:

   - Because MiniBatchKMeans only needs to store and process the mini-batch at any given time, it is more memory-efficient compared to the standard K-Means approach.

4. **Faster Convergence**:

   - The use of mini-batches leads to quicker updates to the centroids, often resulting in faster convergence times, especially for large datasets.

5. **Flexibility**:
   - MiniBatchKMeans allows for the adjustment of the mini-batch size, which can be tuned based on the specific requirements of the dataset and application.

### Advantages of MiniBatchKMeans

- **Scalability**: It is particularly suited for applications where the dataset is too large to fit into memory, allowing clustering on large-scale data.
- **Speed**: The algorithm runs faster than traditional K-Means, especially with large datasets, making it suitable for real-time applications.
- **Reduced Variance**: By using random mini-batches, the algorithm can help reduce the variance in the updates, which can lead to a more stable convergence behavior.


In [17]:
import time

import joblib
from sklearn.cluster import MiniBatchKMeans

In [18]:
def get_kmeans_model(args: Namespace) -> MiniBatchKMeans:
    """Create and return a MiniBatch KMeans clustering model.

    This function initializes a MiniBatch KMeans model with the specified
    parameters from an args.Namespace object, which is suitable for large
    datasets and can handle mini-batch updates for efficiency.

    Args:
        args (Namespace): A Namespace object containing the following parameters:
            - n_clusters (int): The number of clusters to form.
            - init (str): Method for initialization.
            - max_iter (int): Maximum number of iterations for a single run.
            - batch_size (int): Size of the mini-batches.
            - tol (float): Relative tolerance with regards to the inertia to declare convergence.
            - max_no_improvement (int): Maximum number of consecutive iterations with no improvement in the inertia.
            - n_init (int): Number of time the k-means algorithm will be run with different centroid seeds.
            - reassignment_ratio (float): The proportion of the previous cluster centers that must change to consider a clustering solution stable.
            - random_state (int): Random seed for initialization.

    Returns:
        MiniBatchKMeans: An initialized MiniBatchKMeans model ready for fitting to data.

    Raises:
        ValueError: If any of the input parameters are invalid or inconsistent.
    """

    return MiniBatchKMeans(
        n_clusters=args.n_clusters,
        init=args.init,
        max_iter=args.max_iter,
        batch_size=args.batch_size,
        tol=args.tol,
        max_no_improvement=args.max_no_improvement,
        n_init=args.n_init,
        reassignment_ratio=args.reassignment_ratio,
        random_state=args.random_state,
        verbose=0,
        compute_labels=True,
        init_size=None,
    )

In [19]:
args = Namespace(
    n_clusters=200,
    init="k-means++",
    max_iter=150,
    batch_size=1000,
    tol=0.0001,
    max_no_improvement=100,
    n_init=20,
    reassignment_ratio=0.5,
    random_state=42,
)

In [20]:
dataloader = load_librispeech_dataset(
    dataset_root_path, split="train", percentage=0.5, num_workers=4, collate_fn=prepare_dataset(feature_extractor),batch_size=8
)

In [21]:
from loguru import logger
import warnings

# 忽略特定的警告
warnings.filterwarnings("ignore", message="Support for mismatched key_padding_mask and attn_mask is deprecated. Use same type for both instead.")

In [None]:
logger.add(
    f"kmeans.log", rotation="5 MB", retention="7 days", level="DEBUG"
)  # Save to a file with rotation


# for n_clusters in [50, 100, 150]:
for n_clusters in [150]:
    # for n_clusters in [50]:
    args.n_clusters = n_clusters
    kmeans_model = get_kmeans_model(args)
    n_epochs = 10
    best_score = 100000000000000000
    tol = 0

    with torch.no_grad():
        for e in range(n_epochs):
            for i, batch in enumerate(tqdm(dataloader, desc="Processing Audio")):
                waveform = batch['waveform']
                
                outputs = model(waveform.cuda(1), attention_mask=batch['attention_mask'].cuda(1))
                last_hidden_states = outputs.last_hidden_state.cpu().numpy() # [batch, frames, features]
                middle_res = []
                for j, x in enumerate(last_hidden_states):
                    length = batch['attention_mask'][j].sum() // 320
                    middle_res.append(last_hidden_states[j, :length])
                middle_res = np.concatenate(middle_res)
                kmeans_model.partial_fit(middle_res)
                score = -kmeans_model.score(x) / len(x)
                if i % 500 == 0:
                    logger.info(
                        f"{n_clusters}, Epoch {e}, current score: {kmeans_model.inertia_}"
                    )

            val_score = validate_kmeans(kmeans_model, val_feat, metric="inertia")
            logger.info(f"{n_clusters}, Epoch {e}, val score: {val_score}")

            if val_score < best_score:
                best_score = val_score
                tol = 0
                joblib.dump(
                    kmeans_model,
                    os.path.join(dataset_root_path, f"kmeans_model-{n_clusters}.pkl"),
                )
            else:
                tol += 1
                if tol >= 2:
                    break

Processing Audio:   0%|          | 0/17578 [00:00<?, ?it/s]

[32m2024-12-02 12:54:57.938[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1m150, Epoch 0, current score: 382235.6875[0m
Processing Audio:   3%|▎         | 500/17578 [01:07<35:41,  7.98it/s] [32m2024-12-02 12:56:02.085[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1m150, Epoch 0, current score: 488562.78125[0m
Processing Audio:   6%|▌         | 1000/17578 [02:10<36:24,  7.59it/s][32m2024-12-02 12:57:04.939[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1m150, Epoch 0, current score: 411695.96875[0m
Processing Audio:   9%|▊         | 1500/17578 [03:18<34:46,  7.71it/s]  [32m2024-12-02 12:58:13.158[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1m150, Epoch 0, current score: 476722.09375[0m
Processing Audio:  11%|█▏        | 2000/17578 [04:21<32:13,  8.06it/s][32m2024-12-02 12:59:15.618[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m -

# Evaluate Kmeans model


## Alignment of Kmeans and phonemes


In [27]:
from transformers import AutoProcessor, AutoModelForCTC
from transformers import WavLMModel, Wav2Vec2FeatureExtractor, AutoFeatureExtractor

processor = AutoProcessor.from_pretrained("speech31/XLS-R-english-phoneme")
model = AutoModelForCTC.from_pretrained("speech31/XLS-R-english-phoneme")

Some weights of the model checkpoint at speech31/XLS-R-english-phoneme were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at speech31/XLS-R-english-phoneme and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this 

In [28]:
wavlm_feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-base")

In [None]:
def pad_audio_with_max_length(list_audio):
    
    # Get the maximum length of the waveforms in the batch
    max_length = max(waveform.shape[-1] for waveform in list_audio)

    # Pad the waveforms and create a tensor for the batch
    padded_waveforms = []

    for i, waveform in enumerate(list_audio):
        # Pad the waveform with zeros
        print(len(list_audio), i, max_length, waveform.shape)
        padded_waveform = torch.nn.functional.pad(
            torch.from_numpy(waveform), (0, max_length - waveform.shape[1])
        )

    # Stack the waveforms into a single tensor
    return torch.stack(padded_waveforms)

In [54]:
def prepare_dataset(feature_extractor):

    def return_collate_fn(batch):
        res = {}
        model_input_name = feature_extractor.model_input_names[0]


        waveform = [item[0][0].numpy() for item in batch]
        inputs = feature_extractor(
            waveform,
            sampling_rate=16000,
            return_attention_mask=True,
            padding=True,
            return_tensors="pt",
        )
        print(inputs.get(model_input_name).shape)
        
        res['org_waveform'] = torch.nn.utils.rnn.pad_sequence([item[0][0] for item in batch], batch_first=True)

        res[model_input_name] = inputs.get(model_input_name)
        res["attention_mask"] = inputs.get("attention_mask")
        return res

    return return_collate_fn

In [55]:
dataloader = load_librispeech_dataset(
    dataset_root_path,
    split="train",
    percentage=0.5,
    num_workers=4,
    collate_fn=prepare_dataset(processor.feature_extractor),
)
ds = dataloader.dataset

In [56]:
for x in dataloader:
    with torch.no_grad():
        batch_res = model(x["input_values"], attention_mask=x["attention_mask"], output_hidden_states=True)
        logits = batch_res.logits
        phoneme_ids = torch.argmax(logits, dim=-1)
    break

torch.Size([32, 260880])
torch.Size([32, 270321])
torch.Size([32, 264080])
torch.Size([32, 267600])
torch.Size([32, 259840])
torch.Size([32, 271040])
torch.Size([32, 262720])
torch.Size([32, 266560])
torch.Size([32, 263040])


In [62]:
for key, value in x.items():
    print(f"Key: {key}, Value: {value.shape}, Value Type: {type(value)}")

Key: org_waveform, Value: torch.Size([32, 260880]), Value Type: <class 'torch.Tensor'>
Key: input_values, Value: torch.Size([32, 260880]), Value Type: <class 'torch.Tensor'>
Key: attention_mask, Value: torch.Size([32, 260880]), Value Type: <class 'torch.Tensor'>


In [22]:
from kmeans_model import KMeansTokenizer, CustomSpeechEncoder

In [23]:
encoder = CustomSpeechEncoder(vocab_size=150)

Some weights of the model checkpoint at microsoft/wavlm-base were not used when initializing WavLMModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMModel were not initialized from the model checkpoint at microsoft/wavlm-base and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

In [None]:
a = encoder.encode_speech(x['org_waveform'], attention_mask=x['attention_mask'])['original_units']
b = encoder.encode_speech(x['org_waveform'])['original_units']
a - b



In [71]:
x['attention_mask'][0]

tensor([1, 1, 1,  ..., 0, 0, 0], dtype=torch.int32)