In [67]:
import torch
import torchinfo
from transformers import HubertModel, AutoProcessor

model_id = "facebook/hubert-large-ls960-ft"
processor = AutoProcessor.from_pretrained(model_id)
model = HubertModel.from_pretrained(
    model_id,
    use_safetensors=True,
    # torch_dtype=torch.float16,
).to("cpu")
model.eval()
torchinfo.summary(model)

Layer (type:depth-idx)                                  Param #
HubertModel                                             1,024
├─HubertFeatureEncoder: 1-1                             --
│    └─ModuleList: 2-1                                  --
│    │    └─HubertLayerNormConvLayer: 3-1               6,656
│    │    └─HubertLayerNormConvLayer: 3-2               787,968
│    │    └─HubertLayerNormConvLayer: 3-3               787,968
│    │    └─HubertLayerNormConvLayer: 3-4               787,968
│    │    └─HubertLayerNormConvLayer: 3-5               787,968
│    │    └─HubertLayerNormConvLayer: 3-6               525,824
│    │    └─HubertLayerNormConvLayer: 3-7               525,824
├─HubertFeatureProjection: 1-2                          --
│    └─LayerNorm: 2-2                                   1,024
│    └─Linear: 2-3                                      525,312
│    └─Dropout: 2-4                                     --
├─HubertEncoderStableLayerNorm: 1-3                     --
│    └─

torch.Size([2, 2367288])

In [None]:
import librosa

waveform, sr = librosa.load("temp.wav", sr=16000, mono=False)  # dtype=torch.Tensor
waveform

AttributeError: module 'soundfile' has no attribute 'SoundFileRuntimeError'

In [124]:
waveform.shape

(2, 858880)

In [125]:
t1 = torch.from_numpy(waveform)
t1.size()

torch.Size([2, 858880])

In [158]:
import torchaudio

waveform, sr = torchaudio.load("temp.wav")
waveform.size(), sr

(torch.Size([2, 2367288]), 44100)

In [159]:
sampling_rate = 16000
waveform = torchaudio.functional.resample(
    waveform, orig_freq=sr, new_freq=sampling_rate  # , resampling_method="sinc_interp_kaiser"
)
waveform.size()

torch.Size([2, 858880])

In [160]:
def save_waveform(path, waveform, sr):
    torchaudio.save(
        path,
        waveform.cpu(),
        sample_rate=sr,
        format="wav",
        encoding="PCM_S",
        bits_per_sample=16,
    )


save_waveform("temp_resembeled.wav", waveform, 16000)

In [161]:
def count_different_values(t1, t2):
    return torch.sum(torch.abs(t1 - t2) > 1e-3).item()
    # return torch.sum(torch.ne(t1, t2)).item()


# t1[1][:10], waveform[1][:10]

count_different_values(t1, waveform)

243159

In [162]:
signal = processor(waveform, sampling_rate=16000, return_tensors="pt").input_values
signal.size()

torch.Size([1, 2, 858880])

In [163]:
from einops import reduce, rearrange

signal = rearrange(signal, "1 channels timestemps -> channels timestemps")
signal.size()

torch.Size([2, 858880])

In [164]:
save_waveform("temp_resempled_processed.wav", signal, sampling_rate)

In [92]:
torch.allclose(waveform, signal, atol=1e-5)

False

In [93]:
waveform[0][:5], signal[0][:5]

(tensor([0.0005, 0.0010, 0.0009, 0.0008, 0.0008]),
 tensor([0.0185, 0.0347, 0.0321, 0.0301, 0.0289]))

In [88]:
model.eval()
with torch.no_grad():
    outputs = model(signal)

In [91]:
outputs.last_hidden_state.size()

torch.Size([2, 2683, 1024])

In [94]:
import einops

temp = einops.reduce(
    outputs.last_hidden_state, "channel timestemp dim -> channel dim", "mean"
)
temp.size()

torch.Size([2, 1024])