In [33]:
# from transformers import Wav2Vec2Model, Wav2Vec2Config
from transformers import AutoModel, AutoConfig

In [34]:
model_name_or_path = "utter-project/mHuBERT-147"
# model_name_or_path = "facebook/wav2vec2-large-xlsr-53"

layer = 7
output_hidden_states = True

In [35]:
config = AutoConfig.from_pretrained(model_name_or_path)

print("No. encoder layers", config.num_hidden_layers)

No. encoder layers 12


In [36]:
# Either enable hidden_states=True or prune layers

# Custom config
# config.num_hidden_layers = 7
if output_hidden_states:
   config.output_hidden_states=True

In [37]:
model = AutoModel.from_pretrained(model_name_or_path, config=config)

In [38]:
model.encoder.layers

ModuleList(
  (0-11): 12 x HubertEncoderLayer(
    (attention): HubertSdpaAttention(
      (k_proj): Linear(in_features=768, out_features=768, bias=True)
      (v_proj): Linear(in_features=768, out_features=768, bias=True)
      (q_proj): Linear(in_features=768, out_features=768, bias=True)
      (out_proj): Linear(in_features=768, out_features=768, bias=True)
    )
    (dropout): Dropout(p=0.1, inplace=False)
    (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (feed_forward): HubertFeedForward(
      (intermediate_dropout): Dropout(p=0.1, inplace=False)
      (intermediate_dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
      (output_dense): Linear(in_features=3072, out_features=768, bias=True)
      (output_dropout): Dropout(p=0.1, inplace=False)
    )
    (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
)

### Extract Features (dummy)

In [39]:
import torch

# 2 seconds
x_t = torch.randn(1, 32000) # [B x Seq_Len]

out = model(x_t)

print(out["last_hidden_state"].shape)

torch.Size([1, 99, 768])


### Extract Features

In [40]:
audio_fn = "/home/christiaan/Dropbox/code/queries/babaloon_id_11_main_post_test_67591-68366.wav"

In [41]:
import librosa

x, sr = librosa.load(audio_fn, sr=16000)
x.shape

(12400,)

In [42]:
x_t = torch.tensor(x, dtype=torch.float32).view(1,-1)
x_t.shape

out = model(x_t)
print(out["last_hidden_state"].shape)

torch.Size([1, 38, 768])


### Extract queries

In [43]:
from tqdm import tqdm
import torchaudio
import torch.functional as F
from pathlib import Path
import numpy as np

# audio_dir = "/home/christiaan/Dropbox/code/queries"
audio_dir = "/home/christiaan/Dropbox/code/adult_templates"


# out_dir = f"out/{model_name_or_path.split('/')[-1]}-layer{layer}/npy"
out_dir = f"temp_adult"

Path(out_dir).mkdir(exist_ok=True, parents=True)

print("Extracting features:", audio_dir)
for wav_fn in tqdm(Path(audio_dir).rglob("*.wav")):

    wav, sr = torchaudio.load(wav_fn)
    if sr != 16000:
       wav = torchaudio.functional.resample(wav, sr, 16000)
    # wav = wav.unsqueeze(0).cuda()
    # print(wav.shape)

    with torch.inference_mode():
    #     wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
        out = model(wav)
    # print(out["hidden_states"][7].shape)
    out_fn = f"{out_dir}/{wav_fn.stem}.npy"
    np.save(out_fn, out["hidden_states"][layer].squeeze().cpu().numpy())
    # break

    # out_fn = (feat_dir/wav_fn.stem).with_suffix(".npy")
    # np.save(out_fn, x.squeeze().cpu().numpy())

Extracting features: /home/christiaan/Dropbox/code/adult_templates


20it [00:02,  8.60it/s]


### Normalize

In [46]:
# def speaker_mvn(feat_dict):
#     """
#     Perform per-speaker mean and variance normalisation.

#     It is assumed that each of the keys in `feat_dict` starts with a speaker
#     identifier followed by an underscore.
#     """

# npy_dir = "out/wav2vec2-large-xlsr-53-layer12/npy/"
npy_dir = "temp_adult"

# out_dir = f"{Path(npy_dir).parent}-mvn/npy"

out_dir = f"{Path(npy_dir)}-mvn/npy"

Path(out_dir).mkdir(exist_ok=True, parents=True)

npy_fn_list = [x for x in Path(npy_dir).rglob("*.npy")]
print(npy_fn_list)

speaker_dict = {}
for npy_fn in tqdm(npy_fn_list):
    spk = npy_fn.stem.split("_")[2]
    if spk not in speaker_dict:
        speaker_dict[spk] = []
    speaker_dict[spk].append(npy_fn)
# print(speaker_dict)

for spk in speaker_dict:
    print("Speaker", spk)
    speaker_features = {}
    for npy_fn in speaker_dict[spk]:
        speaker_features[npy_fn.stem]= np.load(npy_fn)
    # print(len(speaker_features))
    # print(speaker_features)

    features = np.vstack([speaker_features[x] for x in speaker_features])
    speaker_mean = np.mean(features, axis=0)
    speaker_std = np.std(features, axis=0)

    out_dict = {}
    for utt_key in speaker_features:
        out_dict[utt_key] = (
            (speaker_features[utt_key] - speaker_mean) / 
            speaker_std
        )
    # print(out_dict)
    for utt_key in out_dict:
        np.save(f"{out_dir}/{utt_key}.npy", out_dict[utt_key])
    
    # break

[PosixPath('temp_adult/honger_id_0.npy'), PosixPath('temp_adult/muis_id_0.npy'), PosixPath('temp_adult/hardloop_id_0.npy'), PosixPath('temp_adult/kat_id_0.npy'), PosixPath('temp_adult/katjie_id_0.npy'), PosixPath('temp_adult/vissies_id_0.npy'), PosixPath('temp_adult/seuntjie_id_0.npy'), PosixPath('temp_adult/lekker_id_0.npy'), PosixPath('temp_adult/kwaad_id_0.npy'), PosixPath('temp_adult/butterfly_id_0.npy'), PosixPath('temp_adult/babaloon_id_0.npy'), PosixPath('temp_adult/water_id_0.npy'), PosixPath('temp_adult/sien_id_0.npy'), PosixPath('temp_adult/voel_id_0.npy'), PosixPath('temp_adult/hond_id_0.npy'), PosixPath('temp_adult/gelukkig_id_0.npy'), PosixPath('temp_adult/worsies_id_0.npy'), PosixPath('temp_adult/seer_id_0.npy'), PosixPath('temp_adult/boom_id_0.npy'), PosixPath('temp_adult/balloon_id_0.npy')]


100%|██████████| 20/20 [00:00<00:00, 196454.52it/s]

Speaker 0



