<a href="https://colab.research.google.com/github/RANGAM-AKHILA/TESS/blob/main/audio_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers torchaudio tqdm


In [None]:
import kagglehub
path = kagglehub.dataset_download("ejlok1/toronto-emotional-speech-set-tess")

Using Colab cache for faster access to the 'toronto-emotional-speech-set-tess' dataset.


In [None]:
print(path)

/kaggle/input/toronto-emotional-speech-set-tess


In [None]:
import os
import torch
import torchaudio
from transformers import HubertModel, Wav2Vec2FeatureExtractor
from tqdm import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

DATASET_PATH = "/kaggle/input/toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data"
FEATURE_SAVE_PATH = "/content/drive/MyDrive/Colab Notebooks/hubert_features"

SAMPLE_RATE = 16000
os.makedirs(FEATURE_SAVE_PATH, exist_ok=True)


Device: cuda


In [None]:
from transformers import Wav2Vec2FeatureExtractor, HubertModel
import torch

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Feature extractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "facebook/hubert-base-ls960"
)

# Load HuBERT with hidden states enabled
hubert = HubertModel.from_pretrained(
    "facebook/hubert-base-ls960",
    output_hidden_states=True   # ⭐ IMPORTANT
).to(DEVICE)

hubert.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading weights:   0%|          | 0/211 [00:00<?, ?it/s]

HubertModel(
  (feature_extractor): HubertFeatureEncoder(
    (conv_layers): ModuleList(
      (0): HubertGroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): HubertFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): HubertEncoder(
    (pos_conv_embed): HubertPositionalConvEmbedding(
      (conv): Para

In [None]:
for param in hubert.parameters():
    param.requires_grad = False

In [None]:
def normalize_emotion(e):
    if e is None:
        return None

    e = e.strip().lower()

    if e in ["pleasant_surprised", "pleasant_surprise"]:
        return "pleasant_surprise"

    return e


In [None]:
with torch.no_grad():

    for folder in tqdm(os.listdir(DATASET_PATH), desc="Emotions"):

        folder_path = os.path.join(DATASET_PATH, folder)

        if not os.path.isdir(folder_path):
            continue

        # Extract emotion label
        emotion = normalize_emotion(folder.split("_", 1)[1].lower())

        save_dir = os.path.join(FEATURE_SAVE_PATH, emotion)
        os.makedirs(save_dir, exist_ok=True)

        wav_files = [f for f in os.listdir(folder_path) if f.endswith(".wav")]

        for wav_file in tqdm(wav_files, desc=f"Extracting {emotion}", leave=False):

            wav_path = os.path.join(folder_path, wav_file)
            save_path = os.path.join(
                save_dir,
                wav_file.replace(".wav", ".pt")
            )

            if os.path.exists(save_path):
                continue

            # -------------------------------------------------
            # LOAD AUDIO
            # -------------------------------------------------
            waveform, sr = torchaudio.load(wav_path)
            waveform = waveform.squeeze(0)

            if sr != SAMPLE_RATE:
                waveform = torchaudio.functional.resample(
                    waveform, sr, SAMPLE_RATE
                )

            # -------------------------------------------------
            # FEATURE EXTRACTOR
            # -------------------------------------------------
            inputs = feature_extractor(
                waveform.numpy(),
                sampling_rate=SAMPLE_RATE,
                return_tensors="pt"
            )

            input_values = inputs.input_values.to(DEVICE)

            # -------------------------------------------------
            # HUBERT FORWARD PASS
            # -------------------------------------------------
            outputs = hubert(input_values)

            hidden_states = outputs.hidden_states

            # Layers 5–9
            selected_layers = hidden_states[5:10]

            # Stack → (5, B, T, 768)
            stacked = torch.stack(selected_layers)

            # Mean fusion → (B, T, 768)
            features = torch.mean(stacked, dim=0)

            # Remove batch dimension → (T, 768)
            features = features.squeeze(0).cpu()

            # -------------------------------------------------
            # SAVE FEATURES
            # -------------------------------------------------
            torch.save(features, save_path)

print("✅ Feature extraction completed successfully.")

Emotions:   0%|          | 0/14 [00:00<?, ?it/s]
Extracting fear:   0%|          | 0/200 [00:00<?, ?it/s][A
                                                        [A
Extracting angry:   0%|          | 0/200 [00:00<?, ?it/s][A
                                                         [A
Extracting fear:   0%|          | 0/200 [00:00<?, ?it/s][A
Emotions:  21%|██▏       | 3/14 [00:00<00:00, 23.67it/s]
Extracting disgust:   0%|          | 0/200 [00:00<?, ?it/s][A
                                                           [A
Extracting neutral:   0%|          | 0/200 [00:00<?, ?it/s][A
                                                           [A
Extracting angry:   0%|          | 0/200 [00:00<?, ?it/s][A
Emotions:  43%|████▎     | 6/14 [00:00<00:00, 23.41it/s]
Extracting sad:   0%|          | 0/200 [00:00<?, ?it/s][A
                                                       [A
Extracting disgust:   0%|          | 0/200 [00:00<?, ?it/s][A
                                         

✅ Feature extraction completed successfully.





In [None]:
sample = torch.load("/content/drive/MyDrive/Colab Notebooks/hubert_features/angry/OAF_back_angry.pt")
print(sample.shape)


torch.Size([76, 768])
