In [1]:
# Requires: librosa
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import librosa
import torch
import numpy as np

model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
model = AutoModelForAudioClassification.from_pretrained(model_id)

feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True)
id2label = model.config.id2label


2025-09-13 20:47:38.440184: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-13 20:47:38.499683: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def preprocess_audio(audio_path, feature_extractor, max_duration=30.0):
    audio_array, sampling_rate = librosa.load(audio_path, sr=feature_extractor.sampling_rate)
    
    max_length = int(feature_extractor.sampling_rate * max_duration)
    if len(audio_array) > max_length:
        audio_array = audio_array[:max_length]
    else:
        audio_array = np.pad(audio_array, (0, max_length - len(audio_array)))

    inputs = feature_extractor(
        audio_array,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
    )
    return inputs


In [3]:
def predict_emotion(audio_path, model, feature_extractor, id2label, max_duration=30.0):
    inputs = preprocess_audio(audio_path, feature_extractor, max_duration)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_id = torch.argmax(logits, dim=-1).item()
    predicted_label = id2label[predicted_id]
    
    return predicted_label


***wev2vec2***


In [4]:
from pydub import AudioSegment
import math

audio_path = "downloads/comdey/vocals.wav"

# Load full audio
audio = AudioSegment.from_file(audio_path)

# Segment length in ms
segment_length = 5* 1000  

# Number of segments
num_segments = math.ceil(len(audio) / segment_length)

for i in range(num_segments):
    start_time = i * segment_length
    end_time = min((i + 1) * segment_length, len(audio))
    
    segment = audio[start_time:end_time]
    segment_path = f"/Data/deepakkumar/Subham_work/downloads/comdey/segment_{i}.wav"
    
    # Save segment
    segment.export(segment_path, format="wav")
    
    # Predict emotion for this segment
    predicted_emotion = predict_emotion(segment_path, model, feature_extractor, id2label)
    print(f"Segment {i+1} ({start_time/1000:.1f}-{end_time/1000:.1f}s): {predicted_emotion}")


Segment 1 (0.0-5.0s): angry
Segment 2 (5.0-10.0s): surprised
Segment 3 (10.0-15.0s): happy
Segment 4 (15.0-20.0s): happy
Segment 5 (20.0-25.0s): happy
Segment 6 (25.0-26.8s): sad


In [None]:
#whats's next ?

In [None]:
# import numpy as np
# import torch
# import torch.nn as nn
# from transformers import Wav2Vec2Processor
# from transformers.models.wav2vec2.modeling_wav2vec2 import (
#     Wav2Vec2Model,
#     Wav2Vec2PreTrainedModel,
# )


# class RegressionHead(nn.Module):
#     r"""Classification head."""

#     def __init__(self, config):

#         super().__init__()

#         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
#         self.dropout = nn.Dropout(config.final_dropout)
#         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

#     def forward(self, features, **kwargs):

#         x = features
#         x = self.dropout(x)
#         x = self.dense(x)
#         x = torch.tanh(x)
#         x = self.dropout(x)
#         x = self.out_proj(x)

#         return x


# class EmotionModel(Wav2Vec2PreTrainedModel):
#     r"""Speech emotion classifier."""

#     def __init__(self, config):

#         super().__init__(config)

#         self.config = config
#         self.wav2vec2 = Wav2Vec2Model(config)
#         self.classifier = RegressionHead(config)
#         self.init_weights()

#     def forward(
#             self,
#             input_values,
#     ):

#         outputs = self.wav2vec2(input_values)
#         hidden_states = outputs[0]
#         hidden_states = torch.mean(hidden_states, dim=1)
#         logits = self.classifier(hidden_states)

#         return hidden_states, logits



# # load model from hub
# device = 'cpu'
# model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
# processor = Wav2Vec2Processor.from_pretrained(model_name)
# model = EmotionModel.from_pretrained(model_name).to(device)



# def process_func(
#     x: np.ndarray,
#     sampling_rate: int,
#     embeddings: bool = False,
# ) -> np.ndarray:
#     r"""Predict emotions or extract embeddings from raw audio signal."""

#     # run through processor to normalize signal
#     # always returns a batch, so we just get the first entry
#     # then we put it on the device
#     y = processor(x, sampling_rate=sampling_rate)
#     y = y['input_values'][0]
#     y = y.reshape(1, -1)
#     y = torch.from_numpy(y).to(device)

#     # run through model
#     with torch.no_grad():
#         y = model(y)[0 if embeddings else 1]

#     # convert to numpy
#     y = y.detach().cpu().numpy()

#     return y

# # segment_path = f"/Data/deepakkumar/Subham_work/downloads/emotions/segment_{i}.wav"
    
# import os
# import librosa

# # path where segments are stored
# segments_dir = "/Data/deepakkumar/Subham_work/downloads/comdey/"
# sampling_rate = 16000

# # loop over all wav files in the folder
# for file in sorted(os.listdir(segments_dir)):
#     if file.endswith(".wav"):
#         segment_path = os.path.join(segments_dir, file)

#         # load waveform
#         x, sr = librosa.load(segment_path, sr=sampling_rate)

#         # predict VAD values
#         vad_values = process_func(x, sr)

#         print(f"{file}: {vad_values}")



segment_0.wav: [[0.93654907 0.87850475 0.55000186]]
segment_1.wav: [[0.74494344 0.7079811  0.6341447 ]]
segment_2.wav: [[0.88779783 0.8471817  0.87374544]]
segment_3.wav: [[0.8571902  0.79949933 0.8555167 ]]
segment_4.wav: [[0.6314155 0.6750469 0.6169157]]
segment_5.wav: [[0.69133055 0.62912136 0.6929021 ]]


In [None]:
# import os
# import librosa
# import numpy as np
# import audeer
# import audonnx

# # model path
# url = 'https://zenodo.org/record/6221127/files/w2v2-L-robust-12.6bc4a7fd-1.1.0.zip'
# cache_root = audeer.mkdir('cache')
# model_root = audeer.mkdir('model')

# archive_path = audeer.download_url(url, cache_root, verbose=True)
# audeer.extract_archive(archive_path, model_root)
# model = audonnx.load(model_root)

# # directory of audio segments
# segments_dir = "/Data/deepakkumar/Subham_work/downloads/comdey/"
# sampling_rate = 16000

# # loop over all wav files
# for file in sorted(os.listdir(segments_dir)):
#     if file.endswith(".wav"):
#         segment_path = os.path.join(segments_dir, file)

#         # load waveform at 16kHz
#         x, sr = librosa.load(segment_path, sr=sampling_rate)

#         # make sure it's float32
#         x = x.astype(np.float32)

#         # run model
#         vad = model(x, sampling_rate)

#         print(f"{file}: {vad}")


segment_0.wav: {'hidden_states': array([[-0.00731635,  0.00544077, -0.00931627, ...,  0.00779994,
         0.00887228,  0.00689255]], dtype=float32), 'logits': array([[0.9365496 , 0.87850547, 0.54999673]], dtype=float32)}
segment_1.wav: {'hidden_states': array([[-0.00744769,  0.00546398, -0.00754645, ...,  0.00744426,
         0.0088043 ,  0.00626639]], dtype=float32), 'logits': array([[0.74494237, 0.7079809 , 0.6341444 ]], dtype=float32)}
segment_2.wav: {'hidden_states': array([[-0.00752568,  0.00799264, -0.01331591, ...,  0.00720073,
         0.00877592,  0.0025263 ]], dtype=float32), 'logits': array([[0.8877969, 0.8471813, 0.8737451]], dtype=float32)}
segment_3.wav: {'hidden_states': array([[-0.00757571,  0.00625499, -0.01207605, ...,  0.00766464,
         0.00924588,  0.00633155]], dtype=float32), 'logits': array([[0.8571905, 0.7995001, 0.8555168]], dtype=float32)}
segment_4.wav: {'hidden_states': array([[-0.00727458,  0.00568754, -0.01013857, ...,  0.00726279,
         0.00864966,