In [2]:
!pip install torchcodec
!pip install torchaudio
!pip install transformers
!pip install kagglehub



In [3]:
import torch
import torchaudio
import torchcodec
import matplotlib.pyplot as plt
import librosa
from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForSequenceClassification,
    TrainingArguments,
    Trainer,
    Wav2Vec2FeatureExtractor
)
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
import kagglehub
import numpy as np
import pandas as pd
from pathlib import Path
import torchaudio.transforms as T




## count storage for different classes

In [4]:
emotion_counts = {
    "fearful": 0,
    "angry": 0,
    "happy": 0,
    "sad": 0,
    "neutral": 0
}

In [5]:
emotion_counts

{'fearful': 0, 'angry': 0, 'happy': 0, 'sad': 0, 'neutral': 0}

# Download and prepare dataset

## Download RAVDESS dataset

In [6]:
rav_path = kagglehub.dataset_download('uwrfkaggler/ravdess-emotional-speech-audio')


Using Colab cache for faster access to the 'ravdess-emotional-speech-audio' dataset.


In [7]:
os.listdir(rav_path)

['Actor_02',
 'Actor_17',
 'Actor_05',
 'Actor_16',
 'Actor_21',
 'Actor_01',
 'Actor_11',
 'Actor_20',
 'Actor_08',
 'Actor_15',
 'Actor_06',
 'Actor_12',
 'Actor_23',
 'Actor_24',
 'Actor_22',
 'Actor_04',
 'Actor_19',
 'Actor_10',
 'Actor_09',
 'audio_speech_actors_01-24',
 'Actor_14',
 'Actor_03',
 'Actor_13',
 'Actor_18',
 'Actor_07']

## Extracting data and lables from RAVDESS

In [8]:
# as defined in RAVDESS
emotion_map = {
    "01": "neutral",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful"
}

data = []

for actor in os.listdir(rav_path):
  actor_dir = os.path.join(rav_path, actor)
  for file in os.listdir(actor_dir):
    try:
      emotion_code = file.split('-')[2]
      if emotion_code in emotion_map:
        if emotion_counts[emotion_map[emotion_code]]<2700:
          data.append({
            "path": os.path.join(actor_dir, file),
            "emotion": emotion_map[emotion_code]
          })
          emotion_counts[emotion_map[emotion_code]]+=1
    except IndexError:
      pass

In [9]:
len(data)

864

In [10]:
emotion_counts

{'fearful': 192, 'angry': 192, 'happy': 192, 'sad': 192, 'neutral': 96}

In [11]:
data[0]

{'path': '/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-01-01-01-01-02.wav',
 'emotion': 'neutral'}

## Download CREMA D dataset

In [12]:
crema_path = kagglehub.dataset_download("ejlok1/cremad")

Using Colab cache for faster access to the 'cremad' dataset.


In [13]:
os.listdir(crema_path+"/AudioWAV")

['1028_TSI_DIS_XX.wav',
 '1075_IEO_HAP_LO.wav',
 '1084_ITS_HAP_XX.wav',
 '1067_IWW_DIS_XX.wav',
 '1066_TIE_DIS_XX.wav',
 '1027_DFA_DIS_XX.wav',
 '1032_IEO_HAP_HI.wav',
 '1023_TIE_SAD_XX.wav',
 '1041_TAI_DIS_XX.wav',
 '1004_WSI_SAD_XX.wav',
 '1030_DFA_HAP_XX.wav',
 '1037_WSI_NEU_XX.wav',
 '1006_IOM_FEA_XX.wav',
 '1053_IEO_HAP_LO.wav',
 '1076_IEO_FEA_HI.wav',
 '1055_TSI_SAD_XX.wav',
 '1019_TIE_DIS_XX.wav',
 '1014_TAI_ANG_XX.wav',
 '1087_TIE_DIS_XX.wav',
 '1009_IWW_DIS_XX.wav',
 '1066_MTI_ANG_XX.wav',
 '1026_IEO_FEA_LO.wav',
 '1035_TSI_DIS_XX.wav',
 '1059_DFA_ANG_XX.wav',
 '1054_ITH_DIS_XX.wav',
 '1081_TIE_NEU_XX.wav',
 '1039_ITS_NEU_XX.wav',
 '1063_TSI_HAP_XX.wav',
 '1056_IEO_ANG_HI.wav',
 '1031_ITS_ANG_XX.wav',
 '1062_MTI_ANG_XX.wav',
 '1085_WSI_DIS_XX.wav',
 '1020_DFA_SAD_XX.wav',
 '1042_IWW_ANG_XX.wav',
 '1024_TIE_HAP_XX.wav',
 '1017_TSI_NEU_XX.wav',
 '1005_IWW_ANG_XX.wav',
 '1013_TAI_ANG_XX.wav',
 '1089_IWL_HAP_XX.wav',
 '1074_MTI_ANG_XX.wav',
 '1043_IOM_SAD_XX.wav',
 '1014_TIE_HAP_X

## Extracting datalabels from CREMA D

In [14]:
crema_emotion_map = {
    "ANG": "angry",
    "FEA": "fearful",
    "HAP": "happy",
    "NEU": "neutral",
    "SAD": "sad"
}

for file in os.listdir(crema_path+"/AudioWAV"):
  emotion_code = file.split('_')[2]
  if emotion_code in crema_emotion_map and emotion_counts[crema_emotion_map[emotion_code]]<2700:
    audio_file_path = os.path.join(crema_path+"/AudioWAV", file)
    try:
      # Validate audio file
      s = torchaudio.load(audio_file_path)
      data.append({
        "path": audio_file_path,
        "emotion": crema_emotion_map[emotion_code]
      })
      emotion_counts[crema_emotion_map[emotion_code]]+=1
    except Exception as e:
      print(f"Error loading CREMA D file: {audio_file_path} - {e}")

In [15]:
len(data)

7035

In [16]:
emotion_counts

{'fearful': 1463, 'angry': 1463, 'happy': 1463, 'sad': 1463, 'neutral': 1183}

## AESDD dataset

In [17]:
!mkdir -p /content/data
!gdown https://drive.google.com/uc?id=1_IAWexEWpH-ly_JaA5EGfZDp-_3flkN1
!unzip -q aesdd.zip -d /content/data/
!mv "/content/data/Acted Emotional Speech Dynamic Database/" /content/data/aesdd/

Downloading...
From (original): https://drive.google.com/uc?id=1_IAWexEWpH-ly_JaA5EGfZDp-_3flkN1
From (redirected): https://drive.google.com/uc?id=1_IAWexEWpH-ly_JaA5EGfZDp-_3flkN1&confirm=t&uuid=333cb9ef-3e3a-45c2-a860-61d28b96c999
To: /content/aesdd.zip
100% 410M/410M [00:05<00:00, 71.6MB/s]
replace /content/data/Acted Emotional Speech Dynamic Database/fear/f18 (6).wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
mv: cannot move '/content/data/Acted Emotional Speech Dynamic Database/' to '/content/data/aesdd/Acted Emotional Speech Dynamic Database': Directory not empty


In [18]:
aesdd_emotion_map = {
    "angry": "angry",
    "fear": "fearful",
    "happiness": "happy",
    "sadness": "sad"
}

for path in tqdm(Path("/content/data/aesdd").glob("**/*.wav")):
    name = str(path).split('/')[-1].split('.')[0]
    label = str(path).split('/')[-2]

    try:
        # There are some broken files, that is why we load them first before adding them to the datset
        s = torchaudio.load(path)
        if emotion_counts[aesdd_emotion_map[label]]<2700:
          data.append({
              "path": path,
              "emotion": aesdd_emotion_map[label]
          })
          emotion_counts[aesdd_emotion_map[label]]+=1
    except Exception as e:
        # print(str(path), e)
        pass

1210it [00:12, 99.51it/s] 


In [19]:
emotion_counts

{'fearful': 1703, 'angry': 1463, 'happy': 1701, 'sad': 1707, 'neutral': 1183}

In [20]:
len(data)

7757

## Indian Emotion Speech Corpora (IESC) dataset

In [21]:
iesc_path = kagglehub.dataset_download("ybsingh/indian-emotional-speech-corpora-iesc")

Using Colab cache for faster access to the 'indian-emotional-speech-corpora-iesc' dataset.


In [22]:
os.listdir(iesc_path+"/Indian Emotional Speech Corpora (IESC)")

['Speaker-7',
 'Speaker-2',
 'Speaker-3',
 'Speaker-4',
 'Speaker-6',
 'Speaker-8',
 'Speaker-1',
 'Speaker-5']

In [23]:
emotion_counts

{'fearful': 1703, 'angry': 1463, 'happy': 1701, 'sad': 1707, 'neutral': 1183}

In [24]:
iesc_emotion_map = {
    "Anger": "angry",
    "Fear": "fearful",
    "Happy": "happy",
    "Sad": "sad",
    "Neutral": "neutral"
}

for speaker in os.listdir(iesc_path+"/Indian Emotional Speech Corpora (IESC)"):
  speaker_dir = os.path.join(iesc_path+"/Indian Emotional Speech Corpora (IESC)", speaker)
  for label_dir_name in os.listdir(speaker_dir):
    # Check if the directory name matches an emotion in our map
    if label_dir_name in iesc_emotion_map:
      emotion_label = iesc_emotion_map[label_dir_name]
      emotion_full_path = os.path.join(speaker_dir, label_dir_name)
      # Iterate through the actual .wav files inside the emotion directory
      for audio_file_name in os.listdir(emotion_full_path):
        if audio_file_name.endswith('.wav') and emotion_counts[emotion_label]<2700:
          audio_file_path = os.path.join(emotion_full_path, audio_file_name)
          try:
            # Validate audio file
            s = torchaudio.load(audio_file_path)
            data.append({
                "path": audio_file_path,
                "emotion": emotion_label
            })
            emotion_counts[emotion_label]+=1
          except Exception as e:
            print(f"Error loading IESC file: {audio_file_path} - {e}")

In [25]:
emotion_counts

{'fearful': 1823, 'angry': 1583, 'happy': 1821, 'sad': 1827, 'neutral': 1303}

## TESS Dataset

In [26]:
tess_path = kagglehub.dataset_download("ejlok1/toronto-emotional-speech-set-tess")

Using Colab cache for faster access to the 'toronto-emotional-speech-set-tess' dataset.


In [27]:
os.listdir(tess_path)

['TESS Toronto emotional speech set data',
 'tess toronto emotional speech set data']

In [28]:
tess_emotion_map = {
    "angry": "angry",
    "Fear": "fearful",
    "happy": "happy",
    "Sad": "sad"
}

for folder in os.listdir(tess_path+"/TESS Toronto emotional speech set data/"):
  label = folder.split('_')[1]
  for audio in os.listdir(os.path.join(tess_path+"/TESS Toronto emotional speech set data/", folder)):
    if label in tess_emotion_map and emotion_counts[tess_emotion_map[label]]<2700:
      audio_file_path = os.path.join(tess_path+"/TESS Toronto emotional speech set data/", folder, audio)
      try:
        # Validate audio file
        s = torchaudio.load(audio_file_path)
        data.append({
            "path": audio_file_path,
            "emotion": tess_emotion_map[label]
        })
        emotion_counts[tess_emotion_map[label]]+=1
      except Exception as e:
        print(f"Error loading TESS file: {audio_file_path} - {e}")

In [29]:
emotion_counts

{'fearful': 2023, 'angry': 1983, 'happy': 2221, 'sad': 2027, 'neutral': 1303}

In [30]:
len(data)

9557

## Augmenting audio to fix class imbalance

### function that perturbs the audio

In [31]:
import random

def perturb_speed(path):
    """
    Applies a random speed perturbation to an audio waveform.

    Args:
        path : The string that is the path of the audio file.

    Returns:
        torch.Tensor: The speed-perturbed waveform.
    """

    # Extract the audio from the path
    waveform, sample_rate = torchaudio.load(path)

    # Generate a random speed perturbation factor between 0.9 and 1.1
    speed_factor = random.uniform(0.9, 1.1)

    # Create an instance of torchaudio.transforms.Speed
    speed_transform = T.Speed(orig_freq=sample_rate, factor=speed_factor)

    # Apply the speed transformation
    perturbed_waveform = speed_transform(waveform)

    return perturbed_waveform

print("perturb_speed function defined.")

perturb_speed function defined.


### for the neutral emotion

In [2]:
import gc

# 1. Create a temporary directory named `temp_perturbed_audio`
temp_dir = "./temp_perturbed_audio"
os.makedirs(temp_dir, exist_ok=True)
print(f"Temporary directory created at: {temp_dir}")

# 2. Filter the existing `data` list to get only the samples labeled 'neutral'
neutral_samples = [item for item in data if item["emotion"] == "neutral"]
print(f"Initial number of neutral samples: {len(neutral_samples)}")

# 3. Set batch size and target
batch_size = 100
target = 2000
perturbation_counter = 0

# 4. Process in batches
while emotion_counts["neutral"] < target:
    batch_start = emotion_counts["neutral"]
    batch_target = min(batch_start + batch_size, target)

    print(f"\nProcessing batch: {batch_start} -> {batch_target}")

    # Process samples until we reach the batch target
    while emotion_counts["neutral"] < batch_target:
        # Ensure there are neutral samples to pick from
        if not neutral_samples:
            print("No neutral samples found to perturb. Exiting loop.")
            break

        selected_sample = random.choice(neutral_samples)
        original_path = selected_sample["path"]

        try:
            # Apply the `perturb_speed` function (it handles loading internally)
            perturbed_waveform = perturb_speed(original_path)

            # Load the original audio to get the sample rate for saving
            _, sample_rate = torchaudio.load(original_path)

            # Construct a unique temporary file path
            perturbed_file_name = f"perturbed_neutral_{perturbation_counter}.wav"
            perturbed_file_path = os.path.join(temp_dir, perturbed_file_name)

            # Save the perturbed waveform to this temporary file
            torchaudio.save(perturbed_file_path, perturbed_waveform, sample_rate)

            # IMPORTANT: Delete the waveform from memory immediately after saving
            del perturbed_waveform

            # Append a new dictionary entry to the `data` list
            data.append({
                "path": perturbed_file_path,
                "emotion": "neutral"
            })

            # Increment the count for 'neutral' in the `emotion_counts` dictionary
            emotion_counts["neutral"] += 1
            perturbation_counter += 1

        except Exception as e:
            print(f"Error processing {original_path}: {e}")

    # After each batch, clear memory
    gc.collect()
    print(f"Batch complete. Current neutral count: {emotion_counts['neutral']}/{target}")

# 5. After all batches finish, print the updated `emotion_counts`
print("\nPerturbation complete.")
print(f"Final emotion_counts: {emotion_counts}")

NameError: name 'os' is not defined

### For angry emotion

In [None]:
import gc

# 1. Create a temporary directory named `temp_perturbed_audio`
temp_dir = "./temp_perturbed_audio"
os.makedirs(temp_dir, exist_ok=True)
print(f"Temporary directory created at: {temp_dir}")

# 2. Filter the existing `data` list to get only the samples labeled 'neutral'
neutral_samples = [item for item in data if item["emotion"] == "neutral"]
print(f"Initial number of neutral samples: {len(neutral_samples)}")

# 3. Set batch size and target
batch_size = 100
target = 2000
perturbation_counter = 0

# 4. Process in batches
while emotion_counts["neutral"] < target:
    batch_start = emotion_counts["neutral"]
    batch_target = min(batch_start + batch_size, target)

    print(f"\nProcessing batch: {batch_start} -> {batch_target}")

    # Process samples until we reach the batch target
    while emotion_counts["neutral"] < batch_target:
        # Ensure there are neutral samples to pick from
        if not neutral_samples:
            print("No neutral samples found to perturb. Exiting loop.")
            break

        selected_sample = random.choice(neutral_samples)
        original_path = selected_sample["path"]

        try:
            # Apply the `perturb_speed` function (it handles loading internally)
            perturbed_waveform = perturb_speed(original_path)

            # Load the original audio to get the sample rate for saving
            _, sample_rate = torchaudio.load(original_path)

            # Construct a unique temporary file path
            perturbed_file_name = f"perturbed_neutral_{perturbation_counter}.wav"
            perturbed_file_path = os.path.join(temp_dir, perturbed_file_name)

            # Save the perturbed waveform to this temporary file
            torchaudio.save(perturbed_file_path, perturbed_waveform, sample_rate)

            # IMPORTANT: Delete the waveform from memory immediately after saving
            del perturbed_waveform

            # Append a new dictionary entry to the `data` list
            data.append({
                "path": perturbed_file_path,
                "emotion": "neutral"
            })

            # Increment the count for 'neutral' in the `emotion_counts` dictionary
            emotion_counts["neutral"] += 1
            perturbation_counter += 1

        except Exception as e:
            print(f"Error processing {original_path}: {e}")

    # After each batch, clear memory
    gc.collect()
    print(f"Batch complete. Current neutral count: {emotion_counts['neutral']}/{target}")

# 5. After all batches finish, print the updated `emotion_counts`
print("\nPerturbation complete.")
print(f"Final emotion_counts: {emotion_counts}")

## Creating encoders

In [None]:
label2id = {label: i for i, label in enumerate(sorted(set(emotion_map.values())))}
id2label = {v: k for k, v in label2id.items()}

## Loading and preprocessing audio

In [None]:
def preprocess_audio(path):
  wav,sr = torchaudio.load(path)

  # mono
  if wav.shape[0] > 1:
      wav = wav.mean(dim=0)

    # resample
  if sr != 16000:
      resampler = torchaudio.transforms.Resample(sr, 16000)
      wav = resampler(wav)

    # normalize
  wav = wav / wav.abs().max()

  return wav

## Create data compliant with pytorch

In [None]:
from torch.utils.data import Dataset

class RAVDESSDataset(Dataset):
    def __init__(self, data, label2id):
        self.data = data
        self.label2id = label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        waveform = preprocess_audio(item["path"])
        label = self.label2id[item["emotion"]]

        return {
            "speech": waveform.squeeze(0).numpy(),
            "label": label
        }

## Split data

In [None]:
train_val_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_val_data, test_size=0.1, random_state=42)

print(f"Train samples: {len(train_data)}")
print(f"Val samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

Train samples: 6880
Val samples: 765
Test samples: 1912


## Create Dataset

In [None]:
train_dataset = RAVDESSDataset(train_data, label2id)
val_dataset = RAVDESSDataset(val_data, label2id)
test_dataset = RAVDESSDataset(test_data, label2id)

## Use Wav2Vec2Processor (Batching & padding)

In [None]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m")
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-xls-r-300m", num_labels=len(label2id), label2id=label2id, id2label=id2label)
model.freeze_feature_encoder()#for faster training

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Defining the collate function

In [None]:
def collate_fn(batch):
    """Collate function for DataLoader"""
    speeches = [item["speech"] for item in batch]
    labels = torch.tensor([item["label"] for item in batch])

    # Process audio
    inputs = feature_extractor(
        speeches,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True,
        max_length=160000,  # Max 10 seconds
        truncation=True
    )

    inputs["labels"] = labels
    return inputs

## Defining the Compute metrics

In [None]:
def compute_metrics(eval_pred):
    """Compute evaluation metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'f1': f1
    }

## Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./wav2vec2-ravdess-emotion",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-6,# reduced for fixing overfitting
    per_device_train_batch_size=14, # Reduced batch size
    per_device_eval_batch_size=14,  # Reduced batch size
    gradient_accumulation_steps=1,
    num_train_epochs=7,
    warmup_ratio=0.1,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    report_to="none",  # Disable wandb/tensorboard
    remove_unused_columns=False, # Crucial fix: prevent Trainer from removing 'speech' key
    weight_decay=0.01, # for fixing overfitting
    dataloader_num_workers=2
)

## Create trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Save the model

In [None]:
# Save the model and feature extractor
output_dir = "./wav2vec2-emotion-classifier"  # or whatever your task is

# Save the trained model
model.save_pretrained(output_dir)

# Save the feature extractor
feature_extractor.save_pretrained(output_dir)

print(f"Model and feature extractor saved to {output_dir}")

## Download the model

In [None]:
!cp -r ./wav2vec2-emotion-classifier /content/drive/MyDrive/

In [None]:
!zip -r wav2vec2-classifier.zip ./wav2vec2-emotion-classifier

from google.colab import files
files.download('wav2vec2-classifier.zip')