<a href="https://colab.research.google.com/github/Nsimaar99/Kaggle-Project/blob/master/SpeechEmotion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**EMOTION DETECTION PROJECT USING RAVDESS DATASET**


The Emotion Detection from Spoken Audio project aims to build a model that can classify emotions such as happiness, sadness, anger, and surprise from spoken language. By analyzing audio recordings, the goal is to detect and categorize the emotional tone of the speaker’s voice. The project begins with selecting a suitable dataset, such as RAVDESS or EmoReact, and then preprocesses the audio by extracting meaningful features like MFCCs (Mel-Frequency Cepstral Coefficients) or spectrograms. These features are used to train a machine learning model, often leveraging Recurrent Neural Networks (RNNs), LSTMs, or GRUs to handle the sequential nature of audio data.

Once the model is trained, it is evaluated based on accuracy, precision, and recall, using a separate test set to ensure the model generalizes well to new data. This emotion detection system can have real-world applications, including improving customer service by analyzing the emotional state of callers, enhancing mental health monitoring, or creating more empathetic voice-based assistants. The project challenges include managing background noise, speaker variability, and data imbalance, all of which can affect the model's performance.

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import zipfile
import os

# Define the file path
zip_file_path = '/content/drive/MyDrive/Voiced.zip'

# Define the extraction path
extraction_path = '/content/your_extracted_folder'

# Extract the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

print(f"Files extracted to {extraction_path}")


Files extracted to /content/your_extracted_folder


In [5]:
import os

# Define the main folder where the subdirectories are located
main_folder = "/content/your_extracted_folder/audio_speech_actors_01-24"

# Loop through each subdirectory (actor directories)
for actor_dir in os.listdir(main_folder):
    actor_path = os.path.join(main_folder, actor_dir)

    # Check if it's a directory
    if os.path.isdir(actor_path):
        print(f"Files in '{actor_dir}':")

        # List all the files in the actor's directory
        for file in os.listdir(actor_path):
            # Only print .wav files (if any)
            if file.endswith('.wav'):
                print(f"  {file}")
        print()  # Print an empty line between directories


Files in 'Actor_03':
  03-01-02-02-01-01-03.wav
  03-01-06-02-02-02-03.wav
  03-01-08-02-01-02-03.wav
  03-01-08-02-02-02-03.wav
  03-01-08-01-01-02-03.wav
  03-01-07-01-02-02-03.wav
  03-01-05-01-01-02-03.wav
  03-01-04-01-01-01-03.wav
  03-01-05-02-02-01-03.wav
  03-01-08-02-01-01-03.wav
  03-01-03-01-02-02-03.wav
  03-01-03-01-02-01-03.wav
  03-01-08-01-02-01-03.wav
  03-01-02-02-02-02-03.wav
  03-01-03-01-01-01-03.wav
  03-01-04-01-01-02-03.wav
  03-01-08-01-01-01-03.wav
  03-01-07-02-01-02-03.wav
  03-01-02-02-01-02-03.wav
  03-01-04-01-02-01-03.wav
  03-01-07-01-02-01-03.wav
  03-01-05-02-01-02-03.wav
  03-01-01-01-01-01-03.wav
  03-01-02-01-02-02-03.wav
  03-01-01-01-02-01-03.wav
  03-01-07-01-01-01-03.wav
  03-01-03-02-02-01-03.wav
  03-01-04-02-02-02-03.wav
  03-01-07-01-01-02-03.wav
  03-01-06-01-01-02-03.wav
  03-01-06-02-01-02-03.wav
  03-01-03-02-02-02-03.wav
  03-01-07-02-02-02-03.wav
  03-01-04-02-01-01-03.wav
  03-01-04-02-02-01-03.wav
  03-01-02-01-01-01-03.wav
  03-01

In [6]:
import os

# Define the main folder
main_folder = "/content/your_extracted_folder/audio_speech_actors_01-24"

# Loop through each item in the main folder
for item in os.listdir(main_folder):
    item_path = os.path.join(main_folder, item)

    # Check if it's a directory
    if os.path.isdir(item_path):
        print(f"Directory: {item}")


Directory: Actor_03
Directory: Actor_01
Directory: Actor_04
Directory: Actor_10
Directory: Actor_17
Directory: Actor_20
Directory: Actor_18
Directory: Actor_05
Directory: Actor_06
Directory: Actor_15
Directory: Actor_12
Directory: Actor_22
Directory: Actor_08
Directory: Actor_14
Directory: Actor_11
Directory: Actor_16
Directory: Actor_13
Directory: Actor_24
Directory: Actor_23
Directory: Actor_02
Directory: Actor_19
Directory: Actor_09
Directory: Actor_07
Directory: Actor_21


In [7]:
import os
import shutil

# Emotion labels dictionary (adjust as needed)
emotion_labels = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised",
}

# Path to the extracted dataset
dataset_path = "/content/your_extracted_folder/audio_speech_actors_01-24"
output_path = "/content/grouped_by_emotion"

# Create the output folder if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Group the files by emotion
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith(".wav"):
            # Split the filename by hyphens to extract the emotion code
            components = file.split("-")
            emotion_code = components[2]  # Emotion code is at the 3rd position (index 2)

            # Get emotion label from emotion code
            emotion = emotion_labels.get(emotion_code, "unknown")

            # Create the emotion folder if it doesn't exist
            emotion_folder = os.path.join(output_path, emotion)
            os.makedirs(emotion_folder, exist_ok=True)

            # Move the file to the corresponding emotion folder
            source_file = os.path.join(root, file)
            destination_file = os.path.join(emotion_folder, file)
            shutil.move(source_file, destination_file)

            print(f"Moved {file} to {emotion} folder")

print("Files grouped by emotion!")


Moved 03-01-02-02-01-01-03.wav to calm folder
Moved 03-01-06-02-02-02-03.wav to fearful folder
Moved 03-01-08-02-01-02-03.wav to surprised folder
Moved 03-01-08-02-02-02-03.wav to surprised folder
Moved 03-01-08-01-01-02-03.wav to surprised folder
Moved 03-01-07-01-02-02-03.wav to disgust folder
Moved 03-01-05-01-01-02-03.wav to angry folder
Moved 03-01-04-01-01-01-03.wav to sad folder
Moved 03-01-05-02-02-01-03.wav to angry folder
Moved 03-01-08-02-01-01-03.wav to surprised folder
Moved 03-01-03-01-02-02-03.wav to happy folder
Moved 03-01-03-01-02-01-03.wav to happy folder
Moved 03-01-08-01-02-01-03.wav to surprised folder
Moved 03-01-02-02-02-02-03.wav to calm folder
Moved 03-01-03-01-01-01-03.wav to happy folder
Moved 03-01-04-01-01-02-03.wav to sad folder
Moved 03-01-08-01-01-01-03.wav to surprised folder
Moved 03-01-07-02-01-02-03.wav to disgust folder
Moved 03-01-02-02-01-02-03.wav to calm folder
Moved 03-01-04-01-02-01-03.wav to sad folder
Moved 03-01-07-01-02-01-03.wav to disgu

In [8]:
import os
import shutil
import random

# Paths
dataset_path = "/content/grouped_by_emotion"
output_base_path = "/content/split_dataset"

# Split ratio
train_ratio = 0.7
validation_ratio = 0.15
test_ratio = 0.15

# Create output directories for the splits
for split in ['train', 'validation', 'test']:
    os.makedirs(os.path.join(output_base_path, split), exist_ok=True)

# Loop through each emotion folder
for emotion in os.listdir(dataset_path):
    emotion_folder = os.path.join(dataset_path, emotion)
    if os.path.isdir(emotion_folder):
        # Get all .wav files for the current emotion
        files = [f for f in os.listdir(emotion_folder) if f.endswith(".wav")]

        # Shuffle files for random splitting
        random.shuffle(files)

        # Split files into train, validation, and test
        num_files = len(files)
        train_end = int(train_ratio * num_files)
        validation_end = train_end + int(validation_ratio * num_files)

        # File paths
        train_files = files[:train_end]
        validation_files = files[train_end:validation_end]
        test_files = files[validation_end:]

        # Move files to the corresponding split folder
        for split, file_list in zip(['train', 'validation', 'test'], [train_files, validation_files, test_files]):
            split_folder = os.path.join(output_base_path, split, emotion)
            os.makedirs(split_folder, exist_ok=True)

            for file in file_list:
                source_file = os.path.join(emotion_folder, file)
                destination_file = os.path.join(split_folder, file)
                shutil.move(source_file, destination_file)

            print(f"Moved {len(file_list)} files to {split}/{emotion}")

print("Dataset split into train, validation, and test sets!")


Moved 134 files to train/calm
Moved 28 files to validation/calm
Moved 30 files to test/calm
Moved 134 files to train/disgust
Moved 28 files to validation/disgust
Moved 30 files to test/disgust
Moved 134 files to train/surprised
Moved 28 files to validation/surprised
Moved 30 files to test/surprised
Moved 134 files to train/angry
Moved 28 files to validation/angry
Moved 30 files to test/angry
Moved 134 files to train/sad
Moved 28 files to validation/sad
Moved 30 files to test/sad
Moved 134 files to train/happy
Moved 28 files to validation/happy
Moved 30 files to test/happy
Moved 67 files to train/neutral
Moved 14 files to validation/neutral
Moved 15 files to test/neutral
Moved 134 files to train/fearful
Moved 28 files to validation/fearful
Moved 30 files to test/fearful
Dataset split into train, validation, and test sets!


In [9]:
import os
import torchaudio
from torch.utils.data import Dataset, DataLoader
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
from sklearn.preprocessing import LabelEncoder

class VoiceEmotionDataset(Dataset):
    def __init__(self, data_dir, emotion_labels, sample_rate=16000):
        """
        Args:
            data_dir (str): Path to the directory containing subfolders for emotions.
            emotion_labels (dict): Mapping of emotion codes to labels.
            sample_rate (int): Desired sample rate for audio.
        """
        self.data_dir = data_dir
        self.emotion_labels = emotion_labels
        self.sample_rate = sample_rate
        self.files = []
        self.label_encoder = LabelEncoder()  # Initialize LabelEncoder

        # Collect all emotion labels before fitting
        all_emotions = list(self.emotion_labels.values())  # Get all possible emotion labels

        # Fit the LabelEncoder with all possible emotion labels
        self.label_encoder.fit(all_emotions)

        for emotion in os.listdir(data_dir):
            emotion_folder = os.path.join(data_dir, emotion)
            if os.path.isdir(emotion_folder):
                for file in os.listdir(emotion_folder):
                    if file.endswith(".wav"):
                        emotion_code = file.split("-")[2]  # Extract emotion code
                        if emotion_code in emotion_labels:
                            self.files.append((os.path.join(emotion_folder, file), emotion_labels[emotion_code]))
                        else:
                            print(f"Warning: File '{file}' has unknown emotion code '{emotion_code}'. Skipping.")

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        audio_path, emotion = self.files[idx]
        waveform, sr = torchaudio.load(audio_path)
        if sr != self.sample_rate:
            waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.sample_rate)(waveform)

        # Get the emotion label (as an integer)
        label = self.label_encoder.transform([emotion])[0]

        return waveform.squeeze(0), label



In [10]:
!pip install transformers
from transformers import AutoFeatureExtractor, Wav2Vec2Processor, AutoModelForAudioClassification
import torch
from torch.utils.data import Dataset, DataLoader
import torchaudio
import os

# Instead of AutoProcessor, use AutoFeatureExtractor and AutoModelForAudioClassification
feature_extractor = AutoFeatureExtractor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
model = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")


# Define the collate function
def collate_fn(batch):
    waveforms = [item[0].numpy() for item in batch]  # Extract raw waveforms as NumPy arrays
    labels = [item[1] for item in batch]  # Extract labels

    # Use the feature extractor for processing
    processed = feature_extractor(waveforms, sampling_rate=16000, return_tensors="pt", padding=True)
    processed["labels"] = torch.tensor(labels)

    return processed




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequen

In [11]:
# Define data paths
train_data_dir = "/content/split_dataset/train"
val_data_dir = "/content/split_dataset/validation"
test_data_dir = "/content/split_dataset/test"

# Create dataset instances
train_dataset = VoiceEmotionDataset(train_data_dir, emotion_labels)
val_dataset = VoiceEmotionDataset(val_data_dir, emotion_labels)
test_dataset = VoiceEmotionDataset(test_data_dir, emotion_labels)

# Define batch size
batch_size = 32  # Adjust as needed

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [12]:
import torch
from transformers import AutoModelForAudioClassification

# Load the model
model = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")

# Check for CUDA availability and assign device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the selected device
model.to(device)


Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', '

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=

In [13]:
# Import the AdamW optimizer
from transformers import AdamW  # If you're using Hugging Face Transformers

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)  # Fine-tuning with a smaller learning rate

# Define loss function (Cross-Entropy loss for classification)
criterion = torch.nn.CrossEntropyLoss()



In [None]:
# Number of epochs for training
epochs = 15

# Import tqdm for progress bar
from tqdm.auto import tqdm

# Training loop
def train_model(train_loader, val_loader, model, optimizer, device, criterion, epochs=10):
    for epoch in range(epochs):
        model.train()  # Set the model to training mode
        running_loss = 0.0

        # Iterate over the training dataset
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}"):
            # Get the batch data and move it to the correct device
            inputs = batch["input_values"].to(device)  # Example input batch
            labels = batch["labels"].to(device)  # Labels from dataset

            optimizer.zero_grad()  # Zero the gradients

            # Forward pass
            outputs = model(input_values=inputs, labels=labels)
            loss = outputs.loss  # Loss from the model

            # Backward pass
            loss.backward()  # Compute gradients
            optimizer.step()  # Update weights

            running_loss += loss.item()  # Accumulate loss for monitoring

        # Print loss for the epoch
        epoch_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {epoch_loss:.4f}")

        # Run validation after each epoch
        val_loss = validate_model(val_loader, model, device, criterion)
        print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {val_loss:.4f}")

# Validation loop
def validate_model(val_loader, model, device, criterion):
    model.eval()  # Set the model to evaluation mode
    running_val_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():  # Disable gradient calculation during validation
        for batch in tqdm(val_loader, desc="Validating"): # Use tqdm here as well
            inputs = batch["input_values"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_values=inputs, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            # Compute validation loss
            running_val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(logits, 1)  # Get the index of max logit
            correct_predictions += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    # Calculate average validation loss and accuracy
    val_loss = running_val_loss / len(val_loader)
    val_accuracy = correct_predictions / total_samples * 100
    print(f"Validation Accuracy: {val_accuracy:.2f}%")

    return val_loss

# Assuming train_loader and val_loader are already defined
train_model(train_loader, val_loader, model, optimizer, device, criterion, epochs)

Training Epoch 1/15:   0%|          | 0/41 [00:00<?, ?it/s]