<a href="https://colab.research.google.com/github/Nsimaar99/Kaggle-Project/blob/master/SpeechEmotion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**EMOTION DETECTION PROJECT USING RAVDESS DATASET**


The Emotion Detection from Spoken Audio project aims to build a model that can classify emotions such as happiness, sadness, anger, and surprise from spoken language. By analyzing audio recordings, the goal is to detect and categorize the emotional tone of the speaker’s voice. The project begins with selecting a suitable dataset, such as RAVDESS or EmoReact, and then preprocesses the audio by extracting meaningful features like MFCCs (Mel-Frequency Cepstral Coefficients) or spectrograms. These features are used to train a machine learning model, often leveraging Recurrent Neural Networks (RNNs), LSTMs, or GRUs to handle the sequential nature of audio data.

Once the model is trained, it is evaluated based on accuracy, precision, and recall, using a separate test set to ensure the model generalizes well to new data. This emotion detection system can have real-world applications, including improving customer service by analyzing the emotional state of callers, enhancing mental health monitoring, or creating more empathetic voice-based assistants. The project challenges include managing background noise, speaker variability, and data imbalance, all of which can affect the model's performance.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import zipfile
import os

# Define the file path
zip_file_path = '/content/drive/MyDrive/Voiced.zip'

# Define the extraction path
extraction_path = '/content/your_extracted_folder'

# Extract the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

print(f"Files extracted to {extraction_path}")


Files extracted to /content/your_extracted_folder


In [None]:
import os

# Define the main folder where the subdirectories are located
main_folder = "/content/your_extracted_folder/audio_speech_actors_01-24"

# Loop through each subdirectory (actor directories)
for actor_dir in os.listdir(main_folder):
    actor_path = os.path.join(main_folder, actor_dir)

    # Check if it's a directory
    if os.path.isdir(actor_path):
        print(f"Files in '{actor_dir}':")

        # List all the files in the actor's directory
        for file in os.listdir(actor_path):
            # Only print .wav files (if any)
            if file.endswith('.wav'):
                print(f"  {file}")
        print()  # Print an empty line between directories


Files in 'Actor_03':
  03-01-08-02-01-01-03.wav
  03-01-06-01-01-02-03.wav
  03-01-06-01-02-02-03.wav
  03-01-07-02-02-01-03.wav
  03-01-04-01-02-02-03.wav
  03-01-08-01-01-01-03.wav
  03-01-07-01-01-02-03.wav
  03-01-08-01-01-02-03.wav
  03-01-07-02-01-01-03.wav
  03-01-06-02-02-01-03.wav
  03-01-05-02-01-02-03.wav
  03-01-03-01-02-01-03.wav
  03-01-05-01-02-02-03.wav
  03-01-05-01-01-01-03.wav
  03-01-05-01-02-01-03.wav
  03-01-08-02-02-01-03.wav
  03-01-01-01-02-02-03.wav
  03-01-02-02-01-02-03.wav
  03-01-07-01-01-01-03.wav
  03-01-04-02-01-02-03.wav
  03-01-08-01-02-01-03.wav
  03-01-06-02-02-02-03.wav
  03-01-06-01-01-01-03.wav
  03-01-02-01-02-01-03.wav
  03-01-04-01-01-02-03.wav
  03-01-05-01-01-02-03.wav
  03-01-04-02-01-01-03.wav
  03-01-04-02-02-02-03.wav
  03-01-03-02-01-01-03.wav
  03-01-01-01-01-01-03.wav
  03-01-05-02-02-01-03.wav
  03-01-07-01-02-02-03.wav
  03-01-03-02-02-01-03.wav
  03-01-04-01-01-01-03.wav
  03-01-04-02-02-01-03.wav
  03-01-06-02-01-02-03.wav
  03-01

In [None]:
import os

# Define the main folder
main_folder = "/content/your_extracted_folder/audio_speech_actors_01-24"

# Loop through each item in the main folder
for item in os.listdir(main_folder):
    item_path = os.path.join(main_folder, item)

    # Check if it's a directory
    if os.path.isdir(item_path):
        print(f"Directory: {item}")


Directory: Actor_03
Directory: Actor_14
Directory: Actor_21
Directory: Actor_22
Directory: Actor_17
Directory: Actor_12
Directory: Actor_07
Directory: Actor_18
Directory: Actor_05
Directory: Actor_06
Directory: Actor_24
Directory: Actor_10
Directory: Actor_16
Directory: Actor_13
Directory: Actor_11
Directory: Actor_15
Directory: Actor_04
Directory: Actor_08
Directory: Actor_01
Directory: Actor_20
Directory: Actor_23
Directory: Actor_19
Directory: Actor_02
Directory: Actor_09


In [None]:
import os
import shutil

# Emotion labels dictionary (adjust as needed)
emotion_labels = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised",
}

# Path to the extracted dataset
dataset_path = "/content/your_extracted_folder/audio_speech_actors_01-24"
output_path = "/content/grouped_by_emotion"

# Create the output folder if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Group the files by emotion
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith(".wav"):
            # Split the filename by hyphens to extract the emotion code
            components = file.split("-")
            emotion_code = components[2]  # Emotion code is at the 3rd position (index 2)

            # Get emotion label from emotion code
            emotion = emotion_labels.get(emotion_code, "unknown")

            # Create the emotion folder if it doesn't exist
            emotion_folder = os.path.join(output_path, emotion)
            os.makedirs(emotion_folder, exist_ok=True)

            # Move the file to the corresponding emotion folder
            source_file = os.path.join(root, file)
            destination_file = os.path.join(emotion_folder, file)
            shutil.move(source_file, destination_file)

            print(f"Moved {file} to {emotion} folder")

print("Files grouped by emotion!")


Moved 03-01-08-02-01-01-03.wav to surprised folder
Moved 03-01-06-01-01-02-03.wav to fearful folder
Moved 03-01-06-01-02-02-03.wav to fearful folder
Moved 03-01-07-02-02-01-03.wav to disgust folder
Moved 03-01-04-01-02-02-03.wav to sad folder
Moved 03-01-08-01-01-01-03.wav to surprised folder
Moved 03-01-07-01-01-02-03.wav to disgust folder
Moved 03-01-08-01-01-02-03.wav to surprised folder
Moved 03-01-07-02-01-01-03.wav to disgust folder
Moved 03-01-06-02-02-01-03.wav to fearful folder
Moved 03-01-05-02-01-02-03.wav to angry folder
Moved 03-01-03-01-02-01-03.wav to happy folder
Moved 03-01-05-01-02-02-03.wav to angry folder
Moved 03-01-05-01-01-01-03.wav to angry folder
Moved 03-01-05-01-02-01-03.wav to angry folder
Moved 03-01-08-02-02-01-03.wav to surprised folder
Moved 03-01-01-01-02-02-03.wav to neutral folder
Moved 03-01-02-02-01-02-03.wav to calm folder
Moved 03-01-07-01-01-01-03.wav to disgust folder
Moved 03-01-04-02-01-02-03.wav to sad folder
Moved 03-01-08-01-02-01-03.wav to

In [None]:
import os
import shutil
import random

# Paths
dataset_path = "/content/grouped_by_emotion"
output_base_path = "/content/split_dataset"

# Split ratio
train_ratio = 0.7
validation_ratio = 0.15
test_ratio = 0.15

# Create output directories for the splits
for split in ['train', 'validation', 'test']:
    os.makedirs(os.path.join(output_base_path, split), exist_ok=True)

# Loop through each emotion folder
for emotion in os.listdir(dataset_path):
    emotion_folder = os.path.join(dataset_path, emotion)
    if os.path.isdir(emotion_folder):
        # Get all .wav files for the current emotion
        files = [f for f in os.listdir(emotion_folder) if f.endswith(".wav")]

        # Shuffle files for random splitting
        random.shuffle(files)

        # Split files into train, validation, and test
        num_files = len(files)
        train_end = int(train_ratio * num_files)
        validation_end = train_end + int(validation_ratio * num_files)

        # File paths
        train_files = files[:train_end]
        validation_files = files[train_end:validation_end]
        test_files = files[validation_end:]

        # Move files to the corresponding split folder
        for split, file_list in zip(['train', 'validation', 'test'], [train_files, validation_files, test_files]):
            split_folder = os.path.join(output_base_path, split, emotion)
            os.makedirs(split_folder, exist_ok=True)

            for file in file_list:
                source_file = os.path.join(emotion_folder, file)
                destination_file = os.path.join(split_folder, file)
                shutil.move(source_file, destination_file)

            print(f"Moved {len(file_list)} files to {split}/{emotion}")

print("Dataset split into train, validation, and test sets!")


Moved 134 files to train/angry
Moved 28 files to validation/angry
Moved 30 files to test/angry
Moved 134 files to train/surprised
Moved 28 files to validation/surprised
Moved 30 files to test/surprised
Moved 134 files to train/sad
Moved 28 files to validation/sad
Moved 30 files to test/sad
Moved 134 files to train/disgust
Moved 28 files to validation/disgust
Moved 30 files to test/disgust
Moved 134 files to train/calm
Moved 28 files to validation/calm
Moved 30 files to test/calm
Moved 67 files to train/neutral
Moved 14 files to validation/neutral
Moved 15 files to test/neutral
Moved 134 files to train/happy
Moved 28 files to validation/happy
Moved 30 files to test/happy
Moved 134 files to train/fearful
Moved 28 files to validation/fearful
Moved 30 files to test/fearful
Dataset split into train, validation, and test sets!


In [None]:
!pip install transformers datasets --upgrade



In [None]:
!pip install transformers datasets torchaudio --upgrade



In [None]:
# ipython-input-8-41a220d20f6e
import os
import torchaudio
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification  # Updated import
from sklearn.preprocessing import LabelEncoder

# Define dataset class for emotion recognition
class VoiceEmotionDataset(Dataset):
    def __init__(self, data_dir, emotion_labels, sample_rate=16000):
        """
        Args:
            data_dir (str): Path to the directory containing subfolders for emotions.
            emotion_labels (dict): Mapping of emotion codes to labels.
            sample_rate (int): Desired sample rate for audio.
        """
        self.data_dir = data_dir
        self.emotion_labels = emotion_labels
        self.sample_rate = sample_rate
        self.files = []
        self.label_encoder = LabelEncoder()

        # Collect all emotion labels before fitting
        all_emotions = list(self.emotion_labels.values())
        self.label_encoder.fit(all_emotions)

        # Collect file paths and labels
        for emotion in os.listdir(data_dir):
            emotion_folder = os.path.join(data_dir, emotion)
            if os.path.isdir(emotion_folder):
                for file in os.listdir(emotion_folder):
                    if file.endswith(".wav"):
                        emotion_code = file.split("-")[2]
                        if emotion_code in emotion_labels:
                            self.files.append((os.path.join(emotion_folder, file), emotion_labels[emotion_code]))

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        audio_path, emotion = self.files[idx]

        try:
            waveform, sr = torchaudio.load(audio_path)
            if waveform.numel() == 0:
                print(f"Warning: Empty audio file detected: {audio_path}. Skipping.")
                return None  # Skip this sample

        except Exception as e:
            print(f"Error loading audio file: {audio_path}. Error: {e}")
            return None  # Skip this sample

        if sr != self.sample_rate:
            waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.sample_rate)(waveform)

        # Get the emotion label as an integer
        label = self.label_encoder.transform([emotion])[0]

        return waveform.squeeze(0), label

In [None]:
# Initialize feature extractor and model from Hugging Face for speech emotion recognition
# Instead of AutoProcessor, use AutoFeatureExtractor
feature_extractor = AutoFeatureExtractor.from_pretrained("xmj2002/hubert-base-ch-speech-emotion-recognition")
# Load the model using AutoModelForAudioClassification
model = AutoModelForAudioClassification.from_pretrained("xmj2002/hubert-base-ch-speech-emotion-recognition")

# Manually set the number of labels to match your dataset
num_labels = len(emotion_labels)  # The number of emotion labels in your dataset

# Replace the model's classifier head to match the number of labels
model.classifier = torch.nn.Linear(model.classifier.in_features, num_labels)

# Move model to the selected device (e.g., 'cuda' or 'cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at xmj2002/hubert-base-ch-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HubertForSequenceClassification(
  (hubert): HubertModel(
    (feature_extractor): HubertFeatureEncoder(
      (conv_layers): ModuleList(
        (0): HubertGroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x HubertNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x HubertNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): HubertFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encod

In [None]:
# Define data paths
train_data_dir = "/content/split_dataset/train"
val_data_dir = "/content/split_dataset/validation"
test_data_dir = "/content/split_dataset/test"

In [None]:
def collate_fn(batch):
    # Filter out None values
    batch = [item for item in batch if item is not None]

    # Calculate missing samples to ensure strict batch size
    missing_samples = max(0, batch_size - len(batch))

    # If the batch is empty, create a placeholder
    if not batch:
        return {
            "input_values": torch.zeros(batch_size, 16000),  # Example placeholder
            "labels": torch.full((batch_size,), -1)  # Placeholder labels
        }

    waveforms = [item[0].numpy() for item in batch]  # Extract raw waveforms
    labels = [item[1] for item in batch]  # Extract labels

    # Add dummy data if batch is smaller than required size
    for _ in range(missing_samples):
        waveforms.append(np.zeros_like(waveforms[0]))  # Add zero waveform
        labels.append(-1)  # Add dummy label (e.g., -1 for ignored label)

    # Process with feature extractor
    processed = feature_extractor(waveforms, sampling_rate=16000, return_tensors="pt", padding=True)
    processed["labels"] = torch.tensor(labels)

    return processed



In [None]:
# Create dataset instances
train_dataset = VoiceEmotionDataset(train_data_dir, emotion_labels)
val_dataset = VoiceEmotionDataset(val_data_dir, emotion_labels)
test_dataset = VoiceEmotionDataset(test_data_dir, emotion_labels)

# Define batch size
batch_size = 10 # Adjust as needed

# Create DataLoaders with the adjusted collate function
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Import the AdamW optimizer
from transformers import AdamW  # If you're using Hugging Face Transformers

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)  # Fine-tuning with a smaller learning rate

# Define loss function (Cross-Entropy loss for classification)
criterion = torch.nn.CrossEntropyLoss()



In [None]:
# Training loop with Gradient Accumulation and Mixed Precision
def train_model(train_loader, val_loader, model, optimizer, device, criterion, epochs=10):
    model.train()  # Set the model to training mode
    for epoch in range(epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_samples = 0

        optimizer.zero_grad()  # Zero the gradients before starting each epoch

        # Iterate over the training dataset
        for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}")):
            # Skip empty batches
            if len(batch["input_values"]) == 0:
                continue

            # Get the batch data and move it to the correct device
            inputs = batch["input_values"].to(device)
            labels = batch["labels"].to(device)

            # Ignore dummy samples (labels == -1)
            mask = labels != -1
            inputs = inputs[mask]
            labels = labels[mask]

            # Skip batch if it becomes empty after filtering
            if inputs.size(0) == 0:
                continue

            # Forward pass with mixed precision
            if use_fp16:
                with autocast():
                    outputs = model(input_values=inputs, labels=labels)
                    loss = outputs.loss
            else:
                outputs = model(input_values=inputs, labels=labels)
                loss = outputs.loss

            # Backward pass
            loss.backward()

            # Gradient accumulation
            if (batch_idx + 1) % accumulation_steps == 0 or (batch_idx + 1) == len(train_loader):
                if use_fp16:
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    optimizer.step()

                optimizer.zero_grad()  # Zero gradients for the next accumulation

            # Accumulate loss and accuracy
            running_loss += loss.item()
            _, predicted = torch.max(outputs.logits, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_samples += labels.size(0)

        # Print loss and accuracy for the epoch
        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = correct_predictions / total_samples * 100
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

        # Clear CUDA cache
        torch.cuda.empty_cache()

        # Run validation after each epoch
        val_loss = validate_model(val_loader, model, device, criterion)
        print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {val_loss:.4f}")

# Validation loop
def validate_model(val_loader, model, device, criterion):
    model.eval()  # Set the model to evaluation mode
    running_val_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            # Skip empty batches
            if len(batch["input_values"]) == 0:
                continue

            inputs = batch["input_values"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            if use_fp16:
                with autocast():
                    outputs = model(input_values=inputs, labels=labels)
                    loss = outputs.loss
                    logits = outputs.logits
            else:
                outputs = model(input_values=inputs, labels=labels)
                loss = outputs.loss
                logits = outputs.logits

            # Compute validation loss
            running_val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(logits, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    # Calculate average validation loss and accuracy
    val_loss = running_val_loss / len(val_loader)
    val_accuracy = correct_predictions / total_samples * 100
    print(f"Validation Accuracy: {val_accuracy:.2f}%")

    return val_loss







Training Epoch 1/15:   0%|          | 0/21 [00:00<?, ?it/s]