In [23]:
# Download dataset. This is not needed if you have the dataset already

# import kagglehub

# path = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")

# print("Path to dataset files:", path)

# Data Cleaning and Reformatting

In [24]:
# Function to check the sampling rate of a wav file and valid file path

import wave
import contextlib

def check_sampling_rate(file_path):
    try:
        with contextlib.closing(wave.open(file_path, 'r')) as wav_file:
            sample_rate = wav_file.getframerate()
            print(f"Sampling rate: {sample_rate} Hz")
    except Exception as e:
        print(f"Error: {e}")

file_path = 'data/ravdess-emotional-speech-audio/versions/1/Actor_01/03-01-01-01-01-01-01.wav'
check_sampling_rate(file_path)

Sampling rate: 48000 Hz


In [25]:
import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T

import librosa
import matplotlib.pyplot as plt

In [26]:
"""
File naming convention

Each of the 1440 files has a unique filename. The filename consists of a 7-part numerical identifier (e.g., 03-01-06-01-02-01-12.wav). These identifiers define the stimulus characteristics:

Filename identifiers

Modality (01 = full-AV, 02 = video-only, 03 = audio-only).

Vocal channel (01 = speech, 02 = song).

Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).

Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.

Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").

Repetition (01 = 1st repetition, 02 = 2nd repetition).
"""


import os
import pandas as pd

emotion_mapping = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

file_dir = "data/ravdess-emotional-speech-audio/versions/1/"

data = []

for actor in os.listdir(file_dir):
    actor_path = os.path.join(file_dir, actor)
    
    if os.path.isdir(actor_path) and actor.startswith("Actor_"):
        actor_number = actor.split("_")[-1]

        for file in os.listdir(actor_path):
            if file.endswith(".wav"):
                emotion_code = file[6:8]
                emotion = emotion_mapping.get(emotion_code, "unknown")
                formatted_filename = f"Actor_{actor_number}_{file}"
                data.append({"emotion": emotion, "file_path": formatted_filename})

df = pd.DataFrame(data)

print(df)

        emotion                          file_path
0       neutral  Actor_01_03-01-01-01-01-01-01.wav
1       neutral  Actor_01_03-01-01-01-01-02-01.wav
2       neutral  Actor_01_03-01-01-01-02-01-01.wav
3       neutral  Actor_01_03-01-01-01-02-02-01.wav
4          calm  Actor_01_03-01-02-01-01-01-01.wav
...         ...                                ...
1435  surprised  Actor_24_03-01-08-01-02-02-24.wav
1436  surprised  Actor_24_03-01-08-02-01-01-24.wav
1437  surprised  Actor_24_03-01-08-02-01-02-24.wav
1438  surprised  Actor_24_03-01-08-02-02-01-24.wav
1439  surprised  Actor_24_03-01-08-02-02-02-24.wav

[1440 rows x 2 columns]


In [27]:
#remove calm emotion
df = df[df.emotion != 'calm']

# Extracting waveforms and spectograms

In [28]:
from IPython.display import Audio
from matplotlib.patches import Rectangle
from torchaudio.utils import download_asset
import torch
import matplotlib.pyplot as plt

torch.random.manual_seed(42)

def plot_waveform(waveform, sr, title=None, ax=None):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sr

    if ax is None:
        _, ax = plt.subplots(num_channels, 1)
    ax.plot(time_axis, waveform[0], linewidth=1)
    ax.set_xlim([0, time_axis[-1]])
    ax.set_title(title)

def plot_spectrogram(specgram, title=None, ylabel="freq_bin", ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1)
    if title is not None:
        ax.set_title(title)
    ax.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto", interpolation="nearest")

In [29]:
def pull_wave_and_spec():
    base_dir = "data/ravdess-emotional-speech-audio/versions/1"
    output_dir = "speech"

    for actor in os.listdir(base_dir):
        actor_path = os.path.join(base_dir, actor)
        if os.path.isdir(actor_path) and actor.startswith("Actor_"):
            actor_num = int(actor.split("_")[1])
            if actor_num > 22: # change actor number here because it keeps crashing midway
                print(f"Processing {actor}...")
                for file in os.listdir(actor_path):
                    if file.endswith(".wav"):
                        SAMPLE_SPEECH = os.path.join(actor_path, file)
                        SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH)

                        # Define transform
                        spectrogram = T.Spectrogram(n_fft=512)

                        # Perform transform
                        spec = spectrogram(SPEECH_WAVEFORM)
                        
                        fig, ax = plt.subplots()
                        plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title=None, ax=ax)
                        waveform_path = os.path.join(output_dir, f"{actor}_{file}_waveform.png")
                        plt.savefig(waveform_path)
                        plt.close(fig)

                        # Create figure for spectrogram
                        fig, ax = plt.subplots()
                        plot_spectrogram(spec[0], title=None, ax=ax)
                        spectrogram_path = os.path.join(output_dir, f"{actor}_{file}_spectrogram.png")
                        plt.savefig(spectrogram_path)
                        plt.close(fig)

# Connecting speech waveforms and spectogram to pd dataframe

In [30]:
import torchvision.transforms as transforms
from PIL import Image

# Define base path for spectrogram images
image_dir = "speech"  # Directory where spectrogram images are stored

# Define transformations (convert images to tensors)
transform = transforms.Compose([
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.5], std=[0.5])  # Normalization (optional)
])

# Function to load spectrogram as tensor
def load_spectrogram_tensor(file_path):
    filename = os.path.basename(file_path)
    spectrogram_img_path = os.path.join(image_dir, f"{filename}_spectrogram.png")  # Construct spectrogram path
    
    # Load image if it exists, else return None
    if os.path.exists(spectrogram_img_path):
        image = Image.open(spectrogram_img_path).convert("L")  # Convert to grayscale
        return transform(image)  # Convert to tensor
    return None  # If file doesn't exist, return None

# Apply function to extract spectrogram tensors
df["spectrogram_tensor"] = df["file_path"].apply(load_spectrogram_tensor)

# Display updated DataFrame
print(df.head())


    emotion                          file_path  \
0   neutral  Actor_01_03-01-01-01-01-01-01.wav   
1   neutral  Actor_01_03-01-01-01-01-02-01.wav   
2   neutral  Actor_01_03-01-01-01-02-01-01.wav   
3   neutral  Actor_01_03-01-01-01-02-02-01.wav   
12    happy  Actor_01_03-01-03-01-01-01-01.wav   

                                   spectrogram_tensor  
0   [[[tensor(1.), tensor(1.), tensor(1.), tensor(...  
1   [[[tensor(1.), tensor(1.), tensor(1.), tensor(...  
2   [[[tensor(1.), tensor(1.), tensor(1.), tensor(...  
3   [[[tensor(1.), tensor(1.), tensor(1.), tensor(...  
12  [[[tensor(1.), tensor(1.), tensor(1.), tensor(...  


In [31]:
import torchvision.transforms as transforms
from PIL import Image
import os
import pandas as pd

# Define base path for waveform images
image_dir = "speech"  # Directory where waveform images are stored

# Define transformations (convert images to tensors)
transform = transforms.Compose([
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.5], std=[0.5])  # Normalization (optional)
])

# Function to load waveform as tensor
def load_waveform_tensor(file_path):
    filename = os.path.basename(file_path)  # Extract filename (e.g., "Actor_01_03-01-01-01-01-01-01.wav")
    waveform_img_path = os.path.join(image_dir, f"{filename}_waveform.png")  # Construct waveform path
    
    # Load image if it exists, else return None
    if os.path.exists(waveform_img_path):
        image = Image.open(waveform_img_path).convert("L")  # Convert to grayscale
        return transform(image)  # Convert to tensor
    return None  # If file doesn't exist, return None

# Apply function to extract waveform tensors
df["waveform_tensor"] = df["file_path"].apply(load_waveform_tensor)

# Display updated DataFrame
print(df.head())

    emotion                          file_path  \
0   neutral  Actor_01_03-01-01-01-01-01-01.wav   
1   neutral  Actor_01_03-01-01-01-01-02-01.wav   
2   neutral  Actor_01_03-01-01-01-02-01-01.wav   
3   neutral  Actor_01_03-01-01-01-02-02-01.wav   
12    happy  Actor_01_03-01-03-01-01-01-01.wav   

                                   spectrogram_tensor  \
0   [[[tensor(1.), tensor(1.), tensor(1.), tensor(...   
1   [[[tensor(1.), tensor(1.), tensor(1.), tensor(...   
2   [[[tensor(1.), tensor(1.), tensor(1.), tensor(...   
3   [[[tensor(1.), tensor(1.), tensor(1.), tensor(...   
12  [[[tensor(1.), tensor(1.), tensor(1.), tensor(...   

                                      waveform_tensor  
0   [[[tensor(1.), tensor(1.), tensor(1.), tensor(...  
1   [[[tensor(1.), tensor(1.), tensor(1.), tensor(...  
2   [[[tensor(1.), tensor(1.), tensor(1.), tensor(...  
3   [[[tensor(1.), tensor(1.), tensor(1.), tensor(...  
12  [[[tensor(1.), tensor(1.), tensor(1.), tensor(...  


# Creating the model