In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import torchaudio
import librosa
import os
from sklearn.model_selection import train_test_split

# Emotion mapping from dataset description
EMOTION_MAP = {
    "04": "sadness",
    "05": "anger",
    "03": "happiness",
    "06": "fear",
    "07": "disgust"
}

# Statement mapping
STATEMENT_MAP = {
    "01": "Kids are talking by the door",
    "02": "Dogs are sitting by the door"
}

data = []

for path in tqdm(Path("RAVDESS_data").glob("**/*.wav")):
    name = path.stem  # Get filename without extension
    parts = name.split("-")  # Split filename into parts
    
    if len(parts) < 7:
        continue  # Skip invalid filenames
    
    modality, vocal_channel, emotion, intensity, statement, repetition, actor = parts
    
    if vocal_channel != "01":  # Only keep speech (01 = speech, 02 = song)
        continue
    
    if emotion not in EMOTION_MAP:  # Filter only selected emotions
        continue
    
    text = STATEMENT_MAP.get(statement, "")  # Get text from statement mapping
    
    try:
        # Attempt to load the audio file to ensure it's valid
        s, sr = torchaudio.load(path)
        data.append({
            "name": name,
            "path": str(path),
            "emotion": EMOTION_MAP[emotion],
            "text": text
        })
    except Exception:
        continue  # Skip if file is broken

# Convert data to DataFrame
df = pd.DataFrame(data)
print(f"Total valid files: {len(df)}")

# Shuffle dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["emotion"])

# Save to CSV
save_path = "dataset"
os.makedirs(save_path, exist_ok=True)
train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)

print(f"Train set: {train_df.shape}")
print(f"Test set: {test_df.shape}")


1440it [00:00, 2418.70it/s]

Total valid files: 960
Train set: (768, 4)
Test set: (192, 4)





In [4]:
import pandas as pd 
pd.read_csv("dataset/train.csv", delimiter="\t")['emotion'].value_counts()

emotion
anger        154
fear         154
sadness      154
happiness    153
disgust      153
Name: count, dtype: int64

In [5]:

pd.read_csv("dataset/test.csv", delimiter="\t")['emotion'].value_counts()

emotion
disgust      39
happiness    39
fear         38
sadness      38
anger        38
Name: count, dtype: int64