# Metadata preprocessing

Run this script to preprocess the datasets and to create a metadata file, once you have downloaded them into the `data/EmoData` folder.

## EmoDB

In [None]:
import os
import pandas as pd

In [None]:
data_dir = "data"
emo_data_dir = os.path.join(data_dir, "EmoData")
EmoDB_dir = os.path.join(emo_data_dir, "EmoDB")
wav_dir = os.path.join(EmoDB_dir, "wav")
metadata_filepath = os.path.join(data_dir, "metadata_emo_EmoDB.csv")

EmoDict_german = {
    'W': 'anger',
    'L': 'boredom',
    'A': 'fear',
    'F': 'joy',
    'T': 'sadness',
    'E': 'disgust',
    'N': 'neutral',
}

# comment the following line if the metadata file already exists
metadata_new = pd.DataFrame({'filepath': [],
                            'speaker': [],
                            'gender': [],
                            'emotion': [],
                            'utterance_id': []})

for file in os.listdir(wav_dir):
    speaker = file[:2]
    utterance_id = file[2:5]
    emotion_letter = file[5]
    emotion = EmoDict_german[emotion_letter]
    filepath = os.path.join('EmoDB', 'wav', file)

    new_row = pd.DataFrame({'filepath': [filepath],
                            'speaker': [speaker], 
                            'gender': [None],
                            'emotion': [emotion],
                            'utterance_id': [utterance_id]})
    
    metadata_new = pd.concat([metadata_new, new_row])

metadata_new.to_csv(metadata_filepath, index=False)
print('Done')

## eNTERFACE

eNTERFACE dataset is composed of video recordings of actors performing emotional speech. As we are only interested in the audio, we will extract the audio from the video files. The audio files are stored in the `data/EmoData/eNTERFACE/wav` folder. 

In [None]:
import numpy as np
from moviepy.editor import VideoFileClip
from scipy.io import wavfile

In [None]:
def extract_audio_array_from_video(avi_path):
    """
    extract mono audio array from single .avi video. sampling rate = 44100 Hz
    
    Parameters
    ----------
    avi_path : path of the video file (.avi)

    Returns
    -------
    mono : mono array of audio track (nb samples,1)
    default sampling rate
    """
    
    video = VideoFileClip(avi_path)
    audio = video.audio
    print(f'audio type: {type(audio)}')
    # Extract the audio as a list of samples
    audio_samples = list(audio.iter_frames())
    # Convert the list of samples to a NumPy array
    sound_array = np.array(audio_samples)
    mono = np.mean(sound_array,axis=1) #convert to mono
    
    return mono

In [None]:
data_dir = "data"
emo_data_dir = os.path.join(data_dir, "EmoData")
enterface_dir = os.path.join(emo_data_dir, "eNTERFACE", "enterface database")
enterface_wav_dir = os.path.join(emo_data_dir, "eNTERFACE", "wav")

for subject in os.listdir(enterface_dir):
    subject_path = os.path.join(enterface_dir, subject)
    for emotion in os.listdir(subject_path):
        emotion_path = os.path.join(subject_path, emotion)
        for sentence in os.listdir(emotion_path):
            sentence_path = os.path.join(emotion_path, sentence)
            if os.path.isdir(sentence_path):
                for video in os.listdir(sentence_path):
                    video_path = os.path.join(sentence_path, video)
                    if video.endswith('.avi'):
                        # print(f"Processing {video_path}")
                        audio_filename = os.path.splitext(video)[0] + '.wav'
                        audio_path = os.path.join(enterface_wav_dir, subject, emotion, sentence)
                        audio_filepath = os.path.join(audio_path, audio_filename)
                        if not os.path.exists(audio_filepath):
                            audio_array = extract_audio_array_from_video(video_path)
                            os.makedirs(audio_path, exist_ok=True)
                            print(f"Extracting audio from {video} to {audio_filename}")
                            wavfile.write(audio_filepath, 44100, audio_array)
                        else:
                            print(f"Audio file {audio_filepath} already exists. Skipping extraction.")
                            continue

create the metadata file

In [None]:
data_dir = "data"
emo_data_dir = os.path.join(data_dir, "EmoData")
enterface_wav_dir = os.path.join(emo_data_dir, "eNTERFACE", "wav")
metadata_filepath = os.path.join(data_dir, "metadata_emo_eNTERFACE.csv")
metadata_new = pd.read_csv(metadata_filepath)

emotion_dict = {
    'anger': 'anger',
    'happiness': 'joy',
    'fear': 'fear',
    'sadness': 'sadness',
    'disgust': 'disgust',
    'surprise': 'surprise',
    'neutral': 'neutral',
}

# comment the following line if the metadata file already exists
metadata_new = pd.DataFrame({'filepath': [],
                            'speaker': [],
                            'gender': [],
                            'emotion': [],
                            'utterance_id': []})

for subject in os.listdir(enterface_wav_dir):
    subject_path = os.path.join(enterface_wav_dir, subject)
    for emotion in os.listdir(subject_path):
        emotion_path = os.path.join(subject_path, emotion)
        for sentence in os.listdir(emotion_path):
            sentence_path = os.path.join(emotion_path, sentence)
            if os.path.isdir(sentence_path):
                for audio in os.listdir(sentence_path):
                    audio_path = os.path.join(sentence_path, audio)
                    if audio.endswith('.wav'):
                        emotion_label = emotion_dict[emotion]
                        speaker = subject[7:]
                        utterance_id = sentence[8:]
                        filepath = audio_path[13:]
                        new_row = pd.DataFrame({'filepath': [filepath],
                                                'speaker': [speaker], 
                                                'gender': [None],
                                                'emotion': [emotion_label],
                                                'utterance_id': [utterance_id]})
                        metadata_new = pd.concat([metadata_new, new_row])
                        
metadata_new.to_csv(metadata_filepath, index=False)
print('Done')


# Create split files

Run this script to create the split files (train, val, test) for training and testing the models. The split files are created under the LOSGO (Leave One Speaker Group Out) or LOSO (Leave One Speaker Out) procedure.

## EmoDB

In [None]:
emodb_metadata_filepath = os.path.join('data', 'metadata_emo_EmoDB.csv')
emodb_metadata = pd.read_csv(emodb_metadata_filepath)

# Select the speakers for test and validation sets
test_speakers = [16]
validation_speakers = [15]

# create csv files for test and validation speakers metadata
test_metadata = pd.DataFrame(columns=emodb_metadata.columns)
validation_metadata = pd.DataFrame(columns=emodb_metadata.columns)
train_metadata = pd.DataFrame(columns=emodb_metadata.columns)

for index, row in emodb_metadata.iterrows():
    speaker = row['speaker']
    row = row.to_frame().T
    if int(speaker) in test_speakers:
        test_metadata = pd.concat([test_metadata, row])
    elif int(speaker) in validation_speakers:

        validation_metadata = pd.concat([validation_metadata, row], ignore_index=True)
    else:
        train_metadata = pd.concat([train_metadata, row], ignore_index=True)

# shuffle the metadata files
train_metadata = train_metadata.sample(frac=1).reset_index(drop=True)
validation_metadata = validation_metadata.sample(frac=1).reset_index(drop=True)
test_metadata = test_metadata.sample(frac=1).reset_index(drop=True)

# save the metadata files
train_metadata.to_csv(os.path.join('data', 'train_metadata_emo_emodb.csv'), index=False)
validation_metadata.to_csv(os.path.join('data', 'val_metadata_emo_emodb.csv'), index=False)
test_metadata.to_csv(os.path.join('data', 'test_metadata_emo_emodb.csv'), index=False)
print('Metadata files created successfully')


## eNTERFACE

In [None]:
enterface_metadata_filepath = os.path.join('data', 'metadata_emo_eNTERFACE.csv')
enterface_metadata = pd.read_csv(enterface_metadata_filepath)

test_speakers = [36, 37, 38, 39]
validation_speakers = [40, 41, 42, 43, 44]

# create csv files for test and validation speakers metadata
test_metadata = pd.DataFrame(columns=enterface_metadata.columns)
validation_metadata = pd.DataFrame(columns=enterface_metadata.columns)
train_metadata = pd.DataFrame(columns=enterface_metadata.columns)

for index, row in enterface_metadata.iterrows():
    speaker = row['speaker']
    row = row.to_frame().T
    if int(speaker) in test_speakers:
        test_metadata = pd.concat([test_metadata, row])
    elif int(speaker) in validation_speakers:

        validation_metadata = pd.concat([validation_metadata, row], ignore_index=True)
    else:
        train_metadata = pd.concat([train_metadata, row], ignore_index=True)

# shuffle the metadata files
train_metadata = train_metadata.sample(frac=1).reset_index(drop=True)
validation_metadata = validation_metadata.sample(frac=1).reset_index(drop=True)
test_metadata = test_metadata.sample(frac=1).reset_index(drop=True)

# save the metadata files
train_metadata.to_csv(os.path.join('data', 'train_metadata_emo_enterface.csv'), index=False)
validation_metadata.to_csv(os.path.join('data', 'val_metadata_emo_enterface.csv'), index=False)
test_metadata.to_csv(os.path.join('data', 'test_metadata_emo_enterface.csv'), index=False)
print('Metadata files created successfully')
