# Data Loading 
Using a single zip "Actors 01-05" as a test for loading and creating the full metadata dataset on the larger corpus of audio .wav files, for simplicity and to avoid using excessive memory.

Directories have the following structure:

- Actors_1-5 (top level, many of these with name structure Actors_6-10, Actors_11_15, etc.)
    - Actor_01 (second level, many of these with name structure Actor_01, Actor_02, etc.)
       - 03-01-01-01-01-01-01.wav (third level, each file is a wav audio file, and the names contain the feature to be extracted)
          > Many of these audio files follow the same naming structure but are not sequential.


In [9]:
import zipfile
import os
import librosa
import pandas as pd
import shutil

# Mood Miners Emotion Detection Project
# Description: This file contains the code to load the data from the zip file

def process_audio_from_zip(zip_path):
    """
    This function takes in the path to a zip file containing audio files.
    It then processes the audio data using librosa and creates and returns a dataframe.
    """
    # set extract path to name of zip
    extract_path = zip_path.split('.')[0]
    # create the directory if it doesn't exist
    if not os.path.exists(extract_path):
        print('Extracting zip file ' + zip_path.split('/')[1] + ' to ' + zip_path.split('.')[0])
        os.makedirs(extract_path)

        # extract the contents of the zip file to the directory
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
    # first, simple audio features used by librosa, excluding the raw data
    # features: tempo, spectral_centroid, spectral_bandwidth, spectral_rolloff, zero_crossing_rate
    #           chroma_stft, mfcc, rmse

    # second, inherent emotion features
    # creating the feature dictionary to return later as a df
    feature_dict = {'actor': [], 'tempo': [], 'y':[], 'sr':[], 'onset_env':[], 'spectral_centroid': [], 'spectral_bandwidth':[], 'spectral_rolloff':[], 'zero_crossing_rate':[], 'chroma_stft':[],
                    'mfcc':[], 'rmse':[], 'modality':[], 'vocal_channel':[], 'emotion':[], 'emotional_intensity':[], 'statement':[],'repetition':[]
    }
    # for each file in the directory ill be inserting the data into the feature dictionary
    for actor_dir in os.listdir(extract_path):
        if not actor_dir.startswith('Actor'):
            continue
        print('Processing the actor directory: ' + actor_dir)
        for wav_file in os.listdir(extract_path + '/' + actor_dir):
            if not wav_file.endswith('.wav'):
                continue
            ### Process Librosa Features ###
            # load the audio file
            y, sr = librosa.load(extract_path + '/' + actor_dir + '/' + wav_file)
            # calculate the tempo
            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
            # calculate the spectral centroid
            spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
            # calculate the spectral bandwidth
            spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
            # calculate the spectral rolloff
            spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
            # calculate the zero crossing rate
            zcr = librosa.feature.zero_crossing_rate(y)
            # calculate the chroma stft
            chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
            # calculate the mfcc
            mfcc = librosa.feature.mfcc(y=y, sr=sr)
            # calculate the rmse
            rmse = librosa.feature.rms(y=y)
            # calculate onset strength
            onset_env = librosa.onset.onset_strength(y=y, sr=sr)
            # add all the features to the dictionary
            # appending the features to the dictionary list with the following keys [tempo, spec_cent, spec_bw, spec_rolloff, zcr, chroma_stft, mfcc, rmse]
            feature_dict['actor'].append(actor_dir)
            feature_dict['tempo'].append(tempo)
            feature_dict['spectral_centroid'].append(spec_cent)
            feature_dict['spectral_bandwidth'].append(spec_bw)
            feature_dict['spectral_rolloff'].append(spec_rolloff)
            feature_dict['zero_crossing_rate'].append(zcr)
            feature_dict['chroma_stft'].append(chroma_stft)
            feature_dict['mfcc'].append(mfcc)
            feature_dict['rmse'].append(rmse)
            feature_dict['onset_env'].append(onset_env)
            feature_dict['y'].append(y)
            feature_dict['sr'].append(sr)

            ### Process Inherent Emotion Features ###
            identifiers_only = wav_file.split('.')[0].split('-')
            # Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
            feature_dict['modality'].append(identifiers_only[0])
            # Vocal channel (01 = speech, 02 = song).
            feature_dict['vocal_channel'].append(identifiers_only[1])
            # Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
            feature_dict['emotion'].append(identifiers_only[2])
            # Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
            feature_dict['emotional_intensity'].append(identifiers_only[3])
            # Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
            feature_dict['statement'].append(identifiers_only[4])
            # Repetition (01 = 1st repetition, 02 = 2nd repetition).
            feature_dict['repetition'].append(identifiers_only[5])

        print("Finished processing the actor directory: " + actor_dir)
    print("Finished processing all the audio files in the zip file " + zip_path.split('/')[1])
    
    # deleting the root directory after processing
    print("Deleting the root directory " + extract_path)
    shutil.rmtree(extract_path)
    
    actor_audio_df = pd.DataFrame(feature_dict)
    return actor_audio_df


In [10]:
audio_metadata_shard = process_audio_from_zip('emotiona_speech/Actors_1-5.zip')

Processing the actor directory: Actor_04
Finished processing the actor directory: Actor_04
Processing the actor directory: Actor_03
Finished processing the actor directory: Actor_03
Processing the actor directory: Actor_02
Finished processing the actor directory: Actor_02
Processing the actor directory: Actor_05
Finished processing the actor directory: Actor_05
Processing the actor directory: Actor_01
Finished processing the actor directory: Actor_01
Finished processing all the audio files in the zip file Actors_1-5.zip
Deleting the root directory emotiona_speech/Actors_1-5


In [11]:
audio_metadata_shard#.to_csv('Actors_1-5_Metadata.csv', index=False)

Unnamed: 0,actor,tempo,y,sr,onset_env,spectral_centroid,spectral_bandwidth,spectral_rolloff,zero_crossing_rate,chroma_stft,mfcc,rmse,modality,vocal_channel,emotion,emotional_intensity,statement,repetition
0,Actor_04,86.132812,"[9.556889e-06, 7.979388e-06, -2.6755222e-06, 1...",22050,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.023...","[[4885.3915765124675, 4300.626264748916, 4269....","[[3238.521581755041, 3364.480429795827, 3372.8...","[[8731.7138671875, 8473.3154296875, 8774.78027...","[[0.40869140625, 0.54736328125, 0.7138671875, ...","[[0.8057866, 0.32469767, 0.26406005, 0.4697149...","[[-668.3413, -668.3413, -668.3413, -668.3413, ...","[[4.0514665e-06, 7.336948e-06, 9.567447e-06, 1...",03,01,03,02,02,02
1,Actor_04,151.999081,"[-2.3802373e-08, 1.7626292e-09, 1.6234665e-08,...",22050,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[5034.5240982262385, 4958.2984163909605, 4701...","[[3120.1448163244186, 3155.079455252928, 3021....","[[8925.5126953125, 8817.8466796875, 8516.38183...","[[0.447265625, 0.6708984375, 0.8984375, 0.9038...","[[0.7569911, 0.80539256, 0.69772, 0.6061046, 0...","[[-783.933, -783.933, -783.933, -783.933, -783...","[[2.470209e-06, 3.1261097e-06, 3.5455403e-06, ...",03,01,03,01,01,02
2,Actor_04,103.359375,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",22050,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[-856.4625, -856.4625, -856.4625, -856.4625, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",03,01,02,02,01,01
3,Actor_04,112.347147,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",22050,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[-902.7997, -902.7997, -902.7997, -902.7997, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",03,01,01,01,02,02
4,Actor_04,52.734375,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",22050,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[-850.1834, -850.1834, -850.1834, -850.1834, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",03,01,02,01,02,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,Actor_01,161.499023,"[-2.655571e-06, -1.2237584e-05, -9.1077754e-07...",22050,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2608155, 0.12...","[[4943.788470024794, 4993.147051154355, 5019.4...","[[3303.7808534145265, 3274.449711736935, 3177....","[[9043.9453125, 9022.412109375, 8893.212890625...","[[0.21435546875, 0.35546875, 0.517578125, 0.48...","[[0.7277329, 0.6844372, 0.8753823, 0.8718942, ...","[[-864.93823, -864.93823, -864.93823, -864.938...","[[1.5071628e-06, 1.9333706e-06, 2.1324133e-06,...",03,01,01,01,01,02
296,Actor_01,117.453835,"[1.277529e-05, 5.257948e-07, -5.977686e-07, 5....",22050,"[0.0, 0.0, 0.0, 0.6661148, 0.28193384, 0.0, 0....","[[4858.008571688488, 4406.104244501179, 4227.8...","[[3362.090115162583, 3477.10537796233, 3506.02...","[[8990.1123046875, 8839.3798828125, 8828.61328...","[[0.34912109375, 0.53271484375, 0.724609375, 0...","[[0.87311006, 0.8823115, 0.78982776, 0.8708325...","[[-904.572, -897.0358, -894.00964, -900.38275,...","[[7.823081e-06, 9.391375e-06, 9.465683e-06, 9....",03,01,02,01,01,01
297,Actor_01,135.999178,"[2.4291887e-05, 2.9232147e-05, 8.04831e-06, 2....",22050,"[0.0, 0.0, 0.0, 1.395079, 0.66954195, 0.762513...","[[3527.26574322685, 3731.6598384436093, 3848.3...","[[3424.592313921696, 3408.2188353741108, 3370....","[[8193.3837890625, 8247.216796875, 8279.516601...","[[0.20361328125, 0.3349609375, 0.51416015625, ...","[[0.8004069, 0.8320442, 0.71533036, 0.5944415,...","[[-902.4752, -888.39215, -894.73846, -892.8008...","[[1.8049568e-05, 2.1408556e-05, 2.2228167e-05,...",03,01,02,02,02,01
298,Actor_01,172.265625,"[8.447154e-06, 9.375061e-06, 2.0482428e-05, 1....",22050,"[0.0, 0.0, 0.0, 0.13223195, 0.1392346, 0.22554...","[[4405.080888462469, 4456.270213669002, 4339.4...","[[3371.2924362001995, 3363.3228052507625, 3414...","[[8720.947265625, 8731.7138671875, 8688.647460...","[[0.3359375, 0.46484375, 0.638671875, 0.694335...","[[0.7623131, 0.83263075, 0.9964961, 0.8838115,...","[[-849.5774, -848.08136, -846.53864, -843.9868...","[[6.9308962e-06, 1.241332e-05, 1.3071153e-05, ...",03,01,03,01,02,02


In [54]:
audio_metadata_shard.shape

(300, 18)

In [53]:
print("After processing of the audio data into a metadata CSV dataset for the Actors 1-5, the size of the raw audio dataset/directory reduced from 125 MB to only 5 MB, which is a reduction of about {}%".format(1 - 5/125))

After processing of the audio dat into a metadata CSV dataset for the Actors 1-5, the size of the dataset reduced from 125 MB to only 5 MB, which is a reduction of about 0.96%


# Conclusion
Given that the function was able to process the data for the Actors 1-5 zip successfully, we can now go on to write a complete script to process the entire raw data directory corpus (for Actors 1-24, excluding the additional emotiona_speech/audio_speech_actors_01-24 directory - this one may contain different features)

In [12]:
#run this after running python sample.py
meta_df = pd.read_csv('actors_meta_df.csv')

In [14]:
meta_df.shape

(1440, 18)