# Data Loading 
Using a single zip "Actors 01-05" as a test for loading and creating the full metadata dataset on the larger corpus of audio .wav files, for simplicity and to avoid using excessive memory.

Directories have the following structure:

- Actors_1-5 (top level, many of these with name structure Actors_6-10, Actors_11_15, etc.)
    - Actor_01 (second level, many of these with name structure Actor_01, Actor_02, etc.)
       - 03-01-01-01-01-01-01.wav (third level, each file is a wav audio file, and the names contain the feature to be extracted)
          > Many of these audio files follow the same naming structure but are not sequential.


In [3]:
import zipfile
import os
import librosa
import pandas as pd
import shutil
import json
# Mood Miners Emotion Detection Project
# Description: This file contains the code to load the data from the zip file

def process_audio_from_zip(zip_path):
    """
    This function takes in the path to a zip file containing audio files.
    It then processes the audio data using librosa and creates and returns a dataframe.
    """
    # set extract path to name of zip
    extract_path = zip_path.split('.')[0]
    # create the directory if it doesn't exist
    if not os.path.exists(extract_path):
        print('Extracting zip file ' + zip_path.split('/')[1] + ' to ' + zip_path.split('.')[0])
        os.makedirs(extract_path)

        # extract the contents of the zip file to the directory
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
    # first, simple audio features used by librosa, excluding the raw data
    # features: tempo, spectral_centroid, spectral_bandwidth, spectral_rolloff, zero_crossing_rate
    #           chroma_stft, mfcc, rmse

    # second, inherent emotion features
    # creating the feature dictionary to return later as a df
    feature_dict = {'actor': [], 'tempo': [], 'y':[], 'sr':[], 'onset_env':[], 'spectral_centroid': [], 'spectral_bandwidth':[], 'spectral_rolloff':[], 'zero_crossing_rate':[], 'chroma_stft':[],
                    'mfcc':[], 'rmse':[], 'modality':[], 'vocal_channel':[], 'emotion':[], 'emotional_intensity':[], 'statement':[],'repetition':[]
    }
    # for each file in the directory ill be inserting the data into the feature dictionary
    for actor_dir in os.listdir(extract_path):
        if not actor_dir.startswith('Actor'):
            continue
        print('Processing the actor directory: ' + actor_dir)
        for wav_file in os.listdir(extract_path + '/' + actor_dir):
            if not wav_file.endswith('.wav'):
                continue
            ### Process Librosa Features ###
            # load the audio file
            y, sr = librosa.load(extract_path + '/' + actor_dir + '/' + wav_file)
            # calculate the tempo
            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
            # calculate the spectral centroid
            spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
            # calculate the spectral bandwidth
            spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
            # calculate the spectral rolloff
            spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
            # calculate the zero crossing rate
            zcr = librosa.feature.zero_crossing_rate(y)
            # calculate the chroma stft
            chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
            # calculate the mfcc
            mfcc = librosa.feature.mfcc(y=y, sr=sr)
            # calculate the rmse
            rmse = librosa.feature.rms(y=y)
            # calculate onset strength
            onset_env = librosa.onset.onset_strength(y=y, sr=sr)
            # add all the features to the dictionary
            # appending the features to the dictionary list with the following keys [tempo, spec_cent, spec_bw, spec_rolloff, zcr, chroma_stft, mfcc, rmse]
            feature_dict['actor'].append(actor_dir)
            feature_dict['tempo'].append(tempo)
            feature_dict['spectral_centroid'].append(spec_cent)
            feature_dict['spectral_bandwidth'].append(spec_bw)
            feature_dict['spectral_rolloff'].append(spec_rolloff)
            feature_dict['zero_crossing_rate'].append(zcr)
            feature_dict['chroma_stft'].append(chroma_stft)
            feature_dict['mfcc'].append(mfcc)
            feature_dict['rmse'].append(rmse)
            feature_dict['onset_env'].append(onset_env)
            feature_dict['y'].append(y)
            feature_dict['sr'].append(sr)

            ### Process Inherent Emotion Features ###
            identifiers_only = wav_file.split('.')[0].split('-')
            # Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
            feature_dict['modality'].append(identifiers_only[0])
            # Vocal channel (01 = speech, 02 = song).
            feature_dict['vocal_channel'].append(identifiers_only[1])
            # Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
            feature_dict['emotion'].append(identifiers_only[2])
            # Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
            feature_dict['emotional_intensity'].append(identifiers_only[3])
            # Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
            feature_dict['statement'].append(identifiers_only[4])
            # Repetition (01 = 1st repetition, 02 = 2nd repetition).
            feature_dict['repetition'].append(identifiers_only[5])

        print("Finished processing the actor directory: " + actor_dir)
    print("Finished processing all the audio files in the zip file " + zip_path.split('/')[1])
    
    # deleting the root directory after processing
    print("Deleting the root directory " + extract_path)
    shutil.rmtree(extract_path)
    
    actor_audio_df = pd.DataFrame(feature_dict)
    return actor_audio_df


In [4]:
audio_metadata_shard = process_audio_from_zip('emotiona_speech/Actors_1-5.zip')

Extracting zip file Actors_1-5.zip to emotiona_speech/Actors_1-5
Processing the actor directory: Actor_04
Finished processing the actor directory: Actor_04
Processing the actor directory: Actor_03
Finished processing the actor directory: Actor_03
Processing the actor directory: Actor_02
Finished processing the actor directory: Actor_02
Processing the actor directory: Actor_05
Finished processing the actor directory: Actor_05
Processing the actor directory: Actor_01
Finished processing the actor directory: Actor_01
Finished processing all the audio files in the zip file Actors_1-5.zip
Deleting the root directory emotiona_speech/Actors_1-5


In [5]:
#convert the list of list features to json and then string so that it can be stored in a csv as a string with full fidelity
audio_metadata_shard['spectral_centroid'] = audio_metadata_shard['spectral_centroid'].apply(lambda x: json.dumps(x.tolist()))
audio_metadata_shard['spectral_bandwidth'] = audio_metadata_shard['spectral_bandwidth'].apply(lambda x: json.dumps(x.tolist()))
audio_metadata_shard['spectral_rolloff'] = audio_metadata_shard['spectral_rolloff'].apply(lambda x: json.dumps(x.tolist()))
audio_metadata_shard['zero_crossing_rate'] = audio_metadata_shard['zero_crossing_rate'].apply(lambda x: json.dumps(x.tolist()))
audio_metadata_shard['chroma_stft'] = audio_metadata_shard['chroma_stft'].apply(lambda x: json.dumps(x.tolist()))
audio_metadata_shard['mfcc'] = audio_metadata_shard['mfcc'].apply(lambda x: json.dumps(x.tolist()))
audio_metadata_shard['rmse'] = audio_metadata_shard['rmse'].apply(lambda x: json.dumps(x.tolist()))
audio_metadata_shard['onset_env'] = audio_metadata_shard['onset_env'].apply(lambda x: json.dumps(x.tolist()))
audio_metadata_shard['y'] = audio_metadata_shard['y'].apply(lambda x: json.dumps(x.tolist()))

In [6]:
# Set pandas option to prevent truncation when writing to csv for processing the list of lists in the dataframe with ast.literal_eval
#pd.set_option('display.max_seq_items', None)
audio_metadata_shard.to_csv('Actors_1-5_Metadata.csv', index=False)
#pd.reset_option('display.max_seq_items')

In [7]:
audio_metadata_shard.shape

(300, 18)

In [11]:
print("After processing of the audio data into a metadata CSV dataset for the Actors 1-5, the size of the raw audio dataset/directory reduced from 125 MB to only 5 MB, which is a reduction of about {}%".format(1 - 5/125))

After processing of the audio data into a metadata CSV dataset for the Actors 1-5, the size of the raw audio dataset/directory reduced from 125 MB to only 5 MB, which is a reduction of about 0.96%


In [8]:
# testing that the shards are converted to the correct format (json string)
audio_metadata_shard = pd.read_csv('Actors_1-5_Metadata.csv')
# convert the json stringa back to a list of lists for the list of list features using json.loads
audio_metadata_shard['spectral_centroid'] = audio_metadata_shard['spectral_centroid'].apply(lambda x: json.loads(x))
audio_metadata_shard['spectral_bandwidth'] = audio_metadata_shard['spectral_bandwidth'].apply(lambda x: json.loads(x))
audio_metadata_shard['spectral_rolloff'] = audio_metadata_shard['spectral_rolloff'].apply(lambda x: json.loads(x))
audio_metadata_shard['zero_crossing_rate'] = audio_metadata_shard['zero_crossing_rate'].apply(lambda x: json.loads(x))
audio_metadata_shard['chroma_stft'] = audio_metadata_shard['chroma_stft'].apply(lambda x: json.loads(x))
audio_metadata_shard['mfcc'] = audio_metadata_shard['mfcc'].apply(lambda x: json.loads(x))
audio_metadata_shard['rmse'] = audio_metadata_shard['rmse'].apply(lambda x: json.loads(x))
audio_metadata_shard['onset_env'] = audio_metadata_shard['onset_env'].apply(lambda x: json.loads(x))
audio_metadata_shard['y'] = audio_metadata_shard['y'].apply(lambda x: json.loads(x))

In [10]:
audio_metadata_shard['y'][0][:5]

[9.800436600926332e-06,
 7.78237335907761e-06,
 -2.2437436655309284e-06,
 9.221363939104776e-07,
 -2.9372728249654756e-07]

# Conclusion
Given that the function was able to process the data for the Actors 1-5 zip successfully, we can now go on to write a complete script to process the entire raw data directory corpus (for Actors 1-24, excluding the additional emotiona_speech/audio_speech_actors_01-24 directory - this one may contain different features)

In [None]:
#run this after running python sample.py
meta_df = pd.read_csv('actors_meta_df.csv')

In [None]:
meta_df.shape